Line data Source code
1 : /* SLP - Basic Block Vectorization
2 : Copyright (C) 2007-2026 Free Software Foundation, Inc.
3 : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : and Ira Rosen <irar@il.ibm.com>
5 :
6 : This file is part of GCC.
7 :
8 : GCC is free software; you can redistribute it and/or modify it under
9 : the terms of the GNU General Public License as published by the Free
10 : Software Foundation; either version 3, or (at your option) any later
11 : version.
12 :
13 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with GCC; see the file COPYING3. If not see
20 : <http://www.gnu.org/licenses/>. */
21 :
22 : #include "config.h"
23 : #define INCLUDE_ALGORITHM
24 : #include "system.h"
25 : #include "coretypes.h"
26 : #include "backend.h"
27 : #include "target.h"
28 : #include "rtl.h"
29 : #include "tree.h"
30 : #include "gimple.h"
31 : #include "tree-pass.h"
32 : #include "ssa.h"
33 : #include "optabs-tree.h"
34 : #include "insn-config.h"
35 : #include "recog.h" /* FIXME: for insn_data */
36 : #include "fold-const.h"
37 : #include "stor-layout.h"
38 : #include "gimple-iterator.h"
39 : #include "cfgloop.h"
40 : #include "tree-vectorizer.h"
41 : #include "langhooks.h"
42 : #include "gimple-walk.h"
43 : #include "dbgcnt.h"
44 : #include "tree-vector-builder.h"
45 : #include "vec-perm-indices.h"
46 : #include "gimple-fold.h"
47 : #include "internal-fn.h"
48 : #include "dump-context.h"
49 : #include "cfganal.h"
50 : #include "tree-eh.h"
51 : #include "tree-cfg.h"
52 : #include "alloc-pool.h"
53 : #include "sreal.h"
54 : #include "predict.h"
55 :
56 : #define REDUC_GROUP_FIRST_ELEMENT(S) \
57 : (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element)
58 :
59 : static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
60 : load_permutation_t &,
61 : const vec<tree> &,
62 : gimple_stmt_iterator *,
63 : poly_uint64, bool, bool,
64 : unsigned *,
65 : unsigned * = nullptr,
66 : bool = false);
67 : static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
68 : slp_tree, lane_permutation_t &,
69 : vec<slp_tree> &, bool);
70 : static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
71 : static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
72 :
73 : static object_allocator<_slp_tree> *slp_tree_pool;
74 : static slp_tree slp_first_node;
75 :
76 : void
77 1113429 : vect_slp_init (void)
78 : {
79 1113429 : slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
80 1113429 : }
81 :
82 : void
83 1113429 : vect_slp_fini (void)
84 : {
85 1776573 : while (slp_first_node)
86 663144 : delete slp_first_node;
87 2226858 : delete slp_tree_pool;
88 1113429 : slp_tree_pool = NULL;
89 1113429 : }
90 :
91 : void *
92 7711454 : _slp_tree::operator new (size_t n)
93 : {
94 7711454 : gcc_assert (n == sizeof (_slp_tree));
95 7711454 : return slp_tree_pool->allocate_raw ();
96 : }
97 :
98 : void
99 7711454 : _slp_tree::operator delete (void *node, size_t n)
100 : {
101 7711454 : gcc_assert (n == sizeof (_slp_tree));
102 7711454 : slp_tree_pool->remove_raw (node);
103 7711454 : }
104 :
105 :
106 : /* Initialize a SLP node. */
107 :
108 7711454 : _slp_tree::_slp_tree ()
109 : {
110 7711454 : this->prev_node = NULL;
111 7711454 : if (slp_first_node)
112 6750082 : slp_first_node->prev_node = this;
113 7711454 : this->next_node = slp_first_node;
114 7711454 : slp_first_node = this;
115 7711454 : SLP_TREE_SCALAR_STMTS (this) = vNULL;
116 7711454 : SLP_TREE_SCALAR_OPS (this) = vNULL;
117 7711454 : SLP_TREE_LIVE_LANES (this) = vNULL;
118 7711454 : SLP_TREE_VEC_DEFS (this) = vNULL;
119 7711454 : SLP_TREE_CHILDREN (this) = vNULL;
120 7711454 : SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
121 7711454 : SLP_TREE_LANE_PERMUTATION (this) = vNULL;
122 7711454 : SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
123 7711454 : SLP_TREE_CODE (this) = ERROR_MARK;
124 7711454 : SLP_TREE_GS_SCALE (this) = 0;
125 7711454 : SLP_TREE_GS_BASE (this) = NULL_TREE;
126 7711454 : this->ldst_lanes = false;
127 7711454 : this->avoid_stlf_fail = false;
128 7711454 : SLP_TREE_VECTYPE (this) = NULL_TREE;
129 7711454 : SLP_TREE_REPRESENTATIVE (this) = NULL;
130 7711454 : this->cycle_info.id = -1;
131 7711454 : this->cycle_info.reduc_idx = -1;
132 7711454 : SLP_TREE_REF_COUNT (this) = 1;
133 7711454 : this->failed = NULL;
134 7711454 : this->max_nunits = 1;
135 7711454 : this->lanes = 0;
136 7711454 : SLP_TREE_TYPE (this) = undef_vec_info_type;
137 7711454 : this->data = NULL;
138 7711454 : }
139 :
140 : /* Tear down a SLP node. */
141 :
142 7711454 : _slp_tree::~_slp_tree ()
143 : {
144 7711454 : if (this->prev_node)
145 4663488 : this->prev_node->next_node = this->next_node;
146 : else
147 3047966 : slp_first_node = this->next_node;
148 7711454 : if (this->next_node)
149 5816335 : this->next_node->prev_node = this->prev_node;
150 7711454 : SLP_TREE_CHILDREN (this).release ();
151 7711454 : SLP_TREE_SCALAR_STMTS (this).release ();
152 7711454 : SLP_TREE_SCALAR_OPS (this).release ();
153 7711454 : SLP_TREE_LIVE_LANES (this).release ();
154 7711454 : SLP_TREE_VEC_DEFS (this).release ();
155 7711454 : SLP_TREE_LOAD_PERMUTATION (this).release ();
156 7711454 : SLP_TREE_LANE_PERMUTATION (this).release ();
157 7711454 : if (this->failed)
158 2037326 : free (failed);
159 7711454 : if (this->data)
160 1239221 : delete this->data;
161 7711454 : }
162 :
163 : /* Push the single SSA definition in DEF to the vector of vector defs. */
164 :
165 : void
166 528123 : _slp_tree::push_vec_def (gimple *def)
167 : {
168 528123 : if (gphi *phi = dyn_cast <gphi *> (def))
169 58968 : vec_defs.quick_push (gimple_phi_result (phi));
170 : else
171 : {
172 469155 : def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
173 469155 : vec_defs.quick_push (get_def_from_ptr (defop));
174 : }
175 528123 : }
176 :
177 : /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
178 :
179 : void
180 14607938 : vect_free_slp_tree (slp_tree node)
181 : {
182 14607938 : int i;
183 14607938 : slp_tree child;
184 :
185 14607938 : if (--SLP_TREE_REF_COUNT (node) != 0)
186 14607938 : return;
187 :
188 10924675 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
189 3876365 : if (child)
190 3519902 : vect_free_slp_tree (child);
191 :
192 7048310 : delete node;
193 : }
194 :
195 : /* Return a location suitable for dumpings related to the SLP instance. */
196 :
197 : dump_user_location_t
198 3390688 : _slp_instance::location () const
199 : {
200 3390688 : if (!root_stmts.is_empty ())
201 316323 : return root_stmts[0]->stmt;
202 : else
203 3074365 : return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
204 : }
205 :
206 :
207 : /* Free the memory allocated for the SLP instance. */
208 :
209 : void
210 1551364 : vect_free_slp_instance (slp_instance instance)
211 : {
212 1551364 : vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
213 1551364 : SLP_INSTANCE_LOADS (instance).release ();
214 1551364 : SLP_INSTANCE_ROOT_STMTS (instance).release ();
215 1551364 : SLP_INSTANCE_REMAIN_DEFS (instance).release ();
216 1551364 : instance->subgraph_entries.release ();
217 1551364 : instance->cost_vec.release ();
218 1551364 : free (instance);
219 1551364 : }
220 :
221 :
222 : /* Create a SLP node with NOPS children with CODE, either VEC_PERM_EXPR
223 : for a permute node or else ERROR_MARK. */
224 :
225 : slp_tree
226 95876 : vect_create_new_slp_node (unsigned nops, tree_code code)
227 : {
228 95876 : gcc_assert (code == ERROR_MARK || code == VEC_PERM_EXPR);
229 95876 : slp_tree node = new _slp_tree;
230 95876 : SLP_TREE_SCALAR_STMTS (node) = vNULL;
231 95876 : SLP_TREE_CHILDREN (node).create (nops);
232 95876 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
233 95876 : SLP_TREE_CODE (node) = code;
234 95876 : return node;
235 : }
236 :
237 : /* Create a SLP node inplace at NODE for SCALAR_STMTS and NOPS children. */
238 :
239 : static slp_tree
240 3744535 : vect_create_new_slp_node (slp_tree node,
241 : vec<stmt_vec_info> scalar_stmts, unsigned nops)
242 : {
243 3744535 : SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
244 3744535 : SLP_TREE_CHILDREN (node).create (nops);
245 3744535 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
246 3744535 : SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
247 3744535 : SLP_TREE_LANES (node) = scalar_stmts.length ();
248 3744535 : return node;
249 : }
250 :
251 : /* Create an SLP node for SCALAR_STMTS and NOPS children. */
252 :
253 : static slp_tree
254 8164 : vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
255 : {
256 8164 : return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
257 : }
258 :
259 : /* Create a vect_external_def SLP node inplace at NODE for scalar
260 : operands OPS. */
261 :
262 : static slp_tree
263 1823423 : vect_create_new_slp_node (slp_tree node, vec<tree> ops)
264 : {
265 1823423 : SLP_TREE_SCALAR_OPS (node) = ops;
266 1823423 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
267 0 : SLP_TREE_LANES (node) = ops.length ();
268 1823423 : return node;
269 : }
270 :
271 : /* Create a vect_external_def SLP node for scalar operands OPS. */
272 :
273 : static slp_tree
274 1823423 : vect_create_new_slp_node (vec<tree> ops)
275 : {
276 1823423 : return vect_create_new_slp_node (new _slp_tree, ops);
277 : }
278 :
279 :
280 : /* This structure is used in creation of an SLP tree. Each instance
281 : corresponds to the same operand in a group of scalar stmts in an SLP
282 : node. */
283 : typedef struct _slp_oprnd_info
284 : {
285 : /* Def-stmts for the operands. */
286 : vec<stmt_vec_info> def_stmts;
287 : /* Operands. */
288 : vec<tree> ops;
289 : /* Information about the first statement, its vector def-type, type, the
290 : operand itself in case it's constant, and an indication if it's a pattern
291 : stmt and gather/scatter info. */
292 : tree first_op_type;
293 : enum vect_def_type first_dt;
294 : bool any_pattern;
295 : bool first_gs_p;
296 : gather_scatter_info first_gs_info;
297 : } *slp_oprnd_info;
298 :
299 :
300 : /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
301 : operand. */
302 : static vec<slp_oprnd_info>
303 3322682 : vect_create_oprnd_info (int nops, int group_size)
304 : {
305 3322682 : int i;
306 3322682 : slp_oprnd_info oprnd_info;
307 3322682 : vec<slp_oprnd_info> oprnds_info;
308 :
309 3322682 : oprnds_info.create (nops);
310 11918375 : for (i = 0; i < nops; i++)
311 : {
312 5273011 : oprnd_info = XNEW (struct _slp_oprnd_info);
313 5273011 : oprnd_info->def_stmts.create (group_size);
314 5273011 : oprnd_info->ops.create (group_size);
315 5273011 : oprnd_info->first_dt = vect_uninitialized_def;
316 5273011 : oprnd_info->first_op_type = NULL_TREE;
317 5273011 : oprnd_info->any_pattern = false;
318 5273011 : oprnd_info->first_gs_p = false;
319 5273011 : oprnds_info.quick_push (oprnd_info);
320 : }
321 :
322 3322682 : return oprnds_info;
323 : }
324 :
325 :
326 : /* Free operands info. */
327 :
328 : static void
329 3322682 : vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
330 : {
331 3322682 : int i;
332 3322682 : slp_oprnd_info oprnd_info;
333 :
334 8595693 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
335 : {
336 5273011 : oprnd_info->def_stmts.release ();
337 5273011 : oprnd_info->ops.release ();
338 5273011 : XDELETE (oprnd_info);
339 : }
340 :
341 3322682 : oprnds_info.release ();
342 3322682 : }
343 :
344 : /* Return the execution frequency of NODE (so that a higher value indicates
345 : a "more important" node when optimizing for speed). */
346 :
347 : static sreal
348 3465920 : vect_slp_node_weight (slp_tree node)
349 : {
350 3465920 : stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
351 3465920 : basic_block bb = gimple_bb (stmt_info->stmt);
352 3465920 : return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
353 : }
354 :
355 : /* Return true if STMTS contains a pattern statement. */
356 :
357 : static bool
358 20353 : vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
359 : {
360 20353 : stmt_vec_info stmt_info;
361 20353 : unsigned int i;
362 66691 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
363 48365 : if (stmt_info && is_pattern_stmt_p (stmt_info))
364 : return true;
365 : return false;
366 : }
367 :
368 : /* Return true when all lanes in the external or constant NODE have
369 : the same value. */
370 :
371 : static bool
372 578826 : vect_slp_tree_uniform_p (slp_tree node)
373 : {
374 578826 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
375 : || SLP_TREE_DEF_TYPE (node) == vect_external_def);
376 :
377 : /* Pre-existing vectors. */
378 1018788 : if (SLP_TREE_SCALAR_OPS (node).is_empty ())
379 : return false;
380 :
381 : unsigned i;
382 : tree op, first = NULL_TREE;
383 1326173 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
384 1187309 : if (!first)
385 : first = op;
386 608483 : else if (!operand_equal_p (first, op, 0))
387 : return false;
388 :
389 : return true;
390 : }
391 :
392 : /* Find the place of the data-ref in STMT_INFO in the interleaving chain
393 : that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
394 : of the chain. */
395 :
396 : int
397 701430 : vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
398 : stmt_vec_info first_stmt_info)
399 : {
400 701430 : stmt_vec_info next_stmt_info = first_stmt_info;
401 701430 : int result = 0;
402 :
403 701430 : if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
404 : return -1;
405 :
406 1755568 : do
407 : {
408 1755568 : if (next_stmt_info == stmt_info)
409 : return result;
410 1054138 : next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
411 1054138 : if (next_stmt_info)
412 1054138 : result += DR_GROUP_GAP (next_stmt_info);
413 : }
414 1054138 : while (next_stmt_info);
415 :
416 : return -1;
417 : }
418 :
419 : /* Check whether it is possible to load COUNT elements of type ELT_TYPE
420 : using the method implemented by duplicate_and_interleave. Return true
421 : if so, returning the number of intermediate vectors in *NVECTORS_OUT
422 : (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
423 : (if nonnull). */
424 :
425 : bool
426 0 : can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
427 : tree elt_type, unsigned int *nvectors_out,
428 : tree *vector_type_out,
429 : tree *permutes)
430 : {
431 0 : tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
432 0 : if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
433 0 : return false;
434 :
435 0 : machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
436 0 : poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
437 0 : unsigned int nvectors = 1;
438 0 : for (;;)
439 : {
440 0 : scalar_int_mode int_mode;
441 0 : poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
442 0 : if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
443 : {
444 : /* Get the natural vector type for this SLP group size. */
445 0 : tree int_type = build_nonstandard_integer_type
446 0 : (GET_MODE_BITSIZE (int_mode), 1);
447 0 : tree vector_type
448 0 : = get_vectype_for_scalar_type (vinfo, int_type, count);
449 0 : poly_int64 half_nelts;
450 0 : if (vector_type
451 0 : && VECTOR_MODE_P (TYPE_MODE (vector_type))
452 0 : && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
453 : GET_MODE_SIZE (base_vector_mode))
454 0 : && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
455 : 2, &half_nelts))
456 : {
457 : /* Try fusing consecutive sequences of COUNT / NVECTORS elements
458 : together into elements of type INT_TYPE and using the result
459 : to build NVECTORS vectors. */
460 0 : poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
461 0 : vec_perm_builder sel1 (nelts, 2, 3);
462 0 : vec_perm_builder sel2 (nelts, 2, 3);
463 :
464 0 : for (unsigned int i = 0; i < 3; ++i)
465 : {
466 0 : sel1.quick_push (i);
467 0 : sel1.quick_push (i + nelts);
468 0 : sel2.quick_push (half_nelts + i);
469 0 : sel2.quick_push (half_nelts + i + nelts);
470 : }
471 0 : vec_perm_indices indices1 (sel1, 2, nelts);
472 0 : vec_perm_indices indices2 (sel2, 2, nelts);
473 0 : machine_mode vmode = TYPE_MODE (vector_type);
474 0 : if (can_vec_perm_const_p (vmode, vmode, indices1)
475 0 : && can_vec_perm_const_p (vmode, vmode, indices2))
476 : {
477 0 : if (nvectors_out)
478 0 : *nvectors_out = nvectors;
479 0 : if (vector_type_out)
480 0 : *vector_type_out = vector_type;
481 0 : if (permutes)
482 : {
483 0 : permutes[0] = vect_gen_perm_mask_checked (vector_type,
484 : indices1);
485 0 : permutes[1] = vect_gen_perm_mask_checked (vector_type,
486 : indices2);
487 : }
488 0 : return true;
489 : }
490 0 : }
491 : }
492 0 : if (!multiple_p (elt_bytes, 2, &elt_bytes))
493 : return false;
494 0 : nvectors *= 2;
495 : /* We need to be able to fuse COUNT / NVECTORS elements together. */
496 0 : if (!multiple_p (count, nvectors))
497 : return false;
498 : }
499 : }
500 :
501 : /* Return true if DTA and DTB match. */
502 :
503 : static bool
504 16990139 : vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
505 : {
506 16990139 : return (dta == dtb
507 351389 : || ((dta == vect_external_def || dta == vect_constant_def)
508 217971 : && (dtb == vect_external_def || dtb == vect_constant_def)));
509 : }
510 :
511 : #define GATHER_SCATTER_OFFSET (-3)
512 :
513 : /* For most SLP statements, there is a one-to-one mapping between
514 : gimple arguments and child nodes. If that is not true for STMT,
515 : return an array that contains:
516 :
517 : - the number of child nodes, followed by
518 : - for each child node, the index of the argument associated with that node.
519 : The special index -1 is the first operand of an embedded comparison and
520 : the special index -2 is the second operand of an embedded comparison.
521 : The special index -3 is the offset of a gather as analyzed by
522 : vect_check_gather_scatter.
523 :
524 : SWAP is as for vect_get_and_check_slp_defs. */
525 :
526 : static const int *
527 24229913 : vect_get_operand_map (const gimple *stmt, bool gather_scatter_p,
528 : unsigned char swap)
529 : {
530 24229913 : static const int no_arg_map[] = { 0 };
531 24229913 : static const int arg0_map[] = { 1, 0 };
532 24229913 : static const int arg2_map[] = { 1, 2 };
533 24229913 : static const int arg2_arg3_map[] = { 2, 2, 3 };
534 24229913 : static const int arg2_arg4_map[] = { 2, 2, 4 };
535 24229913 : static const int arg2_arg5_arg6_map[] = { 3, 2, 5, 6 };
536 24229913 : static const int arg2_arg4_arg5_map[] = { 3, 2, 4, 5 };
537 24229913 : static const int arg3_arg2_map[] = { 2, 3, 2 };
538 24229913 : static const int op00_map[] = { 1, -1 };
539 24229913 : static const int op1_op0_map[] = { 2, 1, 0 };
540 24229913 : static const int off_map[] = { 1, GATHER_SCATTER_OFFSET };
541 24229913 : static const int off_op0_map[] = { 2, GATHER_SCATTER_OFFSET, 0 };
542 24229913 : static const int off_arg2_arg3_map[] = { 3, GATHER_SCATTER_OFFSET, 2, 3 };
543 24229913 : static const int off_arg3_arg2_map[] = { 3, GATHER_SCATTER_OFFSET, 3, 2 };
544 24229913 : static const int mask_call_maps[6][7] = {
545 : { 1, 1, },
546 : { 2, 1, 2, },
547 : { 3, 1, 2, 3, },
548 : { 4, 1, 2, 3, 4, },
549 : { 5, 1, 2, 3, 4, 5, },
550 : { 6, 1, 2, 3, 4, 5, 6 },
551 : };
552 :
553 24229913 : gcc_checking_assert (!swap
554 : || !is_gimple_assign (stmt)
555 : || TREE_CODE_CLASS
556 : (gimple_assign_rhs_code (stmt)) == tcc_comparison
557 : || commutative_tree_code
558 : (gimple_assign_rhs_code (stmt)));
559 :
560 24229913 : if (auto assign = dyn_cast<const gassign *> (stmt))
561 : {
562 22773587 : tree_code code = gimple_assign_rhs_code (assign);
563 22773587 : if (code == COND_EXPR
564 22773587 : && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
565 0 : gcc_unreachable ();
566 22773587 : else if ((TREE_CODE_CLASS (code) == tcc_comparison
567 21436105 : || commutative_tree_code (code))
568 31701574 : && swap)
569 : return op1_op0_map;
570 22732778 : else if (code == VIEW_CONVERT_EXPR)
571 : return op00_map;
572 22724627 : else if (gather_scatter_p)
573 43351 : return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
574 43351 : ? off_op0_map : off_map);
575 : }
576 1456326 : else if (auto call = dyn_cast<const gcall *> (stmt))
577 : {
578 161292 : if (gimple_call_internal_p (call))
579 92253 : switch (gimple_call_internal_fn (call))
580 : {
581 15952 : case IFN_MASK_LOAD:
582 27210 : return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
583 :
584 : case IFN_GATHER_LOAD:
585 : return arg2_map;
586 :
587 0 : case IFN_MASK_GATHER_LOAD:
588 0 : case IFN_MASK_LEN_GATHER_LOAD:
589 0 : return arg2_arg5_arg6_map;
590 :
591 0 : case IFN_SCATTER_STORE:
592 0 : return arg2_arg4_map;
593 :
594 0 : case IFN_MASK_SCATTER_STORE:
595 0 : case IFN_MASK_LEN_SCATTER_STORE:
596 0 : return arg2_arg4_arg5_map;
597 :
598 9538 : case IFN_MASK_STORE:
599 17654 : return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
600 :
601 988 : case IFN_MASK_CALL:
602 988 : {
603 988 : unsigned nargs = gimple_call_num_args (call);
604 988 : if (nargs >= 2 && nargs <= 7)
605 988 : return mask_call_maps[nargs-2];
606 : else
607 : return nullptr;
608 : }
609 :
610 278 : case IFN_CLZ:
611 278 : case IFN_CTZ:
612 278 : return arg0_map;
613 :
614 6306 : case IFN_GOMP_SIMD_LANE:
615 6306 : return no_arg_map;
616 :
617 : default:
618 : break;
619 : }
620 : }
621 : return nullptr;
622 : }
623 :
624 : static const int *
625 24213986 : vect_get_operand_map (const stmt_vec_info stmt, unsigned char swap = 0)
626 : {
627 0 : return vect_get_operand_map (stmt->stmt, STMT_VINFO_GATHER_SCATTER_P (stmt),
628 0 : swap);
629 : }
630 :
631 : /* Return the SLP node child index for operand OP of STMT. */
632 :
633 : int
634 1371829 : vect_slp_child_index_for_operand (const stmt_vec_info stmt, int op)
635 : {
636 1371829 : const int *opmap = vect_get_operand_map (stmt);
637 1371829 : if (!opmap)
638 : return op;
639 21917 : for (int i = 1; i < 1 + opmap[0]; ++i)
640 21917 : if (opmap[i] == op)
641 12272 : return i - 1;
642 0 : gcc_unreachable ();
643 : }
644 :
645 : /* Helper class for mapping of GIMPLE operands to SLP children. */
646 : /* ??? Add vect_slp_child_index_for_operand here and amend opmaps
647 : with the full reverse mapping and indicating the position of the
648 : first commutative operand index, eliding the swap_p argument from
649 : vect_get_operand_map. Adjust all consumers. */
650 :
651 : struct slp_oprnds {
652 : slp_oprnds (stmt_vec_info);
653 : tree get_op_for_slp_child (stmt_vec_info, unsigned);
654 : const int *opmap;
655 : const unsigned int num_slp_children;
656 : };
657 :
658 4383610 : slp_oprnds::slp_oprnds (stmt_vec_info stmt_info)
659 4383610 : : opmap (vect_get_operand_map (stmt_info)),
660 4383610 : num_slp_children (opmap ? opmap[0] : gimple_num_args (stmt_info->stmt))
661 : {
662 4383610 : }
663 :
664 : /* For SLP child number N get the corresponding tree operand from GIMPLE
665 : statement described by STMT_INFO. */
666 :
667 : tree
668 4822197 : slp_oprnds::get_op_for_slp_child (stmt_vec_info stmt_info, unsigned n)
669 : {
670 4822197 : gcc_assert (n < num_slp_children);
671 4822197 : int opno = opmap ? opmap[n + 1] : (int) n;
672 4822197 : if (opno == GATHER_SCATTER_OFFSET)
673 0 : gcc_unreachable (); // TODO
674 4822197 : else if (opno < 0)
675 1934 : return TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
676 : else
677 4820263 : return gimple_arg (stmt_info->stmt, opno);
678 : }
679 :
680 : /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
681 : they are of a valid type and that they match the defs of the first stmt of
682 : the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
683 : by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
684 : indicates swap is required for cond_expr stmts. Specifically, SWAP
685 : is 1 if STMT is cond and operands of comparison need to be swapped;
686 : SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
687 :
688 : If there was a fatal error return -1; if the error could be corrected by
689 : swapping operands of father node of this one, return 1; if everything is
690 : ok return 0. */
691 : static int
692 12688321 : vect_get_and_check_slp_defs (vec_info *vinfo, tree vectype, unsigned char swap,
693 : bool *skip_args,
694 : vec<stmt_vec_info> stmts, unsigned stmt_num,
695 : vec<slp_oprnd_info> *oprnds_info)
696 : {
697 12688321 : stmt_vec_info stmt_info = stmts[stmt_num];
698 12688321 : tree oprnd;
699 12688321 : unsigned int i, number_of_oprnds;
700 12688321 : enum vect_def_type dt = vect_uninitialized_def;
701 12688321 : slp_oprnd_info oprnd_info;
702 12688321 : gather_scatter_info gs_info;
703 12688321 : unsigned int gs_op = -1u;
704 12688321 : unsigned int commutative_op = -1U;
705 12688321 : bool first = stmt_num == 0;
706 :
707 12688321 : if (!stmt_info)
708 : {
709 0 : for (auto oi : *oprnds_info)
710 : {
711 0 : oi->def_stmts.quick_push (NULL);
712 0 : oi->ops.quick_push (NULL_TREE);
713 : }
714 : return 0;
715 : }
716 :
717 12688321 : if (!is_a<gcall *> (stmt_info->stmt)
718 : && !is_a<gassign *> (stmt_info->stmt)
719 : && !is_a<gphi *> (stmt_info->stmt))
720 : return -1;
721 :
722 12688321 : number_of_oprnds = gimple_num_args (stmt_info->stmt);
723 12688321 : const int *map = vect_get_operand_map (stmt_info, swap);
724 12688321 : if (map)
725 76006 : number_of_oprnds = *map++;
726 12688321 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
727 : {
728 49339 : if (gimple_call_internal_p (stmt))
729 : {
730 32675 : internal_fn ifn = gimple_call_internal_fn (stmt);
731 32675 : commutative_op = first_commutative_argument (ifn);
732 32675 : if (internal_gather_scatter_fn_p (ifn))
733 : {
734 0 : vect_describe_gather_scatter_call
735 0 : (stmt_info,
736 0 : first ? &(*oprnds_info)[0]->first_gs_info : &gs_info);
737 0 : if (first)
738 0 : (*oprnds_info)[0]->first_gs_p = true;
739 : gs_op = 0;
740 : }
741 : }
742 : }
743 12638982 : else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
744 : {
745 14752689 : if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
746 8360245 : commutative_op = 0;
747 : }
748 :
749 12688321 : bool swapped = (swap != 0);
750 12688321 : bool backedge = false;
751 12688321 : enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
752 35101252 : for (i = 0; i < number_of_oprnds; i++)
753 : {
754 22414129 : oprnd_info = (*oprnds_info)[i];
755 22414129 : int opno = map ? map[i] : int (i);
756 22414129 : if (opno == GATHER_SCATTER_OFFSET)
757 : {
758 22739 : gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
759 22739 : if (!is_a <loop_vec_info> (vinfo)
760 22739 : || !vect_check_gather_scatter (stmt_info, vectype,
761 : as_a <loop_vec_info> (vinfo),
762 : first ? &oprnd_info->first_gs_info
763 : : &gs_info))
764 1198 : return -1;
765 :
766 22739 : if (first)
767 : {
768 22486 : oprnd_info->first_gs_p = true;
769 22486 : oprnd = oprnd_info->first_gs_info.offset;
770 : }
771 : else
772 : {
773 253 : gs_op = i;
774 253 : oprnd = gs_info.offset;
775 : }
776 : }
777 22391390 : else if (opno < 0)
778 2842 : oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
779 : else
780 : {
781 22388548 : oprnd = gimple_arg (stmt_info->stmt, opno);
782 22388548 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
783 : {
784 1218247 : edge e = gimple_phi_arg_edge (stmt, opno);
785 2436494 : backedge = (is_a <bb_vec_info> (vinfo)
786 1879802 : ? e->flags & EDGE_DFS_BACK
787 661555 : : dominated_by_p (CDI_DOMINATORS, e->src,
788 661555 : gimple_bb (stmt_info->stmt)));
789 : }
790 : }
791 :
792 22414129 : stmt_vec_info def_stmt_info;
793 22414129 : if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
794 : {
795 976 : if (dump_enabled_p ())
796 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
797 : "Build SLP failed: can't analyze def for %T\n",
798 : oprnd);
799 :
800 976 : return -1;
801 : }
802 :
803 22413153 : if (skip_args[i])
804 : {
805 527137 : oprnd_info->def_stmts.quick_push (NULL);
806 527137 : oprnd_info->ops.quick_push (NULL_TREE);
807 527137 : oprnd_info->first_dt = vect_uninitialized_def;
808 527137 : continue;
809 : }
810 :
811 21886016 : oprnd_info->def_stmts.quick_push (def_stmt_info);
812 21886016 : oprnd_info->ops.quick_push (oprnd);
813 :
814 21886016 : if (def_stmt_info
815 21886016 : && is_pattern_stmt_p (def_stmt_info))
816 : {
817 391615 : if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
818 : != def_stmt_info)
819 274343 : oprnd_info->any_pattern = true;
820 : else
821 : /* If we promote this to external use the original stmt def. */
822 117272 : oprnd_info->ops.last ()
823 234544 : = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
824 : }
825 :
826 : /* If there's a extern def on a backedge make sure we can
827 : code-generate at the region start.
828 : ??? This is another case that could be fixed by adjusting
829 : how we split the function but at the moment we'd have conflicting
830 : goals there. */
831 21886016 : if (backedge
832 167943 : && dts[i] == vect_external_def
833 243 : && is_a <bb_vec_info> (vinfo)
834 243 : && TREE_CODE (oprnd) == SSA_NAME
835 222 : && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
836 21886238 : && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
837 222 : gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
838 : {
839 222 : if (dump_enabled_p ())
840 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
841 : "Build SLP failed: extern def %T only defined "
842 : "on backedge\n", oprnd);
843 222 : return -1;
844 : }
845 :
846 21885794 : if (first)
847 : {
848 4784605 : tree type = TREE_TYPE (oprnd);
849 4784605 : dt = dts[i];
850 :
851 : /* For the swapping logic below force vect_reduction_def
852 : for the reduction op in a SLP reduction group. */
853 4784605 : if (!STMT_VINFO_DATA_REF (stmt_info)
854 3615927 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
855 5336 : && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
856 4787233 : && def_stmt_info)
857 2628 : dts[i] = dt = vect_reduction_def;
858 :
859 : /* Check the types of the definition. */
860 4784605 : switch (dt)
861 : {
862 4784605 : case vect_external_def:
863 4784605 : case vect_constant_def:
864 4784605 : case vect_internal_def:
865 4784605 : case vect_reduction_def:
866 4784605 : case vect_double_reduction_def:
867 4784605 : case vect_induction_def:
868 4784605 : case vect_nested_cycle:
869 4784605 : case vect_first_order_recurrence:
870 4784605 : break;
871 :
872 0 : default:
873 : /* FORNOW: Not supported. */
874 0 : if (dump_enabled_p ())
875 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
876 : "Build SLP failed: illegal type of def %T\n",
877 : oprnd);
878 0 : return -1;
879 : }
880 :
881 4784605 : oprnd_info->first_dt = dt;
882 4784605 : oprnd_info->first_op_type = type;
883 : }
884 : }
885 12687123 : if (first)
886 : return 0;
887 :
888 : /* Now match the operand definition types to that of the first stmt. */
889 26214003 : for (i = 0; i < number_of_oprnds;)
890 : {
891 17116110 : if (skip_args[i])
892 : {
893 44120 : ++i;
894 44120 : continue;
895 : }
896 :
897 17071990 : oprnd_info = (*oprnds_info)[i];
898 17071990 : dt = dts[i];
899 17071990 : stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
900 17071990 : oprnd = oprnd_info->ops[stmt_num];
901 17071990 : tree type = TREE_TYPE (oprnd);
902 :
903 17071990 : if (!types_compatible_p (oprnd_info->first_op_type, type))
904 : {
905 87672 : if (dump_enabled_p ())
906 109 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
907 : "Build SLP failed: different operand types\n");
908 87672 : return 1;
909 : }
910 :
911 16984318 : if ((gs_op == i) != oprnd_info->first_gs_p)
912 : {
913 0 : if (dump_enabled_p ())
914 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
915 : "Build SLP failed: mixed gather and non-gather\n");
916 0 : return 1;
917 : }
918 16984318 : else if (gs_op == i)
919 : {
920 223 : if (!operand_equal_p (oprnd_info->first_gs_info.base,
921 223 : gs_info.base))
922 : {
923 16 : if (dump_enabled_p ())
924 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
925 : "Build SLP failed: different gather base\n");
926 16 : return 1;
927 : }
928 207 : if (oprnd_info->first_gs_info.scale != gs_info.scale)
929 : {
930 8 : if (dump_enabled_p ())
931 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
932 : "Build SLP failed: different gather scale\n");
933 8 : return 1;
934 : }
935 : }
936 :
937 : /* Not first stmt of the group, check that the def-stmt/s match
938 : the def-stmt/s of the first stmt. Allow different definition
939 : types for reduction chains: the first stmt must be a
940 : vect_reduction_def (a phi node), and the rest
941 : end in the reduction chain. */
942 16984294 : if ((!vect_def_types_match (oprnd_info->first_dt, dt)
943 294692 : && !(oprnd_info->first_dt == vect_reduction_def
944 4797 : && !STMT_VINFO_DATA_REF (stmt_info)
945 4797 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
946 4771 : && def_stmt_info
947 4769 : && !STMT_VINFO_DATA_REF (def_stmt_info)
948 4769 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
949 : == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
950 16694371 : || (!STMT_VINFO_DATA_REF (stmt_info)
951 15386649 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
952 9943 : && ((!def_stmt_info
953 9747 : || STMT_VINFO_DATA_REF (def_stmt_info)
954 17960 : || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
955 : != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
956 9943 : != (oprnd_info->first_dt != vect_reduction_def))))
957 : {
958 : /* Try swapping operands if we got a mismatch. For BB
959 : vectorization only in case it will clearly improve things. */
960 292358 : if (i == commutative_op && !swapped
961 289923 : && (!is_a <bb_vec_info> (vinfo)
962 4595 : || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
963 4595 : dts[i+1])
964 1094 : && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
965 : || vect_def_types_match
966 156 : ((*oprnds_info)[i+1]->first_dt, dts[i])))))
967 : {
968 2435 : if (dump_enabled_p ())
969 153 : dump_printf_loc (MSG_NOTE, vect_location,
970 : "trying swapped operands\n");
971 2435 : std::swap (dts[i], dts[i+1]);
972 2435 : std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
973 2435 : (*oprnds_info)[i+1]->def_stmts[stmt_num]);
974 2435 : std::swap ((*oprnds_info)[i]->ops[stmt_num],
975 2435 : (*oprnds_info)[i+1]->ops[stmt_num]);
976 : /* After swapping some operands we lost track whether an
977 : operand has any pattern defs so be conservative here. */
978 2435 : if ((*oprnds_info)[i]->any_pattern
979 2435 : || (*oprnds_info)[i+1]->any_pattern)
980 36 : (*oprnds_info)[i]->any_pattern
981 18 : = (*oprnds_info)[i+1]->any_pattern = true;
982 2435 : swapped = true;
983 2435 : continue;
984 : }
985 :
986 287488 : if (is_a <bb_vec_info> (vinfo)
987 271908 : && !oprnd_info->any_pattern
988 559160 : && number_of_oprnds > 1)
989 : {
990 : /* Now for commutative ops we should see whether we can
991 : make the other operand matching. */
992 103629 : if (dump_enabled_p ())
993 203 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
994 : "treating operand as external\n");
995 103629 : oprnd_info->first_dt = dt = vect_external_def;
996 : }
997 : else
998 : {
999 183859 : if (dump_enabled_p ())
1000 411 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1001 : "Build SLP failed: different types\n");
1002 183859 : return 1;
1003 : }
1004 : }
1005 :
1006 : /* Make sure to demote the overall operand to external. */
1007 16798000 : if (dt == vect_external_def)
1008 333490 : oprnd_info->first_dt = vect_external_def;
1009 : /* For a SLP reduction chain we want to duplicate the reduction to
1010 : each of the chain members. That gets us a sane SLP graph (still
1011 : the stmts are not 100% correct wrt the initial values). */
1012 16464510 : else if ((dt == vect_internal_def
1013 16464510 : || dt == vect_reduction_def)
1014 15542617 : && oprnd_info->first_dt == vect_reduction_def
1015 101224 : && !STMT_VINFO_DATA_REF (stmt_info)
1016 101224 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
1017 4769 : && !STMT_VINFO_DATA_REF (def_stmt_info)
1018 16469279 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
1019 : == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
1020 : {
1021 4769 : oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
1022 4769 : oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
1023 : }
1024 :
1025 16798000 : ++i;
1026 : }
1027 :
1028 : /* Swap operands. */
1029 9097893 : if (swapped)
1030 : {
1031 40861 : if (dump_enabled_p ())
1032 453 : dump_printf_loc (MSG_NOTE, vect_location,
1033 : "swapped operands to match def types in %G",
1034 : stmt_info->stmt);
1035 : }
1036 :
1037 : return 0;
1038 : }
1039 :
1040 : /* Return true if call statements CALL1 and CALL2 are similar enough
1041 : to be combined into the same SLP group. */
1042 :
1043 : bool
1044 21185 : compatible_calls_p (gcall *call1, gcall *call2, bool allow_two_operators)
1045 : {
1046 21185 : unsigned int nargs = gimple_call_num_args (call1);
1047 21185 : if (nargs != gimple_call_num_args (call2))
1048 : return false;
1049 :
1050 19234 : auto cfn1 = gimple_call_combined_fn (call1);
1051 19234 : auto cfn2 = gimple_call_combined_fn (call2);
1052 19234 : if (cfn1 != cfn2
1053 2 : && (!allow_two_operators
1054 2 : || !((cfn1 == CFN_FMA || cfn1 == CFN_FMS)
1055 2 : && (cfn2 == CFN_FMA || cfn2 == CFN_FMS))))
1056 : return false;
1057 :
1058 19234 : if (gimple_call_internal_p (call1))
1059 : {
1060 7031 : if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
1061 7031 : TREE_TYPE (gimple_call_lhs (call2))))
1062 : return false;
1063 14476 : for (unsigned int i = 0; i < nargs; ++i)
1064 7445 : if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
1065 7445 : TREE_TYPE (gimple_call_arg (call2, i))))
1066 : return false;
1067 : }
1068 : else
1069 : {
1070 12203 : if (!operand_equal_p (gimple_call_fn (call1),
1071 12203 : gimple_call_fn (call2), 0))
1072 : return false;
1073 :
1074 26688 : if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
1075 : return false;
1076 : }
1077 :
1078 : /* Check that any unvectorized arguments are equal. */
1079 15927 : if (const int *map = vect_get_operand_map (call1, false, false))
1080 : {
1081 15 : unsigned int nkept = *map++;
1082 15 : unsigned int mapi = 0;
1083 57 : for (unsigned int i = 0; i < nargs; ++i)
1084 42 : if (mapi < nkept && map[mapi] == int (i))
1085 27 : mapi += 1;
1086 15 : else if (!operand_equal_p (gimple_call_arg (call1, i),
1087 15 : gimple_call_arg (call2, i)))
1088 : return false;
1089 : }
1090 :
1091 : return true;
1092 : }
1093 :
1094 : /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1095 : caller's attempt to find the vector type in STMT_INFO with the narrowest
1096 : element type. Return true if VECTYPE is nonnull and if it is valid
1097 : for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1098 : number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1099 : vect_build_slp_tree. */
1100 :
1101 : static bool
1102 5500535 : vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1103 : unsigned int group_size,
1104 : tree vectype, poly_uint64 *max_nunits)
1105 : {
1106 5500535 : if (!vectype)
1107 : {
1108 3884 : if (dump_enabled_p ())
1109 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1110 : "Build SLP failed: unsupported data-type in %G\n",
1111 : stmt_info->stmt);
1112 : /* Fatal mismatch. */
1113 3884 : return false;
1114 : }
1115 :
1116 : /* If populating the vector type requires unrolling then fail
1117 : before adjusting *max_nunits for basic-block vectorization. */
1118 5496651 : if (is_a <bb_vec_info> (vinfo)
1119 5496651 : && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1120 : {
1121 141146 : if (dump_enabled_p ())
1122 36 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1123 : "Build SLP failed: unrolling required "
1124 : "in basic block SLP\n");
1125 : /* Fatal mismatch. */
1126 141146 : return false;
1127 : }
1128 :
1129 : /* In case of multiple types we need to detect the smallest type. */
1130 5355505 : vect_update_max_nunits (max_nunits, vectype);
1131 5355505 : return true;
1132 : }
1133 :
1134 : /* Verify if the scalar stmts STMTS are isomorphic, require data
1135 : permutation or are of unsupported types of operation. Return
1136 : true if they are, otherwise return false and indicate in *MATCHES
1137 : which stmts are not isomorphic to the first one. If MATCHES[0]
1138 : is false then this indicates the comparison could not be
1139 : carried out or the stmts will never be vectorized by SLP.
1140 :
1141 : Note COND_EXPR is possibly isomorphic to another one after swapping its
1142 : operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1143 : the first stmt by swapping the two operands of comparison; set SWAP[i]
1144 : to 2 if stmt I is isormorphic to the first stmt by inverting the code
1145 : of comparison. Take A1 >= B1 ? X1 : Y1 as an example, it can be swapped
1146 : to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1147 :
1148 : static bool
1149 5766336 : vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1150 : vec<stmt_vec_info> stmts, unsigned int group_size,
1151 : poly_uint64 *max_nunits, bool *matches,
1152 : bool *two_operators, tree *node_vectype)
1153 : {
1154 5766336 : unsigned int i;
1155 5766336 : stmt_vec_info first_stmt_info = stmts[0];
1156 5766336 : code_helper first_stmt_code = ERROR_MARK;
1157 5766336 : code_helper alt_stmt_code = ERROR_MARK;
1158 5766336 : code_helper first_cond_code = ERROR_MARK;
1159 5766336 : bool need_same_oprnds = false;
1160 5766336 : tree first_lhs = NULL_TREE;
1161 5766336 : tree first_op1 = NULL_TREE;
1162 5766336 : stmt_vec_info first_load = NULL, prev_first_load = NULL;
1163 5766336 : bool first_stmt_ldst_p = false, first_stmt_ldst_masklen_p = false;
1164 5766336 : bool first_stmt_phi_p = false;
1165 5766336 : int first_reduc_idx = -1;
1166 5766336 : bool maybe_soft_fail = false;
1167 5766336 : tree soft_fail_nunits_vectype = NULL_TREE;
1168 :
1169 5766336 : tree vectype, nunits_vectype;
1170 5766336 : if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
1171 : &nunits_vectype, group_size))
1172 : {
1173 : /* Fatal mismatch. */
1174 207015 : matches[0] = false;
1175 207015 : return false;
1176 : }
1177 5559321 : if (is_a <bb_vec_info> (vinfo)
1178 5559321 : && known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U))
1179 : {
1180 358738 : if (dump_enabled_p ())
1181 296 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1182 : "Build SLP failed: not using single lane "
1183 : "vector type %T\n", vectype);
1184 358738 : matches[0] = false;
1185 358738 : return false;
1186 : }
1187 : /* Record nunits required but continue analysis, producing matches[]
1188 : as if nunits was not an issue. This allows splitting of groups
1189 : to happen. */
1190 5200583 : if (nunits_vectype
1191 5200583 : && !vect_record_max_nunits (vinfo, first_stmt_info, group_size,
1192 : nunits_vectype, max_nunits))
1193 : {
1194 141146 : gcc_assert (is_a <bb_vec_info> (vinfo));
1195 141146 : maybe_soft_fail = true;
1196 141146 : soft_fail_nunits_vectype = nunits_vectype;
1197 : }
1198 :
1199 5200583 : gcc_assert (vectype || !gimple_get_lhs (first_stmt_info->stmt));
1200 5200583 : *node_vectype = vectype;
1201 :
1202 : /* For every stmt in NODE find its def stmt/s. */
1203 5200583 : stmt_vec_info stmt_info;
1204 22191460 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1205 : {
1206 17153412 : bool ldst_p = false;
1207 17153412 : bool ldst_masklen_p = false;
1208 17153412 : bool phi_p = false;
1209 17153412 : code_helper rhs_code = ERROR_MARK;
1210 :
1211 17153412 : swap[i] = 0;
1212 17153412 : matches[i] = false;
1213 17153412 : if (!stmt_info)
1214 : {
1215 40707 : matches[i] = true;
1216 17031584 : continue;
1217 : }
1218 :
1219 17112705 : gimple *stmt = stmt_info->stmt;
1220 17112705 : if (dump_enabled_p ())
1221 220005 : dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1222 :
1223 : /* Fail to vectorize statements marked as unvectorizable, throw
1224 : or are volatile. */
1225 17112705 : if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1226 16922701 : || stmt_can_throw_internal (cfun, stmt)
1227 33241732 : || gimple_has_volatile_ops (stmt))
1228 : {
1229 195503 : if (dump_enabled_p ())
1230 199 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1231 : "Build SLP failed: unvectorizable statement %G",
1232 : stmt);
1233 : /* ??? For BB vectorization we want to commutate operands in a way
1234 : to shuffle all unvectorizable defs into one operand and have
1235 : the other still vectorized. The following doesn't reliably
1236 : work for this though but it's the easiest we can do here. */
1237 195503 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1238 64496 : continue;
1239 : /* Fatal mismatch. */
1240 131007 : matches[0] = false;
1241 131007 : return false;
1242 : }
1243 :
1244 16917202 : gcall *call_stmt = dyn_cast <gcall *> (stmt);
1245 16917202 : tree lhs = gimple_get_lhs (stmt);
1246 16917202 : if (lhs == NULL_TREE && !call_stmt)
1247 : {
1248 36 : if (dump_enabled_p ())
1249 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250 : "Build SLP failed: not GIMPLE_ASSIGN nor "
1251 : "GIMPLE_CALL %G", stmt);
1252 36 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1253 36 : continue;
1254 : /* Fatal mismatch. */
1255 0 : matches[0] = false;
1256 0 : return false;
1257 : }
1258 :
1259 16917166 : if (call_stmt)
1260 : {
1261 102521 : combined_fn cfn = gimple_call_combined_fn (call_stmt);
1262 102521 : if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1263 58793 : rhs_code = cfn;
1264 : else
1265 : rhs_code = CALL_EXPR;
1266 :
1267 102521 : if (cfn == CFN_GATHER_LOAD
1268 102521 : || cfn == CFN_SCATTER_STORE)
1269 : ldst_p = true;
1270 : else if (cfn == CFN_MASK_LOAD
1271 : || cfn == CFN_MASK_GATHER_LOAD
1272 : || cfn == CFN_MASK_LEN_GATHER_LOAD
1273 : || cfn == CFN_MASK_SCATTER_STORE
1274 : || cfn == CFN_MASK_LEN_SCATTER_STORE)
1275 : {
1276 : ldst_p = true;
1277 : ldst_masklen_p = true;
1278 : }
1279 : else if (cfn == CFN_MASK_STORE)
1280 : {
1281 : ldst_p = true;
1282 : ldst_masklen_p = true;
1283 : rhs_code = CFN_MASK_STORE;
1284 : }
1285 : else if (cfn == CFN_GOMP_SIMD_LANE)
1286 : ;
1287 90807 : else if ((cfn != CFN_LAST
1288 : && cfn != CFN_MASK_CALL
1289 47079 : && internal_fn_p (cfn)
1290 36947 : && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1291 90732 : || gimple_call_tail_p (call_stmt)
1292 90732 : || gimple_call_noreturn_p (call_stmt)
1293 181539 : || gimple_call_chain (call_stmt))
1294 : {
1295 424 : if (dump_enabled_p ())
1296 13 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1297 : "Build SLP failed: unsupported call type %G",
1298 : (gimple *) call_stmt);
1299 424 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1300 64 : continue;
1301 : /* Fatal mismatch. */
1302 360 : matches[0] = false;
1303 360 : return false;
1304 : }
1305 : }
1306 16814645 : else if (gimple_code (stmt) == GIMPLE_PHI)
1307 : {
1308 : rhs_code = ERROR_MARK;
1309 : phi_p = true;
1310 : }
1311 : else
1312 : {
1313 16020971 : rhs_code = gimple_assign_rhs_code (stmt);
1314 16020971 : ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1315 : }
1316 :
1317 : /* Check the operation. */
1318 16916742 : if (i == 0)
1319 : {
1320 5069216 : first_lhs = lhs;
1321 5069216 : first_stmt_code = rhs_code;
1322 5069216 : first_stmt_ldst_p = ldst_p;
1323 5069216 : first_stmt_ldst_masklen_p = ldst_masklen_p;
1324 5069216 : first_stmt_phi_p = phi_p;
1325 5069216 : first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
1326 :
1327 : /* Shift arguments should be equal in all the packed stmts for a
1328 : vector shift with scalar shift operand. */
1329 5069216 : if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1330 4935443 : || rhs_code == LROTATE_EXPR
1331 10004587 : || rhs_code == RROTATE_EXPR)
1332 : {
1333 : /* First see if we have a vector/vector shift. */
1334 134228 : if (!directly_supported_p (rhs_code, vectype, optab_vector))
1335 : {
1336 : /* No vector/vector shift, try for a vector/scalar shift. */
1337 122213 : if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1338 : {
1339 11866 : if (dump_enabled_p ())
1340 386 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1341 : "Build SLP failed: "
1342 : "op not supported by target.\n");
1343 11866 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1344 : continue;
1345 : /* Fatal mismatch. */
1346 11866 : matches[0] = false;
1347 11866 : return false;
1348 : }
1349 110347 : need_same_oprnds = true;
1350 110347 : first_op1 = gimple_assign_rhs2 (stmt);
1351 : }
1352 : }
1353 4934988 : else if (rhs_code == WIDEN_LSHIFT_EXPR)
1354 : {
1355 0 : need_same_oprnds = true;
1356 0 : first_op1 = gimple_assign_rhs2 (stmt);
1357 : }
1358 4934988 : else if (!ldst_p
1359 4934988 : && rhs_code == BIT_FIELD_REF)
1360 : {
1361 5504 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1362 5504 : if (!is_a <bb_vec_info> (vinfo)
1363 5378 : || TREE_CODE (vec) != SSA_NAME
1364 : /* When the element types are not compatible we pun the
1365 : source to the target vectype which requires equal size. */
1366 10870 : || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1367 4649 : || !types_compatible_p (TREE_TYPE (vectype),
1368 4649 : TREE_TYPE (TREE_TYPE (vec))))
1369 1007 : && !operand_equal_p (TYPE_SIZE (vectype),
1370 1007 : TYPE_SIZE (TREE_TYPE (vec)))))
1371 : {
1372 753 : if (dump_enabled_p ())
1373 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1374 : "Build SLP failed: "
1375 : "BIT_FIELD_REF not supported\n");
1376 : /* Fatal mismatch. */
1377 753 : matches[0] = false;
1378 753 : return false;
1379 : }
1380 : }
1381 4929484 : else if (rhs_code == CFN_DIV_POW2)
1382 : {
1383 0 : need_same_oprnds = true;
1384 0 : first_op1 = gimple_call_arg (call_stmt, 1);
1385 : }
1386 4929484 : else if (rhs_code == CFN_GOMP_SIMD_LANE)
1387 : {
1388 3153 : need_same_oprnds = true;
1389 3153 : first_op1 = gimple_call_arg (call_stmt, 1);
1390 : }
1391 : }
1392 : else
1393 : {
1394 11847526 : int comm_arg;
1395 11847904 : if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1396 : /* For SLP reduction groups the index isn't necessarily
1397 : uniform but only that of the first stmt matters. */
1398 2296 : && !(first_reduc_idx != -1
1399 2296 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1400 2296 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1401 11847526 : && !(first_reduc_idx != -1
1402 1049 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1403 1049 : && (comm_arg = first_commutative_argument
1404 1049 : (rhs_code, TREE_TYPE (lhs))) >= 0
1405 : && (first_reduc_idx
1406 815 : == 2 * comm_arg + 1 - STMT_VINFO_REDUC_IDX (stmt_info))))
1407 : {
1408 378 : if (dump_enabled_p ())
1409 : {
1410 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1411 : "Build SLP failed: different reduc_idx "
1412 : "%d instead of %d in %G",
1413 : STMT_VINFO_REDUC_IDX (stmt_info),
1414 : first_reduc_idx, stmt);
1415 : }
1416 : /* Mismatch. */
1417 378 : continue;
1418 : }
1419 11847148 : if (!ldst_p
1420 9300700 : && first_stmt_code != rhs_code
1421 13270958 : && alt_stmt_code == ERROR_MARK)
1422 : alt_stmt_code = rhs_code;
1423 13247114 : if ((!ldst_p
1424 9300700 : && first_stmt_code != rhs_code
1425 1423810 : && (first_stmt_code != IMAGPART_EXPR
1426 127 : || rhs_code != REALPART_EXPR)
1427 1423790 : && (first_stmt_code != REALPART_EXPR
1428 531 : || rhs_code != IMAGPART_EXPR)
1429 : /* Handle mismatches in plus/minus by computing both
1430 : and merging the results. */
1431 1423779 : && !((((first_stmt_code == PLUS_EXPR
1432 1311519 : || first_stmt_code == MINUS_EXPR)
1433 139578 : && (alt_stmt_code == PLUS_EXPR
1434 131005 : || alt_stmt_code == MINUS_EXPR))
1435 1394774 : || ((first_stmt_code == CFN_FMA
1436 1394772 : || first_stmt_code == CFN_FMS)
1437 2 : && (alt_stmt_code == CFN_FMA
1438 2 : || alt_stmt_code == CFN_FMS)))
1439 29007 : && rhs_code == alt_stmt_code)
1440 1434654 : && !(first_stmt_code.is_tree_code ()
1441 1317296 : && rhs_code.is_tree_code ()
1442 1221624 : && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1443 : == tcc_comparison)
1444 127274 : && (swap_tree_comparison (tree_code (first_stmt_code))
1445 127274 : == tree_code (rhs_code))
1446 : && (first_reduc_idx == -1
1447 0 : || REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
1448 : || (ldst_p
1449 5092896 : && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1450 2546448 : != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1451 : || (ldst_p
1452 2501940 : && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1453 2501940 : != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1454 10447329 : || first_stmt_ldst_p != ldst_p
1455 10447190 : || (ldst_p && first_stmt_ldst_masklen_p != ldst_masklen_p)
1456 22294330 : || first_stmt_phi_p != phi_p)
1457 : {
1458 1399966 : if (dump_enabled_p ())
1459 : {
1460 3130 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1461 : "Build SLP failed: different operation "
1462 : "in stmt %G", stmt);
1463 3130 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464 : "original stmt %G", first_stmt_info->stmt);
1465 : }
1466 : /* Mismatch. */
1467 1399966 : continue;
1468 : }
1469 :
1470 10449252 : if (!ldst_p
1471 7945377 : && first_stmt_code == BIT_FIELD_REF
1472 10452435 : && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1473 5253 : != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1474 : {
1475 2070 : if (dump_enabled_p ())
1476 40 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1477 : "Build SLP failed: different BIT_FIELD_REF "
1478 : "arguments in %G", stmt);
1479 : /* Mismatch. */
1480 2070 : continue;
1481 : }
1482 :
1483 10445112 : if (call_stmt
1484 22165 : && first_stmt_code != CFN_MASK_LOAD
1485 10466713 : && first_stmt_code != CFN_MASK_STORE)
1486 : {
1487 21185 : if (!is_a <gcall *> (stmts[0]->stmt)
1488 21185 : || !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1489 : call_stmt, true))
1490 : {
1491 5258 : if (dump_enabled_p ())
1492 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1493 : "Build SLP failed: different calls in %G",
1494 : stmt);
1495 : /* Mismatch. */
1496 5258 : continue;
1497 : }
1498 : }
1499 :
1500 10253672 : if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1501 11238315 : && (gimple_bb (first_stmt_info->stmt)
1502 984643 : != gimple_bb (stmt_info->stmt)))
1503 : {
1504 27295 : if (dump_enabled_p ())
1505 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1506 : "Build SLP failed: different BB for PHI "
1507 : "or possibly trapping operation in %G", stmt);
1508 : /* Mismatch. */
1509 27295 : continue;
1510 : }
1511 :
1512 10412559 : if (need_same_oprnds)
1513 : {
1514 52873 : tree other_op1 = gimple_arg (stmt, 1);
1515 52873 : if (!operand_equal_p (first_op1, other_op1, 0))
1516 : {
1517 5503 : if (dump_enabled_p ())
1518 123 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1519 : "Build SLP failed: different shift "
1520 : "arguments in %G", stmt);
1521 : /* Mismatch. */
1522 5503 : continue;
1523 : }
1524 : }
1525 :
1526 10407793 : if (first_lhs
1527 10407056 : && lhs
1528 10407056 : && !types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (first_lhs)))
1529 : {
1530 737 : if (dump_enabled_p ())
1531 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1532 : "Build SLP failed: different vector type "
1533 : "in %G", stmt);
1534 : /* Mismatch. */
1535 737 : continue;
1536 : }
1537 : }
1538 :
1539 : /* Grouped store or load. */
1540 15462916 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1541 : {
1542 3864258 : gcc_assert (ldst_p);
1543 3864258 : if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1544 : {
1545 : /* Store. */
1546 3028510 : gcc_assert (rhs_code == CFN_MASK_STORE
1547 : || REFERENCE_CLASS_P (lhs)
1548 : || DECL_P (lhs));
1549 : }
1550 : else
1551 : {
1552 : /* Load. */
1553 835748 : first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1554 835748 : if (prev_first_load)
1555 : {
1556 : /* Check that there are no loads from different interleaving
1557 : chains in the same node. */
1558 380380 : if (prev_first_load != first_load)
1559 : {
1560 54189 : if (dump_enabled_p ())
1561 1994 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1562 : vect_location,
1563 : "Build SLP failed: different "
1564 : "interleaving chains in one node %G",
1565 : stmt);
1566 : /* Mismatch. */
1567 54189 : continue;
1568 : }
1569 : }
1570 : else
1571 : prev_first_load = first_load;
1572 : }
1573 : }
1574 : /* Non-grouped store or load. */
1575 11598658 : else if (ldst_p)
1576 : {
1577 887228 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1578 616527 : && rhs_code != CFN_GATHER_LOAD
1579 : && rhs_code != CFN_MASK_GATHER_LOAD
1580 : && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1581 : && rhs_code != CFN_SCATTER_STORE
1582 : && rhs_code != CFN_MASK_SCATTER_STORE
1583 : && rhs_code != CFN_MASK_LEN_SCATTER_STORE
1584 616527 : && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1585 : /* Not grouped loads are handled as externals for BB
1586 : vectorization. For loop vectorization we can handle
1587 : splats the same we handle single element interleaving.
1588 : Likewise we can handle a collection of invariant refs. */
1589 1484934 : && (is_a <bb_vec_info> (vinfo)
1590 597706 : || (stmt_info != first_stmt_info
1591 68088 : && !(integer_zerop (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
1592 241 : && integer_zerop (DR_STEP (STMT_VINFO_DATA_REF
1593 : (first_stmt_info)))))))
1594 : {
1595 : /* Not grouped load. */
1596 67606 : if (dump_enabled_p ())
1597 145 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1598 : "Build SLP failed: not grouped load %G", stmt);
1599 :
1600 67606 : if (i != 0)
1601 67606 : continue;
1602 : /* Fatal mismatch. */
1603 0 : matches[0] = false;
1604 0 : return false;
1605 : }
1606 : }
1607 : /* Not memory operation. */
1608 : else
1609 : {
1610 10711430 : if (!phi_p
1611 10042837 : && rhs_code.is_tree_code ()
1612 9994236 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1613 1517045 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1614 939313 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1615 877526 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1616 64319 : && rhs_code != VIEW_CONVERT_EXPR
1617 : && rhs_code != CALL_EXPR
1618 : && rhs_code != BIT_FIELD_REF
1619 10711430 : && rhs_code != SSA_NAME)
1620 : {
1621 18549 : if (dump_enabled_p ())
1622 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1623 : "Build SLP failed: operation unsupported %G",
1624 : stmt);
1625 18549 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1626 0 : continue;
1627 : /* Fatal mismatch. */
1628 18549 : matches[0] = false;
1629 18549 : return false;
1630 : }
1631 :
1632 10692881 : if (rhs_code == COND_EXPR)
1633 : {
1634 58935 : tree cond_expr = gimple_assign_rhs1 (stmt);
1635 58935 : enum tree_code cond_code = TREE_CODE (cond_expr);
1636 58935 : enum tree_code swap_code = ERROR_MARK;
1637 58935 : enum tree_code invert_code = ERROR_MARK;
1638 :
1639 58935 : if (i == 0)
1640 49846 : first_cond_code = TREE_CODE (cond_expr);
1641 9089 : else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1642 : {
1643 0 : bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1644 0 : swap_code = swap_tree_comparison (cond_code);
1645 0 : invert_code = invert_tree_comparison (cond_code, honor_nans);
1646 : }
1647 :
1648 58935 : if (first_cond_code == cond_code)
1649 : ;
1650 : /* Isomorphic can be achieved by swapping. */
1651 0 : else if (first_cond_code == swap_code)
1652 0 : swap[i] = 1;
1653 : /* Isomorphic can be achieved by inverting. */
1654 0 : else if (first_cond_code == invert_code)
1655 0 : swap[i] = 2;
1656 : else
1657 : {
1658 0 : if (dump_enabled_p ())
1659 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1660 : "Build SLP failed: different"
1661 : " operation %G", stmt);
1662 : /* Mismatch. */
1663 0 : continue;
1664 : }
1665 : }
1666 :
1667 10692881 : if (i != 0
1668 7905765 : && first_stmt_code != rhs_code
1669 68487 : && first_stmt_code.is_tree_code ()
1670 68485 : && rhs_code.is_tree_code ()
1671 68485 : && TREE_CODE_CLASS ((tree_code)first_stmt_code) == tcc_comparison
1672 10732562 : && (swap_tree_comparison ((tree_code)first_stmt_code)
1673 39681 : == (tree_code)rhs_code))
1674 39681 : swap[i] = 1;
1675 :
1676 10692881 : if (i != 0
1677 7905765 : && first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1678 1648 : && first_reduc_idx != -1
1679 1648 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1680 1648 : && rhs_code.is_tree_code ()
1681 1640 : && commutative_tree_code (tree_code (rhs_code))
1682 10694519 : && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info))
1683 1638 : swap[i] = 1;
1684 : }
1685 :
1686 15322572 : matches[i] = true;
1687 : }
1688 :
1689 20369350 : for (i = 0; i < group_size; ++i)
1690 16028430 : if (!matches[i])
1691 : return false;
1692 :
1693 : /* If we allowed a two-operation SLP node verify the target can cope
1694 : with the permute we are going to use. */
1695 4340920 : if (alt_stmt_code != ERROR_MARK
1696 4340920 : && (!alt_stmt_code.is_tree_code ()
1697 53539 : || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1698 53539 : && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1699 : {
1700 14449 : *two_operators = true;
1701 : }
1702 :
1703 4340920 : if (maybe_soft_fail)
1704 : {
1705 140738 : unsigned HOST_WIDE_INT const_nunits;
1706 140738 : if (!TYPE_VECTOR_SUBPARTS
1707 140738 : (soft_fail_nunits_vectype).is_constant (&const_nunits)
1708 140738 : || const_nunits > group_size)
1709 0 : matches[0] = false;
1710 : else
1711 : {
1712 : /* With constant vector elements simulate a mismatch at the
1713 : point we need to split. */
1714 140738 : unsigned tail = group_size & (const_nunits - 1);
1715 140738 : memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1716 : }
1717 140738 : return false;
1718 : }
1719 :
1720 : return true;
1721 : }
1722 :
1723 : /* Traits for the hash_set to record failed SLP builds for a stmt set.
1724 : Note we never remove apart from at destruction time so we do not
1725 : need a special value for deleted that differs from empty. */
1726 : struct bst_traits
1727 : {
1728 : typedef vec <stmt_vec_info> value_type;
1729 : typedef vec <stmt_vec_info> compare_type;
1730 : static inline hashval_t hash (value_type);
1731 : static inline bool equal (value_type existing, value_type candidate);
1732 478395520 : static inline bool is_empty (value_type x) { return !x.exists (); }
1733 107128487 : static inline bool is_deleted (value_type x) { return !x.exists (); }
1734 : static const bool empty_zero_p = true;
1735 0 : static inline void mark_empty (value_type &x) { x.release (); }
1736 : static inline void mark_deleted (value_type &x) { x.release (); }
1737 9211504 : static inline void remove (value_type &x) { x.release (); }
1738 : };
1739 : inline hashval_t
1740 93332974 : bst_traits::hash (value_type x)
1741 : {
1742 93332974 : inchash::hash h;
1743 422945170 : for (unsigned i = 0; i < x.length (); ++i)
1744 329612196 : h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
1745 93332974 : return h.end ();
1746 : }
1747 : inline bool
1748 81629928 : bst_traits::equal (value_type existing, value_type candidate)
1749 : {
1750 244889784 : if (existing.length () != candidate.length ())
1751 : return false;
1752 83007214 : for (unsigned i = 0; i < existing.length (); ++i)
1753 78688398 : if (existing[i] != candidate[i])
1754 : return false;
1755 : return true;
1756 : }
1757 :
1758 : typedef hash_map <vec <stmt_vec_info>, slp_tree,
1759 : simple_hashmap_traits <bst_traits, slp_tree> >
1760 : scalar_stmts_to_slp_tree_map_t;
1761 :
1762 : /* Release BST_MAP. */
1763 :
1764 : static void
1765 1790680 : release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
1766 : {
1767 : /* The map keeps a reference on SLP nodes built, release that. */
1768 11002184 : for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1769 20213688 : it != bst_map->end (); ++it)
1770 9211504 : if ((*it).second)
1771 9211504 : vect_free_slp_tree ((*it).second);
1772 1790680 : delete bst_map;
1773 1790680 : }
1774 :
1775 : /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1776 : but then vec::insert does memmove and that's not compatible with
1777 : std::pair. */
1778 : struct chain_op_t
1779 : {
1780 3708863 : chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1781 3708863 : : code (code_), dt (dt_), op (op_) {}
1782 : tree_code code;
1783 : vect_def_type dt;
1784 : tree op;
1785 : };
1786 :
1787 : /* Comparator for sorting associatable chains. */
1788 :
1789 : static int
1790 8251107 : dt_sort_cmp (const void *op1_, const void *op2_, void *)
1791 : {
1792 8251107 : auto *op1 = (const chain_op_t *) op1_;
1793 8251107 : auto *op2 = (const chain_op_t *) op2_;
1794 8251107 : if (op1->dt != op2->dt)
1795 940598 : return (int)op1->dt - (int)op2->dt;
1796 7310509 : return (int)op1->code - (int)op2->code;
1797 : }
1798 :
1799 : /* Linearize the associatable expression chain at START with the
1800 : associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1801 : filling CHAIN with the result and using WORKLIST as intermediate storage.
1802 : CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1803 : or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1804 : stmts, starting with START. When ALLOW_ALT_CODE is false, do not
1805 : follow into MINUS_EXPR when building a PLUS chain (treat MINUS as leaf). */
1806 :
1807 : static void
1808 1680052 : vect_slp_linearize_chain (vec_info *vinfo,
1809 : vec<std::pair<tree_code, gimple *> > &worklist,
1810 : vec<chain_op_t> &chain,
1811 : enum tree_code code, gimple *start,
1812 : gimple *&code_stmt, gimple *&alt_code_stmt,
1813 : vec<gimple *> *chain_stmts,
1814 : bool allow_alt_code = true)
1815 : {
1816 : /* For each lane linearize the addition/subtraction (or other
1817 : uniform associatable operation) expression tree. */
1818 1680052 : worklist.safe_push (std::make_pair (code, start));
1819 3708863 : while (!worklist.is_empty ())
1820 : {
1821 2028811 : auto entry = worklist.pop ();
1822 2028811 : gassign *stmt = as_a <gassign *> (entry.second);
1823 2028811 : enum tree_code in_code = entry.first;
1824 4057622 : enum tree_code this_code = gimple_assign_rhs_code (stmt);
1825 : /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1826 2028811 : if (!code_stmt
1827 2028811 : && gimple_assign_rhs_code (stmt) == code)
1828 1427004 : code_stmt = stmt;
1829 601807 : else if (!alt_code_stmt
1830 601807 : && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1831 305611 : alt_code_stmt = stmt;
1832 2028811 : if (chain_stmts)
1833 1948091 : chain_stmts->safe_push (stmt);
1834 6086433 : for (unsigned opnum = 1; opnum <= 2; ++opnum)
1835 : {
1836 4057622 : tree op = gimple_op (stmt, opnum);
1837 4057622 : vect_def_type dt;
1838 4057622 : stmt_vec_info def_stmt_info;
1839 4057622 : bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1840 4057622 : gcc_assert (res);
1841 4057622 : if (dt == vect_internal_def
1842 4057622 : && is_pattern_stmt_p (def_stmt_info))
1843 9112 : op = gimple_get_lhs (def_stmt_info->stmt);
1844 4057622 : gimple *use_stmt;
1845 4057622 : use_operand_p use_p;
1846 4057622 : if (dt == vect_internal_def
1847 3767721 : && single_imm_use (op, &use_p, &use_stmt)
1848 2338256 : && is_gimple_assign (def_stmt_info->stmt)
1849 6214990 : && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1850 1808924 : || (allow_alt_code
1851 56905 : && code == PLUS_EXPR
1852 36106 : && (gimple_assign_rhs_code (def_stmt_info->stmt)
1853 : == MINUS_EXPR))))
1854 : {
1855 348759 : tree_code op_def_code = this_code;
1856 348759 : if (op_def_code == MINUS_EXPR && opnum == 1)
1857 50974 : op_def_code = PLUS_EXPR;
1858 348759 : if (in_code == MINUS_EXPR)
1859 135 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1860 348759 : worklist.safe_push (std::make_pair (op_def_code,
1861 348759 : def_stmt_info->stmt));
1862 : }
1863 : else
1864 : {
1865 3708863 : tree_code op_def_code = this_code;
1866 3708863 : if (op_def_code == MINUS_EXPR && opnum == 1)
1867 254754 : op_def_code = PLUS_EXPR;
1868 3708863 : if (in_code == MINUS_EXPR)
1869 3769 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1870 3708863 : chain.safe_push (chain_op_t (op_def_code, dt, op));
1871 : }
1872 : }
1873 : }
1874 1680052 : }
1875 :
1876 : /* Distance from the node currently being discovered to the closest upthread
1877 : commutative operation whose operand-zero discovery may still be fixed by
1878 : retrying with swapped operands, or -1U if there is none. */
1879 :
1880 : static unsigned least_upthread_swappable_op_distance = -1U;
1881 :
1882 : static slp_tree
1883 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1884 : vec<stmt_vec_info> stmts, unsigned int group_size,
1885 : poly_uint64 *max_nunits,
1886 : bool *matches, unsigned *limit, unsigned *tree_size,
1887 : scalar_stmts_to_slp_tree_map_t *bst_map);
1888 :
1889 : static slp_tree
1890 6246869 : vect_build_slp_tree (vec_info *vinfo,
1891 : vec<stmt_vec_info> stmts, unsigned int group_size,
1892 : poly_uint64 *max_nunits,
1893 : bool *matches, unsigned *limit, unsigned *tree_size,
1894 : scalar_stmts_to_slp_tree_map_t *bst_map)
1895 : {
1896 6246869 : if (slp_tree *leader = bst_map->get (stmts))
1897 : {
1898 475333 : if (dump_enabled_p ())
1899 17184 : dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1900 17184 : !(*leader)->failed ? "" : "failed ",
1901 : (void *) *leader);
1902 475333 : if (!(*leader)->failed)
1903 : {
1904 430275 : SLP_TREE_REF_COUNT (*leader)++;
1905 430275 : vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1906 430275 : stmts.release ();
1907 430275 : return *leader;
1908 : }
1909 45058 : memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1910 45058 : return NULL;
1911 : }
1912 :
1913 : /* Single-lane SLP doesn't have the chance of run-away, do not account
1914 : it to the limit. */
1915 5771536 : if (stmts.length () > 1)
1916 : {
1917 3185259 : if (*limit == 0)
1918 : {
1919 1239 : if (dump_enabled_p ())
1920 15 : dump_printf_loc (MSG_NOTE, vect_location,
1921 : "SLP discovery limit exceeded\n");
1922 1239 : memset (matches, 0, sizeof (bool) * group_size);
1923 1239 : return NULL;
1924 : }
1925 3184020 : --*limit;
1926 : }
1927 :
1928 : /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1929 : so we can pick up backedge destinations during discovery. */
1930 5770297 : slp_tree res = new _slp_tree;
1931 5770297 : SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1932 5770297 : SLP_TREE_SCALAR_STMTS (res) = stmts;
1933 5770297 : bst_map->put (stmts.copy (), res);
1934 :
1935 5770297 : if (dump_enabled_p ())
1936 146543 : dump_printf_loc (MSG_NOTE, vect_location,
1937 : "starting SLP discovery for node %p\n", (void *) res);
1938 :
1939 5770297 : poly_uint64 this_max_nunits = 1;
1940 5770297 : slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1941 : &this_max_nunits,
1942 : matches, limit, tree_size, bst_map);
1943 5770297 : if (!res_)
1944 : {
1945 2037326 : if (dump_enabled_p ())
1946 8484 : dump_printf_loc (MSG_NOTE, vect_location,
1947 : "SLP discovery for node %p failed\n", (void *) res);
1948 : /* Mark the node invalid so we can detect those when still in use
1949 : as backedge destinations. */
1950 2037326 : SLP_TREE_SCALAR_STMTS (res) = vNULL;
1951 2037326 : SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1952 2037326 : res->failed = XNEWVEC (bool, group_size);
1953 2037326 : if (flag_checking)
1954 : {
1955 : unsigned i;
1956 3621687 : for (i = 0; i < group_size; ++i)
1957 3621687 : if (!matches[i])
1958 : break;
1959 2037326 : gcc_assert (i < group_size);
1960 : }
1961 2037326 : memcpy (res->failed, matches, sizeof (bool) * group_size);
1962 : }
1963 : else
1964 : {
1965 3732971 : if (dump_enabled_p ())
1966 138059 : dump_printf_loc (MSG_NOTE, vect_location,
1967 : "SLP discovery for node %p succeeded\n",
1968 : (void *) res);
1969 3732971 : gcc_assert (res_ == res);
1970 3732971 : res->max_nunits = this_max_nunits;
1971 3732971 : vect_update_max_nunits (max_nunits, this_max_nunits);
1972 : /* Keep a reference for the bst_map use. */
1973 3732971 : SLP_TREE_REF_COUNT (res)++;
1974 : }
1975 : return res_;
1976 : }
1977 :
1978 : /* Helper for building an associated SLP node chain. */
1979 :
1980 : static void
1981 158 : vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1982 : slp_tree op0, slp_tree op1,
1983 : stmt_vec_info oper1, stmt_vec_info oper2,
1984 : vec<std::pair<unsigned, unsigned> > lperm)
1985 : {
1986 158 : unsigned group_size = SLP_TREE_LANES (op1);
1987 :
1988 158 : slp_tree child1 = new _slp_tree;
1989 158 : SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1990 158 : SLP_TREE_VECTYPE (child1) = vectype;
1991 158 : SLP_TREE_LANES (child1) = group_size;
1992 158 : SLP_TREE_CHILDREN (child1).create (2);
1993 158 : SLP_TREE_CHILDREN (child1).quick_push (op0);
1994 158 : SLP_TREE_CHILDREN (child1).quick_push (op1);
1995 158 : SLP_TREE_REPRESENTATIVE (child1) = oper1;
1996 :
1997 158 : slp_tree child2 = new _slp_tree;
1998 158 : SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1999 158 : SLP_TREE_VECTYPE (child2) = vectype;
2000 158 : SLP_TREE_LANES (child2) = group_size;
2001 158 : SLP_TREE_CHILDREN (child2).create (2);
2002 158 : SLP_TREE_CHILDREN (child2).quick_push (op0);
2003 158 : SLP_TREE_REF_COUNT (op0)++;
2004 158 : SLP_TREE_CHILDREN (child2).quick_push (op1);
2005 158 : SLP_TREE_REF_COUNT (op1)++;
2006 158 : SLP_TREE_REPRESENTATIVE (child2) = oper2;
2007 :
2008 158 : SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
2009 158 : SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
2010 158 : SLP_TREE_VECTYPE (perm) = vectype;
2011 158 : SLP_TREE_LANES (perm) = group_size;
2012 : /* ??? We should set this NULL but that's not expected. */
2013 158 : SLP_TREE_REPRESENTATIVE (perm) = oper1;
2014 158 : SLP_TREE_LANE_PERMUTATION (perm) = lperm;
2015 158 : SLP_TREE_CHILDREN (perm).quick_push (child1);
2016 158 : SLP_TREE_CHILDREN (perm).quick_push (child2);
2017 158 : }
2018 :
2019 : /* Recursively build an SLP tree starting from NODE.
2020 : Fail (and return a value not equal to zero) if def-stmts are not
2021 : isomorphic, require data permutation or are of unsupported types of
2022 : operation. Otherwise, return 0.
2023 : The value returned is the depth in the SLP tree where a mismatch
2024 : was found. */
2025 :
2026 : static slp_tree
2027 5770297 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
2028 : vec<stmt_vec_info> stmts, unsigned int group_size,
2029 : poly_uint64 *max_nunits,
2030 : bool *matches, unsigned *limit, unsigned *tree_size,
2031 : scalar_stmts_to_slp_tree_map_t *bst_map)
2032 : {
2033 5770297 : unsigned nops, i, this_tree_size = 0;
2034 5770297 : poly_uint64 this_max_nunits = *max_nunits;
2035 :
2036 5770297 : matches[0] = false;
2037 :
2038 5770297 : stmt_vec_info stmt_info = stmts[0];
2039 5770297 : if (!is_a<gcall *> (stmt_info->stmt)
2040 : && !is_a<gassign *> (stmt_info->stmt)
2041 : && !is_a<gphi *> (stmt_info->stmt))
2042 : return NULL;
2043 :
2044 5770226 : nops = gimple_num_args (stmt_info->stmt);
2045 5770226 : if (const int *map = vect_get_operand_map (stmt_info))
2046 35130 : nops = map[0];
2047 :
2048 : /* If the SLP node is a PHI (induction or reduction), terminate
2049 : the recursion. */
2050 5770226 : bool *skip_args = XALLOCAVEC (bool, nops);
2051 5770226 : memset (skip_args, 0, sizeof (bool) * nops);
2052 5770226 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2053 2782864 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
2054 : {
2055 299972 : tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
2056 299972 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2057 : group_size);
2058 299972 : if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
2059 : max_nunits))
2060 : return NULL;
2061 :
2062 296088 : vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
2063 296088 : if (def_type == vect_induction_def)
2064 : {
2065 : /* Induction PHIs are not cycles but walk the initial
2066 : value. Only for inner loops through, for outer loops
2067 : we need to pick up the value from the actual PHIs
2068 : to more easily support peeling and epilogue vectorization. */
2069 189878 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2070 189878 : if (!nested_in_vect_loop_p (loop, stmt_info))
2071 189054 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
2072 : else
2073 : loop = loop->inner;
2074 189878 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
2075 : }
2076 106210 : else if (def_type == vect_reduction_def
2077 : || def_type == vect_double_reduction_def
2078 : || def_type == vect_nested_cycle
2079 106210 : || def_type == vect_first_order_recurrence)
2080 : {
2081 : /* Else def types have to match. */
2082 : stmt_vec_info other_info;
2083 : bool all_same = true;
2084 240628 : FOR_EACH_VEC_ELT (stmts, i, other_info)
2085 : {
2086 135732 : if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
2087 1782918 : return NULL;
2088 135726 : if (other_info != stmt_info)
2089 26231 : all_same = false;
2090 : }
2091 104896 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2092 : /* Reduction initial values are not explicitly represented. */
2093 104896 : if (def_type != vect_first_order_recurrence
2094 104896 : && gimple_bb (stmt_info->stmt) == loop->header)
2095 101731 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
2096 : /* Reduction chain backedge defs are filled manually.
2097 : ??? Need a better way to identify a SLP reduction chain PHI.
2098 : Or a better overall way to SLP match those. */
2099 104896 : if (stmts.length () > 1
2100 104896 : && all_same && def_type == vect_reduction_def)
2101 2354 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
2102 : }
2103 1308 : else if (def_type != vect_internal_def)
2104 : return NULL;
2105 : }
2106 :
2107 :
2108 5766336 : bool two_operators = false;
2109 5766336 : unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
2110 5766336 : tree vectype = NULL_TREE;
2111 5766336 : if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
2112 : &this_max_nunits, matches, &two_operators,
2113 : &vectype))
2114 : return NULL;
2115 :
2116 : /* If the SLP node is a load, terminate the recursion unless masked. */
2117 4200182 : if (STMT_VINFO_DATA_REF (stmt_info)
2118 2034754 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2119 : {
2120 900598 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2121 : gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
2122 : else
2123 : {
2124 882030 : *max_nunits = this_max_nunits;
2125 882030 : (*tree_size)++;
2126 882030 : node = vect_create_new_slp_node (node, stmts, 0);
2127 882030 : SLP_TREE_VECTYPE (node) = vectype;
2128 : /* And compute the load permutation. Whether it is actually
2129 : a permutation depends on the unrolling factor which is
2130 : decided later. */
2131 882030 : vec<unsigned> load_permutation;
2132 882030 : int j;
2133 882030 : stmt_vec_info load_info;
2134 882030 : load_permutation.create (group_size);
2135 882030 : stmt_vec_info first_stmt_info
2136 882030 : = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2137 882030 : ? DR_GROUP_FIRST_ELEMENT (stmt_info) : stmt_info;
2138 882030 : bool any_permute = false;
2139 2125659 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2140 : {
2141 1243629 : int load_place;
2142 1243629 : if (! load_info)
2143 : {
2144 40227 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2145 : load_place = j;
2146 : else
2147 : load_place = 0;
2148 : }
2149 1203402 : else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2150 701430 : load_place = vect_get_place_in_interleaving_chain
2151 701430 : (load_info, first_stmt_info);
2152 : else
2153 : /* Recognize the splat case as { 0, 0, ... } but make
2154 : sure to use the appropriate refs for collections
2155 : of invariant refs. */
2156 501972 : load_place = (load_info == stmt_info) ? 0 : j;
2157 741898 : gcc_assert (load_place != -1);
2158 1243629 : any_permute |= load_place != j;
2159 1243629 : load_permutation.quick_push (load_place);
2160 : }
2161 :
2162 882030 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
2163 : {
2164 3416 : gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
2165 3416 : bool has_gaps = false;
2166 3416 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2167 219 : for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
2168 1596 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2169 1377 : if (DR_GROUP_GAP (si) != 1)
2170 200 : has_gaps = true;
2171 : /* We cannot handle permuted masked loads directly, see
2172 : PR114375. We cannot handle strided masked loads or masked
2173 : loads with gaps unless the mask is uniform. */
2174 3416 : if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
2175 219 : && (DR_GROUP_GAP (first_stmt_info) != 0
2176 159 : || (has_gaps
2177 65 : && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))))
2178 6737 : || STMT_VINFO_STRIDED_P (stmt_info))
2179 : {
2180 108 : load_permutation.release ();
2181 108 : matches[0] = false;
2182 878776 : return NULL;
2183 : }
2184 :
2185 : /* For permuted masked loads do an unpermuted masked load of
2186 : the whole group followed by a SLP permute node. */
2187 3308 : if (any_permute
2188 3308 : || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2189 86 : && DR_GROUP_SIZE (first_stmt_info) != group_size))
2190 : {
2191 : /* Discover the whole unpermuted load. */
2192 54 : vec<stmt_vec_info> stmts2;
2193 54 : unsigned dr_group_size = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2194 98 : ? DR_GROUP_SIZE (first_stmt_info) : 1;
2195 54 : stmts2.create (dr_group_size);
2196 54 : stmts2.quick_grow_cleared (dr_group_size);
2197 54 : unsigned i = 0;
2198 54 : for (stmt_vec_info si = first_stmt_info;
2199 854 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2200 : {
2201 800 : if (si != first_stmt_info)
2202 3146 : for (unsigned k = 1; k < DR_GROUP_GAP (si); ++k)
2203 2400 : stmts2[i++] = NULL;
2204 800 : stmts2[i++] = si;
2205 : }
2206 54 : bool *matches2 = XALLOCAVEC (bool, dr_group_size);
2207 54 : slp_tree unperm_load
2208 54 : = vect_build_slp_tree (vinfo, stmts2, dr_group_size,
2209 : &this_max_nunits, matches2, limit,
2210 54 : &this_tree_size, bst_map);
2211 : /* When we are able to do the full masked load emit that
2212 : followed by 'node' being the desired final permutation. */
2213 54 : if (unperm_load)
2214 : {
2215 16 : gcc_assert
2216 : (!SLP_TREE_LOAD_PERMUTATION (unperm_load).exists ());
2217 16 : lane_permutation_t lperm;
2218 16 : lperm.create (group_size);
2219 56 : for (unsigned j = 0; j < load_permutation.length (); ++j)
2220 40 : lperm.quick_push
2221 40 : (std::make_pair (0, load_permutation[j]));
2222 16 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2223 16 : SLP_TREE_CHILDREN (node).safe_push (unperm_load);
2224 16 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2225 16 : load_permutation.release ();
2226 16 : return node;
2227 : }
2228 38 : stmts2.release ();
2229 38 : load_permutation.release ();
2230 38 : matches[0] = false;
2231 38 : return NULL;
2232 : }
2233 3254 : load_permutation.release ();
2234 : }
2235 : else
2236 : {
2237 878614 : if (!any_permute
2238 766274 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2239 1168840 : && group_size == DR_GROUP_SIZE (first_stmt_info))
2240 126494 : load_permutation.release ();
2241 878614 : SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
2242 878614 : return node;
2243 : }
2244 : }
2245 : }
2246 3299584 : else if (gimple_assign_single_p (stmt_info->stmt)
2247 2272402 : && !gimple_vuse (stmt_info->stmt)
2248 3307205 : && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
2249 : {
2250 : /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
2251 : the same SSA name vector of a compatible type to vectype. */
2252 2210 : vec<std::pair<unsigned, unsigned> > lperm = vNULL;
2253 2210 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
2254 2210 : stmt_vec_info estmt_info;
2255 6972 : FOR_EACH_VEC_ELT (stmts, i, estmt_info)
2256 : {
2257 4909 : gassign *estmt = as_a <gassign *> (estmt_info->stmt);
2258 4909 : tree bfref = gimple_assign_rhs1 (estmt);
2259 4909 : HOST_WIDE_INT lane;
2260 4909 : if (!known_eq (bit_field_size (bfref),
2261 : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2262 9671 : || !constant_multiple_p (bit_field_offset (bfref),
2263 4762 : bit_field_size (bfref), &lane))
2264 : {
2265 147 : lperm.release ();
2266 147 : matches[0] = false;
2267 147 : return NULL;
2268 : }
2269 4762 : lperm.safe_push (std::make_pair (0, (unsigned)lane));
2270 : }
2271 2063 : slp_tree vnode = vect_create_new_slp_node (vNULL);
2272 2063 : if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2273 : /* ??? We record vectype here but we hide eventually necessary
2274 : punning and instead rely on code generation to materialize
2275 : VIEW_CONVERT_EXPRs as necessary. We instead should make
2276 : this explicit somehow. */
2277 625 : SLP_TREE_VECTYPE (vnode) = vectype;
2278 : else
2279 : {
2280 : /* For different size but compatible elements we can still
2281 : use VEC_PERM_EXPR without punning. */
2282 1438 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2283 : && types_compatible_p (TREE_TYPE (vectype),
2284 : TREE_TYPE (TREE_TYPE (vec))));
2285 1438 : SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2286 : }
2287 2063 : auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2288 2063 : unsigned HOST_WIDE_INT const_nunits;
2289 2063 : if (nunits.is_constant (&const_nunits))
2290 2063 : SLP_TREE_LANES (vnode) = const_nunits;
2291 2063 : SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2292 : /* We are always building a permutation node even if it is an identity
2293 : permute to shield the rest of the vectorizer from the odd node
2294 : representing an actual vector without any scalar ops.
2295 : ??? We could hide it completely with making the permute node
2296 : external? */
2297 2063 : node = vect_create_new_slp_node (node, stmts, 1);
2298 2063 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2299 2063 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2300 2063 : SLP_TREE_VECTYPE (node) = vectype;
2301 2063 : SLP_TREE_CHILDREN (node).quick_push (vnode);
2302 2063 : return node;
2303 : }
2304 : /* When discovery reaches an associatable operation see whether we can
2305 : improve that to match up lanes in a way superior to the operand
2306 : swapping code which at most looks at two defs.
2307 : ??? For BB vectorization we cannot do the brute-force search
2308 : for matching as we can succeed by means of builds from scalars
2309 : and have no good way to "cost" one build against another. */
2310 3297374 : else if (is_a <loop_vec_info> (vinfo)
2311 : /* Do not bother for single-lane SLP. */
2312 1965858 : && group_size > 1
2313 : /* ??? We don't handle !vect_internal_def defs below. */
2314 111996 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2315 : /* ??? Do not associate a reduction, this will wreck REDUC_IDX
2316 : mapping as long as that exists on the stmt_info level. */
2317 86482 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1
2318 77886 : && is_gimple_assign (stmt_info->stmt)
2319 77570 : && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2320 50947 : || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2321 3325881 : && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2322 16252 : || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2323 13721 : && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2324 : {
2325 : /* See if we have a chain of (mixed) adds or subtracts or other
2326 : associatable ops. */
2327 21541 : enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2328 21541 : if (code == MINUS_EXPR)
2329 799 : code = PLUS_EXPR;
2330 21541 : stmt_vec_info other_op_stmt_info = NULL;
2331 21541 : stmt_vec_info op_stmt_info = NULL;
2332 21541 : unsigned chain_len = 0;
2333 21541 : auto_vec<chain_op_t> chain;
2334 21541 : auto_vec<std::pair<tree_code, gimple *> > worklist;
2335 21541 : auto_vec<vec<chain_op_t> > chains (group_size);
2336 21541 : auto_vec<slp_tree, 4> children;
2337 21541 : bool hard_fail = true;
2338 22568 : for (unsigned lane = 0; lane < group_size; ++lane)
2339 : {
2340 22232 : if (!stmts[lane])
2341 : {
2342 : /* ??? Below we require lane zero is present. */
2343 0 : if (lane == 0)
2344 : {
2345 : hard_fail = false;
2346 21205 : break;
2347 : }
2348 0 : chains.quick_push (vNULL);
2349 0 : continue;
2350 : }
2351 : /* For each lane linearize the addition/subtraction (or other
2352 : uniform associatable operation) expression tree. */
2353 22232 : gimple *op_stmt = NULL, *other_op_stmt = NULL;
2354 22232 : vect_slp_linearize_chain (vinfo, worklist, chain, code,
2355 22232 : stmts[lane]->stmt, op_stmt, other_op_stmt,
2356 : NULL);
2357 22232 : if (!op_stmt_info && op_stmt)
2358 20939 : op_stmt_info = vinfo->lookup_stmt (op_stmt);
2359 22232 : if (!other_op_stmt_info && other_op_stmt)
2360 835 : other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2361 22232 : if (chain.length () == 2)
2362 : {
2363 : /* In a chain of just two elements resort to the regular
2364 : operand swapping scheme. Likewise if we run into a
2365 : length mismatch process regularly as well as we did not
2366 : process the other lanes we cannot report a good hint what
2367 : lanes to try swapping in the parent. */
2368 : hard_fail = false;
2369 : break;
2370 : }
2371 1030 : else if (chain_len == 0)
2372 376 : chain_len = chain.length ();
2373 1308 : else if (chain.length () != chain_len)
2374 : {
2375 : /* ??? Here we could slip in magic to compensate with
2376 : neutral operands. */
2377 3 : matches[lane] = false;
2378 3 : if (lane != group_size - 1)
2379 3 : matches[0] = false;
2380 : break;
2381 : }
2382 1027 : chains.quick_push (chain.copy ());
2383 1027 : chain.truncate (0);
2384 : }
2385 43082 : if (chains.length () == group_size)
2386 : {
2387 : /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2388 336 : if (!op_stmt_info)
2389 : {
2390 3 : hard_fail = false;
2391 3 : goto out;
2392 : }
2393 : /* Now we have a set of chains with the same length. */
2394 : /* 1. pre-sort according to def_type and operation. */
2395 1248 : for (unsigned lane = 0; lane < group_size; ++lane)
2396 1830 : chains[lane].stablesort (dt_sort_cmp, vinfo);
2397 333 : if (dump_enabled_p ())
2398 : {
2399 157 : dump_printf_loc (MSG_NOTE, vect_location,
2400 : "pre-sorted chains of %s\n",
2401 : get_tree_code_name (code));
2402 685 : for (unsigned lane = 0; lane < group_size; ++lane)
2403 : {
2404 528 : if (!stmts[lane])
2405 0 : dump_printf (MSG_NOTE, "--");
2406 : else
2407 2422 : for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2408 3788 : dump_printf (MSG_NOTE, "%s %T ",
2409 1894 : get_tree_code_name (chains[lane][opnum].code),
2410 1894 : chains[lane][opnum].op);
2411 528 : dump_printf (MSG_NOTE, "\n");
2412 : }
2413 : }
2414 : /* 2. try to build children nodes, associating as necessary. */
2415 : /* 2a. prepare and perform early checks to avoid eating into
2416 : discovery limit unnecessarily. */
2417 333 : vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
2418 1407 : for (unsigned n = 0; n < chain_len; ++n)
2419 : {
2420 1074 : vect_def_type dt = chains[0][n].dt;
2421 1074 : unsigned lane;
2422 4177 : for (lane = 0; lane < group_size; ++lane)
2423 6206 : if (stmts[lane] && chains[lane][n].dt != dt)
2424 : {
2425 0 : if (dt == vect_constant_def
2426 0 : && chains[lane][n].dt == vect_external_def)
2427 : dt = vect_external_def;
2428 0 : else if (dt == vect_external_def
2429 0 : && chains[lane][n].dt == vect_constant_def)
2430 : ;
2431 : else
2432 : break;
2433 : }
2434 1074 : if (lane != group_size)
2435 : {
2436 0 : if (dump_enabled_p ())
2437 0 : dump_printf_loc (MSG_NOTE, vect_location,
2438 : "giving up on chain due to mismatched "
2439 : "def types\n");
2440 0 : matches[lane] = false;
2441 0 : if (lane != group_size - 1)
2442 0 : matches[0] = false;
2443 0 : goto out;
2444 : }
2445 1074 : dts[n] = dt;
2446 1074 : if (dt == vect_constant_def
2447 1074 : || dt == vect_external_def)
2448 : {
2449 : /* Check whether we can build the invariant. If we can't
2450 : we never will be able to. */
2451 93 : tree type = TREE_TYPE (chains[0][n].op);
2452 1074 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2453 : && (TREE_CODE (type) == BOOLEAN_TYPE
2454 : || !can_duplicate_and_interleave_p (vinfo, group_size,
2455 : type)))
2456 : {
2457 : matches[0] = false;
2458 : goto out;
2459 : }
2460 : }
2461 981 : else if (dt != vect_internal_def)
2462 : {
2463 : /* Not sure, we might need sth special.
2464 : gcc.dg/vect/pr96854.c,
2465 : gfortran.dg/vect/fast-math-pr37021.f90
2466 : and gfortran.dg/vect/pr61171.f trigger. */
2467 : /* Soft-fail for now. */
2468 0 : hard_fail = false;
2469 0 : goto out;
2470 : }
2471 : }
2472 : /* 2b. do the actual build. */
2473 1349 : for (unsigned n = 0; n < chain_len; ++n)
2474 : {
2475 1036 : vect_def_type dt = dts[n];
2476 1036 : unsigned lane;
2477 1036 : if (dt == vect_constant_def
2478 1036 : || dt == vect_external_def)
2479 : {
2480 93 : vec<tree> ops;
2481 93 : ops.create (group_size);
2482 461 : for (lane = 0; lane < group_size; ++lane)
2483 275 : if (stmts[lane])
2484 275 : ops.quick_push (chains[lane][n].op);
2485 : else
2486 0 : ops.quick_push (NULL_TREE);
2487 93 : slp_tree child = vect_create_new_slp_node (ops);
2488 93 : SLP_TREE_DEF_TYPE (child) = dt;
2489 93 : children.safe_push (child);
2490 : }
2491 : else
2492 : {
2493 943 : vec<stmt_vec_info> op_stmts;
2494 943 : op_stmts.create (group_size);
2495 943 : slp_tree child = NULL;
2496 : /* Brute-force our way. We have to consider a lane
2497 : failing after fixing an earlier fail up in the
2498 : SLP discovery recursion. So track the current
2499 : permute per lane. */
2500 943 : unsigned *perms = XALLOCAVEC (unsigned, group_size);
2501 943 : memset (perms, 0, sizeof (unsigned) * group_size);
2502 1037 : do
2503 : {
2504 1037 : op_stmts.truncate (0);
2505 5080 : for (lane = 0; lane < group_size; ++lane)
2506 3006 : if (stmts[lane])
2507 3006 : op_stmts.quick_push
2508 3006 : (vinfo->lookup_def (chains[lane][n].op));
2509 : else
2510 0 : op_stmts.quick_push (NULL);
2511 1037 : child = vect_build_slp_tree (vinfo, op_stmts,
2512 : group_size, &this_max_nunits,
2513 : matches, limit,
2514 : &this_tree_size, bst_map);
2515 : /* ??? We're likely getting too many fatal mismatches
2516 : here so maybe we want to ignore them (but then we
2517 : have no idea which lanes fatally mismatched). */
2518 1037 : if (child || !matches[0])
2519 : break;
2520 : /* Swap another lane we have not yet matched up into
2521 : lanes that did not match. If we run out of
2522 : permute possibilities for a lane terminate the
2523 : search. */
2524 287 : bool term = false;
2525 287 : for (lane = 1; lane < group_size; ++lane)
2526 193 : if (!matches[lane])
2527 : {
2528 165 : if (n + perms[lane] + 1 == chain_len)
2529 : {
2530 : term = true;
2531 : break;
2532 : }
2533 146 : if (dump_enabled_p ())
2534 113 : dump_printf_loc (MSG_NOTE, vect_location,
2535 : "swapping operand %d and %d "
2536 : "of lane %d\n",
2537 : n, n + perms[lane] + 1, lane);
2538 292 : std::swap (chains[lane][n],
2539 146 : chains[lane][n + perms[lane] + 1]);
2540 146 : perms[lane]++;
2541 : }
2542 113 : if (term)
2543 : break;
2544 : }
2545 : while (1);
2546 943 : if (!child)
2547 : {
2548 20 : if (dump_enabled_p ())
2549 18 : dump_printf_loc (MSG_NOTE, vect_location,
2550 : "failed to match up op %d\n", n);
2551 20 : op_stmts.release ();
2552 20 : if (lane != group_size - 1)
2553 10 : matches[0] = false;
2554 : else
2555 10 : matches[lane] = false;
2556 20 : goto out;
2557 : }
2558 923 : if (dump_enabled_p ())
2559 : {
2560 421 : dump_printf_loc (MSG_NOTE, vect_location,
2561 : "matched up op %d to\n", n);
2562 421 : vect_print_slp_tree (MSG_NOTE, vect_location, child);
2563 : }
2564 923 : children.safe_push (child);
2565 : }
2566 : }
2567 : /* 3. build SLP nodes to combine the chain. */
2568 1153 : for (unsigned lane = 0; lane < group_size; ++lane)
2569 1692 : if (stmts[lane] && chains[lane][0].code != code)
2570 : {
2571 : /* See if there's any alternate all-PLUS entry. */
2572 : unsigned n;
2573 6 : for (n = 1; n < chain_len; ++n)
2574 : {
2575 30 : for (lane = 0; lane < group_size; ++lane)
2576 48 : if (stmts[lane] && chains[lane][n].code != code)
2577 : break;
2578 6 : if (lane == group_size)
2579 : break;
2580 : }
2581 6 : if (n != chain_len)
2582 : {
2583 : /* Swap that in at first position. */
2584 6 : std::swap (children[0], children[n]);
2585 30 : for (lane = 0; lane < group_size; ++lane)
2586 24 : if (stmts[lane])
2587 24 : std::swap (chains[lane][0], chains[lane][n]);
2588 : }
2589 : else
2590 : {
2591 : /* ??? When this triggers and we end up with two
2592 : vect_constant/external_def up-front things break (ICE)
2593 : spectacularly finding an insertion place for the
2594 : all-constant op. We should have a fully
2595 : vect_internal_def operand though(?) so we can swap
2596 : that into first place and then prepend the all-zero
2597 : constant. */
2598 0 : if (dump_enabled_p ())
2599 0 : dump_printf_loc (MSG_NOTE, vect_location,
2600 : "inserting constant zero to compensate "
2601 : "for (partially) negated first "
2602 : "operand\n");
2603 0 : chain_len++;
2604 0 : for (lane = 0; lane < group_size; ++lane)
2605 0 : if (stmts[lane])
2606 0 : chains[lane].safe_insert
2607 0 : (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2608 0 : vec<tree> zero_ops;
2609 0 : zero_ops.create (group_size);
2610 0 : zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2611 0 : for (lane = 1; lane < group_size; ++lane)
2612 0 : if (stmts[lane])
2613 0 : zero_ops.quick_push (zero_ops[0]);
2614 : else
2615 0 : zero_ops.quick_push (NULL_TREE);
2616 0 : slp_tree zero = vect_create_new_slp_node (zero_ops);
2617 0 : SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2618 0 : children.safe_insert (0, zero);
2619 : }
2620 : break;
2621 : }
2622 1011 : for (unsigned i = 1; i < children.length (); ++i)
2623 : {
2624 698 : slp_tree op0 = children[i - 1];
2625 698 : slp_tree op1 = children[i];
2626 698 : bool this_two_op = false;
2627 2560 : for (unsigned lane = 0; lane < group_size; ++lane)
2628 4040 : if (stmts[lane] && chains[lane][i].code != chains[0][i].code)
2629 : {
2630 : this_two_op = true;
2631 : break;
2632 : }
2633 698 : slp_tree child;
2634 698 : if (i == children.length () - 1)
2635 313 : child = vect_create_new_slp_node (node, stmts, 2);
2636 : else
2637 385 : child = vect_create_new_slp_node (2, ERROR_MARK);
2638 698 : if (this_two_op)
2639 : {
2640 158 : vec<std::pair<unsigned, unsigned> > lperm;
2641 158 : lperm.create (group_size);
2642 570 : for (unsigned lane = 0; lane < group_size; ++lane)
2643 824 : lperm.quick_push (std::make_pair
2644 412 : (chains[lane][i].code != chains[0][i].code, lane));
2645 316 : vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2646 158 : (chains[0][i].code == code
2647 : ? op_stmt_info
2648 : : other_op_stmt_info),
2649 158 : (chains[0][i].code == code
2650 : ? other_op_stmt_info
2651 : : op_stmt_info),
2652 : lperm);
2653 : }
2654 : else
2655 : {
2656 540 : SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2657 540 : SLP_TREE_VECTYPE (child) = vectype;
2658 540 : SLP_TREE_LANES (child) = group_size;
2659 540 : SLP_TREE_CHILDREN (child).quick_push (op0);
2660 540 : SLP_TREE_CHILDREN (child).quick_push (op1);
2661 540 : SLP_TREE_REPRESENTATIVE (child)
2662 1080 : = (chains[0][i].code == code
2663 540 : ? op_stmt_info : other_op_stmt_info);
2664 : }
2665 698 : children[i] = child;
2666 : }
2667 313 : *tree_size += this_tree_size + 1;
2668 313 : *max_nunits = this_max_nunits;
2669 1513 : while (!chains.is_empty ())
2670 864 : chains.pop ().release ();
2671 : return node;
2672 : }
2673 21205 : out:
2674 21228 : if (dump_enabled_p ())
2675 2815 : dump_printf_loc (MSG_NOTE, vect_location,
2676 : "failed to line up SLP graph by re-associating "
2677 : "operations in lanes%s\n",
2678 : !hard_fail ? " trying regular discovery" : "");
2679 21233 : while (!children.is_empty ())
2680 5 : vect_free_slp_tree (children.pop ());
2681 21391 : while (!chains.is_empty ())
2682 163 : chains.pop ().release ();
2683 : /* Hard-fail, otherwise we might run into quadratic processing of the
2684 : chains starting one stmt into the chain again. */
2685 21228 : if (hard_fail)
2686 : return NULL;
2687 : /* Fall thru to normal processing. */
2688 21541 : }
2689 :
2690 : /* Get at the operands, verifying they are compatible. */
2691 3318860 : vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2692 3318860 : slp_oprnd_info oprnd_info;
2693 16005983 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2694 : {
2695 25376642 : int res = vect_get_and_check_slp_defs (vinfo, vectype,
2696 12688321 : swap[i], skip_args,
2697 : stmts, i, &oprnds_info);
2698 12688321 : if (res != 0)
2699 544308 : matches[(res == -1) ? 0 : i] = false;
2700 12688321 : if (!matches[0])
2701 : break;
2702 : }
2703 15694349 : for (i = 0; i < group_size; ++i)
2704 12588292 : if (!matches[i])
2705 : {
2706 212803 : vect_free_oprnd_info (oprnds_info);
2707 212803 : return NULL;
2708 : }
2709 9318171 : swap = NULL;
2710 :
2711 9318171 : bool has_two_operators_perm = false;
2712 18636342 : auto_vec<unsigned> two_op_perm_indices[2];
2713 3106057 : vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
2714 :
2715 3120298 : if (two_operators && oprnds_info.length () == 2 && group_size > 2)
2716 : {
2717 3822 : unsigned idx = 0;
2718 3822 : hash_map<gimple *, unsigned> seen;
2719 3822 : vec<slp_oprnd_info> new_oprnds_info
2720 3822 : = vect_create_oprnd_info (1, group_size);
2721 3822 : bool success = true;
2722 :
2723 3822 : enum tree_code code = ERROR_MARK;
2724 3822 : if (oprnds_info[0]->def_stmts[0]
2725 3822 : && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
2726 3764 : code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
2727 3822 : basic_block bb = nullptr;
2728 :
2729 7424 : for (unsigned j = 0; j < group_size; ++j)
2730 : {
2731 17398 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2732 : {
2733 13796 : stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
2734 13796 : if (!stmt_info
2735 13635 : || !is_a<gassign *> (stmt_info->stmt)
2736 13632 : || gimple_assign_rhs_code (stmt_info->stmt) != code
2737 24233 : || skip_args[i])
2738 : {
2739 : success = false;
2740 3363 : break;
2741 : }
2742 : /* Avoid mixing lanes with defs in different basic-blocks. */
2743 10437 : if (!bb)
2744 3940 : bb = gimple_bb (vect_orig_stmt (stmt_info)->stmt);
2745 8261 : else if (gimple_bb (vect_orig_stmt (stmt_info)->stmt) != bb)
2746 : {
2747 : success = false;
2748 : break;
2749 : }
2750 :
2751 10433 : bool exists;
2752 10433 : unsigned &stmt_idx
2753 10433 : = seen.get_or_insert (stmt_info->stmt, &exists);
2754 :
2755 10433 : if (!exists)
2756 : {
2757 9092 : new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
2758 9092 : new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
2759 9092 : stmt_idx = idx;
2760 9092 : idx++;
2761 : }
2762 :
2763 10433 : two_op_perm_indices[i].safe_push (stmt_idx);
2764 : }
2765 :
2766 6965 : if (!success)
2767 : break;
2768 : }
2769 :
2770 3822 : if (success && idx == group_size)
2771 : {
2772 94 : if (dump_enabled_p ())
2773 : {
2774 0 : dump_printf_loc (MSG_NOTE, vect_location,
2775 : "Replace two_operators operands:\n");
2776 :
2777 0 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2778 : {
2779 0 : dump_printf_loc (MSG_NOTE, vect_location,
2780 : "Operand %u:\n", i);
2781 0 : for (unsigned j = 0; j < group_size; j++)
2782 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2783 0 : j, oprnd_info->def_stmts[j]->stmt);
2784 : }
2785 :
2786 0 : dump_printf_loc (MSG_NOTE, vect_location,
2787 : "With a single operand:\n");
2788 0 : for (unsigned j = 0; j < group_size; j++)
2789 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2790 0 : j, new_oprnds_info[0]->def_stmts[j]->stmt);
2791 : }
2792 :
2793 94 : two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
2794 94 : two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
2795 :
2796 94 : new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
2797 94 : new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
2798 94 : new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
2799 94 : new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
2800 94 : new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
2801 :
2802 94 : vect_free_oprnd_info (oprnds_info);
2803 94 : oprnds_info = new_oprnds_info;
2804 94 : nops = 1;
2805 94 : has_two_operators_perm = true;
2806 : }
2807 : else
2808 3728 : vect_free_oprnd_info (new_oprnds_info);
2809 3822 : }
2810 :
2811 6212114 : auto_vec<slp_tree, 4> children;
2812 :
2813 3106057 : stmt_info = stmts[0];
2814 :
2815 3106057 : int reduc_idx = -1;
2816 3106057 : int gs_scale = 0;
2817 3106057 : tree gs_base = NULL_TREE;
2818 :
2819 : /* Create SLP_TREE nodes for the definition node/s. */
2820 7929154 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2821 : {
2822 4950456 : slp_tree child = nullptr;
2823 4950456 : unsigned int j;
2824 4950456 : unsigned old_swap_distance;
2825 4950456 : bool can_swap;
2826 4950456 : bool can_swap_nonmatching;
2827 4950456 : bool *stmt_can_swap;
2828 :
2829 : /* We're skipping certain operands from processing, for example
2830 : outer loop reduction initial defs. */
2831 4950456 : if (skip_args[i])
2832 : {
2833 483017 : children.safe_push (NULL);
2834 5306114 : continue;
2835 : }
2836 :
2837 4467439 : if (oprnd_info->first_dt == vect_uninitialized_def)
2838 : {
2839 : /* COND_EXPR have one too many eventually if the condition
2840 : is a SSA name. */
2841 0 : gcc_assert (i == 3 && nops == 4);
2842 0 : continue;
2843 : }
2844 :
2845 4467439 : if (oprnd_info->first_gs_p)
2846 : {
2847 22438 : gs_scale = oprnd_info->first_gs_info.scale;
2848 22438 : gs_base = oprnd_info->first_gs_info.base;
2849 : }
2850 :
2851 4467439 : if (is_a <bb_vec_info> (vinfo)
2852 1551776 : && oprnd_info->first_dt == vect_internal_def
2853 5267604 : && !oprnd_info->any_pattern)
2854 : {
2855 : /* For BB vectorization, if all defs are the same do not
2856 : bother to continue the build along the single-lane
2857 : graph but use a splat of the scalar value. */
2858 757195 : stmt_vec_info first_def = oprnd_info->def_stmts[0];
2859 813292 : for (j = 1; j < group_size; ++j)
2860 773209 : if (oprnd_info->def_stmts[j] != first_def)
2861 : break;
2862 757195 : if (j == group_size
2863 : /* But avoid doing this for loads where we may be
2864 : able to CSE things, unless the stmt is not
2865 : vectorizable. */
2866 757195 : && (!STMT_VINFO_VECTORIZABLE (first_def)
2867 48786 : || !gimple_vuse (first_def->stmt)))
2868 : {
2869 31199 : if (dump_enabled_p ())
2870 107 : dump_printf_loc (MSG_NOTE, vect_location,
2871 : "Using a splat of the uniform operand %G",
2872 : first_def->stmt);
2873 31199 : oprnd_info->first_dt = vect_external_def;
2874 : }
2875 : }
2876 :
2877 4467439 : if (oprnd_info->first_dt == vect_external_def
2878 4467439 : || oprnd_info->first_dt == vect_constant_def)
2879 : {
2880 1466279 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2881 : {
2882 : tree op0;
2883 : tree uniform_val = op0 = oprnd_info->ops[0];
2884 : for (j = 1; j < oprnd_info->ops.length (); ++j)
2885 : if (oprnd_info->ops[j]
2886 : && !operand_equal_p (uniform_val, oprnd_info->ops[j]))
2887 : {
2888 : uniform_val = NULL_TREE;
2889 : break;
2890 : }
2891 : if (!uniform_val
2892 : && !can_duplicate_and_interleave_p (vinfo,
2893 : oprnd_info->ops.length (),
2894 : TREE_TYPE (op0)))
2895 : {
2896 : matches[j] = false;
2897 : if (dump_enabled_p ())
2898 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2899 : "Build SLP failed: invalid type of def "
2900 : "for variable-length SLP %T\n", op0);
2901 : goto fail;
2902 : }
2903 : }
2904 1466279 : slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2905 1466279 : SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2906 1466279 : oprnd_info->ops = vNULL;
2907 1466279 : children.safe_push (invnode);
2908 1466279 : continue;
2909 1466279 : }
2910 :
2911 : /* See which SLP operand a reduction chain continues on. We want
2912 : to chain even PHIs but not backedges. */
2913 3001160 : if (STMT_VINFO_REDUC_DEF (oprnd_info->def_stmts[0])
2914 3001160 : || STMT_VINFO_REDUC_IDX (oprnd_info->def_stmts[0]) != -1)
2915 : {
2916 233366 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2917 : {
2918 776 : if (oprnd_info->first_dt == vect_double_reduction_def)
2919 388 : reduc_idx = i;
2920 : }
2921 232590 : else if (is_a <gphi *> (stmt_info->stmt)
2922 232590 : && gimple_phi_num_args
2923 99770 : (as_a <gphi *> (stmt_info->stmt)) != 1)
2924 : ;
2925 133213 : else if (STMT_VINFO_REDUC_IDX (stmt_info) == -1
2926 393 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2927 : ;
2928 133213 : else if (reduc_idx == -1)
2929 124804 : reduc_idx = i;
2930 : else
2931 : /* For .COND_* reduction operations the else value can be the
2932 : same as one of the operation operands. The other def
2933 : stmts have been moved, so we can't check easily. Check
2934 : it's a call at least. */
2935 8409 : gcc_assert (is_a <gcall *> (stmt_info->stmt));
2936 : }
2937 :
2938 : /* When we have a masked load with uniform mask discover this
2939 : as a single-lane mask with a splat permute. This way we can
2940 : recognize this as a masked load-lane by stripping the splat. */
2941 3001160 : if (is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
2942 57555 : && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
2943 : IFN_MASK_LOAD)
2944 6075 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2945 3001237 : && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info)))
2946 : {
2947 35 : vec<stmt_vec_info> def_stmts2;
2948 35 : def_stmts2.create (1);
2949 35 : def_stmts2.quick_push (oprnd_info->def_stmts[0]);
2950 35 : child = vect_build_slp_tree (vinfo, def_stmts2, 1,
2951 : &this_max_nunits,
2952 : matches, limit,
2953 : &this_tree_size, bst_map);
2954 35 : if (child)
2955 : {
2956 35 : slp_tree pnode = vect_create_new_slp_node (1, VEC_PERM_EXPR);
2957 35 : SLP_TREE_VECTYPE (pnode) = SLP_TREE_VECTYPE (child);
2958 35 : SLP_TREE_LANES (pnode) = group_size;
2959 35 : SLP_TREE_SCALAR_STMTS (pnode).create (group_size);
2960 35 : SLP_TREE_LANE_PERMUTATION (pnode).create (group_size);
2961 210 : for (unsigned k = 0; k < group_size; ++k)
2962 : {
2963 175 : SLP_TREE_SCALAR_STMTS (pnode)
2964 175 : .quick_push (oprnd_info->def_stmts[0]);
2965 175 : SLP_TREE_LANE_PERMUTATION (pnode)
2966 175 : .quick_push (std::make_pair (0u, 0u));
2967 : }
2968 35 : SLP_TREE_CHILDREN (pnode).quick_push (child);
2969 35 : pnode->max_nunits = child->max_nunits;
2970 35 : children.safe_push (pnode);
2971 35 : oprnd_info->def_stmts = vNULL;
2972 35 : continue;
2973 35 : }
2974 : else
2975 0 : def_stmts2.release ();
2976 : }
2977 :
2978 6002250 : can_swap = (i == 0
2979 2224575 : && (nops == 2 || nops == 3)
2980 1429788 : && oprnds_info.length () > 1
2981 1429788 : && oprnds_info[1]->first_dt == vect_internal_def
2982 583916 : && (is_gimple_assign (stmt_info->stmt)
2983 49541 : || is_gimple_call (stmt_info->stmt))
2984 : /* Swapping operands for reductions breaks assumptions
2985 : later on. */
2986 3540738 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1);
2987 3001125 : can_swap_nonmatching = can_swap;
2988 3001125 : stmt_can_swap = NULL;
2989 3001125 : if (can_swap)
2990 : {
2991 484258 : stmt_can_swap = XALLOCAVEC (bool, group_size);
2992 8103630 : for (j = 0; j < group_size; ++j)
2993 : {
2994 7619372 : stmt_can_swap[j] = false;
2995 7619372 : if (!stmts[j])
2996 : /* NULL lanes are gaps and have no stmt to swap. */
2997 0 : stmt_can_swap[j] = true;
2998 7619372 : else if (gassign *stmt = dyn_cast <gassign *> (stmts[j]->stmt))
2999 : {
3000 7613668 : tree_code code = gimple_assign_rhs_code (stmt);
3001 15227336 : stmt_can_swap[j] = (commutative_tree_code (code)
3002 7613668 : || commutative_ternary_tree_code (code));
3003 : }
3004 5704 : else if (gcall *call = dyn_cast <gcall *> (stmts[j]->stmt))
3005 : {
3006 5704 : internal_fn fn = (gimple_call_internal_p (call)
3007 5704 : ? gimple_call_internal_fn (call) : IFN_LAST);
3008 11408 : stmt_can_swap[j] = ((commutative_binary_fn_p (fn)
3009 5388 : || commutative_ternary_fn_p (fn))
3010 5740 : && first_commutative_argument (fn) == 0);
3011 : }
3012 :
3013 7619372 : if (j != 0 && !stmt_can_swap[j])
3014 7619372 : can_swap_nonmatching = false;
3015 : }
3016 : }
3017 :
3018 3001125 : old_swap_distance = least_upthread_swappable_op_distance;
3019 3001125 : if (can_swap_nonmatching)
3020 451435 : least_upthread_swappable_op_distance = 1;
3021 2549690 : else if (least_upthread_swappable_op_distance != -1U)
3022 302365 : least_upthread_swappable_op_distance++;
3023 3001125 : child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
3024 : group_size, &this_max_nunits,
3025 : matches, limit,
3026 : &this_tree_size, bst_map);
3027 3001125 : least_upthread_swappable_op_distance = old_swap_distance;
3028 3001125 : if (child != NULL)
3029 : {
3030 2513915 : oprnd_info->def_stmts = vNULL;
3031 2513915 : children.safe_push (child);
3032 2513915 : continue;
3033 : }
3034 :
3035 : /* If the SLP build for operand zero failed and operand zero
3036 : and one can be commuted try that for the scalar stmts
3037 : that failed the match. */
3038 487210 : if (/* A first scalar stmt mismatch signals a fatal mismatch. */
3039 487210 : matches[0]
3040 258013 : && can_swap)
3041 : {
3042 : /* See whether we can swap the matching or the non-matching
3043 : stmt operands. */
3044 : bool swap_not_matching = true;
3045 65186 : do
3046 : {
3047 7103690 : for (j = 0; j < group_size; ++j)
3048 : {
3049 7053440 : if (matches[j] != !swap_not_matching)
3050 86583 : continue;
3051 : /* Verify if we can swap operands of this stmt. */
3052 6966857 : if (!stmt_can_swap[j])
3053 : {
3054 14936 : if (!swap_not_matching)
3055 6951 : goto fail;
3056 : swap_not_matching = false;
3057 : break;
3058 : }
3059 : }
3060 : }
3061 58235 : while (j != group_size);
3062 :
3063 : /* Swap mismatched definition stmts. */
3064 50250 : if (dump_enabled_p ())
3065 392 : dump_printf_loc (MSG_NOTE, vect_location,
3066 : "Re-trying with swapped operands of stmts ");
3067 7079848 : for (j = 0; j < group_size; ++j)
3068 7029598 : if (matches[j] == !swap_not_matching)
3069 : {
3070 13903446 : std::swap (oprnds_info[0]->def_stmts[j],
3071 6951723 : oprnds_info[1]->def_stmts[j]);
3072 13903446 : std::swap (oprnds_info[0]->ops[j],
3073 6951723 : oprnds_info[1]->ops[j]);
3074 6951723 : if (dump_enabled_p ())
3075 1079 : dump_printf (MSG_NOTE, "%d ", j);
3076 : }
3077 50250 : if (dump_enabled_p ())
3078 392 : dump_printf (MSG_NOTE, "\n");
3079 : /* After swapping some operands we lost track whether an
3080 : operand has any pattern defs so be conservative here. */
3081 98065 : if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
3082 2888 : oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
3083 : /* And try again with scratch 'matches' ... */
3084 50250 : bool *tem = XALLOCAVEC (bool, group_size);
3085 50250 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
3086 : group_size, &this_max_nunits,
3087 : tem, limit,
3088 : &this_tree_size, bst_map)) != NULL)
3089 : {
3090 6313 : oprnd_info->def_stmts = vNULL;
3091 6313 : children.safe_push (child);
3092 6313 : continue;
3093 : }
3094 : }
3095 430009 : fail:
3096 :
3097 : /* If the SLP build failed and we analyze a basic-block
3098 : simply treat nodes we fail to build as externally defined
3099 : (and thus build vectors from the scalar defs).
3100 : The cost model will reject outright expensive cases.
3101 : ??? This doesn't treat cases where permutation ultimatively
3102 : fails (or we don't try permutation below). Ideally we'd
3103 : even compute a permutation that will end up with the maximum
3104 : SLP tree size... */
3105 480897 : if (is_a <bb_vec_info> (vinfo)
3106 : /* ??? Rejecting patterns this way doesn't work. We'd have to
3107 : do extra work to cancel the pattern so the uses see the
3108 : scalar version. */
3109 : /* Skip building vector operands from scalars while operand
3110 : discovery may still be fixed by retrying with swapped operands. */
3111 400181 : && (least_upthread_swappable_op_distance != 1
3112 : /* A first scalar stmt mismatch signals a fatal mismatch
3113 : that the parent commutative retry cannot recover. */
3114 26339 : || !matches[0])
3115 382276 : && !is_pattern_stmt_p (stmt_info)
3116 840810 : && !oprnd_info->any_pattern)
3117 : {
3118 : /* But if there's a leading vector sized set of matching stmts
3119 : fail here so we can split the group. This matches the condition
3120 : vect_analyze_slp_instance uses. */
3121 : /* ??? We might want to split here and combine the results to support
3122 : multiple vector sizes better. */
3123 557711 : for (j = 0; j < group_size; ++j)
3124 557711 : if (!matches[j])
3125 : break;
3126 359672 : if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
3127 359643 : && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
3128 : {
3129 353538 : if (dump_enabled_p ())
3130 616 : dump_printf_loc (MSG_NOTE, vect_location,
3131 : "Building vector operands from scalars\n");
3132 353538 : this_tree_size++;
3133 353538 : child = vect_create_new_slp_node (oprnd_info->ops);
3134 353538 : children.safe_push (child);
3135 353538 : oprnd_info->ops = vNULL;
3136 353538 : continue;
3137 : }
3138 : }
3139 :
3140 127359 : gcc_assert (child == NULL);
3141 145969 : FOR_EACH_VEC_ELT (children, j, child)
3142 18610 : if (child)
3143 18610 : vect_free_slp_tree (child);
3144 127359 : vect_free_oprnd_info (oprnds_info);
3145 127359 : return NULL;
3146 : }
3147 :
3148 2978698 : vect_free_oprnd_info (oprnds_info);
3149 :
3150 : /* If we have all children of a child built up from uniform scalars
3151 : or does more than one possibly expensive vector construction then
3152 : just throw that away, causing it built up from scalars.
3153 : The exception is the SLP node for the vector store. */
3154 2978698 : if (is_a <bb_vec_info> (vinfo)
3155 1077928 : && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3156 : /* ??? Rejecting patterns this way doesn't work. We'd have to
3157 : do extra work to cancel the pattern so the uses see the
3158 : scalar version. */
3159 3395756 : && !is_pattern_stmt_p (stmt_info))
3160 : {
3161 : slp_tree child;
3162 : unsigned j;
3163 : bool all_uniform_p = true;
3164 : unsigned n_vector_builds = 0;
3165 1183549 : FOR_EACH_VEC_ELT (children, j, child)
3166 : {
3167 791995 : if (!child)
3168 : ;
3169 791995 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3170 : all_uniform_p = false;
3171 576567 : else if (!vect_slp_tree_uniform_p (child))
3172 : {
3173 438512 : all_uniform_p = false;
3174 438512 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
3175 404210 : n_vector_builds++;
3176 : }
3177 : }
3178 391554 : if (all_uniform_p
3179 391554 : || n_vector_builds > 1
3180 661297 : || (n_vector_builds == children.length ()
3181 29744 : && is_a <gphi *> (stmt_info->stmt)))
3182 : {
3183 : /* Roll back. */
3184 126733 : matches[0] = false;
3185 402701 : FOR_EACH_VEC_ELT (children, j, child)
3186 275968 : if (child)
3187 275968 : vect_free_slp_tree (child);
3188 :
3189 126733 : if (dump_enabled_p ())
3190 205 : dump_printf_loc (MSG_NOTE, vect_location,
3191 : "Building parent vector operands from "
3192 : "scalars instead\n");
3193 126733 : return NULL;
3194 : }
3195 : }
3196 :
3197 2851965 : *tree_size += this_tree_size + 1;
3198 2851965 : *max_nunits = this_max_nunits;
3199 :
3200 2851965 : if (two_operators)
3201 : {
3202 : /* ??? We'd likely want to either cache in bst_map sth like
3203 : { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
3204 : the true { a+b, a+b, a+b, a+b } ... but there we don't have
3205 : explicit stmts to put in so the keying on 'stmts' doesn't
3206 : work (but we have the same issue with nodes that use 'ops'). */
3207 :
3208 6689 : if (has_two_operators_perm)
3209 : {
3210 40 : slp_tree child = children[0];
3211 40 : children.truncate (0);
3212 120 : for (i = 0; i < 2; i++)
3213 : {
3214 80 : slp_tree pnode
3215 80 : = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
3216 80 : SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
3217 80 : SLP_TREE_VECTYPE (pnode) = vectype;
3218 80 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3219 80 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3220 80 : lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
3221 80 : children.safe_push (pnode);
3222 :
3223 656 : for (unsigned j = 0; j < stmts.length (); j++)
3224 576 : perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
3225 : }
3226 :
3227 40 : SLP_TREE_REF_COUNT (child) += 4;
3228 : }
3229 :
3230 6689 : slp_tree one = new _slp_tree;
3231 6689 : slp_tree two = new _slp_tree;
3232 6689 : SLP_TREE_DEF_TYPE (one) = vect_internal_def;
3233 6689 : SLP_TREE_DEF_TYPE (two) = vect_internal_def;
3234 6689 : SLP_TREE_VECTYPE (one) = vectype;
3235 6689 : SLP_TREE_VECTYPE (two) = vectype;
3236 6689 : SLP_TREE_CHILDREN (one).safe_splice (children);
3237 6689 : SLP_TREE_CHILDREN (two).safe_splice (children);
3238 6689 : slp_tree child;
3239 26758 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
3240 13380 : SLP_TREE_REF_COUNT (child)++;
3241 :
3242 : /* Here we record the original defs since this
3243 : node represents the final lane configuration. */
3244 6689 : node = vect_create_new_slp_node (node, stmts, 2);
3245 6689 : SLP_TREE_VECTYPE (node) = vectype;
3246 6689 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3247 6689 : SLP_TREE_CHILDREN (node).quick_push (one);
3248 6689 : SLP_TREE_CHILDREN (node).quick_push (two);
3249 6689 : enum tree_code code0 = ERROR_MARK;
3250 6689 : enum tree_code ocode = ERROR_MARK;
3251 6689 : if (gassign *stmt = dyn_cast <gassign *> (stmts[0]->stmt))
3252 6687 : code0 = gimple_assign_rhs_code (stmt);
3253 6689 : stmt_vec_info ostmt_info;
3254 6689 : unsigned j = 0;
3255 24440 : FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
3256 : {
3257 17751 : int op = 0;
3258 17751 : if (gassign *ostmt = dyn_cast <gassign *> (ostmt_info->stmt))
3259 : {
3260 17747 : if (gimple_assign_rhs_code (ostmt) != code0)
3261 : {
3262 8894 : ocode = gimple_assign_rhs_code (ostmt);
3263 : op = 1;
3264 : j = i;
3265 : }
3266 : }
3267 : else
3268 : {
3269 8 : if (gimple_call_combined_fn (stmts[0]->stmt)
3270 4 : != gimple_call_combined_fn (ostmt_info->stmt))
3271 : {
3272 2 : op = 1;
3273 2 : j = i;
3274 : }
3275 : }
3276 17751 : SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (op, i));
3277 : }
3278 6689 : SLP_TREE_CODE (one) = code0;
3279 6689 : SLP_TREE_CODE (two) = ocode;
3280 6689 : SLP_TREE_LANES (one) = stmts.length ();
3281 6689 : SLP_TREE_LANES (two) = stmts.length ();
3282 6689 : SLP_TREE_REPRESENTATIVE (one) = stmts[0];
3283 6689 : SLP_TREE_REPRESENTATIVE (two) = stmts[j];
3284 :
3285 6689 : return node;
3286 : }
3287 :
3288 2845276 : node = vect_create_new_slp_node (node, stmts, nops);
3289 2845276 : SLP_TREE_VECTYPE (node) = vectype;
3290 2845276 : SLP_TREE_CHILDREN (node).splice (children);
3291 2845276 : SLP_TREE_GS_SCALE (node) = gs_scale;
3292 2845276 : SLP_TREE_GS_BASE (node) = gs_base;
3293 2845276 : if (reduc_idx != -1)
3294 : {
3295 116410 : gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) != -1
3296 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
3297 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def);
3298 116410 : SLP_TREE_REDUC_IDX (node) = reduc_idx;
3299 116410 : node->cycle_info.id = SLP_TREE_CHILDREN (node)[reduc_idx]->cycle_info.id;
3300 : }
3301 : /* When reaching the reduction PHI, create a vect_reduc_info. */
3302 2728866 : else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3303 2728866 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3304 2728866 : && is_a <gphi *> (STMT_VINFO_STMT (stmt_info)))
3305 : {
3306 101731 : loop_vec_info loop_vinfo = as_a <loop_vec_info> (vinfo);
3307 101731 : gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) == -1);
3308 101731 : node->cycle_info.id = loop_vinfo->reduc_infos.length ();
3309 101731 : vect_reduc_info reduc_info = new vect_reduc_info_s ();
3310 101731 : loop_vinfo->reduc_infos.safe_push (reduc_info);
3311 101731 : stmt_vec_info reduc_phi = stmt_info;
3312 : /* ??? For double reductions vect_is_simple_reduction stores the
3313 : reduction type and code on the inner loop header PHI. */
3314 101731 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3315 : {
3316 388 : use_operand_p use_p;
3317 388 : gimple *use_stmt;
3318 388 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
3319 : &use_p, &use_stmt);
3320 388 : gcc_assert (res);
3321 388 : reduc_phi = loop_vinfo->lookup_stmt (use_stmt);
3322 : }
3323 101731 : VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (stmt_info);
3324 101731 : VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (reduc_phi);
3325 101731 : VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (reduc_phi);
3326 101731 : VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
3327 : }
3328 : return node;
3329 9318171 : }
3330 :
3331 : /* Dump a single SLP tree NODE. */
3332 :
3333 : static void
3334 445871 : vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
3335 : slp_tree node)
3336 : {
3337 445871 : unsigned i, j;
3338 445871 : slp_tree child;
3339 445871 : stmt_vec_info stmt_info;
3340 445871 : tree op;
3341 :
3342 445871 : dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
3343 445871 : dump_user_location_t user_loc = loc.get_user_location ();
3344 445871 : dump_printf_loc (metadata, user_loc,
3345 : "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
3346 : ", refcnt=%u)",
3347 445871 : SLP_TREE_DEF_TYPE (node) == vect_external_def
3348 : ? " (external)"
3349 : : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
3350 430148 : ? " (constant)"
3351 : : ""), (void *) node,
3352 445871 : estimated_poly_value (node->max_nunits),
3353 : SLP_TREE_REF_COUNT (node));
3354 445871 : if (SLP_TREE_VECTYPE (node))
3355 378124 : dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
3356 445871 : dump_printf (metadata, "%s",
3357 445871 : node->avoid_stlf_fail ? " (avoid-stlf-fail)" : "");
3358 445871 : if (node->cycle_info.id != -1 || node->cycle_info.reduc_idx != -1)
3359 23994 : dump_printf (metadata, " cycle %d, link %d", node->cycle_info.id,
3360 : node->cycle_info.reduc_idx);
3361 445871 : dump_printf (metadata, "\n");
3362 445871 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3363 : {
3364 363073 : if (SLP_TREE_PERMUTE_P (node))
3365 13801 : dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
3366 : else
3367 349272 : dump_printf_loc (metadata, user_loc, "op template: %G",
3368 349272 : SLP_TREE_REPRESENTATIVE (node)->stmt);
3369 : }
3370 445871 : if (SLP_TREE_SCALAR_STMTS (node).exists ())
3371 869239 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3372 514321 : if (stmt_info)
3373 508956 : dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
3374 508956 : SLP_TREE_LIVE_LANES (node).contains (i)
3375 505284 : ? "[l*]" : (STMT_VINFO_LIVE_P (stmt_info)
3376 505284 : ? "[l] " : ""),
3377 : i, stmt_info->stmt);
3378 : else
3379 5365 : dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
3380 : else
3381 : {
3382 90953 : dump_printf_loc (metadata, user_loc, "\t{ ");
3383 199800 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
3384 108847 : dump_printf (metadata, "%T%s ", op,
3385 108847 : i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
3386 90953 : dump_printf (metadata, "}\n");
3387 : }
3388 445871 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3389 : {
3390 64912 : dump_printf_loc (metadata, user_loc, "\tload permutation {");
3391 148299 : FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
3392 83387 : dump_printf (dump_kind, " %u", j);
3393 64912 : dump_printf (dump_kind, " }\n");
3394 : }
3395 445871 : if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3396 : {
3397 13809 : dump_printf_loc (metadata, user_loc, "\tlane permutation {");
3398 51913 : for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
3399 38104 : dump_printf (dump_kind, " %u[%u]",
3400 38104 : SLP_TREE_LANE_PERMUTATION (node)[i].first,
3401 38104 : SLP_TREE_LANE_PERMUTATION (node)[i].second);
3402 13809 : dump_printf (dump_kind, " }%s\n",
3403 13809 : node->ldst_lanes ? " (load-lanes)" : "");
3404 : }
3405 445871 : if (SLP_TREE_CHILDREN (node).is_empty ())
3406 169958 : return;
3407 275913 : dump_printf_loc (metadata, user_loc, "\tchildren");
3408 727946 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3409 452033 : dump_printf (dump_kind, " %p", (void *)child);
3410 275913 : dump_printf (dump_kind, "%s\n",
3411 275913 : node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
3412 : ? " (store-lanes)" : "");
3413 : }
3414 :
3415 : DEBUG_FUNCTION void
3416 0 : debug (slp_tree node)
3417 : {
3418 0 : debug_dump_context ctx;
3419 0 : vect_print_slp_tree (MSG_NOTE,
3420 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3421 : node);
3422 0 : }
3423 :
3424 : /* Recursive helper for the dot producer below. */
3425 :
3426 : static void
3427 0 : dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
3428 : {
3429 0 : if (visited.add (node))
3430 : return;
3431 :
3432 0 : fprintf (f, "\"%p\" [label=\"", (void *)node);
3433 0 : vect_print_slp_tree (MSG_NOTE,
3434 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3435 : node);
3436 0 : fprintf (f, "\"];\n");
3437 :
3438 :
3439 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3440 0 : fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
3441 :
3442 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3443 0 : if (child)
3444 0 : dot_slp_tree (f, child, visited);
3445 : }
3446 :
3447 : DEBUG_FUNCTION void
3448 0 : dot_slp_tree (const char *fname, slp_tree node)
3449 : {
3450 0 : FILE *f = fopen (fname, "w");
3451 0 : fprintf (f, "digraph {\n");
3452 0 : fflush (f);
3453 0 : {
3454 0 : debug_dump_context ctx (f);
3455 0 : hash_set<slp_tree> visited;
3456 0 : dot_slp_tree (f, node, visited);
3457 0 : }
3458 0 : fflush (f);
3459 0 : fprintf (f, "}\n");
3460 0 : fclose (f);
3461 0 : }
3462 :
3463 : DEBUG_FUNCTION void
3464 0 : dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
3465 : {
3466 0 : FILE *f = fopen (fname, "w");
3467 0 : fprintf (f, "digraph {\n");
3468 0 : fflush (f);
3469 0 : {
3470 0 : debug_dump_context ctx (f);
3471 0 : hash_set<slp_tree> visited;
3472 0 : for (auto inst : slp_instances)
3473 0 : dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
3474 0 : }
3475 0 : fflush (f);
3476 0 : fprintf (f, "}\n");
3477 0 : fclose (f);
3478 0 : }
3479 :
3480 : /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
3481 :
3482 : static void
3483 484806 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3484 : slp_tree node, hash_set<slp_tree> &visited)
3485 : {
3486 484806 : unsigned i;
3487 484806 : slp_tree child;
3488 :
3489 484806 : if (visited.add (node))
3490 484806 : return;
3491 :
3492 445397 : vect_print_slp_tree (dump_kind, loc, node);
3493 :
3494 1342313 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3495 451519 : if (child)
3496 408685 : vect_print_slp_graph (dump_kind, loc, child, visited);
3497 : }
3498 :
3499 : static void
3500 46744 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3501 : slp_tree entry)
3502 : {
3503 46744 : hash_set<slp_tree> visited;
3504 46744 : vect_print_slp_graph (dump_kind, loc, entry, visited);
3505 46744 : }
3506 :
3507 : DEBUG_FUNCTION void
3508 0 : debug (slp_instance instance)
3509 : {
3510 0 : debug_dump_context ctx;
3511 0 : vect_print_slp_graph (MSG_NOTE,
3512 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3513 : SLP_INSTANCE_TREE (instance));
3514 0 : }
3515 :
3516 :
3517 : /* Compute the set of scalar stmts participating in external nodes. */
3518 :
3519 : static void
3520 1553024 : vect_slp_gather_extern_scalar_stmts (vec_info *vinfo, slp_tree node,
3521 : hash_set<slp_tree> &visited,
3522 : hash_set<stmt_vec_info> &estmts)
3523 : {
3524 1553024 : if (visited.add (node))
3525 : return;
3526 :
3527 1509135 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3528 : {
3529 : slp_tree child;
3530 : int i;
3531 1739136 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3532 872475 : if (child)
3533 872475 : vect_slp_gather_extern_scalar_stmts (vinfo, child, visited, estmts);
3534 : }
3535 : else
3536 3617994 : for (tree def : SLP_TREE_SCALAR_OPS (node))
3537 : {
3538 1691838 : stmt_vec_info def_stmt = vinfo->lookup_def (def);
3539 1691838 : if (def_stmt)
3540 333070 : estmts.add (def_stmt);
3541 : }
3542 : }
3543 :
3544 : /* Mark the original scalar stmt coverage of the vector SLP graph of VINFO
3545 : with STMT_SLP_TYPE == pure_slp. */
3546 :
3547 : static void
3548 234430 : vect_bb_slp_mark_stmts_vectorized (bb_vec_info vinfo)
3549 : {
3550 : /* Gather the scalar stmt leafs of the SLP graph to stop the below DFS
3551 : walk on. */
3552 234430 : hash_set<stmt_vec_info> scalar_stmts_in_externs;
3553 234430 : hash_set<slp_tree> visited;
3554 1383839 : for (auto instance : BB_VINFO_SLP_INSTANCES (vinfo))
3555 680549 : vect_slp_gather_extern_scalar_stmts (vinfo, SLP_INSTANCE_TREE (instance),
3556 : visited, scalar_stmts_in_externs);
3557 :
3558 : /* DFS walk scalar stmts to compute the vectorized coverage indicated
3559 : by STMT_SLP_TYPE (stmt) == pure_slp on the original scalar (non-pattern)
3560 : stmts. */
3561 1383839 : for (auto instance : BB_VINFO_SLP_INSTANCES (vinfo))
3562 : {
3563 789986 : for (auto stmt : SLP_INSTANCE_ROOT_STMTS (instance))
3564 51905 : if (!scalar_stmts_in_externs.contains (stmt))
3565 51377 : STMT_SLP_TYPE (stmt) = pure_slp;
3566 680549 : auto_vec<stmt_vec_info> worklist;
3567 3845941 : for (auto stmt : SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)))
3568 : {
3569 1804294 : stmt = vect_orig_stmt (stmt);
3570 1804294 : if (!scalar_stmts_in_externs.contains (stmt)
3571 1804294 : && STMT_SLP_TYPE (stmt) != pure_slp)
3572 : {
3573 1794686 : STMT_SLP_TYPE (stmt) = pure_slp;
3574 1794686 : worklist.safe_push (stmt);
3575 : }
3576 : }
3577 3586899 : while (!worklist.is_empty ())
3578 : {
3579 2228675 : stmt_vec_info stmt = worklist.pop ();
3580 :
3581 : /* Now walk relevant parts of the SSA use-def graph. */
3582 2228675 : slp_oprnds child_ops (stmt);
3583 4690227 : for (unsigned i = 0; i < child_ops.num_slp_children; ++i)
3584 : {
3585 2461552 : tree op = child_ops.get_op_for_slp_child (stmt, i);
3586 2461552 : stmt_vec_info def = vinfo->lookup_def (op);
3587 2461552 : if (def
3588 849887 : && !scalar_stmts_in_externs.contains (def)
3589 2979498 : && STMT_SLP_TYPE (def) != pure_slp)
3590 : {
3591 433989 : STMT_SLP_TYPE (def) = pure_slp;
3592 433989 : worklist.safe_push (def);
3593 : }
3594 : }
3595 : }
3596 680549 : }
3597 234430 : }
3598 :
3599 : /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
3600 :
3601 : static void
3602 2403231 : vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
3603 : {
3604 2403231 : int i;
3605 2403231 : stmt_vec_info stmt_info;
3606 2403231 : slp_tree child;
3607 :
3608 2403231 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3609 : return;
3610 :
3611 1432140 : if (visited.add (node))
3612 : return;
3613 :
3614 4298133 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3615 3002514 : if (stmt_info)
3616 : {
3617 3002514 : gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
3618 : || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
3619 3002514 : STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
3620 : }
3621 :
3622 2918644 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3623 1623025 : if (child)
3624 1623025 : vect_mark_slp_stmts_relevant (child, visited);
3625 : }
3626 :
3627 : static void
3628 780206 : vect_mark_slp_stmts_relevant (slp_tree node)
3629 : {
3630 780206 : hash_set<slp_tree> visited;
3631 780206 : vect_mark_slp_stmts_relevant (node, visited);
3632 780206 : }
3633 :
3634 :
3635 : /* Gather loads in the SLP graph NODE and populate the INST loads array. */
3636 :
3637 : static void
3638 10506174 : vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
3639 : hash_set<slp_tree> &visited)
3640 : {
3641 10506174 : if (!node || visited.add (node))
3642 1734858 : return;
3643 :
3644 8771316 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3645 : return;
3646 :
3647 6509904 : if (!SLP_TREE_PERMUTE_P (node))
3648 : {
3649 6304258 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3650 6304258 : if (STMT_VINFO_DATA_REF (stmt_info)
3651 2752610 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3652 1552607 : loads.safe_push (node);
3653 : }
3654 :
3655 : unsigned i;
3656 : slp_tree child;
3657 14794840 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3658 8284936 : vect_gather_slp_loads (loads, child, visited);
3659 : }
3660 :
3661 :
3662 : /* Find the last store in SLP INSTANCE. */
3663 :
3664 : stmt_vec_info
3665 2726617 : vect_find_last_scalar_stmt_in_slp (slp_tree node)
3666 : {
3667 2726617 : stmt_vec_info last = NULL;
3668 2726617 : stmt_vec_info stmt_vinfo;
3669 :
3670 9941361 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3671 7214744 : if (stmt_vinfo)
3672 : {
3673 7214744 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3674 7214744 : last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
3675 : }
3676 :
3677 2726617 : return last;
3678 : }
3679 :
3680 : /* Find the first stmt in NODE. */
3681 :
3682 : stmt_vec_info
3683 527585 : vect_find_first_scalar_stmt_in_slp (slp_tree node)
3684 : {
3685 527585 : stmt_vec_info first = NULL;
3686 527585 : stmt_vec_info stmt_vinfo;
3687 :
3688 1790377 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3689 1262792 : if (stmt_vinfo)
3690 : {
3691 1260098 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3692 1260098 : if (!first
3693 1260098 : || get_later_stmt (stmt_vinfo, first) == first)
3694 : first = stmt_vinfo;
3695 : }
3696 :
3697 527585 : return first;
3698 : }
3699 :
3700 : /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3701 : two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3702 : (also containing the first GROUP1_SIZE stmts, since stores are
3703 : consecutive), the second containing the remainder.
3704 : Return the first stmt in the second group. */
3705 :
3706 : static stmt_vec_info
3707 156668 : vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3708 : {
3709 156668 : gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3710 156668 : gcc_assert (group1_size > 0);
3711 156668 : int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3712 156668 : gcc_assert (group2_size > 0);
3713 156668 : DR_GROUP_SIZE (first_vinfo) = group1_size;
3714 :
3715 156668 : stmt_vec_info stmt_info = first_vinfo;
3716 526293 : for (unsigned i = group1_size; i > 1; i--)
3717 : {
3718 369625 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3719 369625 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3720 : }
3721 : /* STMT is now the last element of the first group. */
3722 156668 : stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3723 156668 : DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3724 :
3725 156668 : DR_GROUP_SIZE (group2) = group2_size;
3726 438742 : for (stmt_info = group2; stmt_info;
3727 282074 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3728 : {
3729 282074 : DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3730 282074 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3731 : }
3732 :
3733 : /* For the second group, the DR_GROUP_GAP is that before the original group,
3734 : plus skipping over the first vector. */
3735 156668 : DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3736 :
3737 : /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3738 156668 : DR_GROUP_GAP (first_vinfo) += group2_size;
3739 :
3740 156668 : if (dump_enabled_p ())
3741 70 : dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3742 : group1_size, group2_size);
3743 :
3744 156668 : return group2;
3745 : }
3746 :
3747 : /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3748 : statements and a vector of NUNITS elements. */
3749 :
3750 : static poly_uint64
3751 4156729 : calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3752 : {
3753 4156729 : return exact_div (common_multiple (nunits, group_size), group_size);
3754 : }
3755 :
3756 : /* Helper that checks to see if a node is a load node. */
3757 :
3758 : static inline bool
3759 102 : vect_is_slp_load_node (slp_tree root)
3760 : {
3761 102 : return (!SLP_TREE_PERMUTE_P (root)
3762 102 : && SLP_TREE_DEF_TYPE (root) == vect_internal_def
3763 96 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3764 166 : && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
3765 : }
3766 :
3767 :
3768 : /* Helper function of optimize_load_redistribution that performs the operation
3769 : recursively. */
3770 :
3771 : static slp_tree
3772 18862 : optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3773 : vec_info *vinfo, unsigned int group_size,
3774 : hash_map<slp_tree, slp_tree> *load_map,
3775 : slp_tree root)
3776 : {
3777 18862 : if (slp_tree *leader = load_map->get (root))
3778 3517 : return *leader;
3779 :
3780 15345 : slp_tree node;
3781 15345 : unsigned i;
3782 :
3783 : /* For now, we don't know anything about externals so do not do anything. */
3784 15345 : if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3785 : return NULL;
3786 11386 : else if (SLP_TREE_PERMUTE_P (root))
3787 : {
3788 : /* First convert this node into a load node and add it to the leaves
3789 : list and flatten the permute from a lane to a load one. If it's
3790 : unneeded it will be elided later. */
3791 70 : vec<stmt_vec_info> stmts;
3792 70 : stmts.create (SLP_TREE_LANES (root));
3793 70 : lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3794 134 : for (unsigned j = 0; j < lane_perm.length (); j++)
3795 : {
3796 102 : std::pair<unsigned, unsigned> perm = lane_perm[j];
3797 102 : node = SLP_TREE_CHILDREN (root)[perm.first];
3798 :
3799 102 : if (!vect_is_slp_load_node (node)
3800 102 : || SLP_TREE_CHILDREN (node).exists ())
3801 : {
3802 38 : stmts.release ();
3803 38 : goto next;
3804 : }
3805 :
3806 64 : stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3807 : }
3808 :
3809 32 : if (dump_enabled_p ())
3810 0 : dump_printf_loc (MSG_NOTE, vect_location,
3811 : "converting stmts on permute node %p\n",
3812 : (void *) root);
3813 :
3814 32 : bool *matches = XALLOCAVEC (bool, group_size);
3815 32 : poly_uint64 max_nunits = 1;
3816 32 : unsigned tree_size = 0, limit = 1;
3817 32 : node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3818 : matches, &limit, &tree_size, bst_map);
3819 32 : if (!node)
3820 0 : stmts.release ();
3821 :
3822 32 : load_map->put (root, node);
3823 32 : return node;
3824 : }
3825 :
3826 11316 : next:
3827 11354 : load_map->put (root, NULL);
3828 :
3829 26495 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3830 : {
3831 15141 : slp_tree value
3832 15141 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3833 : node);
3834 15141 : if (value)
3835 : {
3836 32 : SLP_TREE_REF_COUNT (value)++;
3837 32 : SLP_TREE_CHILDREN (root)[i] = value;
3838 : /* ??? We know the original leafs of the replaced nodes will
3839 : be referenced by bst_map, only the permutes created by
3840 : pattern matching are not. */
3841 32 : if (SLP_TREE_REF_COUNT (node) == 1)
3842 32 : load_map->remove (node);
3843 32 : vect_free_slp_tree (node);
3844 : }
3845 : }
3846 :
3847 : return NULL;
3848 : }
3849 :
3850 : /* Temporary workaround for loads not being CSEd during SLP build. This
3851 : function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3852 : VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3853 : same DR such that the final operation is equal to a permuted load. Such
3854 : NODES are then directly converted into LOADS themselves. The nodes are
3855 : CSEd using BST_MAP. */
3856 :
3857 : static void
3858 2838 : optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3859 : vec_info *vinfo, unsigned int group_size,
3860 : hash_map<slp_tree, slp_tree> *load_map,
3861 : slp_tree root)
3862 : {
3863 2838 : slp_tree node;
3864 2838 : unsigned i;
3865 :
3866 6559 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3867 : {
3868 3721 : slp_tree value
3869 3721 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3870 : node);
3871 3721 : if (value)
3872 : {
3873 0 : SLP_TREE_REF_COUNT (value)++;
3874 0 : SLP_TREE_CHILDREN (root)[i] = value;
3875 : /* ??? We know the original leafs of the replaced nodes will
3876 : be referenced by bst_map, only the permutes created by
3877 : pattern matching are not. */
3878 0 : if (SLP_TREE_REF_COUNT (node) == 1)
3879 0 : load_map->remove (node);
3880 0 : vect_free_slp_tree (node);
3881 : }
3882 : }
3883 2838 : }
3884 :
3885 : /* Helper function of vect_match_slp_patterns.
3886 :
3887 : Attempts to match patterns against the slp tree rooted in REF_NODE using
3888 : VINFO. Patterns are matched in post-order traversal.
3889 :
3890 : If matching is successful the value in REF_NODE is updated and returned, if
3891 : not then it is returned unchanged. */
3892 :
3893 : static bool
3894 6079507 : vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3895 : slp_tree_to_load_perm_map_t *perm_cache,
3896 : slp_compat_nodes_map_t *compat_cache,
3897 : hash_set<slp_tree> *visited)
3898 : {
3899 6079507 : unsigned i;
3900 6079507 : slp_tree node = *ref_node;
3901 6079507 : bool found_p = false;
3902 6079507 : if (!node || visited->add (node))
3903 870089 : return false;
3904 :
3905 : slp_tree child;
3906 9740484 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3907 4531066 : found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3908 : vinfo, perm_cache, compat_cache,
3909 : visited);
3910 :
3911 15628254 : for (unsigned x = 0; x < num__slp_patterns; x++)
3912 : {
3913 10418836 : vect_pattern *pattern
3914 10418836 : = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3915 10418836 : if (pattern)
3916 : {
3917 1107 : pattern->build (vinfo);
3918 1107 : delete pattern;
3919 1107 : found_p = true;
3920 : }
3921 : }
3922 :
3923 : return found_p;
3924 : }
3925 :
3926 : /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3927 : vec_info VINFO.
3928 :
3929 : The modified tree is returned. Patterns are tried in order and multiple
3930 : patterns may match. */
3931 :
3932 : static bool
3933 1548441 : vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3934 : hash_set<slp_tree> *visited,
3935 : slp_tree_to_load_perm_map_t *perm_cache,
3936 : slp_compat_nodes_map_t *compat_cache)
3937 : {
3938 1548441 : DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3939 1548441 : slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3940 :
3941 1548441 : if (dump_enabled_p ())
3942 30574 : dump_printf_loc (MSG_NOTE, vect_location,
3943 : "Analyzing SLP tree %p for patterns\n",
3944 30574 : (void *) SLP_INSTANCE_TREE (instance));
3945 :
3946 1548441 : return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3947 1548441 : visited);
3948 : }
3949 :
3950 : /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3951 : vectorizing with VECTYPE that might be NULL. MASKED_P indicates whether
3952 : the stores are masked.
3953 : Return true if we could use IFN_STORE_LANES instead and if that appears
3954 : to be the better approach. */
3955 :
3956 : static bool
3957 6115 : vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3958 : tree vectype, bool masked_p,
3959 : unsigned int group_size,
3960 : unsigned int new_group_size)
3961 : {
3962 6115 : if (!vectype)
3963 : {
3964 6115 : tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3965 6115 : vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3966 : }
3967 6115 : if (!vectype)
3968 : return false;
3969 : /* Allow the split if one of the two new groups would operate on full
3970 : vectors *within* rather than across one scalar loop iteration.
3971 : This is purely a heuristic, but it should work well for group
3972 : sizes of 3 and 4, where the possible splits are:
3973 :
3974 : 3->2+1: OK if the vector has exactly two elements
3975 : 4->2+2: Likewise
3976 : 4->3+1: Less clear-cut. */
3977 6115 : if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3978 3446 : || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3979 2692 : return false;
3980 3423 : return vect_store_lanes_supported (vectype, group_size, masked_p) != IFN_LAST;
3981 : }
3982 :
3983 : /* Analyze an SLP instance starting from a group of grouped stores. Call
3984 : vect_build_slp_tree to build a tree of packed stmts if possible.
3985 : Return FALSE if it's impossible to SLP any stmt in the loop. */
3986 :
3987 : static bool
3988 : vect_analyze_slp_instance (vec_info *vinfo,
3989 : scalar_stmts_to_slp_tree_map_t *bst_map,
3990 : stmt_vec_info stmt_info, slp_instance_kind kind,
3991 : unsigned max_tree_size, unsigned *limit,
3992 : bool force_single_lane);
3993 :
3994 : /* Build an interleaving scheme for the store sources RHS_NODES from
3995 : SCALAR_STMTS. */
3996 :
3997 : static slp_tree
3998 8041 : vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
3999 : vec<stmt_vec_info> &scalar_stmts,
4000 : poly_uint64 max_nunits)
4001 : {
4002 8041 : unsigned int group_size = scalar_stmts.length ();
4003 16082 : slp_tree node = vect_create_new_slp_node (scalar_stmts,
4004 8041 : SLP_TREE_CHILDREN
4005 : (rhs_nodes[0]).length ());
4006 8041 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
4007 8041 : node->max_nunits = max_nunits;
4008 8041 : for (unsigned l = 0;
4009 16109 : l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
4010 : {
4011 : /* And a permute merging all RHS SLP trees. */
4012 8068 : slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
4013 8068 : VEC_PERM_EXPR);
4014 8068 : SLP_TREE_CHILDREN (node).quick_push (perm);
4015 8068 : SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
4016 8068 : SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
4017 8068 : perm->max_nunits = max_nunits;
4018 8068 : SLP_TREE_LANES (perm) = group_size;
4019 : /* ??? We should set this NULL but that's not expected. */
4020 8068 : SLP_TREE_REPRESENTATIVE (perm)
4021 8068 : = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
4022 31405 : for (unsigned j = 0; j < rhs_nodes.length (); ++j)
4023 : {
4024 23337 : SLP_TREE_CHILDREN (perm)
4025 23337 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
4026 23337 : SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
4027 23337 : for (unsigned k = 0;
4028 48990 : k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
4029 : {
4030 : /* ??? We should populate SLP_TREE_SCALAR_STMTS
4031 : or SLP_TREE_SCALAR_OPS but then we might have
4032 : a mix of both in our children. */
4033 25653 : SLP_TREE_LANE_PERMUTATION (perm)
4034 25653 : .quick_push (std::make_pair (j, k));
4035 : }
4036 : }
4037 :
4038 : /* Now we have a single permute node but we cannot code-generate
4039 : the case with more than two inputs.
4040 : Perform pairwise reduction, reducing the two inputs
4041 : with the least number of lanes to one and then repeat until
4042 : we end up with two inputs. That scheme makes sure we end
4043 : up with permutes satisfying the restriction of requiring at
4044 : most two vector inputs to produce a single vector output
4045 : when the number of lanes is even. */
4046 15269 : while (SLP_TREE_CHILDREN (perm).length () > 2)
4047 : {
4048 : /* When we have three equal sized groups left the pairwise
4049 : reduction does not result in a scheme that avoids using
4050 : three vectors. Instead merge the first two groups
4051 : to the final size with do-not-care elements (chosen
4052 : from the first group) and then merge with the third.
4053 : { A0, B0, x, A1, B1, x, ... }
4054 : -> { A0, B0, C0, A1, B1, C1, ... }
4055 : This handles group size of three (and at least
4056 : power-of-two multiples of that). */
4057 7201 : if (SLP_TREE_CHILDREN (perm).length () == 3
4058 3291 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
4059 3291 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
4060 7201 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
4061 2459 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
4062 : {
4063 2153 : int ai = 0;
4064 2153 : int bi = 1;
4065 2153 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
4066 2153 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
4067 2153 : unsigned n = SLP_TREE_LANES (perm);
4068 :
4069 2153 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
4070 2153 : SLP_TREE_LANES (permab) = n;
4071 2153 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
4072 2153 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
4073 2153 : permab->max_nunits = max_nunits;
4074 : /* ??? Should be NULL but that's not expected. */
4075 2153 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
4076 2153 : SLP_TREE_CHILDREN (permab).quick_push (a);
4077 4320 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
4078 2167 : SLP_TREE_LANE_PERMUTATION (permab)
4079 2167 : .quick_push (std::make_pair (0, k));
4080 2153 : SLP_TREE_CHILDREN (permab).quick_push (b);
4081 4320 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
4082 2167 : SLP_TREE_LANE_PERMUTATION (permab)
4083 2167 : .quick_push (std::make_pair (1, k));
4084 : /* Push the do-not-care lanes. */
4085 4320 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
4086 2167 : SLP_TREE_LANE_PERMUTATION (permab)
4087 2167 : .quick_push (std::make_pair (0, k));
4088 :
4089 : /* Put the merged node into 'perm', in place of a. */
4090 2153 : SLP_TREE_CHILDREN (perm)[ai] = permab;
4091 : /* Adjust the references to b in the permutation
4092 : of perm and to the later children which we'll
4093 : remove. */
4094 8654 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
4095 : {
4096 6501 : std::pair<unsigned, unsigned> &p
4097 6501 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
4098 6501 : if (p.first == (unsigned) bi)
4099 : {
4100 2167 : p.first = ai;
4101 2167 : p.second += SLP_TREE_LANES (a);
4102 : }
4103 4334 : else if (p.first > (unsigned) bi)
4104 2167 : p.first--;
4105 : }
4106 2153 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
4107 2153 : break;
4108 : }
4109 :
4110 : /* Pick the two nodes with the least number of lanes,
4111 : prefer the earliest candidate and maintain ai < bi. */
4112 : int ai = -1;
4113 : int bi = -1;
4114 45840 : for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
4115 : {
4116 40792 : if (ai == -1)
4117 5048 : ai = ci;
4118 35744 : else if (bi == -1)
4119 5048 : bi = ci;
4120 30696 : else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
4121 30696 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
4122 30696 : || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
4123 25256 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
4124 : {
4125 11768 : if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
4126 5884 : <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
4127 2727 : bi = ci;
4128 : else
4129 : {
4130 3157 : ai = bi;
4131 3157 : bi = ci;
4132 : }
4133 : }
4134 : }
4135 :
4136 : /* Produce a merge of nodes ai and bi. */
4137 5048 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
4138 5048 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
4139 5048 : unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
4140 5048 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
4141 5048 : SLP_TREE_LANES (permab) = n;
4142 5048 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
4143 5048 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
4144 5048 : permab->max_nunits = max_nunits;
4145 : /* ??? Should be NULL but that's not expected. */
4146 5048 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
4147 5048 : SLP_TREE_CHILDREN (permab).quick_push (a);
4148 13340 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
4149 8292 : SLP_TREE_LANE_PERMUTATION (permab)
4150 8292 : .quick_push (std::make_pair (0, k));
4151 5048 : SLP_TREE_CHILDREN (permab).quick_push (b);
4152 12664 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
4153 7616 : SLP_TREE_LANE_PERMUTATION (permab)
4154 7616 : .quick_push (std::make_pair (1, k));
4155 :
4156 : /* Put the merged node into 'perm', in place of a. */
4157 5048 : SLP_TREE_CHILDREN (perm)[ai] = permab;
4158 : /* Adjust the references to b in the permutation
4159 : of perm and to the later children which we'll
4160 : remove. */
4161 73221 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
4162 : {
4163 68173 : std::pair<unsigned, unsigned> &p
4164 68173 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
4165 68173 : if (p.first == (unsigned) bi)
4166 : {
4167 7616 : p.first = ai;
4168 7616 : p.second += SLP_TREE_LANES (a);
4169 : }
4170 60557 : else if (p.first > (unsigned) bi)
4171 25450 : p.first--;
4172 : }
4173 5048 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
4174 : }
4175 : }
4176 :
4177 8041 : return node;
4178 : }
4179 :
4180 : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
4181 : of KIND. Return true if successful. SCALAR_STMTS is owned by this
4182 : function, REMAIN and ROOT_STMT_INFOS ownership is transferred back to
4183 : the caller upon failure. */
4184 :
4185 : static bool
4186 1899689 : vect_build_slp_instance (vec_info *vinfo,
4187 : slp_instance_kind kind,
4188 : vec<stmt_vec_info> &scalar_stmts,
4189 : vec<stmt_vec_info> &root_stmt_infos,
4190 : vec<tree> &remain,
4191 : unsigned max_tree_size, unsigned *limit,
4192 : scalar_stmts_to_slp_tree_map_t *bst_map,
4193 : bool force_single_lane)
4194 : {
4195 : /* If there's no budget left bail out early. */
4196 1899689 : if (*limit == 0)
4197 : {
4198 22260 : scalar_stmts.release ();
4199 22260 : return false;
4200 : }
4201 :
4202 1877429 : if (kind == slp_inst_kind_ctor)
4203 : {
4204 13097 : if (dump_enabled_p ())
4205 86 : dump_printf_loc (MSG_NOTE, vect_location,
4206 : "Analyzing vectorizable constructor: %G\n",
4207 43 : root_stmt_infos[0]->stmt);
4208 : }
4209 1864332 : else if (kind == slp_inst_kind_gcond)
4210 : {
4211 277494 : if (dump_enabled_p ())
4212 5720 : dump_printf_loc (MSG_NOTE, vect_location,
4213 : "Analyzing vectorizable control flow: %G",
4214 2860 : root_stmt_infos[0]->stmt);
4215 : }
4216 :
4217 1877429 : if (dump_enabled_p ())
4218 : {
4219 25587 : dump_printf_loc (MSG_NOTE, vect_location,
4220 : "Starting SLP discovery for\n");
4221 54634 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4222 58094 : dump_printf_loc (MSG_NOTE, vect_location,
4223 29047 : " %G", scalar_stmts[i]->stmt);
4224 : }
4225 :
4226 : /* Build the tree for the SLP instance. */
4227 1877429 : unsigned int group_size = scalar_stmts.length ();
4228 1877429 : bool *matches = XALLOCAVEC (bool, group_size);
4229 1877429 : poly_uint64 max_nunits = 1;
4230 1877429 : unsigned tree_size = 0;
4231 :
4232 1877429 : slp_tree node = NULL;
4233 1877429 : if (group_size > 1 && force_single_lane)
4234 : {
4235 0 : matches[0] = true;
4236 0 : matches[1] = false;
4237 : }
4238 : else
4239 1877429 : node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4240 : &max_nunits, matches, limit,
4241 : &tree_size, bst_map);
4242 1877429 : if (node != NULL)
4243 : {
4244 : /* Calculate the unrolling factor based on the smallest type. */
4245 762507 : poly_uint64 unrolling_factor
4246 762507 : = calculate_unrolling_factor (max_nunits, group_size);
4247 :
4248 762507 : if (maybe_ne (unrolling_factor, 1U)
4249 762507 : && is_a <bb_vec_info> (vinfo))
4250 : {
4251 0 : unsigned HOST_WIDE_INT const_max_nunits;
4252 0 : if (!max_nunits.is_constant (&const_max_nunits)
4253 0 : || const_max_nunits > group_size)
4254 : {
4255 0 : if (dump_enabled_p ())
4256 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4257 : "Build SLP failed: store group "
4258 : "size not a multiple of the vector size "
4259 : "in basic block SLP\n");
4260 0 : vect_free_slp_tree (node);
4261 0 : return false;
4262 : }
4263 : /* Fatal mismatch. */
4264 0 : if (dump_enabled_p ())
4265 0 : dump_printf_loc (MSG_NOTE, vect_location,
4266 : "SLP discovery succeeded but node needs "
4267 : "splitting\n");
4268 0 : memset (matches, true, group_size);
4269 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
4270 0 : vect_free_slp_tree (node);
4271 : }
4272 : else
4273 : {
4274 : /* Create a new SLP instance. */
4275 762507 : slp_instance new_instance = XNEW (class _slp_instance);
4276 762507 : SLP_INSTANCE_TREE (new_instance) = node;
4277 762507 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4278 762507 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4279 762507 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4280 762507 : SLP_INSTANCE_KIND (new_instance) = kind;
4281 762507 : new_instance->reduc_phis = NULL;
4282 762507 : new_instance->cost_vec = vNULL;
4283 762507 : new_instance->subgraph_entries = vNULL;
4284 :
4285 762507 : if (dump_enabled_p ())
4286 22531 : dump_printf_loc (MSG_NOTE, vect_location,
4287 : "SLP size %u vs. limit %u.\n",
4288 : tree_size, max_tree_size);
4289 :
4290 762507 : vinfo->slp_instances.safe_push (new_instance);
4291 :
4292 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4293 : the number of scalar stmts in the root in a few places.
4294 : Verify that assumption holds. */
4295 1525014 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4296 : .length () == group_size);
4297 :
4298 762507 : if (dump_enabled_p ())
4299 : {
4300 22531 : if (kind == slp_inst_kind_reduc_group)
4301 1455 : dump_printf_loc (MSG_NOTE, vect_location,
4302 : "SLP discovery of size %d reduction group "
4303 : "succeeded\n", group_size);
4304 22531 : dump_printf_loc (MSG_NOTE, vect_location,
4305 : "Final SLP tree for instance %p:\n",
4306 : (void *) new_instance);
4307 22531 : vect_print_slp_graph (MSG_NOTE, vect_location,
4308 : SLP_INSTANCE_TREE (new_instance));
4309 : }
4310 :
4311 762507 : return true;
4312 : }
4313 : }
4314 : /* Failed to SLP. */
4315 :
4316 : /* While we arrive here even with slp_inst_kind_store we should only
4317 : for group_size == 1. The code to split store groups is only in
4318 : vect_analyze_slp_instance now. */
4319 1114922 : gcc_assert (kind != slp_inst_kind_store || group_size == 1);
4320 :
4321 : /* Free the allocated memory. */
4322 1114922 : scalar_stmts.release ();
4323 :
4324 : /* Failed to SLP. */
4325 1114922 : if (dump_enabled_p ())
4326 3056 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4327 : return false;
4328 : }
4329 :
4330 : /* Analyze an SLP instance starting from a the start of a reduction chain.
4331 : Call vect_build_slp_tree to build a tree of packed stmts if possible.
4332 : Return FALSE if SLP build fails. */
4333 :
4334 : static bool
4335 73013 : vect_analyze_slp_reduc_chain (loop_vec_info vinfo,
4336 : scalar_stmts_to_slp_tree_map_t *bst_map,
4337 : stmt_vec_info scalar_stmt,
4338 : unsigned max_tree_size, unsigned *limit)
4339 : {
4340 73013 : vec<stmt_vec_info> scalar_stmts = vNULL;
4341 :
4342 73013 : bool fail = false;
4343 : /* ??? We could leave operation code checking to SLP discovery. */
4344 73013 : code_helper code = STMT_VINFO_REDUC_CODE (STMT_VINFO_REDUC_DEF
4345 : (vect_orig_stmt (scalar_stmt)));
4346 73013 : bool first = true;
4347 73013 : stmt_vec_info next_stmt = scalar_stmt;
4348 82978 : do
4349 : {
4350 82978 : stmt_vec_info stmt = next_stmt;
4351 82978 : gimple_match_op op;
4352 82978 : if (!gimple_extract_op (STMT_VINFO_STMT (stmt), &op))
4353 0 : gcc_unreachable ();
4354 165956 : tree reduc_def = gimple_arg (STMT_VINFO_STMT (stmt),
4355 82978 : STMT_VINFO_REDUC_IDX (stmt));
4356 82978 : next_stmt = vect_stmt_to_vectorize (vinfo->lookup_def (reduc_def));
4357 82978 : gcc_assert (is_a <gphi *> (STMT_VINFO_STMT (next_stmt))
4358 : || STMT_VINFO_REDUC_IDX (next_stmt) != -1);
4359 88881 : if (!gimple_extract_op (STMT_VINFO_STMT (vect_orig_stmt (stmt)), &op))
4360 0 : gcc_unreachable ();
4361 82978 : if (CONVERT_EXPR_CODE_P (op.code)
4362 4917 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))
4363 87883 : && (first
4364 2440 : || is_a <gphi *> (STMT_VINFO_STMT (next_stmt))))
4365 : ;
4366 78077 : else if (code != op.code)
4367 : {
4368 2610 : fail = true;
4369 2610 : break;
4370 : }
4371 : else
4372 75467 : scalar_stmts.safe_push (stmt);
4373 80368 : first = false;
4374 : }
4375 80368 : while (!is_a <gphi *> (STMT_VINFO_STMT (next_stmt)));
4376 73013 : if (fail)
4377 2610 : return false;
4378 :
4379 : /* Remember a stmt with the actual reduction operation. */
4380 70403 : stmt_vec_info reduc_scalar_stmt = scalar_stmts[0];
4381 :
4382 : /* When the SSA def chain through reduc-idx does not form a natural
4383 : reduction chain try to linearize an associative operation manually. */
4384 70403 : if (scalar_stmts.length () == 1
4385 67733 : && code.is_tree_code ()
4386 61661 : && associative_tree_code ((tree_code)code)
4387 : /* We may not associate if a fold-left reduction is required. */
4388 130646 : && !needs_fold_left_reduction_p (TREE_TYPE (gimple_get_lhs
4389 : (reduc_scalar_stmt->stmt)),
4390 : code))
4391 : {
4392 57026 : auto_vec<chain_op_t> chain;
4393 57026 : auto_vec<std::pair<tree_code, gimple *> > worklist;
4394 57026 : gimple *op_stmt = NULL, *other_op_stmt = NULL;
4395 57026 : if (is_a <gassign *> (scalar_stmts[0]->stmt)
4396 : /* We cannot linearize an operation that vect_slp_linearize_chain
4397 : would not put on its worklist. */
4398 57026 : && gimple_assign_rhs_code (scalar_stmts[0]->stmt) == (tree_code)code)
4399 : {
4400 56379 : vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
4401 56379 : scalar_stmts[0]->stmt, op_stmt,
4402 : other_op_stmt,
4403 : NULL);
4404 :
4405 56379 : scalar_stmts.truncate (0);
4406 56379 : stmt_vec_info tail = NULL;
4407 282092 : for (auto el : chain)
4408 : {
4409 113312 : if (el.dt == vect_external_def
4410 113312 : || el.dt == vect_constant_def
4411 113312 : || el.code != (tree_code) code)
4412 : {
4413 357 : scalar_stmts.release ();
4414 357 : return false;
4415 : }
4416 112955 : stmt_vec_info stmt = vinfo->lookup_def (el.op);
4417 112955 : if (STMT_VINFO_REDUC_IDX (stmt) != -1
4418 110671 : || STMT_VINFO_REDUC_DEF (stmt))
4419 : {
4420 56255 : gcc_assert (tail == NULL);
4421 56255 : tail = stmt;
4422 56255 : continue;
4423 : }
4424 56700 : scalar_stmts.safe_push (stmt);
4425 : }
4426 56022 : gcc_assert (tail);
4427 : }
4428 :
4429 : /* When this linearization didn't produce a chain see if stripping
4430 : a wrapping sign conversion produces one. */
4431 56669 : if (scalar_stmts.length () == 1
4432 56669 : && (code == PLUS_EXPR || code == MULT_EXPR || code == BIT_IOR_EXPR
4433 : || code == BIT_AND_EXPR || code == BIT_XOR_EXPR))
4434 : {
4435 54882 : gimple *stmt = scalar_stmts[0]->stmt;
4436 54882 : if (!is_gimple_assign (stmt)
4437 53716 : || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
4438 4579 : || TREE_CODE (gimple_assign_rhs1 (stmt)) != SSA_NAME
4439 59461 : || !tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
4440 4579 : TREE_TYPE (gimple_assign_rhs1 (stmt))))
4441 : {
4442 53127 : scalar_stmts.release ();
4443 53127 : return false;
4444 : }
4445 1755 : stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
4446 1755 : if (!is_gimple_assign (stmt)
4447 1755 : || gimple_assign_rhs_code (stmt) != (tree_code)code)
4448 : {
4449 1736 : scalar_stmts.release ();
4450 1736 : return false;
4451 : }
4452 19 : chain.truncate (0);
4453 19 : vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
4454 : stmt, op_stmt, other_op_stmt, NULL);
4455 :
4456 19 : scalar_stmts.truncate (0);
4457 19 : stmt_vec_info tail = NULL;
4458 93 : for (auto el : chain)
4459 : {
4460 44 : if (el.dt == vect_external_def
4461 44 : || el.dt == vect_constant_def
4462 44 : || el.code != (tree_code) code)
4463 : {
4464 8 : scalar_stmts.release ();
4465 8 : return false;
4466 : }
4467 36 : stmt_vec_info stmt = vinfo->lookup_def (el.op);
4468 36 : if (STMT_VINFO_REDUC_IDX (stmt) != -1
4469 36 : || STMT_VINFO_REDUC_DEF (stmt))
4470 : {
4471 0 : gcc_assert (tail == NULL);
4472 0 : tail = stmt;
4473 0 : continue;
4474 : }
4475 36 : scalar_stmts.safe_push (stmt);
4476 : }
4477 : /* Unlike the above this does not include the reduction SSA
4478 : cycle. */
4479 11 : gcc_assert (!tail);
4480 : }
4481 :
4482 1798 : if (scalar_stmts.length () < 2)
4483 : {
4484 1673 : scalar_stmts.release ();
4485 1673 : return false;
4486 : }
4487 :
4488 125 : if (dump_enabled_p ())
4489 : {
4490 34 : dump_printf_loc (MSG_NOTE, vect_location,
4491 : "Starting SLP discovery of reduction chain for\n");
4492 140 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4493 212 : dump_printf_loc (MSG_NOTE, vect_location,
4494 106 : " %G", scalar_stmts[i]->stmt);
4495 : }
4496 :
4497 125 : unsigned int group_size = scalar_stmts.length ();
4498 125 : bool *matches = XALLOCAVEC (bool, group_size);
4499 125 : poly_uint64 max_nunits = 1;
4500 125 : unsigned tree_size = 0;
4501 125 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4502 : &max_nunits, matches, limit,
4503 125 : &tree_size, bst_map);
4504 125 : if (!node)
4505 : {
4506 47 : scalar_stmts.release ();
4507 47 : return false;
4508 : }
4509 :
4510 78 : unsigned cycle_id = vinfo->reduc_infos.length ();
4511 78 : vect_reduc_info reduc_info = new vect_reduc_info_s ();
4512 78 : vinfo->reduc_infos.safe_push (reduc_info);
4513 78 : VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (next_stmt);
4514 78 : VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (next_stmt);
4515 78 : VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (next_stmt);
4516 78 : VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
4517 78 : reduc_info->is_reduc_chain = true;
4518 :
4519 : /* Build the node for the PHI and possibly the conversions. */
4520 78 : slp_tree phis = vect_create_new_slp_node (2, ERROR_MARK);
4521 78 : SLP_TREE_REPRESENTATIVE (phis) = next_stmt;
4522 78 : phis->cycle_info.id = cycle_id;
4523 78 : SLP_TREE_LANES (phis) = group_size;
4524 78 : if (reduc_scalar_stmt == scalar_stmt)
4525 74 : SLP_TREE_VECTYPE (phis) = SLP_TREE_VECTYPE (node);
4526 : else
4527 4 : SLP_TREE_VECTYPE (phis)
4528 4 : = signed_or_unsigned_type_for (TYPE_UNSIGNED
4529 : (TREE_TYPE (gimple_get_lhs
4530 : (scalar_stmt->stmt))),
4531 : SLP_TREE_VECTYPE (node));
4532 : /* ??? vect_cse_slp_nodes cannot cope with cycles without any
4533 : SLP_TREE_SCALAR_STMTS. */
4534 78 : SLP_TREE_SCALAR_STMTS (phis).create (group_size);
4535 393 : for (unsigned i = 0; i < group_size; ++i)
4536 315 : SLP_TREE_SCALAR_STMTS (phis).quick_push (next_stmt);
4537 :
4538 78 : slp_tree op_input = phis;
4539 78 : if (reduc_scalar_stmt != scalar_stmt)
4540 : {
4541 4 : slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
4542 4 : SLP_TREE_REPRESENTATIVE (conv)
4543 4 : = vinfo->lookup_def (gimple_arg (reduc_scalar_stmt->stmt,
4544 4 : STMT_VINFO_REDUC_IDX
4545 : (reduc_scalar_stmt)));
4546 4 : SLP_TREE_CHILDREN (conv).quick_push (phis);
4547 4 : conv->cycle_info.id = cycle_id;
4548 4 : SLP_TREE_REDUC_IDX (conv) = 0;
4549 4 : SLP_TREE_LANES (conv) = group_size;
4550 4 : SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (node);
4551 4 : SLP_TREE_SCALAR_STMTS (conv) = vNULL;
4552 4 : op_input = conv;
4553 : }
4554 :
4555 78 : slp_tree reduc = vect_create_new_slp_node (2, ERROR_MARK);
4556 78 : SLP_TREE_REPRESENTATIVE (reduc) = reduc_scalar_stmt;
4557 78 : SLP_TREE_CHILDREN (reduc).quick_push (op_input);
4558 78 : SLP_TREE_CHILDREN (reduc).quick_push (node);
4559 78 : reduc->cycle_info.id = cycle_id;
4560 78 : SLP_TREE_REDUC_IDX (reduc) = 0;
4561 78 : SLP_TREE_LANES (reduc) = group_size;
4562 78 : SLP_TREE_VECTYPE (reduc) = SLP_TREE_VECTYPE (node);
4563 : /* ??? For the reduction epilogue we need a live lane. */
4564 78 : SLP_TREE_SCALAR_STMTS (reduc).create (group_size);
4565 78 : SLP_TREE_SCALAR_STMTS (reduc).quick_push (reduc_scalar_stmt);
4566 315 : for (unsigned i = 1; i < group_size; ++i)
4567 237 : SLP_TREE_SCALAR_STMTS (reduc).quick_push (NULL);
4568 :
4569 78 : if (reduc_scalar_stmt != scalar_stmt)
4570 : {
4571 4 : slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
4572 4 : SLP_TREE_REPRESENTATIVE (conv) = scalar_stmt;
4573 4 : SLP_TREE_CHILDREN (conv).quick_push (reduc);
4574 4 : conv->cycle_info.id = cycle_id;
4575 4 : SLP_TREE_REDUC_IDX (conv) = 0;
4576 4 : SLP_TREE_LANES (conv) = group_size;
4577 4 : SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (phis);
4578 : /* ??? For the reduction epilogue we need a live lane. */
4579 4 : SLP_TREE_SCALAR_STMTS (conv).create (group_size);
4580 4 : SLP_TREE_SCALAR_STMTS (conv).quick_push (scalar_stmt);
4581 8 : for (unsigned i = 1; i < group_size; ++i)
4582 4 : SLP_TREE_SCALAR_STMTS (conv).quick_push (NULL);
4583 4 : reduc = conv;
4584 : }
4585 :
4586 78 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (vinfo));
4587 78 : SLP_TREE_CHILDREN (phis).quick_push (NULL);
4588 78 : SLP_TREE_CHILDREN (phis).quick_push (NULL);
4589 78 : SLP_TREE_CHILDREN (phis)[le->dest_idx] = reduc;
4590 78 : SLP_TREE_REF_COUNT (reduc)++;
4591 :
4592 : /* Create a new SLP instance. */
4593 78 : slp_instance new_instance = XNEW (class _slp_instance);
4594 78 : SLP_INSTANCE_TREE (new_instance) = reduc;
4595 78 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4596 78 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4597 78 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4598 78 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
4599 78 : new_instance->reduc_phis = NULL;
4600 78 : new_instance->cost_vec = vNULL;
4601 78 : new_instance->subgraph_entries = vNULL;
4602 :
4603 78 : vinfo->slp_instances.safe_push (new_instance);
4604 :
4605 78 : if (dump_enabled_p ())
4606 : {
4607 24 : dump_printf_loc (MSG_NOTE, vect_location,
4608 : "Final SLP tree for instance %p:\n",
4609 : (void *) new_instance);
4610 24 : vect_print_slp_graph (MSG_NOTE, vect_location,
4611 : SLP_INSTANCE_TREE (new_instance));
4612 : }
4613 :
4614 78 : return true;
4615 57026 : }
4616 :
4617 13377 : if (scalar_stmts.length () <= 1)
4618 : {
4619 10707 : scalar_stmts.release ();
4620 10707 : return false;
4621 : }
4622 :
4623 2670 : scalar_stmts.reverse ();
4624 2670 : stmt_vec_info reduc_phi_info = next_stmt;
4625 :
4626 : /* Build the tree for the SLP instance. */
4627 2670 : vec<stmt_vec_info> root_stmt_infos = vNULL;
4628 2670 : vec<tree> remain = vNULL;
4629 :
4630 2670 : if (dump_enabled_p ())
4631 : {
4632 193 : dump_printf_loc (MSG_NOTE, vect_location,
4633 : "Starting SLP discovery of reduction chain for\n");
4634 1029 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4635 1672 : dump_printf_loc (MSG_NOTE, vect_location,
4636 836 : " %G", scalar_stmts[i]->stmt);
4637 : }
4638 :
4639 : /* Build the tree for the SLP instance. */
4640 2670 : unsigned int group_size = scalar_stmts.length ();
4641 2670 : bool *matches = XALLOCAVEC (bool, group_size);
4642 2670 : poly_uint64 max_nunits = 1;
4643 2670 : unsigned tree_size = 0;
4644 :
4645 : /* ??? We need this only for SLP discovery. */
4646 10378 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4647 7708 : REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = scalar_stmts[0];
4648 :
4649 2670 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4650 : &max_nunits, matches, limit,
4651 2670 : &tree_size, bst_map);
4652 :
4653 10378 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4654 7708 : REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = NULL;
4655 :
4656 2670 : if (node != NULL)
4657 : {
4658 : /* Create a new SLP instance. */
4659 2329 : slp_instance new_instance = XNEW (class _slp_instance);
4660 2329 : SLP_INSTANCE_TREE (new_instance) = node;
4661 2329 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4662 2329 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4663 2329 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4664 2329 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
4665 2329 : new_instance->reduc_phis = NULL;
4666 2329 : new_instance->cost_vec = vNULL;
4667 2329 : new_instance->subgraph_entries = vNULL;
4668 :
4669 2329 : vect_reduc_info reduc_info = info_for_reduction (vinfo, node);
4670 2329 : reduc_info->is_reduc_chain = true;
4671 :
4672 2329 : if (dump_enabled_p ())
4673 144 : dump_printf_loc (MSG_NOTE, vect_location,
4674 : "SLP size %u vs. limit %u.\n",
4675 : tree_size, max_tree_size);
4676 :
4677 : /* Fixup SLP reduction chains. If this is a reduction chain with
4678 : a conversion in front amend the SLP tree with a node for that. */
4679 2329 : gimple *scalar_def = STMT_VINFO_REDUC_DEF (reduc_phi_info)->stmt;
4680 2329 : if (is_gimple_assign (scalar_def)
4681 2329 : && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (scalar_def)))
4682 : {
4683 43 : stmt_vec_info conv_info = vect_stmt_to_vectorize
4684 43 : (STMT_VINFO_REDUC_DEF (reduc_phi_info));
4685 43 : scalar_stmts = vNULL;
4686 43 : scalar_stmts.create (group_size);
4687 135 : for (unsigned i = 0; i < group_size; ++i)
4688 92 : scalar_stmts.quick_push (conv_info);
4689 43 : slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
4690 43 : SLP_TREE_VECTYPE (conv)
4691 43 : = get_vectype_for_scalar_type (vinfo,
4692 43 : TREE_TYPE
4693 : (gimple_assign_lhs (scalar_def)),
4694 : group_size);
4695 43 : SLP_TREE_REDUC_IDX (conv) = 0;
4696 43 : conv->cycle_info.id = node->cycle_info.id;
4697 43 : SLP_TREE_CHILDREN (conv).quick_push (node);
4698 43 : SLP_INSTANCE_TREE (new_instance) = conv;
4699 : }
4700 : /* Fill the backedge child of the PHI SLP node. The
4701 : general matching code cannot find it because the
4702 : scalar code does not reflect how we vectorize the
4703 : reduction. */
4704 2329 : use_operand_p use_p;
4705 2329 : imm_use_iterator imm_iter;
4706 2329 : class loop *loop = LOOP_VINFO_LOOP (vinfo);
4707 11195 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
4708 : gimple_get_lhs (scalar_def))
4709 : /* There are exactly two non-debug uses, the reduction
4710 : PHI and the loop-closed PHI node. */
4711 6537 : if (!is_gimple_debug (USE_STMT (use_p))
4712 6537 : && gimple_bb (USE_STMT (use_p)) == loop->header)
4713 : {
4714 2329 : auto_vec<stmt_vec_info, 64> phis (group_size);
4715 2329 : stmt_vec_info phi_info = vinfo->lookup_stmt (USE_STMT (use_p));
4716 9180 : for (unsigned i = 0; i < group_size; ++i)
4717 6851 : phis.quick_push (phi_info);
4718 2329 : slp_tree *phi_node = bst_map->get (phis);
4719 2329 : unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
4720 4658 : SLP_TREE_CHILDREN (*phi_node)[dest_idx]
4721 2329 : = SLP_INSTANCE_TREE (new_instance);
4722 2329 : SLP_INSTANCE_TREE (new_instance)->refcnt++;
4723 2329 : }
4724 :
4725 2329 : vinfo->slp_instances.safe_push (new_instance);
4726 :
4727 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4728 : the number of scalar stmts in the root in a few places.
4729 : Verify that assumption holds. */
4730 4658 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4731 : .length () == group_size);
4732 :
4733 2329 : if (dump_enabled_p ())
4734 : {
4735 144 : dump_printf_loc (MSG_NOTE, vect_location,
4736 : "Final SLP tree for instance %p:\n",
4737 : (void *) new_instance);
4738 144 : vect_print_slp_graph (MSG_NOTE, vect_location,
4739 : SLP_INSTANCE_TREE (new_instance));
4740 : }
4741 :
4742 2329 : return true;
4743 : }
4744 :
4745 : /* Failed to SLP. */
4746 341 : scalar_stmts.release ();
4747 341 : if (dump_enabled_p ())
4748 49 : dump_printf_loc (MSG_NOTE, vect_location,
4749 : "SLP discovery of reduction chain failed\n");
4750 : return false;
4751 : }
4752 :
4753 : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
4754 : of KIND. Return true if successful. */
4755 :
4756 : static bool
4757 99087 : vect_analyze_slp_reduction (loop_vec_info vinfo,
4758 : stmt_vec_info scalar_stmt,
4759 : unsigned max_tree_size, unsigned *limit,
4760 : scalar_stmts_to_slp_tree_map_t *bst_map,
4761 : bool force_single_lane)
4762 : {
4763 99087 : slp_instance_kind kind = slp_inst_kind_reduc_group;
4764 :
4765 : /* Try to gather a reduction chain. Only attempt if there's budget left
4766 : since chain analysis may build multi-lane trees that consume limit. */
4767 99087 : if (! force_single_lane
4768 73298 : && *limit != 0
4769 73298 : && STMT_VINFO_DEF_TYPE (scalar_stmt) == vect_reduction_def
4770 172100 : && vect_analyze_slp_reduc_chain (vinfo, bst_map, scalar_stmt,
4771 : max_tree_size, limit))
4772 : return true;
4773 :
4774 96680 : vec<stmt_vec_info> scalar_stmts;
4775 96680 : scalar_stmts.create (1);
4776 96680 : scalar_stmts.quick_push (scalar_stmt);
4777 :
4778 96680 : if (dump_enabled_p ())
4779 : {
4780 3864 : dump_printf_loc (MSG_NOTE, vect_location,
4781 : "Starting SLP discovery for\n");
4782 7728 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4783 7728 : dump_printf_loc (MSG_NOTE, vect_location,
4784 3864 : " %G", scalar_stmts[i]->stmt);
4785 : }
4786 :
4787 : /* Build the tree for the SLP instance. */
4788 96680 : unsigned int group_size = scalar_stmts.length ();
4789 96680 : bool *matches = XALLOCAVEC (bool, group_size);
4790 96680 : poly_uint64 max_nunits = 1;
4791 96680 : unsigned tree_size = 0;
4792 :
4793 96680 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4794 : &max_nunits, matches, limit,
4795 : &tree_size, bst_map);
4796 96680 : if (node != NULL)
4797 : {
4798 : /* Create a new SLP instance. */
4799 93681 : slp_instance new_instance = XNEW (class _slp_instance);
4800 93681 : SLP_INSTANCE_TREE (new_instance) = node;
4801 93681 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4802 93681 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4803 93681 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4804 93681 : SLP_INSTANCE_KIND (new_instance) = kind;
4805 93681 : new_instance->reduc_phis = NULL;
4806 93681 : new_instance->cost_vec = vNULL;
4807 93681 : new_instance->subgraph_entries = vNULL;
4808 :
4809 93681 : if (dump_enabled_p ())
4810 3744 : dump_printf_loc (MSG_NOTE, vect_location,
4811 : "SLP size %u vs. limit %u.\n",
4812 : tree_size, max_tree_size);
4813 :
4814 93681 : vinfo->slp_instances.safe_push (new_instance);
4815 :
4816 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4817 : the number of scalar stmts in the root in a few places.
4818 : Verify that assumption holds. */
4819 187362 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4820 : .length () == group_size);
4821 :
4822 93681 : if (dump_enabled_p ())
4823 : {
4824 3744 : dump_printf_loc (MSG_NOTE, vect_location,
4825 : "Final SLP tree for instance %p:\n",
4826 : (void *) new_instance);
4827 3744 : vect_print_slp_graph (MSG_NOTE, vect_location,
4828 : SLP_INSTANCE_TREE (new_instance));
4829 : }
4830 :
4831 93681 : return true;
4832 : }
4833 : /* Failed to SLP. */
4834 :
4835 : /* Free the allocated memory. */
4836 2999 : scalar_stmts.release ();
4837 :
4838 : /* Failed to SLP. */
4839 2999 : if (dump_enabled_p ())
4840 120 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4841 : return false;
4842 : }
4843 :
4844 : /* Analyze a single SLP reduction group. If successful add a SLP instance
4845 : for it and return true, otherwise return false and have *MATCHES
4846 : populated. */
4847 :
4848 : static bool
4849 24148 : vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo,
4850 : vec<stmt_vec_info> scalar_stmts,
4851 : scalar_stmts_to_slp_tree_map_t *bst_map,
4852 : unsigned max_tree_size, unsigned *limit,
4853 : bool *matches)
4854 : {
4855 : /* Try to form a reduction group. Size-1 groups are not suitable
4856 : for SLP reduction and should fall back to single-lane reduction. */
4857 45533 : unsigned int group_size = scalar_stmts.length ();
4858 24148 : if (group_size <= 1)
4859 : return false;
4860 17471 : if (!matches)
4861 4550 : matches = XALLOCAVEC (bool, group_size);
4862 17471 : poly_uint64 max_nunits = 1;
4863 17471 : unsigned tree_size = 0;
4864 17471 : slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts,
4865 : group_size,
4866 : &max_nunits, matches, limit,
4867 : &tree_size, bst_map);
4868 17471 : if (!node)
4869 : return false;
4870 :
4871 : /* Create a new SLP instance. */
4872 2763 : slp_instance new_instance = XNEW (class _slp_instance);
4873 2763 : SLP_INSTANCE_TREE (new_instance) = node;
4874 2763 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4875 2763 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4876 2763 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4877 2763 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group;
4878 2763 : new_instance->reduc_phis = NULL;
4879 2763 : new_instance->cost_vec = vNULL;
4880 2763 : new_instance->subgraph_entries = vNULL;
4881 :
4882 2763 : if (dump_enabled_p ())
4883 213 : dump_printf_loc (MSG_NOTE, vect_location,
4884 : "SLP size %u vs. limit %u.\n",
4885 : tree_size, max_tree_size);
4886 :
4887 2763 : loop_vinfo->slp_instances.safe_push (new_instance);
4888 :
4889 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4890 : the number of scalar stmts in the root in a few places.
4891 : Verify that assumption holds. */
4892 5526 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4893 : .length () == group_size);
4894 :
4895 2763 : if (dump_enabled_p ())
4896 : {
4897 213 : dump_printf_loc (MSG_NOTE, vect_location,
4898 : "SLP discovery of size %d reduction group "
4899 : "succeeded\n", group_size);
4900 213 : dump_printf_loc (MSG_NOTE, vect_location,
4901 : "Final SLP tree for instance %p:\n",
4902 : (void *) new_instance);
4903 213 : vect_print_slp_graph (MSG_NOTE, vect_location,
4904 : SLP_INSTANCE_TREE (new_instance));
4905 : }
4906 :
4907 : return true;
4908 : }
4909 :
4910 : /* Analyze reductions in LOOP_VINFO and populate SLP instances
4911 : accordingly. Returns false if something fails. */
4912 :
4913 : static bool
4914 491259 : vect_analyze_slp_reductions (loop_vec_info loop_vinfo,
4915 : unsigned max_tree_size, unsigned *limit,
4916 : scalar_stmts_to_slp_tree_map_t *bst_map,
4917 : bool force_single_lane)
4918 : {
4919 560015 : if (loop_vinfo->reductions.is_empty ())
4920 : return true;
4921 :
4922 : /* Collect reduction statements we can combine into
4923 : a SLP reduction. */
4924 73311 : vec<stmt_vec_info> scalar_stmts;
4925 73311 : scalar_stmts.create (loop_vinfo->reductions.length ());
4926 325418 : for (auto next_info : loop_vinfo->reductions)
4927 : {
4928 105485 : next_info = vect_stmt_to_vectorize (next_info);
4929 105485 : if ((STMT_VINFO_RELEVANT_P (next_info)
4930 14 : || STMT_VINFO_LIVE_P (next_info))
4931 : /* ??? Make sure we didn't skip a conversion around a
4932 : reduction path. In that case we'd have to reverse
4933 : engineer that conversion stmt following the chain using
4934 : reduc_idx and from the PHI using reduc_def. */
4935 105471 : && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
4936 105471 : || (STMT_VINFO_DEF_TYPE (next_info)
4937 : == vect_double_reduction_def)))
4938 : {
4939 : /* Do not discover SLP reductions combining lane-reducing
4940 : ops, that will fail later. */
4941 105471 : if (!force_single_lane
4942 105471 : && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
4943 78991 : scalar_stmts.quick_push (next_info);
4944 : /* Do SLP discovery for single-lane reductions. */
4945 26480 : else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
4946 : max_tree_size, limit,
4947 : bst_map,
4948 : force_single_lane))
4949 : {
4950 0 : scalar_stmts.release ();
4951 0 : return false;
4952 : }
4953 : }
4954 : }
4955 :
4956 73311 : if (scalar_stmts.length () > 1)
4957 : {
4958 : /* Try to form a reduction group. */
4959 4644 : unsigned int group_size = scalar_stmts.length ();
4960 4644 : bool *matches = XALLOCAVEC (bool, group_size);
4961 4644 : if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts, bst_map,
4962 : max_tree_size, limit, matches))
4963 1581 : return true;
4964 :
4965 : /* When analysis as a single SLP reduction group failed try to
4966 : form sub-groups by collecting matching lanes. Do not recurse
4967 : that on failure (to limit compile-time costs), but recurse
4968 : for the initial non-matching parts. Everything not covered
4969 : by a sub-group gets single-reduction treatment. */
4970 3518 : vec<stmt_vec_info> cands = vNULL;
4971 11365 : while (matches[0])
4972 : {
4973 11227 : cands.truncate (0);
4974 11227 : cands.reserve (group_size, true);
4975 88333 : for (unsigned i = 0; i < group_size; ++i)
4976 77106 : if (matches[i])
4977 19560 : cands.quick_push (scalar_stmts[i]);
4978 :
4979 : /* Try to form a reduction group. */
4980 11227 : if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map,
4981 : max_tree_size, limit, NULL))
4982 1207 : cands = vNULL;
4983 : else
4984 : {
4985 : /* Do SLP discovery for single-lane reductions. */
4986 47153 : for (auto stmt_info : cands)
4987 17118 : if (! vect_analyze_slp_reduction (loop_vinfo,
4988 : vect_stmt_to_vectorize
4989 : (stmt_info),
4990 : max_tree_size, limit,
4991 : bst_map, force_single_lane))
4992 : {
4993 25 : scalar_stmts.release ();
4994 25 : cands.release ();
4995 25 : return false;
4996 : }
4997 : }
4998 : /* Remove the handled stmts from scalar_stmts and try again,
4999 : possibly repeating the above with updated matches[]. */
5000 : unsigned j = 0;
5001 88238 : for (unsigned i = 0; i < group_size; ++i)
5002 77036 : if (!matches[i])
5003 : {
5004 57516 : scalar_stmts[j] = scalar_stmts[i];
5005 57516 : ++j;
5006 : }
5007 11202 : scalar_stmts.truncate (j);
5008 11202 : group_size = scalar_stmts.length ();
5009 11202 : if (group_size <= 1)
5010 : break;
5011 8277 : if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
5012 : bst_map, max_tree_size, limit,
5013 : matches))
5014 : return true;
5015 : }
5016 : }
5017 : /* Do SLP discovery for single-lane reductions. */
5018 267705 : for (auto stmt_info : scalar_stmts)
5019 55489 : if (! vect_analyze_slp_reduction (loop_vinfo,
5020 : vect_stmt_to_vectorize (stmt_info),
5021 : max_tree_size, limit,
5022 : bst_map, force_single_lane))
5023 : {
5024 2974 : scalar_stmts.release ();
5025 2974 : return false;
5026 : }
5027 :
5028 68756 : scalar_stmts.release ();
5029 68756 : return true;
5030 : }
5031 :
5032 : /* Analyze an SLP instance starting from a group of grouped stores. Call
5033 : vect_build_slp_tree to build a tree of packed stmts if possible.
5034 : Return FALSE if it's impossible to SLP any stmt in the group. */
5035 :
5036 : static bool
5037 1097162 : vect_analyze_slp_instance (vec_info *vinfo,
5038 : scalar_stmts_to_slp_tree_map_t *bst_map,
5039 : stmt_vec_info stmt_info,
5040 : slp_instance_kind kind,
5041 : unsigned max_tree_size, unsigned *limit,
5042 : bool force_single_lane)
5043 : {
5044 1097162 : vec<stmt_vec_info> scalar_stmts;
5045 :
5046 1097162 : if (is_a <bb_vec_info> (vinfo))
5047 1067754 : vect_location = stmt_info->stmt;
5048 :
5049 1097162 : gcc_assert (kind == slp_inst_kind_store);
5050 :
5051 : /* Collect the stores and store them in scalar_stmts. */
5052 1097162 : scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
5053 1097162 : stmt_vec_info next_info = stmt_info;
5054 5454343 : while (next_info)
5055 : {
5056 3260019 : scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
5057 3260019 : next_info = DR_GROUP_NEXT_ELEMENT (next_info);
5058 : }
5059 :
5060 1097162 : vec<stmt_vec_info> root_stmt_infos = vNULL;
5061 1097162 : vec<tree> remain = vNULL;
5062 :
5063 : /* Build the tree for the SLP instance. */
5064 :
5065 : /* If there's no budget left bail out early. */
5066 1097162 : if (*limit == 0)
5067 : return false;
5068 :
5069 1097142 : if (dump_enabled_p ())
5070 : {
5071 4164 : dump_printf_loc (MSG_NOTE, vect_location,
5072 : "Starting SLP discovery for\n");
5073 24166 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
5074 40004 : dump_printf_loc (MSG_NOTE, vect_location,
5075 20002 : " %G", scalar_stmts[i]->stmt);
5076 : }
5077 :
5078 : /* Build the tree for the SLP instance. */
5079 1097142 : unsigned int group_size = scalar_stmts.length ();
5080 1097142 : bool *matches = XALLOCAVEC (bool, group_size);
5081 1097142 : poly_uint64 max_nunits = 1;
5082 1097142 : unsigned tree_size = 0;
5083 1097142 : unsigned i;
5084 :
5085 1097142 : slp_tree node = NULL;
5086 1097142 : if (group_size > 1 && force_single_lane)
5087 : {
5088 1730 : matches[0] = true;
5089 1730 : matches[1] = false;
5090 : }
5091 : else
5092 1095412 : node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
5093 : &max_nunits, matches, limit,
5094 : &tree_size, bst_map);
5095 1097142 : if (node != NULL)
5096 : {
5097 : /* Calculate the unrolling factor based on the smallest type. */
5098 681965 : poly_uint64 unrolling_factor
5099 681965 : = calculate_unrolling_factor (max_nunits, group_size);
5100 :
5101 681965 : if (maybe_ne (unrolling_factor, 1U)
5102 681965 : && is_a <bb_vec_info> (vinfo))
5103 : {
5104 0 : unsigned HOST_WIDE_INT const_max_nunits;
5105 0 : if (!max_nunits.is_constant (&const_max_nunits)
5106 0 : || const_max_nunits > group_size)
5107 : {
5108 0 : if (dump_enabled_p ())
5109 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5110 : "Build SLP failed: store group "
5111 : "size not a multiple of the vector size "
5112 : "in basic block SLP\n");
5113 0 : vect_free_slp_tree (node);
5114 0 : return false;
5115 : }
5116 : /* Fatal mismatch. */
5117 0 : if (dump_enabled_p ())
5118 0 : dump_printf_loc (MSG_NOTE, vect_location,
5119 : "SLP discovery succeeded but node needs "
5120 : "splitting\n");
5121 0 : memset (matches, true, group_size);
5122 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
5123 0 : vect_free_slp_tree (node);
5124 : }
5125 : else
5126 : {
5127 : /* Create a new SLP instance. */
5128 681965 : slp_instance new_instance = XNEW (class _slp_instance);
5129 681965 : SLP_INSTANCE_TREE (new_instance) = node;
5130 681965 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
5131 681965 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
5132 681965 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
5133 681965 : SLP_INSTANCE_KIND (new_instance) = kind;
5134 681965 : new_instance->reduc_phis = NULL;
5135 681965 : new_instance->cost_vec = vNULL;
5136 681965 : new_instance->subgraph_entries = vNULL;
5137 :
5138 681965 : if (dump_enabled_p ())
5139 3171 : dump_printf_loc (MSG_NOTE, vect_location,
5140 : "SLP size %u vs. limit %u.\n",
5141 : tree_size, max_tree_size);
5142 :
5143 681965 : vinfo->slp_instances.safe_push (new_instance);
5144 :
5145 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
5146 : the number of scalar stmts in the root in a few places.
5147 : Verify that assumption holds. */
5148 1363930 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
5149 : .length () == group_size);
5150 :
5151 681965 : if (dump_enabled_p ())
5152 : {
5153 3171 : dump_printf_loc (MSG_NOTE, vect_location,
5154 : "Final SLP tree for instance %p:\n",
5155 : (void *) new_instance);
5156 3171 : vect_print_slp_graph (MSG_NOTE, vect_location,
5157 : SLP_INSTANCE_TREE (new_instance));
5158 : }
5159 :
5160 681965 : return true;
5161 : }
5162 : }
5163 : /* Failed to SLP. */
5164 :
5165 : /* Try to break the group up into pieces. */
5166 415177 : if (*limit > 0 && kind == slp_inst_kind_store)
5167 : {
5168 : /* ??? We could delay all the actual splitting of store-groups
5169 : until after SLP discovery of the original group completed.
5170 : Then we can recurse to vect_build_slp_instance directly. */
5171 1085187 : for (i = 0; i < group_size; i++)
5172 1085187 : if (!matches[i])
5173 : break;
5174 :
5175 : /* For basic block SLP, try to break the group up into multiples of
5176 : a vector size. */
5177 415176 : if (is_a <bb_vec_info> (vinfo)
5178 415176 : && (i > 1 && i < group_size))
5179 : {
5180 : /* Free the allocated memory. */
5181 154214 : scalar_stmts.release ();
5182 :
5183 154214 : tree scalar_type
5184 154214 : = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
5185 308428 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
5186 154214 : 1 << floor_log2 (i));
5187 154214 : unsigned HOST_WIDE_INT const_nunits;
5188 154214 : if (vectype
5189 154214 : && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
5190 : {
5191 : /* Split into two groups at the first vector boundary. */
5192 154214 : gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
5193 154214 : unsigned group1_size = i & ~(const_nunits - 1);
5194 :
5195 154214 : if (dump_enabled_p ())
5196 66 : dump_printf_loc (MSG_NOTE, vect_location,
5197 : "Splitting SLP group at stmt %u\n", i);
5198 154214 : stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
5199 : group1_size);
5200 154214 : bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
5201 : kind, max_tree_size,
5202 : limit, false);
5203 : /* Split the rest at the failure point and possibly
5204 : re-analyze the remaining matching part if it has
5205 : at least two lanes. */
5206 154214 : if (group1_size < i
5207 5376 : && (i + 1 < group_size
5208 2950 : || i - group1_size > 1))
5209 : {
5210 2454 : stmt_vec_info rest2 = rest;
5211 2454 : rest = vect_split_slp_store_group (rest, i - group1_size);
5212 2454 : if (i - group1_size > 1)
5213 57 : res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
5214 : kind, max_tree_size,
5215 : limit, false);
5216 : }
5217 : /* Re-analyze the non-matching tail if it has at least
5218 : two lanes. */
5219 154214 : if (i + 1 < group_size)
5220 22014 : res |= vect_analyze_slp_instance (vinfo, bst_map,
5221 : rest, kind, max_tree_size,
5222 : limit, false);
5223 154214 : return res;
5224 : }
5225 : }
5226 :
5227 : /* For loop vectorization split the RHS into arbitrary pieces of
5228 : size >= 1. */
5229 260962 : else if (is_a <loop_vec_info> (vinfo)
5230 260962 : && (group_size != 1 && i < group_size))
5231 : {
5232 8302 : gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
5233 28 : bool masked_p = call
5234 28 : && gimple_call_internal_p (call)
5235 28 : && internal_fn_mask_index (gimple_call_internal_fn (call)) != -1;
5236 : /* There are targets that cannot do even/odd interleaving schemes
5237 : so they absolutely need to use load/store-lanes. For now
5238 : force single-lane SLP for them - they would be happy with
5239 : uniform power-of-two lanes (but depending on element size),
5240 : but even if we can use 'i' as indicator we would need to
5241 : backtrack when later lanes fail to discover with the same
5242 : granularity. We cannot turn any of strided or scatter store
5243 : into store-lanes. */
5244 : /* ??? If this is not in sync with what get_load_store_type
5245 : later decides the SLP representation is not good for other
5246 : store vectorization methods. */
5247 8302 : bool want_store_lanes
5248 8302 : = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5249 8302 : && ! STMT_VINFO_STRIDED_P (stmt_info)
5250 6202 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
5251 6198 : && compare_step_with_zero (vinfo, stmt_info) > 0
5252 14417 : && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
5253 16604 : masked_p, group_size, i));
5254 8302 : if (want_store_lanes || force_single_lane)
5255 : i = 1;
5256 :
5257 : /* A fatal discovery fail doesn't always mean single-lane SLP
5258 : isn't a possibility, so try. */
5259 6572 : if (i == 0)
5260 : i = 1;
5261 :
5262 8302 : if (dump_enabled_p ())
5263 885 : dump_printf_loc (MSG_NOTE, vect_location,
5264 : "Splitting SLP group at stmt %u\n", i);
5265 :
5266 : /* Analyze the stored values and pinch them together with
5267 : a permute node so we can preserve the whole store group. */
5268 8302 : auto_vec<slp_tree> rhs_nodes;
5269 8302 : poly_uint64 max_nunits = 1;
5270 :
5271 8302 : unsigned int rhs_common_nlanes = 0;
5272 8302 : unsigned int start = 0, end = i;
5273 37183 : while (start < group_size)
5274 : {
5275 29142 : gcc_assert (end - start >= 1);
5276 29142 : vec<stmt_vec_info> substmts;
5277 29142 : substmts.create (end - start);
5278 90653 : for (unsigned j = start; j < end; ++j)
5279 61511 : substmts.quick_push (scalar_stmts[j]);
5280 29142 : max_nunits = 1;
5281 29142 : node = vect_build_slp_tree (vinfo, substmts, end - start,
5282 : &max_nunits,
5283 : matches, limit, &tree_size, bst_map);
5284 29142 : if (node)
5285 : {
5286 23282 : rhs_nodes.safe_push (node);
5287 23282 : vect_update_max_nunits (&max_nunits, node->max_nunits);
5288 23282 : if (start == 0)
5289 8047 : rhs_common_nlanes = SLP_TREE_LANES (node);
5290 15235 : else if (rhs_common_nlanes != SLP_TREE_LANES (node))
5291 1375 : rhs_common_nlanes = 0;
5292 23282 : start = end;
5293 23282 : if (want_store_lanes || force_single_lane)
5294 5202 : end = start + 1;
5295 : else
5296 : end = group_size;
5297 : }
5298 : else
5299 : {
5300 5860 : substmts.release ();
5301 5860 : if (end - start == 1)
5302 : {
5303 : /* Single-lane discovery failed. Free resources. */
5304 281 : for (auto node : rhs_nodes)
5305 8 : vect_free_slp_tree (node);
5306 261 : scalar_stmts.release ();
5307 261 : if (dump_enabled_p ())
5308 39 : dump_printf_loc (MSG_NOTE, vect_location,
5309 : "SLP discovery failed\n");
5310 261 : return false;
5311 : }
5312 :
5313 : /* ??? It really happens that we soft-fail SLP
5314 : build at a mismatch but the matching part hard-fails
5315 : later. As we know we arrived here with a group
5316 : larger than one try a group of size one! */
5317 5599 : if (!matches[0])
5318 44 : end = start + 1;
5319 : else
5320 12209 : for (unsigned j = start; j < end; j++)
5321 12209 : if (!matches[j - start])
5322 : {
5323 : end = j;
5324 : break;
5325 : }
5326 : }
5327 : }
5328 :
5329 : /* Now re-assess whether we want store lanes in case the
5330 : discovery ended up producing all single-lane RHSs. */
5331 8041 : if (! want_store_lanes
5332 8041 : && rhs_common_nlanes == 1
5333 6992 : && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5334 6992 : && ! STMT_VINFO_STRIDED_P (stmt_info)
5335 5257 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
5336 5254 : && compare_step_with_zero (vinfo, stmt_info) > 0
5337 13238 : && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes[0]),
5338 : group_size, masked_p)
5339 : != IFN_LAST))
5340 : want_store_lanes = true;
5341 :
5342 : /* Now we assume we can build the root SLP node from all stores. */
5343 8041 : if (want_store_lanes)
5344 : {
5345 : /* For store-lanes feed the store node with all RHS nodes
5346 : in order. */
5347 0 : node = vect_create_new_slp_node (scalar_stmts,
5348 0 : SLP_TREE_CHILDREN
5349 : (rhs_nodes[0]).length ());
5350 0 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
5351 0 : node->max_nunits = max_nunits;
5352 0 : node->ldst_lanes = true;
5353 0 : SLP_TREE_CHILDREN (node)
5354 0 : .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
5355 0 : + rhs_nodes.length () - 1);
5356 : /* First store value and possibly mask. */
5357 0 : SLP_TREE_CHILDREN (node)
5358 0 : .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
5359 : /* Rest of the store values. All mask nodes are the same,
5360 : this should be guaranteed by dataref group discovery. */
5361 0 : for (unsigned j = 1; j < rhs_nodes.length (); ++j)
5362 0 : SLP_TREE_CHILDREN (node)
5363 0 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
5364 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
5365 0 : child->refcnt++;
5366 : }
5367 : else
5368 8041 : node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
5369 : max_nunits);
5370 :
5371 31315 : while (!rhs_nodes.is_empty ())
5372 23274 : vect_free_slp_tree (rhs_nodes.pop ());
5373 :
5374 : /* Create a new SLP instance. */
5375 8041 : slp_instance new_instance = XNEW (class _slp_instance);
5376 8041 : SLP_INSTANCE_TREE (new_instance) = node;
5377 8041 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
5378 8041 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
5379 8041 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
5380 8041 : SLP_INSTANCE_KIND (new_instance) = kind;
5381 8041 : new_instance->reduc_phis = NULL;
5382 8041 : new_instance->cost_vec = vNULL;
5383 8041 : new_instance->subgraph_entries = vNULL;
5384 :
5385 8041 : if (dump_enabled_p ())
5386 846 : dump_printf_loc (MSG_NOTE, vect_location,
5387 : "SLP size %u vs. limit %u.\n",
5388 : tree_size, max_tree_size);
5389 :
5390 8041 : vinfo->slp_instances.safe_push (new_instance);
5391 :
5392 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
5393 : the number of scalar stmts in the root in a few places.
5394 : Verify that assumption holds. */
5395 16082 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
5396 : .length () == group_size);
5397 :
5398 8041 : if (dump_enabled_p ())
5399 : {
5400 846 : dump_printf_loc (MSG_NOTE, vect_location,
5401 : "Final SLP tree for instance %p:\n",
5402 : (void *) new_instance);
5403 846 : vect_print_slp_graph (MSG_NOTE, vect_location,
5404 : SLP_INSTANCE_TREE (new_instance));
5405 : }
5406 8041 : return true;
5407 8302 : }
5408 : else
5409 : /* Free the allocated memory. */
5410 252660 : scalar_stmts.release ();
5411 :
5412 : /* Even though the first vector did not all match, we might be able to SLP
5413 : (some) of the remainder. FORNOW ignore this possibility. */
5414 : }
5415 : else
5416 : /* Free the allocated memory. */
5417 1 : scalar_stmts.release ();
5418 :
5419 : /* Failed to SLP. */
5420 252661 : if (dump_enabled_p ())
5421 42 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
5422 : return false;
5423 : }
5424 :
5425 : /* qsort comparator ordering SLP load nodes. */
5426 :
5427 : static int
5428 2647652 : vllp_cmp (const void *a_, const void *b_)
5429 : {
5430 2647652 : const slp_tree a = *(const slp_tree *)a_;
5431 2647652 : const slp_tree b = *(const slp_tree *)b_;
5432 2647652 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
5433 2647652 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
5434 2647652 : if (STMT_VINFO_GROUPED_ACCESS (a0)
5435 1541665 : && STMT_VINFO_GROUPED_ACCESS (b0)
5436 4128011 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
5437 : {
5438 : /* Same group, order after lanes used. */
5439 344808 : if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
5440 : return 1;
5441 336017 : else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
5442 : return -1;
5443 : else
5444 : {
5445 : /* Try to order loads using the same lanes together, breaking
5446 : the tie with the lane number that first differs. */
5447 326477 : if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
5448 326477 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
5449 : return 0;
5450 326477 : else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
5451 326477 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
5452 : return 1;
5453 322422 : else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
5454 322422 : && SLP_TREE_LOAD_PERMUTATION (b).exists ())
5455 : return -1;
5456 : else
5457 : {
5458 314944 : for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
5459 314944 : if (SLP_TREE_LOAD_PERMUTATION (a)[i]
5460 314944 : != SLP_TREE_LOAD_PERMUTATION (b)[i])
5461 : {
5462 : /* In-order lane first, that's what the above case for
5463 : no permutation does. */
5464 313632 : if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
5465 : return -1;
5466 191852 : else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
5467 : return 1;
5468 100482 : else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
5469 100482 : < SLP_TREE_LOAD_PERMUTATION (b)[i])
5470 : return -1;
5471 : else
5472 : return 1;
5473 : }
5474 : return 0;
5475 : }
5476 : }
5477 : }
5478 : else /* Different groups or non-groups. */
5479 : {
5480 : /* Order groups as their first element to keep them together. */
5481 2302844 : if (STMT_VINFO_GROUPED_ACCESS (a0))
5482 2302844 : a0 = DR_GROUP_FIRST_ELEMENT (a0);
5483 2302844 : if (STMT_VINFO_GROUPED_ACCESS (b0))
5484 2302844 : b0 = DR_GROUP_FIRST_ELEMENT (b0);
5485 2302844 : if (a0 == b0)
5486 : return 0;
5487 : /* Tie using UID. */
5488 2302724 : else if (gimple_uid (STMT_VINFO_STMT (a0))
5489 2302724 : < gimple_uid (STMT_VINFO_STMT (b0)))
5490 : return -1;
5491 : else
5492 : {
5493 1022835 : gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
5494 : != gimple_uid (STMT_VINFO_STMT (b0)));
5495 : return 1;
5496 : }
5497 : }
5498 : }
5499 :
5500 : /* Return whether if the load permutation of NODE is consecutive starting
5501 : with value START_VAL in the first element. If START_VAL is not given
5502 : the first element's value is used. */
5503 :
5504 : bool
5505 623134 : vect_load_perm_consecutive_p (slp_tree node, unsigned start_val)
5506 : {
5507 623134 : load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
5508 :
5509 623134 : if (!perm.exists () || !perm.length ())
5510 : return false;
5511 :
5512 623134 : if (start_val == UINT_MAX)
5513 79408 : start_val = perm[0];
5514 :
5515 1230198 : for (unsigned int i = 0; i < perm.length (); i++)
5516 630425 : if (perm[i] != start_val + (unsigned int) i)
5517 : return false;
5518 :
5519 : return true;
5520 : }
5521 :
5522 : /* Process the set of LOADS that are all from the same dataref group. */
5523 :
5524 : static void
5525 161418 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
5526 : scalar_stmts_to_slp_tree_map_t *bst_map,
5527 : const array_slice<slp_tree> &loads,
5528 : bool force_single_lane)
5529 : {
5530 : /* We at this point want to lower without a fixed VF or vector
5531 : size in mind which means we cannot actually compute whether we
5532 : need three or more vectors for a load permutation yet. So always
5533 : lower. */
5534 161418 : stmt_vec_info first
5535 161418 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
5536 161418 : unsigned group_lanes = DR_GROUP_SIZE (first);
5537 :
5538 : /* Verify if all load permutations can be implemented with a suitably
5539 : large element load-lanes operation. */
5540 161418 : unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
5541 161418 : if (STMT_VINFO_STRIDED_P (first)
5542 158971 : || compare_step_with_zero (loop_vinfo, first) <= 0
5543 156311 : || exact_log2 (ld_lanes_lanes) == -1
5544 : /* ??? For now only support the single-lane case as there is
5545 : missing support on the store-lane side and code generation
5546 : isn't up to the task yet. */
5547 153530 : || ld_lanes_lanes != 1
5548 303978 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
5549 : group_lanes / ld_lanes_lanes,
5550 : false) == IFN_LAST)
5551 : ld_lanes_lanes = 0;
5552 : else
5553 : /* Verify the loads access the same number of lanes aligned to
5554 : ld_lanes_lanes. */
5555 0 : for (slp_tree load : loads)
5556 : {
5557 0 : if (SLP_TREE_LANES (load) != ld_lanes_lanes)
5558 : {
5559 : ld_lanes_lanes = 0;
5560 : break;
5561 : }
5562 0 : unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
5563 0 : if (first % ld_lanes_lanes != 0)
5564 : {
5565 : ld_lanes_lanes = 0;
5566 : break;
5567 : }
5568 0 : if (!vect_load_perm_consecutive_p (load))
5569 : {
5570 : ld_lanes_lanes = 0;
5571 : break;
5572 : }
5573 : }
5574 :
5575 : /* Only a power-of-two number of lanes matches interleaving with N levels.
5576 : ??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
5577 : at each step. */
5578 262146 : if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
5579 : return;
5580 :
5581 265193 : for (slp_tree load : loads)
5582 : {
5583 : /* Leave masked or gather loads alone for now. */
5584 187209 : if (!SLP_TREE_CHILDREN (load).is_empty ())
5585 60470 : continue;
5586 :
5587 : /* For single-element interleaving spanning multiple vectors avoid
5588 : lowering, we want to use VMAT_ELEMENTWISE later. */
5589 187203 : if (ld_lanes_lanes == 0
5590 187203 : && SLP_TREE_LANES (load) == 1
5591 167843 : && !DR_GROUP_NEXT_ELEMENT (first)
5592 266779 : && maybe_gt (group_lanes,
5593 : TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (load))))
5594 51332 : return;
5595 :
5596 : /* We want to pattern-match special cases here and keep those
5597 : alone. Candidates are splats and load-lane. */
5598 :
5599 : /* We need to lower only loads of less than half of the groups
5600 : lanes, including duplicate lanes. Note this leaves nodes
5601 : with a non-1:1 load permutation around instead of canonicalizing
5602 : those into a load and a permute node. Removing this early
5603 : check would do such canonicalization. */
5604 135871 : if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
5605 56896 : && ld_lanes_lanes == 0)
5606 56896 : continue;
5607 :
5608 : /* Build the permute to get the original load permutation order. */
5609 78975 : bool contiguous = vect_load_perm_consecutive_p (load);
5610 78975 : lane_permutation_t final_perm;
5611 78975 : final_perm.create (SLP_TREE_LANES (load));
5612 158864 : for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
5613 159778 : final_perm.quick_push (
5614 79889 : std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
5615 :
5616 : /* When the load permutation accesses a contiguous unpermuted,
5617 : power-of-two aligned and sized chunk leave the load alone.
5618 : We can likely (re-)load it more efficiently rather than
5619 : extracting it from the larger load.
5620 : ??? Long-term some of the lowering should move to where
5621 : the vector types involved are fixed. */
5622 82543 : if (!force_single_lane
5623 78975 : && ld_lanes_lanes == 0
5624 53231 : && contiguous
5625 52988 : && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
5626 6563 : && pow2p_hwi (SLP_TREE_LANES (load))
5627 6527 : && pow2p_hwi (group_lanes)
5628 3568 : && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
5629 82543 : && group_lanes % SLP_TREE_LANES (load) == 0)
5630 : {
5631 3568 : final_perm.release ();
5632 3568 : continue;
5633 : }
5634 :
5635 : /* First build (and possibly re-use) a load node for the
5636 : unpermuted group. Gaps in the middle and on the end are
5637 : represented with NULL stmts. */
5638 75407 : vec<stmt_vec_info> stmts;
5639 75407 : stmts.create (group_lanes);
5640 268221 : for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
5641 : {
5642 192814 : if (s != first)
5643 122260 : for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
5644 4853 : stmts.quick_push (NULL);
5645 192814 : stmts.quick_push (s);
5646 : }
5647 138200 : for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
5648 62793 : stmts.quick_push (NULL);
5649 75407 : poly_uint64 max_nunits = 1;
5650 75407 : bool *matches = XALLOCAVEC (bool, group_lanes);
5651 75407 : unsigned limit = 1;
5652 75407 : unsigned tree_size = 0;
5653 75407 : slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
5654 : group_lanes,
5655 : &max_nunits, matches, &limit,
5656 75407 : &tree_size, bst_map);
5657 75407 : gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0).exists ());
5658 :
5659 75407 : if (ld_lanes_lanes != 0)
5660 : {
5661 : /* ??? If this is not in sync with what get_load_store_type
5662 : later decides the SLP representation is not good for other
5663 : store vectorization methods. */
5664 0 : l0->ldst_lanes = true;
5665 0 : load->ldst_lanes = true;
5666 : }
5667 :
5668 234275 : while (1)
5669 : {
5670 154841 : unsigned group_lanes = SLP_TREE_LANES (l0);
5671 154841 : if (ld_lanes_lanes != 0
5672 154841 : || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
5673 : break;
5674 :
5675 : /* Try to lower by reducing the group to half its size using an
5676 : interleaving scheme. For this try to compute whether all
5677 : elements needed for this load are in even or odd elements of
5678 : an even/odd decomposition with N consecutive elements.
5679 : Thus { e, e, o, o, e, e, o, o } would be an even/odd decomposition
5680 : with N == 2. */
5681 : /* ??? Only an even number of lanes can be handed this way, but the
5682 : fallback below could work for any number. We have to make sure
5683 : to round up in that case. */
5684 79434 : gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
5685 11362 : unsigned even = 0, odd = 0;
5686 11362 : if ((group_lanes & 1) == 0)
5687 : {
5688 11362 : even = (1 << ceil_log2 (group_lanes)) - 1;
5689 11362 : odd = even;
5690 46125 : for (auto l : final_perm)
5691 : {
5692 12039 : even &= ~l.second;
5693 12039 : odd &= l.second;
5694 : }
5695 : }
5696 :
5697 : /* Now build an even or odd extraction from the unpermuted load. */
5698 79434 : lane_permutation_t perm;
5699 79434 : perm.create ((group_lanes + 1) / 2);
5700 79434 : unsigned even_level = even ? 1 << ctz_hwi (even) : 0;
5701 79434 : unsigned odd_level = odd ? 1 << ctz_hwi (odd) : 0;
5702 79434 : if (even_level
5703 10439 : && group_lanes % (2 * even_level) == 0
5704 : /* ??? When code generating permutes we do not try to pun
5705 : to larger component modes so level != 1 isn't a natural
5706 : even/odd extract. Prefer one if possible. */
5707 10439 : && (even_level == 1 || !odd_level || odd_level != 1))
5708 : {
5709 : /* { 0, 1, ... 4, 5 ..., } */
5710 37447 : for (unsigned i = 0; i < group_lanes / 2 / even_level; ++i)
5711 59114 : for (unsigned j = 0; j < even_level; ++j)
5712 29730 : perm.quick_push (std::make_pair (0, 2 * i * even_level + j));
5713 : }
5714 68995 : else if (odd_level)
5715 : {
5716 : /* { ..., 2, 3, ... 6, 7 } */
5717 3269 : gcc_assert (group_lanes % (2 * odd_level) == 0);
5718 14261 : for (unsigned i = 0; i < group_lanes / 2 / odd_level; ++i)
5719 22038 : for (unsigned j = 0; j < odd_level; ++j)
5720 11046 : perm.quick_push
5721 11046 : (std::make_pair (0, (2 * i + 1) * odd_level + j));
5722 : }
5723 : else
5724 : {
5725 : /* As fallback extract all used lanes and fill to half the
5726 : group size by repeating the last element.
5727 : ??? This is quite a bad strathegy for re-use - we could
5728 : brute force our way to find more optimal filling lanes to
5729 : maximize re-use when looking at all loads from the group. */
5730 68102 : auto_bitmap l;
5731 272464 : for (auto p : final_perm)
5732 68158 : bitmap_set_bit (l, p.second);
5733 68102 : unsigned i = 0;
5734 68102 : bitmap_iterator bi;
5735 136260 : EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
5736 68158 : perm.quick_push (std::make_pair (0, i));
5737 272560 : while (perm.length () < (group_lanes + 1) / 2)
5738 68178 : perm.quick_push (perm.last ());
5739 68102 : }
5740 :
5741 : /* Update final_perm with the intermediate permute. */
5742 159545 : for (unsigned i = 0; i < final_perm.length (); ++i)
5743 : {
5744 80111 : unsigned l = final_perm[i].second;
5745 80111 : unsigned j;
5746 88713 : for (j = 0; j < perm.length (); ++j)
5747 88713 : if (perm[j].second == l)
5748 : {
5749 80111 : final_perm[i].second = j;
5750 80111 : break;
5751 : }
5752 80111 : gcc_assert (j < perm.length ());
5753 : }
5754 :
5755 : /* And create scalar stmts. */
5756 79434 : vec<stmt_vec_info> perm_stmts;
5757 79434 : perm_stmts.create (perm.length ());
5758 256546 : for (unsigned i = 0; i < perm.length (); ++i)
5759 177112 : perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
5760 :
5761 79434 : slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
5762 79434 : SLP_TREE_CHILDREN (p).quick_push (l0);
5763 79434 : SLP_TREE_LANE_PERMUTATION (p) = perm;
5764 79434 : SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
5765 79434 : SLP_TREE_LANES (p) = perm.length ();
5766 79434 : SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
5767 : /* ??? As we have scalar stmts for this intermediate permute we
5768 : could CSE it via bst_map but we do not want to pick up
5769 : another SLP node with a load permutation. We instead should
5770 : have a "local" CSE map here. */
5771 79434 : SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
5772 :
5773 : /* We now have a node for (group_lanes + 1) / 2 lanes. */
5774 79434 : l0 = p;
5775 79434 : }
5776 :
5777 : /* And finally from the ordered reduction node create the
5778 : permute to shuffle the lanes into the original load-permutation
5779 : order. We replace the original load node with this. */
5780 75407 : SLP_TREE_CODE (load) = VEC_PERM_EXPR;
5781 75407 : SLP_TREE_LOAD_PERMUTATION (load).release ();
5782 75407 : SLP_TREE_LANE_PERMUTATION (load) = final_perm;
5783 75407 : SLP_TREE_CHILDREN (load).create (1);
5784 75407 : SLP_TREE_CHILDREN (load).quick_push (l0);
5785 : }
5786 : }
5787 :
5788 : /* Transform SLP loads in the SLP graph created by SLP discovery to
5789 : group loads from the same group and lower load permutations that
5790 : are unlikely to be supported into a series of permutes.
5791 : In the degenerate case of having only single-lane SLP instances
5792 : this should result in a series of permute nodes emulating an
5793 : interleaving scheme. */
5794 :
5795 : static void
5796 473646 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
5797 : scalar_stmts_to_slp_tree_map_t *bst_map,
5798 : bool force_single_lane)
5799 : {
5800 : /* Gather and sort loads across all instances. */
5801 473646 : hash_set<slp_tree> visited;
5802 473646 : auto_vec<slp_tree> loads;
5803 2178749 : for (auto inst : loop_vinfo->slp_instances)
5804 759757 : vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
5805 473646 : if (loads.is_empty ())
5806 90213 : return;
5807 383433 : loads.qsort (vllp_cmp);
5808 :
5809 : /* Now process each dataref group separately. */
5810 383433 : unsigned firsti = 0;
5811 719257 : for (unsigned i = 1; i < loads.length (); ++i)
5812 : {
5813 335824 : slp_tree first = loads[firsti];
5814 335824 : slp_tree next = loads[i];
5815 335824 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
5816 335824 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
5817 335824 : if (STMT_VINFO_GROUPED_ACCESS (a0)
5818 158113 : && STMT_VINFO_GROUPED_ACCESS (b0)
5819 480892 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
5820 62964 : continue;
5821 : /* Now we have one or multiple SLP loads of the same group from
5822 : firsti to i - 1. */
5823 272860 : if (STMT_VINFO_GROUPED_ACCESS (a0))
5824 95149 : vect_lower_load_permutations (loop_vinfo, bst_map,
5825 95149 : make_array_slice (&loads[firsti],
5826 : i - firsti),
5827 : force_single_lane);
5828 : firsti = i;
5829 : }
5830 766866 : if (firsti < loads.length ()
5831 766866 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
5832 66269 : vect_lower_load_permutations (loop_vinfo, bst_map,
5833 66269 : make_array_slice (&loads[firsti],
5834 66269 : loads.length () - firsti),
5835 : force_single_lane);
5836 473646 : }
5837 :
5838 : /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
5839 : trees of packed scalar stmts if SLP is possible. */
5840 :
5841 : opt_result
5842 1111744 : vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
5843 : bool force_single_lane)
5844 : {
5845 1111744 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5846 1111744 : unsigned int i;
5847 1111744 : stmt_vec_info first_element;
5848 1111744 : slp_instance instance;
5849 :
5850 1111744 : DUMP_VECT_SCOPE ("vect_analyze_slp");
5851 :
5852 1111744 : unsigned limit = max_tree_size;
5853 :
5854 1111744 : scalar_stmts_to_slp_tree_map_t *bst_map
5855 1111744 : = new scalar_stmts_to_slp_tree_map_t ();
5856 :
5857 : /* Find SLP sequences starting from groups of grouped stores. */
5858 3144096 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5859 920877 : if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
5860 : slp_inst_kind_store, max_tree_size, &limit,
5861 : force_single_lane)
5862 920877 : && loop_vinfo)
5863 : {
5864 269 : release_scalar_stmts_to_slp_tree_map (bst_map);
5865 269 : return opt_result::failure_at (vect_location, "SLP build failed.\n");
5866 : }
5867 :
5868 : /* For loops also start SLP discovery from non-grouped stores. */
5869 1111475 : if (loop_vinfo)
5870 : {
5871 : data_reference_p dr;
5872 1629947 : FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
5873 1138688 : if (DR_IS_WRITE (dr))
5874 : {
5875 370781 : stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
5876 : /* Grouped stores are already handled above. */
5877 370781 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5878 100080 : continue;
5879 270701 : vec<stmt_vec_info> stmts;
5880 270701 : vec<stmt_vec_info> roots = vNULL;
5881 270701 : vec<tree> remain = vNULL;
5882 270701 : stmts.create (1);
5883 270701 : stmts.quick_push (stmt_info);
5884 270701 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
5885 : stmts, roots, remain, max_tree_size,
5886 : &limit, bst_map, force_single_lane))
5887 : {
5888 6983 : release_scalar_stmts_to_slp_tree_map (bst_map);
5889 6983 : return opt_result::failure_at (vect_location,
5890 : "SLP build failed.\n");
5891 : }
5892 : }
5893 :
5894 : stmt_vec_info stmt_info;
5895 491299 : FOR_EACH_VEC_ELT (LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo), i, stmt_info)
5896 : {
5897 20 : vec<stmt_vec_info> stmts;
5898 20 : vec<stmt_vec_info> roots = vNULL;
5899 20 : vec<tree> remain = vNULL;
5900 20 : stmts.create (1);
5901 20 : stmts.quick_push (stmt_info);
5902 20 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
5903 : stmts, roots, remain, max_tree_size,
5904 : &limit, bst_map, force_single_lane))
5905 : {
5906 0 : release_scalar_stmts_to_slp_tree_map (bst_map);
5907 0 : return opt_result::failure_at (vect_location,
5908 : "SLP build failed.\n");
5909 : }
5910 : }
5911 : }
5912 :
5913 1104492 : if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
5914 : {
5915 1858290 : for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
5916 : {
5917 1245057 : vect_location = bb_vinfo->roots[i].roots[0]->stmt;
5918 : /* Apply patterns. */
5919 3889721 : for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
5920 5289328 : bb_vinfo->roots[i].stmts[j]
5921 2727668 : = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
5922 1245057 : if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
5923 1245057 : bb_vinfo->roots[i].stmts,
5924 1245057 : bb_vinfo->roots[i].roots,
5925 1245057 : bb_vinfo->roots[i].remain,
5926 : max_tree_size, &limit, bst_map, false))
5927 : {
5928 127817 : bb_vinfo->roots[i].roots = vNULL;
5929 127817 : bb_vinfo->roots[i].remain = vNULL;
5930 : }
5931 1245057 : bb_vinfo->roots[i].stmts = vNULL;
5932 : }
5933 : }
5934 :
5935 1104492 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5936 : {
5937 : /* Find SLP sequences starting from groups of reductions. */
5938 491259 : if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit,
5939 : bst_map, force_single_lane))
5940 : {
5941 2999 : release_scalar_stmts_to_slp_tree_map (bst_map);
5942 2999 : return opt_result::failure_at (vect_location, "SLP build failed.\n");
5943 : }
5944 :
5945 : /* Make sure to vectorize only-live stmts, usually inductions. */
5946 2199375 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
5947 1422268 : for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
5948 678476 : gsi_next (&gsi))
5949 : {
5950 687673 : gphi *lc_phi = *gsi;
5951 687673 : tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
5952 687673 : stmt_vec_info stmt_info;
5953 687673 : if (TREE_CODE (def) == SSA_NAME
5954 575444 : && !virtual_operand_p (def)
5955 298863 : && (stmt_info = loop_vinfo->lookup_def (def))
5956 268104 : && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
5957 268104 : && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
5958 208321 : && STMT_VINFO_LIVE_P (stmt_info)
5959 208321 : && !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))
5960 794177 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
5961 : {
5962 106417 : vec<stmt_vec_info> stmts;
5963 106417 : vec<stmt_vec_info> roots = vNULL;
5964 106417 : vec<tree> remain = vNULL;
5965 106417 : stmts.create (1);
5966 106417 : stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
5967 106417 : if (! vect_build_slp_instance (vinfo,
5968 : slp_inst_kind_reduc_group,
5969 : stmts, roots, remain,
5970 : max_tree_size, &limit,
5971 : bst_map, force_single_lane))
5972 : {
5973 9197 : release_scalar_stmts_to_slp_tree_map (bst_map);
5974 9197 : return opt_result::failure_at (vect_location,
5975 : "SLP build failed.\n");
5976 : }
5977 : }
5978 9197 : }
5979 :
5980 : /* Find SLP sequences starting from gconds. */
5981 1189875 : for (auto cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
5982 : {
5983 279149 : auto cond_info = loop_vinfo->lookup_stmt (cond);
5984 :
5985 279149 : cond_info = vect_stmt_to_vectorize (cond_info);
5986 279149 : vec<stmt_vec_info> roots = vNULL;
5987 279149 : roots.safe_push (cond_info);
5988 279149 : gimple *stmt = STMT_VINFO_STMT (cond_info);
5989 279149 : tree args0 = gimple_cond_lhs (stmt);
5990 279149 : tree args1 = gimple_cond_rhs (stmt);
5991 :
5992 : /* These should be enforced by cond lowering, but if it failed
5993 : bail. */
5994 279149 : if (gimple_cond_code (stmt) != NE_EXPR
5995 278059 : || TREE_TYPE (args0) != boolean_type_node
5996 556643 : || !integer_zerop (args1))
5997 : {
5998 1655 : roots.release ();
5999 1655 : release_scalar_stmts_to_slp_tree_map (bst_map);
6000 1655 : return opt_result::failure_at (vect_location,
6001 : "SLP build failed.\n");
6002 : }
6003 :
6004 : /* An argument without a loop def will be codegened from vectorizing the
6005 : root gcond itself. As such we don't need to try to build an SLP tree
6006 : from them. It's highly likely that the resulting SLP tree here if both
6007 : arguments have a def will be incompatible, but we rely on it being split
6008 : later on. */
6009 277494 : auto varg = loop_vinfo->lookup_def (args0);
6010 277494 : vec<stmt_vec_info> stmts;
6011 277494 : vec<tree> remain = vNULL;
6012 277494 : stmts.create (1);
6013 277494 : stmts.quick_push (vect_stmt_to_vectorize (varg));
6014 :
6015 277494 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
6016 : stmts, roots, remain,
6017 : max_tree_size, &limit,
6018 : bst_map, force_single_lane))
6019 : {
6020 3762 : roots.release ();
6021 3762 : release_scalar_stmts_to_slp_tree_map (bst_map);
6022 3762 : return opt_result::failure_at (vect_location,
6023 : "SLP build failed.\n");
6024 : }
6025 : }
6026 : }
6027 :
6028 1086879 : hash_set<slp_tree> visited_patterns;
6029 1086879 : slp_tree_to_load_perm_map_t perm_cache;
6030 1086879 : slp_compat_nodes_map_t compat_cache;
6031 :
6032 : /* See if any patterns can be found in the SLP tree. */
6033 1086879 : bool pattern_found = false;
6034 3722199 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6035 1548441 : pattern_found |= vect_match_slp_patterns (instance, vinfo,
6036 : &visited_patterns, &perm_cache,
6037 : &compat_cache);
6038 :
6039 : /* If any were found optimize permutations of loads. */
6040 1086879 : if (pattern_found)
6041 : {
6042 264 : hash_map<slp_tree, slp_tree> load_map;
6043 3366 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6044 : {
6045 2838 : slp_tree root = SLP_INSTANCE_TREE (instance);
6046 2838 : optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
6047 : &load_map, root);
6048 : }
6049 264 : }
6050 :
6051 : /* Check whether we should force some SLP instances to use load/store-lanes
6052 : and do so by forcing SLP re-discovery with single lanes. We used
6053 : to cancel SLP when this applied to all instances in a loop but now
6054 : we decide this per SLP instance. It's important to do this only
6055 : after SLP pattern recognition. */
6056 1086879 : if (is_a <loop_vec_info> (vinfo))
6057 1233403 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6058 759757 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
6059 291045 : && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
6060 : {
6061 291045 : slp_tree slp_root = SLP_INSTANCE_TREE (instance);
6062 291045 : unsigned int group_size = SLP_TREE_LANES (slp_root);
6063 291045 : tree vectype = SLP_TREE_VECTYPE (slp_root);
6064 :
6065 291045 : stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
6066 291045 : gimple *rep = STMT_VINFO_STMT (rep_info);
6067 291045 : bool masked = (is_gimple_call (rep)
6068 2556 : && gimple_call_internal_p (rep)
6069 293581 : && internal_fn_mask_index
6070 2536 : (gimple_call_internal_fn (rep)) != -1);
6071 291025 : if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
6072 29104 : || slp_root->ldst_lanes
6073 320149 : || (vect_store_lanes_supported (vectype, group_size, masked)
6074 : == IFN_LAST))
6075 291045 : continue;
6076 :
6077 0 : auto_vec<slp_tree> loads;
6078 0 : hash_set<slp_tree> visited;
6079 0 : vect_gather_slp_loads (loads, slp_root, visited);
6080 :
6081 : /* Check whether any load in the SLP instance is possibly
6082 : permuted. */
6083 0 : bool loads_permuted = false;
6084 0 : slp_tree load_node;
6085 0 : unsigned j;
6086 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
6087 : {
6088 0 : if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
6089 0 : continue;
6090 : unsigned k;
6091 : stmt_vec_info load_info;
6092 0 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
6093 0 : if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
6094 : {
6095 : loads_permuted = true;
6096 : break;
6097 : }
6098 : }
6099 :
6100 : /* If the loads and stores can use load/store-lanes force re-discovery
6101 : with single lanes. */
6102 0 : if (loads_permuted)
6103 : {
6104 0 : bool can_use_lanes = true;
6105 : bool prefer_load_lanes = false;
6106 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
6107 0 : if (STMT_VINFO_GROUPED_ACCESS
6108 : (SLP_TREE_REPRESENTATIVE (load_node)))
6109 : {
6110 0 : stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
6111 : (SLP_TREE_REPRESENTATIVE (load_node));
6112 0 : rep = STMT_VINFO_STMT (stmt_vinfo);
6113 0 : masked = (is_gimple_call (rep)
6114 0 : && gimple_call_internal_p (rep)
6115 0 : && internal_fn_mask_index
6116 0 : (gimple_call_internal_fn (rep)));
6117 : /* Use SLP for strided accesses (or if we can't
6118 : load-lanes). */
6119 0 : if (STMT_VINFO_STRIDED_P (stmt_vinfo)
6120 0 : || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
6121 0 : || vect_load_lanes_supported
6122 0 : (SLP_TREE_VECTYPE (load_node),
6123 0 : DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
6124 : /* ??? During SLP re-discovery with a single lane
6125 : a masked grouped load will appear permuted and
6126 : discovery will fail. We have to rework this
6127 : on the discovery side - for now avoid ICEing. */
6128 0 : || masked)
6129 : {
6130 : can_use_lanes = false;
6131 : break;
6132 : }
6133 : /* Make sure that the target would prefer store-lanes
6134 : for at least one of the loads.
6135 :
6136 : ??? Perhaps we should instead require this for
6137 : all loads? */
6138 0 : prefer_load_lanes
6139 : = (prefer_load_lanes
6140 0 : || SLP_TREE_LANES (load_node) == group_size
6141 0 : || (vect_slp_prefer_store_lanes_p
6142 0 : (vinfo, stmt_vinfo,
6143 : SLP_TREE_VECTYPE (load_node), masked,
6144 : group_size, SLP_TREE_LANES (load_node))));
6145 : }
6146 :
6147 0 : if (can_use_lanes && prefer_load_lanes)
6148 : {
6149 0 : if (dump_enabled_p ())
6150 0 : dump_printf_loc (MSG_NOTE, vect_location,
6151 : "SLP instance %p can use load/store-lanes,"
6152 : " re-discovering with single-lanes\n",
6153 : (void *) instance);
6154 :
6155 0 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
6156 :
6157 0 : vect_free_slp_instance (instance);
6158 0 : limit = max_tree_size;
6159 0 : bool res = vect_analyze_slp_instance (vinfo, bst_map,
6160 : stmt_info,
6161 : slp_inst_kind_store,
6162 : max_tree_size, &limit,
6163 : true);
6164 0 : gcc_assert (res);
6165 0 : auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
6166 0 : LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
6167 : }
6168 : }
6169 0 : }
6170 :
6171 : /* When we end up with load permutations that we cannot possibly handle,
6172 : like those requiring three vector inputs, lower them using interleaving
6173 : like schemes. */
6174 1086879 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6175 : {
6176 473646 : vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
6177 473646 : if (dump_enabled_p ())
6178 : {
6179 20019 : dump_printf_loc (MSG_NOTE, vect_location,
6180 : "SLP graph after lowering permutations:\n");
6181 20019 : hash_set<slp_tree> visited;
6182 89263 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6183 29231 : vect_print_slp_graph (MSG_NOTE, vect_location,
6184 : SLP_INSTANCE_TREE (instance), visited);
6185 20019 : }
6186 : }
6187 :
6188 1086879 : release_scalar_stmts_to_slp_tree_map (bst_map);
6189 :
6190 1086879 : if (pattern_found && dump_enabled_p ())
6191 : {
6192 18 : dump_printf_loc (MSG_NOTE, vect_location,
6193 : "Pattern matched SLP tree\n");
6194 18 : hash_set<slp_tree> visited;
6195 90 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6196 36 : vect_print_slp_graph (MSG_NOTE, vect_location,
6197 : SLP_INSTANCE_TREE (instance), visited);
6198 18 : }
6199 :
6200 1086879 : return opt_result::success ();
6201 1086879 : }
6202 :
6203 : /* Estimates the cost of inserting layout changes into the SLP graph.
6204 : It can also say that the insertion is impossible. */
6205 :
6206 : struct slpg_layout_cost
6207 : {
6208 10489186 : slpg_layout_cost () = default;
6209 : slpg_layout_cost (sreal, bool);
6210 :
6211 430955 : static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
6212 4677176 : bool is_possible () const { return depth != sreal::max (); }
6213 :
6214 : bool operator== (const slpg_layout_cost &) const;
6215 : bool operator!= (const slpg_layout_cost &) const;
6216 :
6217 : bool is_better_than (const slpg_layout_cost &, bool) const;
6218 :
6219 : void add_parallel_cost (const slpg_layout_cost &);
6220 : void add_serial_cost (const slpg_layout_cost &);
6221 : void split (unsigned int);
6222 :
6223 : /* The longest sequence of layout changes needed during any traversal
6224 : of the partition dag, weighted by execution frequency.
6225 :
6226 : This is the most important metric when optimizing for speed, since
6227 : it helps to ensure that we keep the number of operations on
6228 : critical paths to a minimum. */
6229 : sreal depth = 0;
6230 :
6231 : /* An estimate of the total number of operations needed. It is weighted by
6232 : execution frequency when optimizing for speed but not when optimizing for
6233 : size. In order to avoid double-counting, a node with a fanout of N will
6234 : distribute 1/N of its total cost to each successor.
6235 :
6236 : This is the most important metric when optimizing for size, since
6237 : it helps to keep the total number of operations to a minimum, */
6238 : sreal total = 0;
6239 : };
6240 :
6241 : /* Construct costs for a node with weight WEIGHT. A higher weight
6242 : indicates more frequent execution. IS_FOR_SIZE is true if we are
6243 : optimizing for size rather than speed. */
6244 :
6245 1109172 : slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
6246 1109862 : : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
6247 : {
6248 1109172 : }
6249 :
6250 : bool
6251 0 : slpg_layout_cost::operator== (const slpg_layout_cost &other) const
6252 : {
6253 0 : return depth == other.depth && total == other.total;
6254 : }
6255 :
6256 : bool
6257 0 : slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
6258 : {
6259 0 : return !operator== (other);
6260 : }
6261 :
6262 : /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
6263 : true if we are optimizing for size rather than speed. */
6264 :
6265 : bool
6266 268700 : slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
6267 : bool is_for_size) const
6268 : {
6269 268700 : if (is_for_size)
6270 : {
6271 301 : if (total != other.total)
6272 117 : return total < other.total;
6273 184 : return depth < other.depth;
6274 : }
6275 : else
6276 : {
6277 268399 : if (depth != other.depth)
6278 110562 : return depth < other.depth;
6279 157837 : return total < other.total;
6280 : }
6281 : }
6282 :
6283 : /* Increase the costs to account for something with cost INPUT_COST
6284 : happening in parallel with the current costs. */
6285 :
6286 : void
6287 326382 : slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
6288 : {
6289 326382 : depth = std::max (depth, input_cost.depth);
6290 326382 : total += input_cost.total;
6291 326382 : }
6292 :
6293 : /* Increase the costs to account for something with cost INPUT_COST
6294 : happening in series with the current costs. */
6295 :
6296 : void
6297 1321715 : slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
6298 : {
6299 1321715 : depth += other.depth;
6300 1321715 : total += other.total;
6301 1321715 : }
6302 :
6303 : /* Split the total cost among TIMES successors or predecessors. */
6304 :
6305 : void
6306 1090412 : slpg_layout_cost::split (unsigned int times)
6307 : {
6308 1090412 : if (times > 1)
6309 507182 : total /= times;
6310 1090412 : }
6311 :
6312 : /* Information about one node in the SLP graph, for use during
6313 : vect_optimize_slp_pass. */
6314 :
6315 : struct slpg_vertex
6316 : {
6317 9872731 : slpg_vertex (slp_tree node_) : node (node_) {}
6318 :
6319 : /* The node itself. */
6320 : slp_tree node;
6321 :
6322 : /* Which partition the node belongs to, or -1 if none. Nodes outside of
6323 : partitions are flexible; they can have whichever layout consumers
6324 : want them to have. */
6325 : int partition = -1;
6326 :
6327 : /* The number of nodes that directly use the result of this one
6328 : (i.e. the number of nodes that count this one as a child). */
6329 : unsigned int out_degree = 0;
6330 :
6331 : /* The execution frequency of the node. */
6332 : sreal weight = 0;
6333 :
6334 : /* The total execution frequency of all nodes that directly use the
6335 : result of this one. */
6336 : sreal out_weight = 0;
6337 : };
6338 :
6339 : /* Information about one partition of the SLP graph, for use during
6340 : vect_optimize_slp_pass. */
6341 :
6342 : struct slpg_partition_info
6343 : {
6344 : /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
6345 : of m_partitioned_nodes. */
6346 : unsigned int node_begin = 0;
6347 : unsigned int node_end = 0;
6348 :
6349 : /* Which layout we've chosen to use for this partition, or -1 if
6350 : we haven't picked one yet. */
6351 : int layout = -1;
6352 :
6353 : /* The number of predecessors and successors in the partition dag.
6354 : The predecessors always have lower partition numbers and the
6355 : successors always have higher partition numbers.
6356 :
6357 : Note that the directions of these edges are not necessarily the
6358 : same as in the data flow graph. For example, if an SCC has separate
6359 : partitions for an inner loop and an outer loop, the inner loop's
6360 : partition will have at least two incoming edges from the outer loop's
6361 : partition: one for a live-in value and one for a live-out value.
6362 : In data flow terms, one of these edges would also be from the outer loop
6363 : to the inner loop, but the other would be in the opposite direction. */
6364 : unsigned int in_degree = 0;
6365 : unsigned int out_degree = 0;
6366 : };
6367 :
6368 : /* Information about the costs of using a particular layout for a
6369 : particular partition. It can also say that the combination is
6370 : impossible. */
6371 :
6372 : struct slpg_partition_layout_costs
6373 : {
6374 1349234 : bool is_possible () const { return internal_cost.is_possible (); }
6375 50722 : void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
6376 :
6377 : /* The costs inherited from predecessor partitions. */
6378 : slpg_layout_cost in_cost;
6379 :
6380 : /* The inherent cost of the layout within the node itself. For example,
6381 : this is nonzero for a load if choosing a particular layout would require
6382 : the load to permute the loaded elements. It is nonzero for a
6383 : VEC_PERM_EXPR if the permutation cannot be eliminated or converted
6384 : to full-vector moves. */
6385 : slpg_layout_cost internal_cost;
6386 :
6387 : /* The costs inherited from successor partitions. */
6388 : slpg_layout_cost out_cost;
6389 : };
6390 :
6391 : /* This class tries to optimize the layout of vectors in order to avoid
6392 : unnecessary shuffling. At the moment, the set of possible layouts are
6393 : restricted to bijective permutations.
6394 :
6395 : The goal of the pass depends on whether we're optimizing for size or
6396 : for speed. When optimizing for size, the goal is to reduce the overall
6397 : number of layout changes (including layout changes implied by things
6398 : like load permutations). When optimizing for speed, the goal is to
6399 : reduce the maximum latency attributable to layout changes on any
6400 : non-cyclical path through the data flow graph.
6401 :
6402 : For example, when optimizing a loop nest for speed, we will prefer
6403 : to make layout changes outside of a loop rather than inside of a loop,
6404 : and will prefer to make layout changes in parallel rather than serially,
6405 : even if that increases the overall number of layout changes.
6406 :
6407 : The high-level procedure is:
6408 :
6409 : (1) Build a graph in which edges go from uses (parents) to definitions
6410 : (children).
6411 :
6412 : (2) Divide the graph into a dag of strongly-connected components (SCCs).
6413 :
6414 : (3) When optimizing for speed, partition the nodes in each SCC based
6415 : on their containing cfg loop. When optimizing for size, treat
6416 : each SCC as a single partition.
6417 :
6418 : This gives us a dag of partitions. The goal is now to assign a
6419 : layout to each partition.
6420 :
6421 : (4) Construct a set of vector layouts that are worth considering.
6422 : Record which nodes must keep their current layout.
6423 :
6424 : (5) Perform a forward walk over the partition dag (from loads to stores)
6425 : accumulating the "forward" cost of using each layout. When visiting
6426 : each partition, assign a tentative choice of layout to the partition
6427 : and use that choice when calculating the cost of using a different
6428 : layout in successor partitions.
6429 :
6430 : (6) Perform a backward walk over the partition dag (from stores to loads),
6431 : accumulating the "backward" cost of using each layout. When visiting
6432 : each partition, make a final choice of layout for that partition based
6433 : on the accumulated forward costs (from (5)) and backward costs
6434 : (from (6)).
6435 :
6436 : (7) Apply the chosen layouts to the SLP graph.
6437 :
6438 : For example, consider the SLP statements:
6439 :
6440 : S1: a_1 = load
6441 : loop:
6442 : S2: a_2 = PHI<a_1, a_3>
6443 : S3: b_1 = load
6444 : S4: a_3 = a_2 + b_1
6445 : exit:
6446 : S5: a_4 = PHI<a_3>
6447 : S6: store a_4
6448 :
6449 : S2 and S4 form an SCC and are part of the same loop. Every other
6450 : statement is in a singleton SCC. In this example there is a one-to-one
6451 : mapping between SCCs and partitions and the partition dag looks like this;
6452 :
6453 : S1 S3
6454 : \ /
6455 : S2+S4
6456 : |
6457 : S5
6458 : |
6459 : S6
6460 :
6461 : S2, S3 and S4 will have a higher execution frequency than the other
6462 : statements, so when optimizing for speed, the goal is to avoid any
6463 : layout changes:
6464 :
6465 : - within S3
6466 : - within S2+S4
6467 : - on the S3->S2+S4 edge
6468 :
6469 : For example, if S3 was originally a reversing load, the goal of the
6470 : pass is to make it an unreversed load and change the layout on the
6471 : S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
6472 : on S1->S2+S4 and S5->S6 would also be acceptable.)
6473 :
6474 : The difference between SCCs and partitions becomes important if we
6475 : add an outer loop:
6476 :
6477 : S1: a_1 = ...
6478 : loop1:
6479 : S2: a_2 = PHI<a_1, a_6>
6480 : S3: b_1 = load
6481 : S4: a_3 = a_2 + b_1
6482 : loop2:
6483 : S5: a_4 = PHI<a_3, a_5>
6484 : S6: c_1 = load
6485 : S7: a_5 = a_4 + c_1
6486 : exit2:
6487 : S8: a_6 = PHI<a_5>
6488 : S9: store a_6
6489 : exit1:
6490 :
6491 : Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
6492 : for speed, we usually do not want restrictions in the outer loop to "infect"
6493 : the decision for the inner loop. For example, if an outer-loop node
6494 : in the SCC contains a statement with a fixed layout, that should not
6495 : prevent the inner loop from using a different layout. Conversely,
6496 : the inner loop should not dictate a layout to the outer loop: if the
6497 : outer loop does a lot of computation, then it may not be efficient to
6498 : do all of that computation in the inner loop's preferred layout.
6499 :
6500 : So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
6501 : and S5+S7 (inner). We also try to arrange partitions so that:
6502 :
6503 : - the partition for an outer loop comes before the partition for
6504 : an inner loop
6505 :
6506 : - if a sibling loop A dominates a sibling loop B, A's partition
6507 : comes before B's
6508 :
6509 : This gives the following partition dag for the example above:
6510 :
6511 : S1 S3
6512 : \ /
6513 : S2+S4+S8 S6
6514 : | \\ /
6515 : | S5+S7
6516 : |
6517 : S9
6518 :
6519 : There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
6520 : one for a reversal of the edge S7->S8.
6521 :
6522 : The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
6523 : for S2+S4+S8 therefore has to balance the cost of using the outer loop's
6524 : preferred layout against the cost of changing the layout on entry to the
6525 : inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
6526 :
6527 : Although this works well when optimizing for speed, it has the downside
6528 : when optimizing for size that the choice of layout for S5+S7 is completely
6529 : independent of S9, which lessens the chance of reducing the overall number
6530 : of permutations. We therefore do not partition SCCs when optimizing
6531 : for size.
6532 :
6533 : To give a concrete example of the difference between optimizing
6534 : for size and speed, consider:
6535 :
6536 : a[0] = (b[1] << c[3]) - d[1];
6537 : a[1] = (b[0] << c[2]) - d[0];
6538 : a[2] = (b[3] << c[1]) - d[3];
6539 : a[3] = (b[2] << c[0]) - d[2];
6540 :
6541 : There are three different layouts here: one for a, one for b and d,
6542 : and one for c. When optimizing for speed it is better to permute each
6543 : of b, c and d into the order required by a, since those permutations
6544 : happen in parallel. But when optimizing for size, it is better to:
6545 :
6546 : - permute c into the same order as b
6547 : - do the arithmetic
6548 : - permute the result into the order required by a
6549 :
6550 : This gives 2 permutations rather than 3. */
6551 :
6552 : class vect_optimize_slp_pass
6553 : {
6554 : public:
6555 678936 : vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
6556 : void run ();
6557 :
6558 : private:
6559 : /* Graph building. */
6560 : struct loop *containing_loop (slp_tree);
6561 : bool is_cfg_latch_edge (graph_edge *);
6562 : void build_vertices (hash_set<slp_tree> &, slp_tree);
6563 : void build_vertices ();
6564 : void build_graph ();
6565 :
6566 : /* Partitioning. */
6567 : void create_partitions ();
6568 : template<typename T> void for_each_partition_edge (unsigned int, T);
6569 :
6570 : /* Layout selection. */
6571 : bool is_compatible_layout (slp_tree, unsigned int);
6572 : bool is_compatible_layout (const slpg_partition_info &, unsigned int);
6573 : int change_layout_cost (slp_tree, unsigned int, unsigned int);
6574 : slpg_partition_layout_costs &partition_layout_costs (unsigned int,
6575 : unsigned int);
6576 : void change_vec_perm_layout (slp_tree, lane_permutation_t &,
6577 : int, unsigned int);
6578 : int internal_node_cost (slp_tree, int, unsigned int);
6579 : void start_choosing_layouts ();
6580 : bool legitimize ();
6581 :
6582 : /* Cost propagation. */
6583 : slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
6584 : unsigned int, unsigned int);
6585 : slpg_layout_cost total_in_cost (unsigned int);
6586 : slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
6587 : slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
6588 : void forward_pass ();
6589 : void backward_pass ();
6590 :
6591 : /* Rematerialization. */
6592 : slp_tree get_result_with_layout (slp_tree, unsigned int);
6593 : void materialize ();
6594 :
6595 : /* Clean-up. */
6596 : void remove_redundant_permutations ();
6597 :
6598 : /* Masked load lanes discovery. */
6599 : void decide_masked_load_lanes ();
6600 :
6601 : void dump ();
6602 :
6603 : vec_info *m_vinfo;
6604 :
6605 : /* True if we should optimize the graph for size, false if we should
6606 : optimize it for speed. (It wouldn't be easy to make this decision
6607 : more locally.) */
6608 : bool m_optimize_size;
6609 :
6610 : /* A graph of all SLP nodes, with edges leading from uses to definitions.
6611 : In other words, a node's predecessors are its slp_tree parents and
6612 : a node's successors are its slp_tree children. */
6613 : graph *m_slpg = nullptr;
6614 :
6615 : /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
6616 : auto_vec<slpg_vertex> m_vertices;
6617 :
6618 : /* The list of all leaves of M_SLPG. such as external definitions, constants,
6619 : and loads. */
6620 : auto_vec<int> m_leafs;
6621 :
6622 : /* This array has one entry for every vector layout that we're considering.
6623 : Element 0 is null and indicates "no change". Other entries describe
6624 : permutations that are inherent in the current graph and that we would
6625 : like to reverse if possible.
6626 :
6627 : For example, a permutation { 1, 2, 3, 0 } means that something has
6628 : effectively been permuted in that way, such as a load group
6629 : { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
6630 : We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
6631 : in order to put things "back" in order. */
6632 : auto_vec<vec<unsigned> > m_perms;
6633 :
6634 : /* A partitioning of the nodes for which a layout must be chosen.
6635 : Each partition represents an <SCC, cfg loop> pair; that is,
6636 : nodes in different SCCs belong to different partitions, and nodes
6637 : within an SCC can be further partitioned according to a containing
6638 : cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
6639 :
6640 : - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
6641 : from leaves (such as loads) to roots (such as stores).
6642 :
6643 : - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
6644 : auto_vec<slpg_partition_info> m_partitions;
6645 :
6646 : /* The list of all nodes for which a layout must be chosen. Nodes for
6647 : partition P come before the nodes for partition P+1. Nodes within a
6648 : partition are in reverse postorder. */
6649 : auto_vec<unsigned int> m_partitioned_nodes;
6650 :
6651 : /* Index P * num-layouts + L contains the cost of using layout L
6652 : for partition P. */
6653 : auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
6654 :
6655 : /* Index N * num-layouts + L, if nonnull, is a node that provides the
6656 : original output of node N adjusted to have layout L. */
6657 : auto_vec<slp_tree> m_node_layouts;
6658 : };
6659 :
6660 : /* Fill the vertices and leafs vector with all nodes in the SLP graph.
6661 : Also record whether we should optimize anything for speed rather
6662 : than size. */
6663 :
6664 : void
6665 10674921 : vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
6666 : slp_tree node)
6667 : {
6668 10674921 : unsigned i;
6669 10674921 : slp_tree child;
6670 :
6671 10674921 : if (visited.add (node))
6672 10674921 : return;
6673 :
6674 9872731 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
6675 : {
6676 7784818 : basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
6677 6932394 : if (optimize_bb_for_speed_p (bb))
6678 6812674 : m_optimize_size = false;
6679 : }
6680 :
6681 9872731 : node->vertex = m_vertices.length ();
6682 9872731 : m_vertices.safe_push (slpg_vertex (node));
6683 :
6684 9872731 : bool leaf = true;
6685 9872731 : bool force_leaf = false;
6686 18487438 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6687 8614707 : if (child)
6688 : {
6689 7751959 : leaf = false;
6690 7751959 : build_vertices (visited, child);
6691 : }
6692 : else
6693 : force_leaf = true;
6694 : /* Since SLP discovery works along use-def edges all cycles have an
6695 : entry - but there's the exception of cycles where we do not handle
6696 : the entry explicitly (but with a NULL SLP node), like some reductions
6697 : and inductions. Force those SLP PHIs to act as leafs to make them
6698 : backwards reachable. */
6699 9872731 : if (leaf || force_leaf)
6700 4879546 : m_leafs.safe_push (node->vertex);
6701 : }
6702 :
6703 : /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
6704 :
6705 : void
6706 1357872 : vect_optimize_slp_pass::build_vertices ()
6707 : {
6708 1357872 : hash_set<slp_tree> visited;
6709 1357872 : unsigned i;
6710 1357872 : slp_instance instance;
6711 1357872 : m_vertices.truncate (0);
6712 1357872 : m_leafs.truncate (0);
6713 6996578 : FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
6714 2922962 : build_vertices (visited, SLP_INSTANCE_TREE (instance));
6715 1357872 : }
6716 :
6717 : /* Apply (reverse) bijectite PERM to VEC. */
6718 :
6719 : template <class T>
6720 : static void
6721 172837 : vect_slp_permute (vec<unsigned> perm,
6722 : vec<T> &vec, bool reverse)
6723 : {
6724 172837 : auto_vec<T, 64> saved;
6725 172837 : saved.create (vec.length ());
6726 572225 : for (unsigned i = 0; i < vec.length (); ++i)
6727 399388 : saved.quick_push (vec[i]);
6728 :
6729 172837 : if (reverse)
6730 : {
6731 1135438 : for (unsigned i = 0; i < vec.length (); ++i)
6732 398056 : vec[perm[i]] = saved[i];
6733 570295 : for (unsigned i = 0; i < vec.length (); ++i)
6734 699787 : gcc_assert (vec[perm[i]] == saved[i]);
6735 : }
6736 : else
6737 : {
6738 3860 : for (unsigned i = 0; i < vec.length (); ++i)
6739 1332 : vec[i] = saved[perm[i]];
6740 174169 : for (unsigned i = 0; i < vec.length (); ++i)
6741 1998 : gcc_assert (vec[i] == saved[perm[i]]);
6742 : }
6743 172837 : }
6744 :
6745 : /* Return the cfg loop that contains NODE. */
6746 :
6747 : struct loop *
6748 3869483 : vect_optimize_slp_pass::containing_loop (slp_tree node)
6749 : {
6750 3869483 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
6751 3869483 : if (!rep)
6752 5133 : return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
6753 4303735 : return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
6754 : }
6755 :
6756 : /* Return true if UD (an edge from a use to a definition) is associated
6757 : with a loop latch edge in the cfg. */
6758 :
6759 : bool
6760 7751959 : vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
6761 : {
6762 7751959 : slp_tree use = m_vertices[ud->src].node;
6763 7751959 : slp_tree def = m_vertices[ud->dest].node;
6764 7751959 : if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
6765 7751959 : || SLP_TREE_PERMUTE_P (use))
6766 7440800 : || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
6767 : return false;
6768 :
6769 4513592 : stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
6770 4513592 : return (is_a<gphi *> (use_rep->stmt)
6771 376248 : && bb_loop_header_p (gimple_bb (use_rep->stmt))
6772 4724988 : && containing_loop (def) == containing_loop (use));
6773 : }
6774 :
6775 : /* Build the graph. Mark edges that correspond to cfg loop latch edges with
6776 : a nonnull data field. */
6777 :
6778 : void
6779 1357872 : vect_optimize_slp_pass::build_graph ()
6780 : {
6781 1357872 : m_optimize_size = true;
6782 1357872 : build_vertices ();
6783 :
6784 2715744 : m_slpg = new_graph (m_vertices.length ());
6785 13946347 : for (slpg_vertex &v : m_vertices)
6786 29517700 : for (slp_tree child : SLP_TREE_CHILDREN (v.node))
6787 8614707 : if (child)
6788 : {
6789 7751959 : graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
6790 7751959 : if (is_cfg_latch_edge (ud))
6791 202676 : ud->data = this;
6792 : }
6793 1357872 : }
6794 :
6795 : /* Return true if E corresponds to a loop latch edge in the cfg. */
6796 :
6797 : static bool
6798 3977038 : skip_cfg_latch_edges (graph_edge *e)
6799 : {
6800 3977038 : return e->data;
6801 : }
6802 :
6803 : /* Create the node partitions. */
6804 :
6805 : void
6806 678936 : vect_optimize_slp_pass::create_partitions ()
6807 : {
6808 : /* Calculate a postorder of the graph, ignoring edges that correspond
6809 : to natural latch edges in the cfg. Reading the vector from the end
6810 : to the beginning gives the reverse postorder. */
6811 678936 : auto_vec<int> initial_rpo;
6812 1357872 : graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
6813 : false, NULL, skip_cfg_latch_edges);
6814 2036808 : gcc_assert (initial_rpo.length () == m_vertices.length ());
6815 :
6816 : /* Calculate the strongly connected components of the graph. */
6817 678936 : auto_vec<int> scc_grouping;
6818 678936 : unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
6819 :
6820 : /* Create a new index order in which all nodes from the same SCC are
6821 : consecutive. Use scc_pos to record the index of the first node in
6822 : each SCC. */
6823 678936 : auto_vec<unsigned int> scc_pos (num_sccs);
6824 678936 : int last_component = -1;
6825 678936 : unsigned int node_count = 0;
6826 6972896 : for (unsigned int node_i : scc_grouping)
6827 : {
6828 4936088 : if (last_component != m_slpg->vertices[node_i].component)
6829 : {
6830 4808743 : last_component = m_slpg->vertices[node_i].component;
6831 9617486 : gcc_assert (last_component == int (scc_pos.length ()));
6832 4808743 : scc_pos.quick_push (node_count);
6833 : }
6834 4936088 : node_count += 1;
6835 : }
6836 1357872 : gcc_assert (node_count == initial_rpo.length ()
6837 : && last_component + 1 == int (num_sccs));
6838 :
6839 : /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
6840 : inside each SCC following the RPO we calculated above. The fact that
6841 : we ignored natural latch edges when calculating the RPO should ensure
6842 : that, for natural loop nests:
6843 :
6844 : - the first node that we encounter in a cfg loop is the loop header phi
6845 : - the loop header phis are in dominance order
6846 :
6847 : Arranging for this is an optimization (see below) rather than a
6848 : correctness issue. Unnatural loops with a tangled mess of backedges
6849 : will still work correctly, but might give poorer results.
6850 :
6851 : Also update scc_pos so that it gives 1 + the index of the last node
6852 : in the SCC. */
6853 678936 : m_partitioned_nodes.safe_grow (node_count);
6854 6293960 : for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
6855 : {
6856 4936088 : unsigned int node_i = initial_rpo[old_i];
6857 4936088 : unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
6858 4936088 : m_partitioned_nodes[new_i] = node_i;
6859 : }
6860 :
6861 : /* When optimizing for speed, partition each SCC based on the containing
6862 : cfg loop. The order we constructed above should ensure that, for natural
6863 : cfg loops, we'll create sub-SCC partitions for outer loops before
6864 : the corresponding sub-SCC partitions for inner loops. Similarly,
6865 : when one sibling loop A dominates another sibling loop B, we should
6866 : create a sub-SCC partition for A before a sub-SCC partition for B.
6867 :
6868 : As above, nothing depends for correctness on whether this achieves
6869 : a natural nesting, but we should get better results when it does. */
6870 1357872 : m_partitions.reserve (m_vertices.length ());
6871 678936 : unsigned int next_partition_i = 0;
6872 678936 : hash_map<struct loop *, int> loop_partitions;
6873 678936 : unsigned int rpo_begin = 0;
6874 678936 : unsigned int num_partitioned_nodes = 0;
6875 6845551 : for (unsigned int rpo_end : scc_pos)
6876 : {
6877 4808743 : loop_partitions.empty ();
6878 : unsigned int partition_i = next_partition_i;
6879 9744831 : for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
6880 : {
6881 : /* Handle externals and constants optimistically throughout.
6882 : But treat existing vectors as fixed since we do not handle
6883 : permuting them. */
6884 4936088 : unsigned int node_i = m_partitioned_nodes[rpo_i];
6885 4936088 : auto &vertex = m_vertices[node_i];
6886 4936088 : if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
6887 496061 : && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
6888 4938123 : || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
6889 1465035 : vertex.partition = -1;
6890 : else
6891 : {
6892 3471053 : bool existed;
6893 3471053 : if (m_optimize_size)
6894 24362 : existed = next_partition_i > partition_i;
6895 : else
6896 : {
6897 3446691 : struct loop *loop = containing_loop (vertex.node);
6898 3446691 : auto &entry = loop_partitions.get_or_insert (loop, &existed);
6899 3446691 : if (!existed)
6900 3320393 : entry = next_partition_i;
6901 3446691 : partition_i = entry;
6902 : }
6903 3471053 : if (!existed)
6904 : {
6905 3344665 : m_partitions.quick_push (slpg_partition_info ());
6906 3344665 : next_partition_i += 1;
6907 : }
6908 3471053 : vertex.partition = partition_i;
6909 3471053 : num_partitioned_nodes += 1;
6910 3471053 : m_partitions[partition_i].node_end += 1;
6911 : }
6912 : }
6913 4808743 : rpo_begin = rpo_end;
6914 : }
6915 :
6916 : /* Assign ranges of consecutive node indices to each partition,
6917 : in partition order. Start with node_end being the same as
6918 : node_begin so that the next loop can use it as a counter. */
6919 678936 : unsigned int node_begin = 0;
6920 5381473 : for (auto &partition : m_partitions)
6921 : {
6922 3344665 : partition.node_begin = node_begin;
6923 3344665 : node_begin += partition.node_end;
6924 3344665 : partition.node_end = partition.node_begin;
6925 : }
6926 678936 : gcc_assert (node_begin == num_partitioned_nodes);
6927 :
6928 : /* Finally build the list of nodes in partition order. */
6929 678936 : m_partitioned_nodes.truncate (num_partitioned_nodes);
6930 5615024 : for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
6931 : {
6932 4936088 : int partition_i = m_vertices[node_i].partition;
6933 4936088 : if (partition_i >= 0)
6934 : {
6935 3471053 : unsigned int order_i = m_partitions[partition_i].node_end++;
6936 3471053 : m_partitioned_nodes[order_i] = node_i;
6937 : }
6938 : }
6939 678936 : }
6940 :
6941 : /* Look for edges from earlier partitions into node NODE_I and edges from
6942 : node NODE_I into later partitions. Call:
6943 :
6944 : FN (ud, other_node_i)
6945 :
6946 : for each such use-to-def edge ud, where other_node_i is the node at the
6947 : other end of the edge. */
6948 :
6949 : template<typename T>
6950 : void
6951 3856908 : vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
6952 : {
6953 3856908 : int partition_i = m_vertices[node_i].partition;
6954 3856908 : for (graph_edge *pred = m_slpg->vertices[node_i].pred;
6955 6668551 : pred; pred = pred->pred_next)
6956 : {
6957 2811643 : int src_partition_i = m_vertices[pred->src].partition;
6958 2811643 : if (src_partition_i >= 0 && src_partition_i != partition_i)
6959 2488590 : fn (pred, pred->src);
6960 : }
6961 3856908 : for (graph_edge *succ = m_slpg->vertices[node_i].succ;
6962 8266592 : succ; succ = succ->succ_next)
6963 : {
6964 4409684 : int dest_partition_i = m_vertices[succ->dest].partition;
6965 4409684 : if (dest_partition_i >= 0 && dest_partition_i != partition_i)
6966 2516189 : fn (succ, succ->dest);
6967 : }
6968 3856908 : }
6969 :
6970 : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
6971 : that NODE would operate on. This test is independent of NODE's actual
6972 : operation. */
6973 :
6974 : bool
6975 1503197 : vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
6976 : unsigned int layout_i)
6977 : {
6978 1503197 : if (layout_i == 0)
6979 : return true;
6980 :
6981 848720 : if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
6982 14558 : return false;
6983 :
6984 : return true;
6985 : }
6986 :
6987 : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
6988 : that NODE would operate on for each NODE in PARTITION.
6989 : This test is independent of NODE's actual operations. */
6990 :
6991 : bool
6992 16238 : vect_optimize_slp_pass::is_compatible_layout (const slpg_partition_info
6993 : &partition,
6994 : unsigned int layout_i)
6995 : {
6996 32748 : for (unsigned int order_i = partition.node_begin;
6997 32748 : order_i < partition.node_end; ++order_i)
6998 : {
6999 16577 : unsigned int node_i = m_partitioned_nodes[order_i];
7000 16577 : auto &vertex = m_vertices[node_i];
7001 :
7002 : /* The layout is incompatible if it is individually incompatible
7003 : with any node in the partition. */
7004 16577 : if (!is_compatible_layout (vertex.node, layout_i))
7005 : return false;
7006 : }
7007 : return true;
7008 : }
7009 :
7010 : /* Return the cost (in arbitrary units) of going from layout FROM_LAYOUT_I
7011 : to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
7012 : layouts is incompatible with NODE or if the change is not possible for
7013 : some other reason.
7014 :
7015 : The properties taken from NODE include the number of lanes and the
7016 : vector type. The actual operation doesn't matter. */
7017 :
7018 : int
7019 638892 : vect_optimize_slp_pass::change_layout_cost (slp_tree node,
7020 : unsigned int from_layout_i,
7021 : unsigned int to_layout_i)
7022 : {
7023 638892 : if (!is_compatible_layout (node, from_layout_i)
7024 638892 : || !is_compatible_layout (node, to_layout_i))
7025 545 : return -1;
7026 :
7027 638347 : if (from_layout_i == to_layout_i)
7028 : return 0;
7029 :
7030 262952 : auto_vec<slp_tree, 1> children (1);
7031 262952 : children.quick_push (node);
7032 262952 : auto_lane_permutation_t perm (SLP_TREE_LANES (node));
7033 262952 : if (from_layout_i > 0)
7034 750666 : for (unsigned int i : m_perms[from_layout_i])
7035 333747 : perm.quick_push ({ 0, i });
7036 : else
7037 404712 : for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
7038 280733 : perm.quick_push ({ 0, i });
7039 262952 : if (to_layout_i > 0)
7040 124406 : vect_slp_permute (m_perms[to_layout_i], perm, true);
7041 262952 : auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
7042 : children, false);
7043 262952 : if (count >= 0)
7044 258743 : return MAX (count, 1);
7045 :
7046 : /* ??? In principle we could try changing via layout 0, giving two
7047 : layout changes rather than 1. Doing that would require
7048 : corresponding support in get_result_with_layout. */
7049 : return -1;
7050 262952 : }
7051 :
7052 : /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
7053 :
7054 : inline slpg_partition_layout_costs &
7055 927206 : vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
7056 : unsigned int layout_i)
7057 : {
7058 1854412 : return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
7059 : }
7060 :
7061 : /* Change PERM in one of two ways:
7062 :
7063 : - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
7064 : chosen for child I of NODE.
7065 :
7066 : - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
7067 :
7068 : In both cases, arrange for the output to have layout OUT_LAYOUT_I */
7069 :
7070 : void
7071 30181 : vect_optimize_slp_pass::
7072 : change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
7073 : int in_layout_i, unsigned int out_layout_i)
7074 : {
7075 175721 : for (auto &entry : perm)
7076 : {
7077 85178 : int this_in_layout_i = in_layout_i;
7078 85178 : if (this_in_layout_i < 0)
7079 : {
7080 59111 : slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
7081 59111 : unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
7082 59111 : if (in_partition_i == -1u)
7083 329 : continue;
7084 58782 : this_in_layout_i = m_partitions[in_partition_i].layout;
7085 : }
7086 84849 : if (this_in_layout_i > 0)
7087 19065 : entry.second = m_perms[this_in_layout_i][entry.second];
7088 : }
7089 30181 : if (out_layout_i > 0)
7090 7031 : vect_slp_permute (m_perms[out_layout_i], perm, true);
7091 30181 : }
7092 :
7093 : /* Check whether the target allows NODE to be rearranged so that the node's
7094 : output has layout OUT_LAYOUT_I. Return the cost of the change if so,
7095 : in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
7096 :
7097 : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
7098 : NODE can adapt to the layout changes that have (perhaps provisionally)
7099 : been chosen for NODE's children, so that no extra permutations are
7100 : needed on either the input or the output of NODE.
7101 :
7102 : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
7103 : that all inputs will be forced into layout IN_LAYOUT_I beforehand.
7104 :
7105 : IN_LAYOUT_I has no meaning for other types of node.
7106 :
7107 : Keeping the node as-is is always valid. If the target doesn't appear
7108 : to support the node as-is, but might realistically support other layouts,
7109 : then layout 0 instead has the cost of a worst-case permutation. On the
7110 : one hand, this ensures that every node has at least one valid layout,
7111 : avoiding what would otherwise be an awkward special case. On the other,
7112 : it still encourages the pass to change an invalid pre-existing layout
7113 : choice into a valid one. */
7114 :
7115 : int
7116 205074 : vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
7117 : unsigned int out_layout_i)
7118 : {
7119 205074 : const int fallback_cost = 1;
7120 :
7121 205074 : if (SLP_TREE_PERMUTE_P (node))
7122 : {
7123 25071 : auto_lane_permutation_t tmp_perm;
7124 25071 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
7125 :
7126 : /* Check that the child nodes support the chosen layout. Checking
7127 : the first child is enough, since any second child would have the
7128 : same shape. */
7129 25071 : auto first_child = SLP_TREE_CHILDREN (node)[0];
7130 25071 : if (in_layout_i > 0
7131 25071 : && !is_compatible_layout (first_child, in_layout_i))
7132 : return -1;
7133 :
7134 24531 : change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
7135 49062 : int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
7136 : node, tmp_perm,
7137 24531 : SLP_TREE_CHILDREN (node),
7138 : false);
7139 24531 : if (count < 0)
7140 : {
7141 1510 : if (in_layout_i == 0 && out_layout_i == 0)
7142 : {
7143 : /* Use the fallback cost if the node could in principle support
7144 : some nonzero layout for both the inputs and the outputs.
7145 : Otherwise assume that the node will be rejected later
7146 : and rebuilt from scalars. */
7147 367 : if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
7148 : return fallback_cost;
7149 297 : return 0;
7150 : }
7151 : return -1;
7152 : }
7153 :
7154 : /* We currently have no way of telling whether the new layout is cheaper
7155 : or more expensive than the old one. But at least in principle,
7156 : it should be worth making zero permutations (whole-vector shuffles)
7157 : cheaper than real permutations, in case the pass is able to remove
7158 : the latter. */
7159 23021 : return count == 0 ? 0 : 1;
7160 25071 : }
7161 :
7162 180003 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
7163 180003 : if (rep
7164 179169 : && STMT_VINFO_DATA_REF (rep)
7165 56756 : && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
7166 219605 : && SLP_TREE_LOAD_PERMUTATION (node).exists ())
7167 : {
7168 32527 : auto_load_permutation_t tmp_perm;
7169 32527 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
7170 32527 : if (out_layout_i > 0)
7171 12252 : vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
7172 :
7173 32527 : poly_uint64 vf = 1;
7174 32527 : if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
7175 12152 : vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7176 32527 : unsigned int n_perms;
7177 32527 : if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
7178 : nullptr, vf, true, false, &n_perms))
7179 : {
7180 1503 : auto rep = SLP_TREE_REPRESENTATIVE (node);
7181 1503 : if (out_layout_i == 0)
7182 : {
7183 : /* Use the fallback cost if the load is an N-to-N permutation.
7184 : Otherwise assume that the node will be rejected later
7185 : and rebuilt from scalars. */
7186 1097 : if (STMT_VINFO_GROUPED_ACCESS (rep)
7187 2194 : && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
7188 1097 : == SLP_TREE_LANES (node)))
7189 595 : return fallback_cost;
7190 : return 0;
7191 : }
7192 : return -1;
7193 : }
7194 :
7195 : /* See the comment above the corresponding VEC_PERM_EXPR handling. */
7196 31024 : return n_perms == 0 ? 0 : 1;
7197 32527 : }
7198 :
7199 : return 0;
7200 : }
7201 :
7202 : /* Decide which element layouts we should consider using. Calculate the
7203 : weights associated with inserting layout changes on partition edges.
7204 : Also mark partitions that cannot change layout, by setting their
7205 : layout to zero. */
7206 :
7207 : void
7208 678936 : vect_optimize_slp_pass::start_choosing_layouts ()
7209 : {
7210 : /* Used to assign unique permutation indices. */
7211 678936 : using perm_hash = unbounded_hashmap_traits<
7212 : vec_free_hash_base<int_hash_base<unsigned>>,
7213 : int_hash<int, -1, -2>
7214 : >;
7215 678936 : hash_map<vec<unsigned>, int, perm_hash> layout_ids;
7216 :
7217 : /* Layout 0 is "no change". */
7218 678936 : m_perms.safe_push (vNULL);
7219 :
7220 : /* Create layouts from existing permutations. */
7221 678936 : auto_load_permutation_t tmp_perm;
7222 5507861 : for (unsigned int node_i : m_partitioned_nodes)
7223 : {
7224 : /* Leafs also double as entries to the reverse graph. Allow the
7225 : layout of those to be changed. */
7226 3471053 : auto &vertex = m_vertices[node_i];
7227 3471053 : auto &partition = m_partitions[vertex.partition];
7228 3471053 : if (!m_slpg->vertices[node_i].succ)
7229 884166 : partition.layout = 0;
7230 :
7231 : /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
7232 3471053 : slp_tree node = vertex.node;
7233 3471053 : stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
7234 3471053 : slp_tree child;
7235 3471053 : unsigned HOST_WIDE_INT imin, imax = 0;
7236 3471053 : bool any_permute = false;
7237 3471053 : tmp_perm.truncate (0);
7238 3471053 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
7239 : {
7240 : /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
7241 : unpermuted, record a layout that reverses this permutation.
7242 :
7243 : We would need more work to cope with loads that are internally
7244 : permuted and also have inputs (such as masks for
7245 : IFN_MASK_LOADs). */
7246 594481 : gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
7247 594481 : if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
7248 : {
7249 422669 : partition.layout = -1;
7250 3454909 : continue;
7251 : }
7252 171812 : dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
7253 171812 : imin = DR_GROUP_SIZE (dr_stmt) + 1;
7254 171812 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
7255 : }
7256 5634878 : else if (SLP_TREE_PERMUTE_P (node)
7257 136783 : && SLP_TREE_CHILDREN (node).length () == 1
7258 118266 : && (child = SLP_TREE_CHILDREN (node)[0])
7259 2994838 : && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
7260 118266 : .is_constant (&imin)))
7261 : {
7262 : /* If the child has the same vector size as this node,
7263 : reversing the permutation can make the permutation a no-op.
7264 : In other cases it can change a true permutation into a
7265 : full-vector extract. */
7266 118266 : tmp_perm.reserve (SLP_TREE_LANES (node));
7267 317572 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7268 199306 : tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
7269 : }
7270 : else
7271 2758306 : continue;
7272 :
7273 764972 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7274 : {
7275 474894 : unsigned idx = tmp_perm[j];
7276 474894 : imin = MIN (imin, idx);
7277 474894 : imax = MAX (imax, idx);
7278 474894 : if (idx - tmp_perm[0] != j)
7279 138639 : any_permute = true;
7280 : }
7281 : /* If the span doesn't match we'd disrupt VF computation, avoid
7282 : that for now. */
7283 290078 : if (imax - imin + 1 != SLP_TREE_LANES (node))
7284 82680 : continue;
7285 : /* If there's no permute no need to split one out. In this case
7286 : we can consider turning a load into a permuted load, if that
7287 : turns out to be cheaper than alternatives. */
7288 207398 : if (!any_permute)
7289 : {
7290 191105 : partition.layout = -1;
7291 191105 : continue;
7292 : }
7293 :
7294 : /* For now only handle true permutes, like
7295 : vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
7296 : when permuting constants and invariants keeping the permute
7297 : bijective. */
7298 16293 : auto_sbitmap load_index (SLP_TREE_LANES (node));
7299 16293 : bitmap_clear (load_index);
7300 63259 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7301 46966 : bitmap_set_bit (load_index, tmp_perm[j] - imin);
7302 : unsigned j;
7303 62430 : for (j = 0; j < SLP_TREE_LANES (node); ++j)
7304 46286 : if (!bitmap_bit_p (load_index, j))
7305 : break;
7306 16293 : if (j != SLP_TREE_LANES (node))
7307 149 : continue;
7308 :
7309 16144 : vec<unsigned> perm = vNULL;
7310 16144 : perm.safe_grow (SLP_TREE_LANES (node), true);
7311 62159 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7312 46015 : perm[j] = tmp_perm[j] - imin;
7313 :
7314 32288 : if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
7315 : {
7316 : /* Continue to use existing layouts, but don't add any more. */
7317 0 : int *entry = layout_ids.get (perm);
7318 0 : partition.layout = entry ? *entry : 0;
7319 0 : perm.release ();
7320 : }
7321 : else
7322 : {
7323 16144 : bool existed;
7324 16144 : int &layout_i = layout_ids.get_or_insert (perm, &existed);
7325 16144 : if (existed)
7326 5553 : perm.release ();
7327 : else
7328 : {
7329 10591 : layout_i = m_perms.length ();
7330 10591 : m_perms.safe_push (perm);
7331 : }
7332 16144 : partition.layout = layout_i;
7333 : }
7334 16293 : }
7335 :
7336 : /* Initially assume that every layout is possible and has zero cost
7337 : in every partition. */
7338 678936 : m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
7339 1357872 : * m_perms.length ());
7340 :
7341 : /* We have to mark outgoing permutations facing non-associating-reduction
7342 : graph entries that are not represented as to be materialized.
7343 : slp_inst_kind_bb_reduc currently only covers associatable reductions. */
7344 3498289 : for (slp_instance instance : m_vinfo->slp_instances)
7345 1461481 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
7346 : {
7347 6409 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
7348 6409 : m_partitions[m_vertices[node_i].partition].layout = 0;
7349 : }
7350 1455072 : else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
7351 : {
7352 2300 : stmt_vec_info stmt_info
7353 2300 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
7354 2300 : vect_reduc_info reduc_info
7355 2300 : = info_for_reduction (as_a <loop_vec_info> (m_vinfo),
7356 : SLP_INSTANCE_TREE (instance));
7357 2300 : if (needs_fold_left_reduction_p (TREE_TYPE
7358 : (gimple_get_lhs (stmt_info->stmt)),
7359 : VECT_REDUC_INFO_CODE (reduc_info)))
7360 : {
7361 97 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
7362 97 : m_partitions[m_vertices[node_i].partition].layout = 0;
7363 : }
7364 : }
7365 :
7366 : /* Check which layouts each node and partition can handle. Calculate the
7367 : weights associated with inserting layout changes on edges. */
7368 5507861 : for (unsigned int node_i : m_partitioned_nodes)
7369 : {
7370 3471053 : auto &vertex = m_vertices[node_i];
7371 3471053 : auto &partition = m_partitions[vertex.partition];
7372 3471053 : slp_tree node = vertex.node;
7373 :
7374 3471053 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
7375 : {
7376 3465920 : vertex.weight = vect_slp_node_weight (node);
7377 :
7378 : /* We do not handle stores with a permutation, so all
7379 : incoming permutations must have been materialized.
7380 :
7381 : We also don't handle masked grouped loads, which lack a
7382 : permutation vector. In this case the memory locations
7383 : form an implicit second input to the loads, on top of the
7384 : explicit mask input, and the memory input's layout cannot
7385 : be changed.
7386 :
7387 : On the other hand, we do support permuting gather loads and
7388 : masked gather loads, where each scalar load is independent
7389 : of the others. This can be useful if the address/index input
7390 : benefits from permutation. */
7391 3465920 : if (STMT_VINFO_DATA_REF (rep)
7392 1756908 : && STMT_VINFO_GROUPED_ACCESS (rep)
7393 4556285 : && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
7394 918553 : partition.layout = 0;
7395 :
7396 : /* We cannot change the layout of an operation that is
7397 : not independent on lanes. Note this is an explicit
7398 : negative list since that's much shorter than the respective
7399 : positive one but it's critical to keep maintaining it. */
7400 3465920 : if (is_gimple_call (STMT_VINFO_STMT (rep)))
7401 31608 : switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
7402 : {
7403 1091 : case CFN_COMPLEX_ADD_ROT90:
7404 1091 : case CFN_COMPLEX_ADD_ROT270:
7405 1091 : case CFN_COMPLEX_MUL:
7406 1091 : case CFN_COMPLEX_MUL_CONJ:
7407 1091 : case CFN_VEC_ADDSUB:
7408 1091 : case CFN_VEC_FMADDSUB:
7409 1091 : case CFN_VEC_FMSUBADD:
7410 1091 : partition.layout = 0;
7411 : default:;
7412 : }
7413 : }
7414 :
7415 7809521 : auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
7416 : {
7417 4338468 : auto &other_vertex = m_vertices[other_node_i];
7418 :
7419 : /* Count the number of edges from earlier partitions and the number
7420 : of edges to later partitions. */
7421 4338468 : if (other_vertex.partition < vertex.partition)
7422 2169234 : partition.in_degree += 1;
7423 : else
7424 2169234 : partition.out_degree += 1;
7425 :
7426 : /* If the current node uses the result of OTHER_NODE_I, accumulate
7427 : the effects of that. */
7428 4338468 : if (ud->src == int (node_i))
7429 : {
7430 2169234 : other_vertex.out_weight += vertex.weight;
7431 2169234 : other_vertex.out_degree += 1;
7432 : }
7433 7809521 : };
7434 3471053 : for_each_partition_edge (node_i, process_edge);
7435 : }
7436 678936 : }
7437 :
7438 : /* Return the incoming costs for node NODE_I, assuming that each input keeps
7439 : its current (provisional) choice of layout. The inputs do not necessarily
7440 : have the same layout as each other. */
7441 :
7442 : slpg_layout_cost
7443 3076 : vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
7444 : {
7445 3076 : auto &vertex = m_vertices[node_i];
7446 3076 : slpg_layout_cost cost;
7447 11238 : auto add_cost = [&](graph_edge *, unsigned int other_node_i)
7448 : {
7449 8162 : auto &other_vertex = m_vertices[other_node_i];
7450 8162 : if (other_vertex.partition < vertex.partition)
7451 : {
7452 5241 : auto &other_partition = m_partitions[other_vertex.partition];
7453 10482 : auto &other_costs = partition_layout_costs (other_vertex.partition,
7454 5241 : other_partition.layout);
7455 5241 : slpg_layout_cost this_cost = other_costs.in_cost;
7456 5241 : this_cost.add_serial_cost (other_costs.internal_cost);
7457 5241 : this_cost.split (other_partition.out_degree);
7458 5241 : cost.add_parallel_cost (this_cost);
7459 : }
7460 11238 : };
7461 3076 : for_each_partition_edge (node_i, add_cost);
7462 3076 : return cost;
7463 : }
7464 :
7465 : /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
7466 : and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
7467 : slpg_layout_cost::impossible () if the change isn't possible. */
7468 :
7469 : slpg_layout_cost
7470 638892 : vect_optimize_slp_pass::
7471 : edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
7472 : unsigned int layout2_i)
7473 : {
7474 638892 : auto &def_vertex = m_vertices[ud->dest];
7475 638892 : auto &use_vertex = m_vertices[ud->src];
7476 638892 : auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
7477 638892 : auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
7478 638892 : auto factor = change_layout_cost (def_vertex.node, def_layout_i,
7479 : use_layout_i);
7480 638892 : if (factor < 0)
7481 4754 : return slpg_layout_cost::impossible ();
7482 :
7483 : /* We have a choice of putting the layout change at the site of the
7484 : definition or at the site of the use. Prefer the former when
7485 : optimizing for size or when the execution frequency of the
7486 : definition is no greater than the combined execution frequencies of
7487 : the uses. When putting the layout change at the site of the definition,
7488 : divvy up the cost among all consumers. */
7489 634138 : if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
7490 : {
7491 616444 : slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
7492 616444 : cost.split (def_vertex.out_degree);
7493 616444 : return cost;
7494 : }
7495 17694 : return { use_vertex.weight * factor, m_optimize_size };
7496 : }
7497 :
7498 : /* UD represents a use-def link between FROM_NODE_I and a node in a later
7499 : partition; FROM_NODE_I could be the definition node or the use node.
7500 : The node at the other end of the link wants to use layout TO_LAYOUT_I.
7501 : Return the cost of any necessary fix-ups on edge UD, or return
7502 : slpg_layout_cost::impossible () if the change isn't possible.
7503 :
7504 : At this point, FROM_NODE_I's partition has chosen the cheapest
7505 : layout based on the information available so far, but this choice
7506 : is only provisional. */
7507 :
7508 : slpg_layout_cost
7509 169523 : vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
7510 : unsigned int to_layout_i)
7511 : {
7512 169523 : auto &from_vertex = m_vertices[from_node_i];
7513 169523 : unsigned int from_partition_i = from_vertex.partition;
7514 169523 : slpg_partition_info &from_partition = m_partitions[from_partition_i];
7515 169523 : gcc_assert (from_partition.layout >= 0);
7516 :
7517 : /* First calculate the cost on the assumption that FROM_PARTITION sticks
7518 : with its current layout preference. */
7519 169523 : slpg_layout_cost cost = slpg_layout_cost::impossible ();
7520 169523 : auto edge_cost = edge_layout_cost (ud, from_node_i,
7521 169523 : from_partition.layout, to_layout_i);
7522 169523 : if (edge_cost.is_possible ())
7523 : {
7524 334094 : auto &from_costs = partition_layout_costs (from_partition_i,
7525 167047 : from_partition.layout);
7526 167047 : cost = from_costs.in_cost;
7527 167047 : cost.add_serial_cost (from_costs.internal_cost);
7528 167047 : cost.split (from_partition.out_degree);
7529 167047 : cost.add_serial_cost (edge_cost);
7530 : }
7531 2476 : else if (from_partition.layout == 0)
7532 : /* We must allow the source partition to have layout 0 as a fallback,
7533 : in case all other options turn out to be impossible. */
7534 2476 : return cost;
7535 :
7536 : /* Take the minimum of that cost and the cost that applies if
7537 : FROM_PARTITION instead switches to TO_LAYOUT_I. */
7538 167047 : auto &direct_layout_costs = partition_layout_costs (from_partition_i,
7539 : to_layout_i);
7540 167047 : if (direct_layout_costs.is_possible ())
7541 : {
7542 147586 : slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
7543 147586 : direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
7544 147586 : direct_cost.split (from_partition.out_degree);
7545 147586 : if (!cost.is_possible ()
7546 147586 : || direct_cost.is_better_than (cost, m_optimize_size))
7547 33660 : cost = direct_cost;
7548 : }
7549 :
7550 167047 : return cost;
7551 : }
7552 :
7553 : /* UD represents a use-def link between TO_NODE_I and a node in an earlier
7554 : partition; TO_NODE_I could be the definition node or the use node.
7555 : The node at the other end of the link wants to use layout FROM_LAYOUT_I;
7556 : return the cost of any necessary fix-ups on edge UD, or
7557 : slpg_layout_cost::impossible () if the choice cannot be made.
7558 :
7559 : At this point, TO_NODE_I's partition has a fixed choice of layout. */
7560 :
7561 : slpg_layout_cost
7562 154094 : vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
7563 : unsigned int from_layout_i)
7564 : {
7565 154094 : auto &to_vertex = m_vertices[to_node_i];
7566 154094 : unsigned int to_partition_i = to_vertex.partition;
7567 154094 : slpg_partition_info &to_partition = m_partitions[to_partition_i];
7568 154094 : gcc_assert (to_partition.layout >= 0);
7569 :
7570 : /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
7571 : adjusted for this input having layout FROM_LAYOUT_I. Assume that
7572 : any other inputs keep their current choice of layout. */
7573 154094 : auto &to_costs = partition_layout_costs (to_partition_i,
7574 : to_partition.layout);
7575 154094 : if (ud->src == int (to_node_i)
7576 153892 : && SLP_TREE_PERMUTE_P (to_vertex.node))
7577 : {
7578 9377 : auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
7579 9377 : auto old_layout = from_partition.layout;
7580 9377 : from_partition.layout = from_layout_i;
7581 18754 : int factor = internal_node_cost (to_vertex.node, -1,
7582 9377 : to_partition.layout);
7583 9377 : from_partition.layout = old_layout;
7584 9377 : if (factor >= 0)
7585 : {
7586 8747 : slpg_layout_cost cost = to_costs.out_cost;
7587 17494 : cost.add_serial_cost ({ to_vertex.weight * factor,
7588 8747 : m_optimize_size });
7589 8747 : cost.split (to_partition.in_degree);
7590 8747 : return cost;
7591 : }
7592 : }
7593 :
7594 : /* Compute the cost if we insert any necessary layout change on edge UD. */
7595 145347 : auto edge_cost = edge_layout_cost (ud, to_node_i,
7596 145347 : to_partition.layout, from_layout_i);
7597 145347 : if (edge_cost.is_possible ())
7598 : {
7599 145347 : slpg_layout_cost cost = to_costs.out_cost;
7600 145347 : cost.add_serial_cost (to_costs.internal_cost);
7601 145347 : cost.split (to_partition.in_degree);
7602 145347 : cost.add_serial_cost (edge_cost);
7603 145347 : return cost;
7604 : }
7605 :
7606 0 : return slpg_layout_cost::impossible ();
7607 : }
7608 :
7609 : /* Make a forward pass through the partitions, accumulating input costs.
7610 : Make a tentative (provisional) choice of layout for each partition,
7611 : ensuring that this choice still allows later partitions to keep
7612 : their original layout. */
7613 :
7614 : void
7615 5219 : vect_optimize_slp_pass::forward_pass ()
7616 : {
7617 108197 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
7618 : ++partition_i)
7619 : {
7620 102978 : auto &partition = m_partitions[partition_i];
7621 :
7622 : /* If the partition consists of a single VEC_PERM_EXPR, precompute
7623 : the incoming cost that would apply if every predecessor partition
7624 : keeps its current layout. This is used within the loop below. */
7625 102978 : slpg_layout_cost in_cost;
7626 102978 : slp_tree single_node = nullptr;
7627 102978 : if (partition.node_end == partition.node_begin + 1)
7628 : {
7629 96752 : unsigned int node_i = m_partitioned_nodes[partition.node_begin];
7630 96752 : single_node = m_vertices[node_i].node;
7631 96752 : if (SLP_TREE_PERMUTE_P (single_node))
7632 3076 : in_cost = total_in_cost (node_i);
7633 : }
7634 :
7635 : /* Go through the possible layouts. Decide which ones are valid
7636 : for this partition and record which of the valid layouts has
7637 : the lowest cost. */
7638 102978 : unsigned int min_layout_i = 0;
7639 102978 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
7640 314787 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7641 : {
7642 211809 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7643 211809 : if (!layout_costs.is_possible ())
7644 50722 : continue;
7645 :
7646 : /* If the recorded layout is already 0 then the layout cannot
7647 : change. */
7648 211809 : if (partition.layout == 0 && layout_i != 0)
7649 : {
7650 34200 : layout_costs.mark_impossible ();
7651 34200 : continue;
7652 : }
7653 :
7654 177609 : bool is_possible = true;
7655 366951 : for (unsigned int order_i = partition.node_begin;
7656 366951 : order_i < partition.node_end; ++order_i)
7657 : {
7658 203764 : unsigned int node_i = m_partitioned_nodes[order_i];
7659 203764 : auto &vertex = m_vertices[node_i];
7660 :
7661 : /* Reject the layout if it is individually incompatible
7662 : with any node in the partition. */
7663 203764 : if (!is_compatible_layout (vertex.node, layout_i))
7664 : {
7665 13406 : is_possible = false;
7666 14422 : break;
7667 : }
7668 :
7669 516967 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
7670 : {
7671 326609 : auto &other_vertex = m_vertices[other_node_i];
7672 326609 : if (other_vertex.partition < vertex.partition)
7673 : {
7674 : /* Accumulate the incoming costs from earlier
7675 : partitions, plus the cost of any layout changes
7676 : on UD itself. */
7677 169523 : auto cost = forward_cost (ud, other_node_i, layout_i);
7678 169523 : if (!cost.is_possible ())
7679 2476 : is_possible = false;
7680 : else
7681 167047 : layout_costs.in_cost.add_parallel_cost (cost);
7682 : }
7683 : else
7684 : /* Reject the layout if it would make layout 0 impossible
7685 : for later partitions. This amounts to testing that the
7686 : target supports reversing the layout change on edges
7687 : to later partitions.
7688 :
7689 : In principle, it might be possible to push a layout
7690 : change all the way down a graph, so that it never
7691 : needs to be reversed and so that the target doesn't
7692 : need to support the reverse operation. But it would
7693 : be awkward to bail out if we hit a partition that
7694 : does not support the new layout, especially since
7695 : we are not dealing with a lattice. */
7696 157086 : is_possible &= edge_layout_cost (ud, other_node_i, 0,
7697 157086 : layout_i).is_possible ();
7698 516967 : };
7699 190358 : for_each_partition_edge (node_i, add_cost);
7700 :
7701 : /* Accumulate the cost of using LAYOUT_I within NODE,
7702 : both for the inputs and the outputs. */
7703 190358 : int factor = internal_node_cost (vertex.node, layout_i,
7704 : layout_i);
7705 190358 : if (factor < 0)
7706 : {
7707 1016 : is_possible = false;
7708 1016 : break;
7709 : }
7710 189342 : else if (factor)
7711 30436 : layout_costs.internal_cost.add_serial_cost
7712 30436 : ({ vertex.weight * factor, m_optimize_size });
7713 : }
7714 177609 : if (!is_possible)
7715 : {
7716 16522 : layout_costs.mark_impossible ();
7717 16522 : continue;
7718 : }
7719 :
7720 : /* Combine the incoming and partition-internal costs. */
7721 161087 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7722 161087 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7723 :
7724 : /* If this partition consists of a single VEC_PERM_EXPR, see
7725 : if the VEC_PERM_EXPR can be changed to support output layout
7726 : LAYOUT_I while keeping all the provisional choices of input
7727 : layout. */
7728 161087 : if (single_node && SLP_TREE_PERMUTE_P (single_node))
7729 : {
7730 5339 : int factor = internal_node_cost (single_node, -1, layout_i);
7731 5339 : if (factor >= 0)
7732 : {
7733 4896 : auto weight = m_vertices[single_node->vertex].weight;
7734 4896 : slpg_layout_cost internal_cost
7735 4896 : = { weight * factor, m_optimize_size };
7736 :
7737 4896 : slpg_layout_cost alt_cost = in_cost;
7738 4896 : alt_cost.add_serial_cost (internal_cost);
7739 4896 : if (alt_cost.is_better_than (combined_cost, m_optimize_size))
7740 : {
7741 1531 : combined_cost = alt_cost;
7742 1531 : layout_costs.in_cost = in_cost;
7743 1531 : layout_costs.internal_cost = internal_cost;
7744 : }
7745 : }
7746 : }
7747 :
7748 : /* Record the layout with the lowest cost. Prefer layout 0 in
7749 : the event of a tie between it and another layout. */
7750 161087 : if (!min_layout_cost.is_possible ()
7751 58109 : || combined_cost.is_better_than (min_layout_cost,
7752 58109 : m_optimize_size))
7753 : {
7754 115342 : min_layout_i = layout_i;
7755 115342 : min_layout_cost = combined_cost;
7756 : }
7757 : }
7758 :
7759 : /* This loop's handling of earlier partitions should ensure that
7760 : choosing the original layout for the current partition is no
7761 : less valid than it was in the original graph, even with the
7762 : provisional layout choices for those earlier partitions. */
7763 102978 : gcc_assert (min_layout_cost.is_possible ());
7764 102978 : partition.layout = min_layout_i;
7765 : }
7766 5219 : }
7767 :
7768 : /* Make a backward pass through the partitions, accumulating output costs.
7769 : Make a final choice of layout for each partition. */
7770 :
7771 : void
7772 5219 : vect_optimize_slp_pass::backward_pass ()
7773 : {
7774 113416 : for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
7775 : {
7776 102978 : auto &partition = m_partitions[partition_i];
7777 :
7778 102978 : unsigned int min_layout_i = 0;
7779 102978 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
7780 314787 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7781 : {
7782 211809 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7783 211809 : if (!layout_costs.is_possible ())
7784 50722 : continue;
7785 :
7786 : /* Accumulate the costs from successor partitions. */
7787 161087 : bool is_possible = true;
7788 348292 : for (unsigned int order_i = partition.node_begin;
7789 348292 : order_i < partition.node_end; ++order_i)
7790 : {
7791 187205 : unsigned int node_i = m_partitioned_nodes[order_i];
7792 187205 : auto &vertex = m_vertices[node_i];
7793 508235 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
7794 : {
7795 321030 : auto &other_vertex = m_vertices[other_node_i];
7796 321030 : auto &other_partition = m_partitions[other_vertex.partition];
7797 321030 : if (other_vertex.partition > vertex.partition)
7798 : {
7799 : /* Accumulate the incoming costs from later
7800 : partitions, plus the cost of any layout changes
7801 : on UD itself. */
7802 154094 : auto cost = backward_cost (ud, other_node_i, layout_i);
7803 154094 : if (!cost.is_possible ())
7804 0 : is_possible = false;
7805 : else
7806 154094 : layout_costs.out_cost.add_parallel_cost (cost);
7807 : }
7808 : else
7809 : /* Make sure that earlier partitions can (if necessary
7810 : or beneficial) keep the layout that they chose in
7811 : the forward pass. This ensures that there is at
7812 : least one valid choice of layout. */
7813 166936 : is_possible &= edge_layout_cost (ud, other_node_i,
7814 166936 : other_partition.layout,
7815 166936 : layout_i).is_possible ();
7816 508235 : };
7817 187205 : for_each_partition_edge (node_i, add_cost);
7818 : }
7819 161087 : if (!is_possible)
7820 : {
7821 0 : layout_costs.mark_impossible ();
7822 0 : continue;
7823 : }
7824 :
7825 : /* Locally combine the costs from the forward and backward passes.
7826 : (This combined cost is not passed on, since that would lead
7827 : to double counting.) */
7828 161087 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7829 161087 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7830 161087 : combined_cost.add_serial_cost (layout_costs.out_cost);
7831 :
7832 : /* Record the layout with the lowest cost. Prefer layout 0 in
7833 : the event of a tie between it and another layout. */
7834 161087 : if (!min_layout_cost.is_possible ()
7835 58109 : || combined_cost.is_better_than (min_layout_cost,
7836 58109 : m_optimize_size))
7837 : {
7838 108446 : min_layout_i = layout_i;
7839 108446 : min_layout_cost = combined_cost;
7840 : }
7841 : }
7842 :
7843 102978 : gcc_assert (min_layout_cost.is_possible ());
7844 102978 : partition.layout = min_layout_i;
7845 : }
7846 5219 : }
7847 :
7848 : /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
7849 : NODE already has the layout that was selected for its partition. */
7850 :
7851 : slp_tree
7852 146078 : vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
7853 : unsigned int to_layout_i)
7854 : {
7855 146078 : unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
7856 146078 : slp_tree result = m_node_layouts[result_i];
7857 146078 : if (result)
7858 : return result;
7859 :
7860 145545 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7861 145545 : || (SLP_TREE_DEF_TYPE (node) == vect_external_def
7862 : /* We can't permute vector defs in place. */
7863 16133 : && SLP_TREE_VEC_DEFS (node).is_empty ()))
7864 : {
7865 : /* If the vector is uniform or unchanged, there's nothing to do. */
7866 32600 : if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
7867 : result = node;
7868 : else
7869 : {
7870 1450 : auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
7871 1450 : result = vect_create_new_slp_node (scalar_ops);
7872 1450 : vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
7873 : }
7874 : }
7875 : else
7876 : {
7877 112945 : unsigned int partition_i = m_vertices[node->vertex].partition;
7878 112945 : unsigned int from_layout_i = m_partitions[partition_i].layout;
7879 112945 : if (from_layout_i == to_layout_i)
7880 112388 : return node;
7881 :
7882 : /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
7883 : permutation instead of a serial one. Leave the new permutation
7884 : in TMP_PERM on success. */
7885 557 : auto_lane_permutation_t tmp_perm;
7886 557 : unsigned int num_inputs = 1;
7887 557 : if (SLP_TREE_PERMUTE_P (node))
7888 : {
7889 7 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
7890 7 : if (from_layout_i != 0)
7891 7 : vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
7892 7 : if (to_layout_i != 0)
7893 4 : vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
7894 7 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
7895 : tmp_perm,
7896 7 : SLP_TREE_CHILDREN (node),
7897 : false) >= 0)
7898 7 : num_inputs = SLP_TREE_CHILDREN (node).length ();
7899 : else
7900 0 : tmp_perm.truncate (0);
7901 : }
7902 :
7903 557 : if (dump_enabled_p ())
7904 : {
7905 70 : if (tmp_perm.length () > 0)
7906 6 : dump_printf_loc (MSG_NOTE, vect_location,
7907 : "duplicating permutation node %p with"
7908 : " layout %d\n",
7909 : (void *) node, to_layout_i);
7910 : else
7911 64 : dump_printf_loc (MSG_NOTE, vect_location,
7912 : "inserting permutation node in place of %p\n",
7913 : (void *) node);
7914 : }
7915 :
7916 557 : unsigned int num_lanes = SLP_TREE_LANES (node);
7917 557 : result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
7918 557 : if (SLP_TREE_SCALAR_STMTS (node).length ())
7919 : {
7920 556 : auto &stmts = SLP_TREE_SCALAR_STMTS (result);
7921 556 : stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
7922 556 : if (from_layout_i != 0)
7923 299 : vect_slp_permute (m_perms[from_layout_i], stmts, false);
7924 556 : if (to_layout_i != 0)
7925 261 : vect_slp_permute (m_perms[to_layout_i], stmts, true);
7926 : }
7927 557 : SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
7928 557 : SLP_TREE_LANES (result) = num_lanes;
7929 557 : SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
7930 557 : result->vertex = -1;
7931 :
7932 557 : auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
7933 557 : if (tmp_perm.length ())
7934 : {
7935 7 : lane_perm.safe_splice (tmp_perm);
7936 7 : SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
7937 : }
7938 : else
7939 : {
7940 550 : lane_perm.create (num_lanes);
7941 1714 : for (unsigned j = 0; j < num_lanes; ++j)
7942 1164 : lane_perm.quick_push ({ 0, j });
7943 550 : if (from_layout_i != 0)
7944 292 : vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
7945 550 : if (to_layout_i != 0)
7946 258 : vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
7947 550 : SLP_TREE_CHILDREN (result).safe_push (node);
7948 : }
7949 2232 : for (slp_tree child : SLP_TREE_CHILDREN (result))
7950 561 : child->refcnt++;
7951 557 : }
7952 33157 : m_node_layouts[result_i] = result;
7953 33157 : return result;
7954 : }
7955 :
7956 : /* Apply the chosen vector layouts to the SLP graph. */
7957 :
7958 : void
7959 10174 : vect_optimize_slp_pass::materialize ()
7960 : {
7961 : /* We no longer need the costs, so avoid having two O(N * P) arrays
7962 : live at the same time. */
7963 10174 : m_partition_layout_costs.release ();
7964 30522 : m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
7965 :
7966 20348 : auto_sbitmap fully_folded (m_vertices.length ());
7967 10174 : bitmap_clear (fully_folded);
7968 156254 : for (unsigned int node_i : m_partitioned_nodes)
7969 : {
7970 125732 : auto &vertex = m_vertices[node_i];
7971 125732 : slp_tree node = vertex.node;
7972 125732 : int layout_i = m_partitions[vertex.partition].layout;
7973 125732 : gcc_assert (layout_i >= 0);
7974 :
7975 : /* Rearrange the scalar statements to match the chosen layout. */
7976 125732 : if (layout_i > 0)
7977 13340 : vect_slp_permute (m_perms[layout_i],
7978 13340 : SLP_TREE_SCALAR_STMTS (node), true);
7979 :
7980 : /* Update load and lane permutations. */
7981 125732 : if (SLP_TREE_PERMUTE_P (node))
7982 : {
7983 : /* First try to absorb the input vector layouts. If that fails,
7984 : force the inputs to have layout LAYOUT_I too. We checked that
7985 : that was possible before deciding to use nonzero output layouts.
7986 : (Note that at this stage we don't really have any guarantee that
7987 : the target supports the original VEC_PERM_EXPR.) */
7988 5283 : auto &perm = SLP_TREE_LANE_PERMUTATION (node);
7989 5283 : auto_lane_permutation_t tmp_perm;
7990 5283 : tmp_perm.safe_splice (perm);
7991 5283 : change_vec_perm_layout (node, tmp_perm, -1, layout_i);
7992 5283 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
7993 : tmp_perm,
7994 5283 : SLP_TREE_CHILDREN (node),
7995 : false) >= 0)
7996 : {
7997 4916 : if (dump_enabled_p ()
7998 5836 : && !std::equal (tmp_perm.begin (), tmp_perm.end (),
7999 : perm.begin ()))
8000 58 : dump_printf_loc (MSG_NOTE, vect_location,
8001 : "absorbing input layouts into %p\n",
8002 : (void *) node);
8003 27719 : std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
8004 4916 : bitmap_set_bit (fully_folded, node_i);
8005 : }
8006 : else
8007 : {
8008 : /* Not MSG_MISSED because it would make no sense to users. */
8009 367 : if (dump_enabled_p ())
8010 46 : dump_printf_loc (MSG_NOTE, vect_location,
8011 : "failed to absorb input layouts into %p\n",
8012 : (void *) node);
8013 367 : change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
8014 : }
8015 5283 : }
8016 : else
8017 : {
8018 120449 : gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
8019 120449 : auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
8020 120449 : if (layout_i > 0)
8021 : /* ??? When we handle non-bijective permutes the idea
8022 : is that we can force the load-permutation to be
8023 : { min, min + 1, min + 2, ... max }. But then the
8024 : scalar defs might no longer match the lane content
8025 : which means wrong-code with live lane vectorization.
8026 : So we possibly have to have NULL entries for those. */
8027 13237 : vect_slp_permute (m_perms[layout_i], load_perm, true);
8028 : }
8029 : }
8030 :
8031 : /* Do this before any nodes disappear, since it involves a walk
8032 : over the leaves. */
8033 10174 : remove_redundant_permutations ();
8034 :
8035 : /* Replace each child with a correctly laid-out version. */
8036 156254 : for (unsigned int node_i : m_partitioned_nodes)
8037 : {
8038 : /* Skip nodes that have already been handled above. */
8039 125732 : if (bitmap_bit_p (fully_folded, node_i))
8040 4916 : continue;
8041 :
8042 120816 : auto &vertex = m_vertices[node_i];
8043 120816 : int in_layout_i = m_partitions[vertex.partition].layout;
8044 120816 : gcc_assert (in_layout_i >= 0);
8045 :
8046 : unsigned j;
8047 : slp_tree child;
8048 365075 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
8049 : {
8050 152094 : if (!child)
8051 6016 : continue;
8052 :
8053 146078 : slp_tree new_child = get_result_with_layout (child, in_layout_i);
8054 146078 : if (new_child != child)
8055 : {
8056 2248 : vect_free_slp_tree (child);
8057 2248 : SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
8058 2248 : new_child->refcnt += 1;
8059 : }
8060 : }
8061 : }
8062 10174 : }
8063 :
8064 : /* Elide load permutations that are not necessary. Such permutations might
8065 : be pre-existing, rather than created by the layout optimizations. */
8066 :
8067 : void
8068 678936 : vect_optimize_slp_pass::remove_redundant_permutations ()
8069 : {
8070 4476581 : for (unsigned int node_i : m_leafs)
8071 : {
8072 2439773 : slp_tree node = m_vertices[node_i].node;
8073 2439773 : if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
8074 1845292 : continue;
8075 :
8076 : /* In basic block vectorization we allow any subchain of an interleaving
8077 : chain.
8078 : FORNOW: not in loop SLP because of realignment complications. */
8079 594481 : if (is_a <bb_vec_info> (m_vinfo))
8080 : {
8081 154543 : bool subchain_p = true;
8082 : stmt_vec_info next_load_info = NULL;
8083 : stmt_vec_info load_info;
8084 : unsigned j;
8085 154543 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
8086 : {
8087 125999 : if (j != 0
8088 125999 : && (next_load_info != load_info
8089 60303 : || ! load_info
8090 60303 : || DR_GROUP_GAP (load_info) != 1))
8091 : {
8092 : subchain_p = false;
8093 : break;
8094 : }
8095 103788 : next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
8096 : }
8097 50755 : if (subchain_p)
8098 : {
8099 28544 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8100 28544 : continue;
8101 : }
8102 : }
8103 : else
8104 : {
8105 543726 : loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
8106 543726 : bool this_load_permuted = !vect_load_perm_consecutive_p (node, 0);
8107 : /* When this isn't a grouped access we know it's single element
8108 : and contiguous. */
8109 543726 : if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
8110 : {
8111 422669 : if (!this_load_permuted
8112 422669 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
8113 421914 : || SLP_TREE_LANES (node) == 1))
8114 421916 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8115 422669 : continue;
8116 : }
8117 121057 : stmt_vec_info first_stmt_info
8118 121057 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
8119 121562 : if (!this_load_permuted
8120 : /* The load requires permutation when unrolling exposes
8121 : a gap either because the group is larger than the SLP
8122 : group-size or because there is a gap between the groups. */
8123 121057 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
8124 98665 : || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
8125 140 : && DR_GROUP_GAP (first_stmt_info) == 0)))
8126 : {
8127 505 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8128 505 : continue;
8129 : }
8130 : }
8131 : }
8132 678936 : }
8133 :
8134 : /* Print the partition graph and layout information to the dump file. */
8135 :
8136 : void
8137 679 : vect_optimize_slp_pass::dump ()
8138 : {
8139 679 : dump_printf_loc (MSG_NOTE, vect_location,
8140 : "SLP optimize permutations:\n");
8141 1371 : for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
8142 : {
8143 692 : dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
8144 692 : const char *sep = "";
8145 5909 : for (unsigned int idx : m_perms[layout_i])
8146 : {
8147 3833 : dump_printf (MSG_NOTE, "%s%d", sep, idx);
8148 3833 : sep = ", ";
8149 : }
8150 692 : dump_printf (MSG_NOTE, " }\n");
8151 : }
8152 679 : dump_printf_loc (MSG_NOTE, vect_location,
8153 : "SLP optimize partitions:\n");
8154 5659 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
8155 : ++partition_i)
8156 : {
8157 4980 : auto &partition = m_partitions[partition_i];
8158 4980 : dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
8159 4980 : dump_printf_loc (MSG_NOTE, vect_location,
8160 : " partition %d (layout %d):\n",
8161 : partition_i, partition.layout);
8162 4980 : dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
8163 10196 : for (unsigned int order_i = partition.node_begin;
8164 10196 : order_i < partition.node_end; ++order_i)
8165 : {
8166 5216 : auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
8167 10432 : dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
8168 5216 : (void *) vertex.node);
8169 5216 : dump_printf_loc (MSG_NOTE, vect_location,
8170 : " weight: %f\n",
8171 : vertex.weight.to_double ());
8172 5216 : if (vertex.out_degree)
8173 4083 : dump_printf_loc (MSG_NOTE, vect_location,
8174 : " out weight: %f (degree %d)\n",
8175 : vertex.out_weight.to_double (),
8176 : vertex.out_degree);
8177 5216 : if (SLP_TREE_PERMUTE_P (vertex.node))
8178 506 : dump_printf_loc (MSG_NOTE, vect_location,
8179 : " op: VEC_PERM_EXPR\n");
8180 4710 : else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
8181 4692 : dump_printf_loc (MSG_NOTE, vect_location,
8182 : " op template: %G", rep->stmt);
8183 : }
8184 4980 : dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
8185 10196 : for (unsigned int order_i = partition.node_begin;
8186 10196 : order_i < partition.node_end; ++order_i)
8187 : {
8188 5216 : unsigned int node_i = m_partitioned_nodes[order_i];
8189 5216 : auto &vertex = m_vertices[node_i];
8190 15726 : auto print_edge = [&](graph_edge *, unsigned int other_node_i)
8191 : {
8192 10510 : auto &other_vertex = m_vertices[other_node_i];
8193 10510 : if (other_vertex.partition < vertex.partition)
8194 5255 : dump_printf_loc (MSG_NOTE, vect_location,
8195 : " - %p [%d] --> %p\n",
8196 5255 : (void *) other_vertex.node,
8197 : other_vertex.partition,
8198 5255 : (void *) vertex.node);
8199 : else
8200 5255 : dump_printf_loc (MSG_NOTE, vect_location,
8201 : " - %p --> [%d] %p\n",
8202 5255 : (void *) vertex.node,
8203 : other_vertex.partition,
8204 5255 : (void *) other_vertex.node);
8205 15726 : };
8206 5216 : for_each_partition_edge (node_i, print_edge);
8207 : }
8208 :
8209 15139 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
8210 : {
8211 10159 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
8212 10159 : if (layout_costs.is_possible ())
8213 : {
8214 8380 : dump_printf_loc (MSG_NOTE, vect_location,
8215 : " layout %d:%s\n", layout_i,
8216 8380 : partition.layout == int (layout_i)
8217 : ? " (*)" : "");
8218 8380 : slpg_layout_cost combined_cost = layout_costs.in_cost;
8219 8380 : combined_cost.add_serial_cost (layout_costs.internal_cost);
8220 8380 : combined_cost.add_serial_cost (layout_costs.out_cost);
8221 : #define TEMPLATE "{depth: %f, total: %f}"
8222 8380 : dump_printf_loc (MSG_NOTE, vect_location,
8223 : " " TEMPLATE "\n",
8224 : layout_costs.in_cost.depth.to_double (),
8225 : layout_costs.in_cost.total.to_double ());
8226 8380 : dump_printf_loc (MSG_NOTE, vect_location,
8227 : " + " TEMPLATE "\n",
8228 : layout_costs.internal_cost.depth.to_double (),
8229 : layout_costs.internal_cost.total.to_double ());
8230 8380 : dump_printf_loc (MSG_NOTE, vect_location,
8231 : " + " TEMPLATE "\n",
8232 : layout_costs.out_cost.depth.to_double (),
8233 : layout_costs.out_cost.total.to_double ());
8234 8380 : dump_printf_loc (MSG_NOTE, vect_location,
8235 : " = " TEMPLATE "\n",
8236 : combined_cost.depth.to_double (),
8237 : combined_cost.total.to_double ());
8238 : #undef TEMPLATE
8239 : }
8240 : else
8241 1779 : dump_printf_loc (MSG_NOTE, vect_location,
8242 : " layout %d: rejected\n", layout_i);
8243 : }
8244 : }
8245 679 : }
8246 :
8247 : /* Masked load lanes discovery. */
8248 :
8249 : void
8250 678936 : vect_optimize_slp_pass::decide_masked_load_lanes ()
8251 : {
8252 6973451 : for (auto v : m_vertices)
8253 : {
8254 4936643 : slp_tree node = v.node;
8255 4936643 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
8256 3469568 : || SLP_TREE_PERMUTE_P (node))
8257 1604413 : continue;
8258 3332230 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8259 1637392 : if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
8260 : /* The mask has to be uniform. */
8261 971442 : || STMT_VINFO_SLP_VECT_ONLY (stmt_info)
8262 971301 : || ! is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
8263 3332315 : || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
8264 : IFN_MASK_LOAD))
8265 3332197 : continue;
8266 33 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8267 66 : if (STMT_VINFO_STRIDED_P (stmt_info)
8268 33 : || compare_step_with_zero (m_vinfo, stmt_info) <= 0
8269 63 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (node),
8270 30 : DR_GROUP_SIZE (stmt_info),
8271 : true) == IFN_LAST)
8272 33 : continue;
8273 :
8274 : /* Uniform masks need to be suitably represented. */
8275 0 : slp_tree mask = SLP_TREE_CHILDREN (node)[0];
8276 0 : if (!SLP_TREE_PERMUTE_P (mask)
8277 0 : || SLP_TREE_CHILDREN (mask).length () != 1)
8278 0 : continue;
8279 0 : bool match = true;
8280 0 : for (auto perm : SLP_TREE_LANE_PERMUTATION (mask))
8281 0 : if (perm.first != 0 || perm.second != 0)
8282 : {
8283 : match = false;
8284 : break;
8285 : }
8286 0 : if (!match)
8287 0 : continue;
8288 :
8289 : /* Now see if the consumer side matches. */
8290 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
8291 0 : pred; pred = pred->pred_next)
8292 : {
8293 0 : slp_tree pred_node = m_vertices[pred->src].node;
8294 : /* All consumers should be a permute with a single outgoing lane. */
8295 0 : if (!SLP_TREE_PERMUTE_P (pred_node)
8296 0 : || SLP_TREE_LANES (pred_node) != 1)
8297 : {
8298 : match = false;
8299 : break;
8300 : }
8301 0 : gcc_assert (SLP_TREE_CHILDREN (pred_node).length () == 1);
8302 : }
8303 0 : if (!match)
8304 0 : continue;
8305 : /* Now we can mark the nodes as to use load lanes. */
8306 0 : node->ldst_lanes = true;
8307 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
8308 0 : pred; pred = pred->pred_next)
8309 0 : m_vertices[pred->src].node->ldst_lanes = true;
8310 : /* The catch is we have to massage the mask. We have arranged
8311 : analyzed uniform masks to be represented by a splat VEC_PERM
8312 : which we can now simply elide as we cannot easily re-do SLP
8313 : discovery here. */
8314 0 : slp_tree new_mask = SLP_TREE_CHILDREN (mask)[0];
8315 0 : SLP_TREE_REF_COUNT (new_mask)++;
8316 0 : SLP_TREE_CHILDREN (node)[0] = new_mask;
8317 0 : vect_free_slp_tree (mask);
8318 : }
8319 678936 : }
8320 :
8321 : /* Perform legitimizing attempts. This is intended to improve the
8322 : situation when layout 0 is not valid which is a situation the cost
8323 : based propagation does not handle well.
8324 : Return true if further layout optimization is possible, false if
8325 : the layout configuration should be considered final. */
8326 :
8327 : bool
8328 10174 : vect_optimize_slp_pass::legitimize ()
8329 : {
8330 : /* Perform a very simple legitimizing attempt by attempting to choose
8331 : a single layout for all partitions that will make all permutations
8332 : a noop. That should also be the optimal layout choice in case
8333 : layout zero is legitimate.
8334 : ??? Disconnected components of the SLP graph could have distinct
8335 : single layouts. */
8336 10174 : int single_layout_i = -1;
8337 10174 : unsigned deferred_up_to = -1U;
8338 29607 : for (unsigned partition_i = 0; partition_i < m_partitions.length ();
8339 : ++partition_i)
8340 : {
8341 24649 : auto &partition = m_partitions[partition_i];
8342 24649 : if (single_layout_i == -1)
8343 : {
8344 13544 : single_layout_i = partition.layout;
8345 13544 : deferred_up_to = partition_i;
8346 : }
8347 11105 : else if (partition.layout == single_layout_i || partition.layout == -1)
8348 : ;
8349 : else
8350 : single_layout_i = 0;
8351 21900 : if (single_layout_i == 0)
8352 : return true;
8353 :
8354 19497 : if (single_layout_i != -1
8355 19497 : && !is_compatible_layout (partition, single_layout_i))
8356 : return true;
8357 : }
8358 :
8359 4958 : if (single_layout_i <= 0)
8360 : return true;
8361 :
8362 5066 : for (unsigned partition_i = 0; partition_i < deferred_up_to; ++partition_i)
8363 111 : if (!is_compatible_layout (m_partitions[partition_i],
8364 : single_layout_i))
8365 : return true;
8366 :
8367 12503 : for (unsigned partition_i = 0; partition_i < m_partitions.length ();
8368 : ++partition_i)
8369 : {
8370 7548 : auto &partition = m_partitions[partition_i];
8371 7548 : partition.layout = single_layout_i;
8372 : }
8373 :
8374 : return false;
8375 : }
8376 :
8377 : /* Main entry point for the SLP graph optimization pass. */
8378 :
8379 : void
8380 678936 : vect_optimize_slp_pass::run ()
8381 : {
8382 678936 : build_graph ();
8383 678936 : create_partitions ();
8384 678936 : start_choosing_layouts ();
8385 678936 : if (m_perms.length () > 1)
8386 : {
8387 10174 : if (legitimize ())
8388 : {
8389 5219 : forward_pass ();
8390 5219 : backward_pass ();
8391 : }
8392 10174 : if (dump_enabled_p ())
8393 679 : dump ();
8394 10174 : materialize ();
8395 41113 : while (!m_perms.is_empty ())
8396 20765 : m_perms.pop ().release ();
8397 : }
8398 : else
8399 668762 : remove_redundant_permutations ();
8400 678936 : free_graph (m_slpg);
8401 678936 : build_graph ();
8402 678936 : decide_masked_load_lanes ();
8403 678936 : free_graph (m_slpg);
8404 678936 : }
8405 :
8406 : /* Apply CSE to NODE and its children using BST_MAP. */
8407 :
8408 : static void
8409 5334030 : vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
8410 : {
8411 5334030 : bool put_p = false;
8412 5334030 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
8413 : /* Besides some VEC_PERM_EXPR, two-operator nodes also
8414 : lack scalar stmts and thus CSE doesn't work via bst_map. Ideally
8415 : we'd have sth that works for all internal and external nodes. */
8416 5334030 : && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
8417 : {
8418 3841154 : slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
8419 3841154 : if (leader)
8420 : {
8421 : /* We've visited this node already. */
8422 399947 : if (!*leader || *leader == node)
8423 : return;
8424 :
8425 2809 : if (dump_enabled_p ())
8426 924 : dump_printf_loc (MSG_NOTE, vect_location,
8427 : "re-using SLP tree %p for %p\n",
8428 : (void *)*leader, (void *)node);
8429 2809 : vect_free_slp_tree (node);
8430 2809 : (*leader)->refcnt += 1;
8431 2809 : node = *leader;
8432 2809 : return;
8433 : }
8434 :
8435 : /* Avoid creating a cycle by populating the map only after recursion. */
8436 3441207 : bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
8437 3441207 : node->refcnt += 1;
8438 3441207 : put_p = true;
8439 : /* And recurse. */
8440 : }
8441 :
8442 14746424 : for (slp_tree &child : SLP_TREE_CHILDREN (node))
8443 4303917 : if (child)
8444 3872549 : vect_cse_slp_nodes (bst_map, child);
8445 :
8446 : /* Now record the node for CSE in other siblings. */
8447 4934083 : if (put_p)
8448 3441207 : *bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
8449 : }
8450 :
8451 : /* Optimize the SLP graph of VINFO. */
8452 :
8453 : void
8454 1025600 : vect_optimize_slp (vec_info *vinfo)
8455 : {
8456 1025600 : if (vinfo->slp_instances.is_empty ())
8457 : return;
8458 678936 : vect_optimize_slp_pass (vinfo).run ();
8459 :
8460 : /* Apply CSE again to nodes after permute optimization. */
8461 678936 : scalar_stmts_to_slp_tree_map_t *bst_map
8462 678936 : = new scalar_stmts_to_slp_tree_map_t ();
8463 :
8464 3498289 : for (auto inst : vinfo->slp_instances)
8465 1461481 : vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
8466 :
8467 678936 : release_scalar_stmts_to_slp_tree_map (bst_map);
8468 : }
8469 :
8470 : /* Gather loads reachable from the individual SLP graph entries. */
8471 :
8472 : void
8473 1025600 : vect_gather_slp_loads (vec_info *vinfo)
8474 : {
8475 1025600 : unsigned i;
8476 1025600 : slp_instance instance;
8477 2487081 : FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
8478 : {
8479 1461481 : hash_set<slp_tree> visited;
8480 1461481 : vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
8481 : SLP_INSTANCE_TREE (instance), visited);
8482 1461481 : }
8483 1025600 : }
8484 :
8485 : /* For NODE update VF based on the number of lanes and the vector types
8486 : used. */
8487 :
8488 : static void
8489 4228395 : vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
8490 : hash_set<slp_tree> &visited)
8491 : {
8492 4228395 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
8493 1519723 : return;
8494 3072818 : if (visited.add (node))
8495 : return;
8496 :
8497 10272516 : for (slp_tree child : SLP_TREE_CHILDREN (node))
8498 3468638 : vect_update_slp_vf_for_node (child, vf, visited);
8499 :
8500 : /* We do not visit SLP nodes for constants or externals - those neither
8501 : have a vector type set yet (vectorizable_* does this) nor do they
8502 : have max_nunits set. Instead we rely on internal nodes max_nunit
8503 : to cover constant/external operands.
8504 : Note that when we stop using fixed size vectors externs and constants
8505 : shouldn't influence the (minimum) vectorization factor, instead
8506 : vectorizable_* should honor the vectorization factor when trying to
8507 : assign vector types to constants and externals and cause iteration
8508 : to a higher vectorization factor when required. */
8509 2708672 : poly_uint64 node_vf
8510 2708672 : = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
8511 2708672 : vf = force_common_multiple (vf, node_vf);
8512 :
8513 : /* For permute nodes that are fed from externs or constants we have to
8514 : consider their number of lanes as well. Likewise for store-lanes. */
8515 2708672 : if (SLP_TREE_PERMUTE_P (node) || node->ldst_lanes)
8516 710178 : for (slp_tree child : SLP_TREE_CHILDREN (node))
8517 191136 : if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
8518 : {
8519 3585 : poly_uint64 child_vf
8520 3585 : = calculate_unrolling_factor (node->max_nunits,
8521 : SLP_TREE_LANES (child));
8522 3585 : vf = force_common_multiple (vf, child_vf);
8523 : }
8524 : }
8525 :
8526 : /* For each possible SLP instance decide whether to SLP it and calculate overall
8527 : unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
8528 : least one instance. */
8529 :
8530 : bool
8531 473646 : vect_make_slp_decision (loop_vec_info loop_vinfo)
8532 : {
8533 473646 : unsigned int i;
8534 473646 : poly_uint64 unrolling_factor = 1;
8535 473646 : const vec<slp_instance> &slp_instances
8536 : = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8537 473646 : slp_instance instance;
8538 473646 : int decided_to_slp = 0;
8539 :
8540 473646 : DUMP_VECT_SCOPE ("vect_make_slp_decision");
8541 :
8542 473646 : hash_set<slp_tree> visited;
8543 1233403 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
8544 : {
8545 759757 : slp_tree root = SLP_INSTANCE_TREE (instance);
8546 :
8547 : /* All unroll factors have the form:
8548 :
8549 : GET_MODE_SIZE (vinfo->vector_mode) * X
8550 :
8551 : for some rational X, so they must have a common multiple. */
8552 759757 : vect_update_slp_vf_for_node (root, unrolling_factor, visited);
8553 :
8554 : /* If all instances ended up with vector(1) T roots make sure to
8555 : not vectorize. RVV for example relies on loop vectorization
8556 : when some instances are essentially kept scalar. See PR121048. */
8557 759757 : if (SLP_TREE_VECTYPE (root)
8558 759757 : && known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U))
8559 621581 : decided_to_slp++;
8560 : }
8561 :
8562 473646 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = unrolling_factor;
8563 :
8564 473646 : if (decided_to_slp && dump_enabled_p ())
8565 : {
8566 19116 : dump_printf_loc (MSG_NOTE, vect_location,
8567 : "Decided to SLP %d instances. Unrolling factor ",
8568 : decided_to_slp);
8569 19116 : dump_dec (MSG_NOTE, unrolling_factor);
8570 19116 : dump_printf (MSG_NOTE, "\n");
8571 : }
8572 :
8573 473646 : return (decided_to_slp > 0);
8574 473646 : }
8575 :
8576 : /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
8577 :
8578 2183721 : _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
8579 : : vec_info (vec_info::bb, shared),
8580 2183721 : roots (vNULL)
8581 : {
8582 : /* The region we are operating on. bbs[0] is the entry, excluding
8583 : its PHI nodes. In the future we might want to track an explicit
8584 : entry edge to cover bbs[0] PHI nodes and have a region entry
8585 : insert location. */
8586 2183721 : bbs = _bbs.address ();
8587 2183721 : nbbs = _bbs.length ();
8588 :
8589 17491773 : for (unsigned i = 0; i < nbbs; ++i)
8590 : {
8591 15308052 : if (i != 0)
8592 19926052 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
8593 6801721 : gsi_next (&si))
8594 : {
8595 6801721 : gphi *phi = si.phi ();
8596 6801721 : gimple_set_uid (phi, 0);
8597 6801721 : add_stmt (phi);
8598 : }
8599 30616104 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
8600 134643934 : !gsi_end_p (gsi); gsi_next (&gsi))
8601 : {
8602 119335882 : gimple *stmt = gsi_stmt (gsi);
8603 119335882 : gimple_set_uid (stmt, 0);
8604 119335882 : if (is_gimple_debug (stmt))
8605 74311225 : continue;
8606 45024657 : add_stmt (stmt);
8607 : }
8608 : }
8609 2183721 : }
8610 :
8611 :
8612 : /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
8613 : stmts in the basic block. */
8614 :
8615 2183721 : _bb_vec_info::~_bb_vec_info ()
8616 : {
8617 : /* Reset region marker. */
8618 17491773 : for (unsigned i = 0; i < nbbs; ++i)
8619 : {
8620 15308052 : if (i != 0)
8621 19941839 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
8622 6817508 : gsi_next (&si))
8623 : {
8624 6817508 : gphi *phi = si.phi ();
8625 6817508 : gimple_set_uid (phi, -1);
8626 : }
8627 30616104 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
8628 134586827 : !gsi_end_p (gsi); gsi_next (&gsi))
8629 : {
8630 119278775 : gimple *stmt = gsi_stmt (gsi);
8631 119278775 : gimple_set_uid (stmt, -1);
8632 : }
8633 : }
8634 :
8635 3428778 : for (unsigned i = 0; i < roots.length (); ++i)
8636 : {
8637 1245057 : roots[i].stmts.release ();
8638 1245057 : roots[i].roots.release ();
8639 1245057 : roots[i].remain.release ();
8640 : }
8641 2183721 : roots.release ();
8642 2183721 : }
8643 :
8644 : /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
8645 : given then that child nodes have already been processed, and that
8646 : their def types currently match their SLP node's def type. */
8647 :
8648 : static bool
8649 2784748 : vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
8650 : slp_instance node_instance,
8651 : stmt_vector_for_cost *cost_vec)
8652 : {
8653 : /* Handle purely internal nodes. */
8654 2784748 : if (SLP_TREE_PERMUTE_P (node))
8655 : {
8656 121226 : if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
8657 : return false;
8658 :
8659 : stmt_vec_info slp_stmt_info;
8660 : unsigned int i;
8661 319278 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
8662 : {
8663 199339 : if (slp_stmt_info
8664 193775 : && STMT_VINFO_LIVE_P (slp_stmt_info)
8665 199339 : && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
8666 : node_instance, i,
8667 : false, cost_vec))
8668 : return false;
8669 : }
8670 119939 : SLP_TREE_TYPE (node) = permute_info_type;
8671 119939 : return true;
8672 : }
8673 :
8674 2663522 : return vect_analyze_stmt (vinfo, node, node_instance, cost_vec);
8675 : }
8676 :
8677 : static int
8678 1847975 : sort_ints (const void *a_, const void *b_)
8679 : {
8680 1847975 : int a = *(const int *)a_;
8681 1847975 : int b = *(const int *)b_;
8682 1847975 : return a - b;
8683 : }
8684 :
8685 : /* Verify if we can externalize a set of internal defs. */
8686 :
8687 : static bool
8688 371746 : vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
8689 : {
8690 : /* Constant generation uses get_later_stmt which can only handle
8691 : defs from the same BB or a set of defs that can be ordered
8692 : with a dominance query. */
8693 371746 : basic_block bb = NULL;
8694 371746 : bool all_same = true;
8695 371746 : auto_vec<int> bbs;
8696 743492 : bbs.reserve_exact (stmts.length ());
8697 2004470 : for (stmt_vec_info stmt : stmts)
8698 : {
8699 889232 : if (!stmt)
8700 : return false;
8701 889232 : else if (!bb)
8702 371746 : bb = gimple_bb (stmt->stmt);
8703 517486 : else if (gimple_bb (stmt->stmt) != bb)
8704 172873 : all_same = false;
8705 889232 : bbs.quick_push (gimple_bb (stmt->stmt)->index);
8706 : }
8707 371746 : if (all_same)
8708 : return true;
8709 :
8710 : /* Produce a vector of unique BB indexes for the defs. */
8711 129106 : bbs.qsort (sort_ints);
8712 : unsigned i, j;
8713 315038 : for (i = 1, j = 1; i < bbs.length (); ++i)
8714 185932 : if (bbs[i] != bbs[j-1])
8715 137766 : bbs[j++] = bbs[i];
8716 129106 : gcc_assert (j >= 2);
8717 129106 : bbs.truncate (j);
8718 :
8719 258212 : if (bbs.length () == 2)
8720 125690 : return (dominated_by_p (CDI_DOMINATORS,
8721 125690 : BASIC_BLOCK_FOR_FN (cfun, bbs[0]),
8722 125690 : BASIC_BLOCK_FOR_FN (cfun, bbs[1]))
8723 245767 : || dominated_by_p (CDI_DOMINATORS,
8724 120077 : BASIC_BLOCK_FOR_FN (cfun, bbs[1]),
8725 120077 : BASIC_BLOCK_FOR_FN (cfun, bbs[0])));
8726 :
8727 : /* ??? For more than two BBs we can sort the vector and verify the
8728 : result is a total order. But we can't use vec::qsort with a
8729 : compare function using a dominance query since there's no way to
8730 : signal failure and any fallback for an unordered pair would
8731 : fail qsort_chk later.
8732 : For now simply hope that ordering after BB index provides the
8733 : best candidate total order. If required we can implement our
8734 : own mergesort or export an entry without checking. */
8735 387185 : for (unsigned i = 1; i < bbs.length (); ++i)
8736 12052 : if (!dominated_by_p (CDI_DOMINATORS,
8737 12052 : BASIC_BLOCK_FOR_FN (cfun, bbs[i]),
8738 12052 : BASIC_BLOCK_FOR_FN (cfun, bbs[i-1])))
8739 : return false;
8740 :
8741 : return true;
8742 371746 : }
8743 :
8744 : /* Try to build NODE from scalars, returning true on success.
8745 : NODE_INSTANCE is the SLP instance that contains NODE. */
8746 :
8747 : static bool
8748 559858 : vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
8749 : slp_instance node_instance)
8750 : {
8751 559858 : stmt_vec_info stmt_info;
8752 559858 : unsigned int i;
8753 :
8754 559858 : if (!is_a <bb_vec_info> (vinfo)
8755 69178 : || node == SLP_INSTANCE_TREE (node_instance)
8756 20394 : || !SLP_TREE_SCALAR_STMTS (node).exists ()
8757 20353 : || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
8758 : /* Force the mask use to be built from scalars instead. */
8759 18326 : || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
8760 578037 : || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
8761 541679 : return false;
8762 :
8763 18179 : if (dump_enabled_p ())
8764 74 : dump_printf_loc (MSG_NOTE, vect_location,
8765 : "Building vector operands of %p from scalars instead\n",
8766 : (void *) node);
8767 :
8768 : /* Don't remove and free the child nodes here, since they could be
8769 : referenced by other structures. The analysis and scheduling phases
8770 : (need to) ignore child nodes of anything that isn't vect_internal_def. */
8771 18179 : unsigned int group_size = SLP_TREE_LANES (node);
8772 18179 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
8773 : /* Invariants get their vector type from the uses. */
8774 18179 : SLP_TREE_VECTYPE (node) = NULL_TREE;
8775 18179 : SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
8776 18179 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8777 63951 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8778 : {
8779 45772 : tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
8780 45772 : SLP_TREE_SCALAR_OPS (node)[i] = lhs;
8781 : }
8782 : return true;
8783 : }
8784 :
8785 : /* Return true if all elements of the slice are the same. */
8786 : bool
8787 472141 : vect_scalar_ops_slice::all_same_p () const
8788 : {
8789 519104 : for (unsigned int i = 1; i < length; ++i)
8790 437519 : if (!operand_equal_p (op (0), op (i)))
8791 : return false;
8792 : return true;
8793 : }
8794 :
8795 : hashval_t
8796 403915 : vect_scalar_ops_slice_hash::hash (const value_type &s)
8797 : {
8798 403915 : hashval_t hash = 0;
8799 1553131 : for (unsigned i = 0; i < s.length; ++i)
8800 1149216 : hash = iterative_hash_expr (s.op (i), hash);
8801 403915 : return hash;
8802 : }
8803 :
8804 : bool
8805 219272 : vect_scalar_ops_slice_hash::equal (const value_type &s1,
8806 : const compare_type &s2)
8807 : {
8808 219272 : if (s1.length != s2.length)
8809 : return false;
8810 383515 : for (unsigned i = 0; i < s1.length; ++i)
8811 333855 : if (!operand_equal_p (s1.op (i), s2.op (i)))
8812 : return false;
8813 : return true;
8814 : }
8815 :
8816 : /* Compute the prologue cost for invariant or constant operands represented
8817 : by NODE. */
8818 :
8819 : static void
8820 1099348 : vect_prologue_cost_for_slp (vec_info *vinfo, slp_tree node,
8821 : stmt_vector_for_cost *cost_vec)
8822 : {
8823 : /* There's a special case of an existing vector, that costs nothing. */
8824 1099348 : if (SLP_TREE_SCALAR_OPS (node).length () == 0
8825 1099348 : && !SLP_TREE_VEC_DEFS (node).is_empty ())
8826 1425 : return;
8827 : /* Without looking at the actual initializer a vector of
8828 : constants can be implemented as load from the constant pool.
8829 : When all elements are the same we can use a splat. */
8830 1097923 : tree vectype = SLP_TREE_VECTYPE (node);
8831 1097923 : unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
8832 1097923 : unsigned HOST_WIDE_INT const_nunits;
8833 1097923 : unsigned nelt_limit;
8834 1097923 : unsigned nvectors = vect_get_num_copies (vinfo, node);
8835 1097923 : auto ops = &SLP_TREE_SCALAR_OPS (node);
8836 1097923 : auto_vec<unsigned int> starts (nvectors);
8837 1097923 : if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
8838 1097923 : && ! multiple_p (const_nunits, group_size))
8839 : {
8840 64056 : nelt_limit = const_nunits;
8841 64056 : hash_set<vect_scalar_ops_slice_hash> vector_ops;
8842 266282 : for (unsigned int i = 0; i < nvectors; ++i)
8843 202226 : if (!vector_ops.add ({ ops, i * nelt_limit, nelt_limit }))
8844 152566 : starts.quick_push (i * nelt_limit);
8845 64056 : }
8846 : else
8847 : {
8848 : /* If either the vector has variable length or the vectors
8849 : are composed of repeated whole groups we only need to
8850 : cost construction once. All vectors will be the same. */
8851 1033867 : nelt_limit = group_size;
8852 1033867 : starts.quick_push (0);
8853 : }
8854 : /* ??? We're just tracking whether vectors in a single node are the same.
8855 : Ideally we'd do something more global. */
8856 1097923 : bool passed = false;
8857 4480202 : for (unsigned int start : starts)
8858 : {
8859 1186433 : vect_cost_for_stmt kind;
8860 1186433 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
8861 : kind = vector_load;
8862 472141 : else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
8863 : kind = scalar_to_vec;
8864 : else
8865 390556 : kind = vec_construct;
8866 : /* The target cost hook has no idea which part of the SLP node
8867 : we are costing so avoid passing it down more than once. Pass
8868 : it to the first vec_construct or scalar_to_vec part since for those
8869 : the x86 backend tries to account for GPR to XMM register moves. */
8870 1186433 : record_stmt_cost (cost_vec, 1, kind, nullptr,
8871 1186433 : (kind != vector_load && !passed) ? node : nullptr,
8872 : vectype, 0, vect_prologue);
8873 1186433 : if (kind != vector_load)
8874 472141 : passed = true;
8875 : }
8876 1097923 : }
8877 :
8878 : /* Analyze statements contained in SLP tree NODE after recursively analyzing
8879 : the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
8880 :
8881 : Return true if the operations are supported. */
8882 :
8883 : static bool
8884 5149053 : vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
8885 : slp_instance node_instance,
8886 : hash_set<slp_tree> &visited_set,
8887 : vec<slp_tree> &visited_vec,
8888 : stmt_vector_for_cost *cost_vec)
8889 : {
8890 5149053 : int i, j;
8891 5149053 : slp_tree child;
8892 :
8893 : /* Assume we can code-generate all invariants. */
8894 5149053 : if (!node
8895 4774933 : || SLP_TREE_DEF_TYPE (node) == vect_constant_def
8896 4008320 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8897 : return true;
8898 :
8899 3465867 : if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
8900 : {
8901 5 : if (dump_enabled_p ())
8902 0 : dump_printf_loc (MSG_NOTE, vect_location,
8903 : "Failed cyclic SLP reference in %p\n", (void *) node);
8904 5 : return false;
8905 : }
8906 3465862 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
8907 :
8908 : /* If we already analyzed the exact same set of scalar stmts we're done.
8909 : We share the generated vector stmts for those. */
8910 3465862 : if (visited_set.add (node))
8911 : return true;
8912 3096249 : visited_vec.safe_push (node);
8913 :
8914 3096249 : bool res = true;
8915 3096249 : unsigned visited_rec_start = visited_vec.length ();
8916 3096249 : unsigned cost_vec_rec_start = cost_vec->length ();
8917 3096249 : bool seen_non_constant_child = false;
8918 6631867 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8919 : {
8920 3846905 : res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
8921 : visited_set, visited_vec,
8922 : cost_vec);
8923 3846905 : if (!res)
8924 : break;
8925 3535618 : if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
8926 3535618 : seen_non_constant_child = true;
8927 : }
8928 : /* We're having difficulties scheduling nodes with just constant
8929 : operands and no scalar stmts since we then cannot compute a stmt
8930 : insertion place. */
8931 3096249 : if (res
8932 3096249 : && !seen_non_constant_child
8933 3096249 : && SLP_TREE_SCALAR_STMTS (node).is_empty ())
8934 : {
8935 214 : if (dump_enabled_p ())
8936 6 : dump_printf_loc (MSG_NOTE, vect_location,
8937 : "Cannot vectorize all-constant op node %p\n",
8938 : (void *) node);
8939 : res = false;
8940 : }
8941 :
8942 3096035 : if (res)
8943 2784748 : res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
8944 : cost_vec);
8945 : /* If analysis failed we have to pop all recursive visited nodes
8946 : plus ourselves. */
8947 3096249 : if (!res)
8948 : {
8949 2803354 : while (visited_vec.length () >= visited_rec_start)
8950 841819 : visited_set.remove (visited_vec.pop ());
8951 559858 : cost_vec->truncate (cost_vec_rec_start);
8952 : }
8953 :
8954 : /* When the node can be vectorized cost invariant nodes it references.
8955 : This is not done in DFS order to allow the referring node
8956 : vectorizable_* calls to nail down the invariant nodes vector type
8957 : and possibly unshare it if it needs a different vector type than
8958 : other referrers. */
8959 3096249 : if (res)
8960 5759733 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
8961 3223342 : if (child
8962 2915581 : && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
8963 2915581 : || SLP_TREE_DEF_TYPE (child) == vect_external_def)
8964 : /* Perform usual caching, note code-generation still
8965 : code-gens these nodes multiple times but we expect
8966 : to CSE them later. */
8967 4409206 : && !visited_set.add (child))
8968 : {
8969 1142461 : visited_vec.safe_push (child);
8970 : /* ??? After auditing more code paths make a "default"
8971 : and push the vector type from NODE to all children
8972 : if it is not already set. */
8973 : /* Compute the number of vectors to be generated. */
8974 1142461 : tree vector_type = SLP_TREE_VECTYPE (child);
8975 1142461 : if (!vector_type)
8976 : {
8977 : /* Masked loads can have an undefined (default SSA definition)
8978 : else operand. We do not need to cost it. */
8979 43113 : vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
8980 44548 : if (SLP_TREE_TYPE (node) == load_vec_info_type
8981 44548 : && ((ops.length ()
8982 1435 : && TREE_CODE (ops[0]) == SSA_NAME
8983 0 : && SSA_NAME_IS_DEFAULT_DEF (ops[0])
8984 0 : && VAR_P (SSA_NAME_VAR (ops[0])))
8985 1435 : || SLP_TREE_DEF_TYPE (child) == vect_constant_def))
8986 1435 : continue;
8987 :
8988 : /* For shifts with a scalar argument we don't need
8989 : to cost or code-generate anything.
8990 : ??? Represent this more explicitly. */
8991 41678 : gcc_assert (SLP_TREE_TYPE (node) == shift_vec_info_type
8992 : && j == 1);
8993 41678 : continue;
8994 41678 : }
8995 :
8996 : /* And cost them. */
8997 1099348 : vect_prologue_cost_for_slp (vinfo, child, cost_vec);
8998 : }
8999 :
9000 : /* If this node or any of its children can't be vectorized, try pruning
9001 : the tree here rather than felling the whole thing. */
9002 559858 : if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
9003 : {
9004 : /* We'll need to revisit this for invariant costing and number
9005 : of vectorized stmt setting. */
9006 : res = true;
9007 : }
9008 :
9009 : return res;
9010 : }
9011 :
9012 : /* Mark lanes of NODE that are live outside of the basic-block vectorized
9013 : region and that can be vectorized using vectorizable_live_operation
9014 : with STMT_VINFO_LIVE_P. Not handled live operations will cause the
9015 : scalar code computing it to be retained. */
9016 :
9017 : static void
9018 909127 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
9019 : slp_instance instance,
9020 : stmt_vector_for_cost *cost_vec,
9021 : hash_set<stmt_vec_info> &svisited,
9022 : hash_set<slp_tree> &visited)
9023 : {
9024 909127 : if (visited.add (node))
9025 42449 : return;
9026 :
9027 866678 : unsigned i;
9028 866678 : stmt_vec_info stmt_info;
9029 866678 : stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
9030 3140826 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9031 : {
9032 2274148 : if (!stmt_info || svisited.contains (stmt_info))
9033 48153 : continue;
9034 2249816 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
9035 2249816 : if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
9036 11999 : && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
9037 : /* Only the pattern root stmt computes the original scalar value. */
9038 8933 : continue;
9039 2240883 : if (!PURE_SLP_STMT (orig_stmt_info))
9040 : /* Iff the stmt is not part of the vector coverage because it or
9041 : uses of it are used by SLP graph leafs as extern input there is
9042 : no point in trying to live code-generate from a vector stmt as
9043 : the scalar stmt will survive anyway. */
9044 14888 : continue;
9045 2225995 : bool mark_visited = true;
9046 2225995 : gimple *orig_stmt = orig_stmt_info->stmt;
9047 2225995 : ssa_op_iter op_iter;
9048 2225995 : def_operand_p def_p;
9049 4940717 : FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
9050 : {
9051 : /* We have to verify whether we can insert the lane extract
9052 : before all uses. The following is a conservative approximation.
9053 : We cannot put this into vectorizable_live_operation because
9054 : iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
9055 : doesn't work.
9056 : Note that while the fact that we emit code for loads at the
9057 : first load should make this a non-problem leafs we construct
9058 : from scalars are vectorized after the last scalar def.
9059 : ??? If we'd actually compute the insert location during
9060 : analysis we could use sth less conservative than the last
9061 : scalar stmt in the node for the dominance check. */
9062 : /* ??? What remains is "live" uses in vector CTORs in the same
9063 : SLP graph which is where those uses can end up code-generated
9064 : right after their definition instead of close to their original
9065 : use. But that would restrict us to code-generate lane-extracts
9066 : from the latest stmt in a node. So we compensate for this
9067 : during code-generation, simply not replacing uses for those
9068 : hopefully rare cases. */
9069 488727 : imm_use_iterator use_iter;
9070 488727 : gimple *use_stmt;
9071 488727 : stmt_vec_info use_stmt_info;
9072 :
9073 488727 : bool live_p = false;
9074 488727 : bool can_insert = true;
9075 1889544 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
9076 927962 : if (!is_gimple_debug (use_stmt)
9077 927962 : && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
9078 690868 : || !PURE_SLP_STMT (use_stmt_info)))
9079 : {
9080 146178 : live_p = true;
9081 146178 : if (!vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
9082 : {
9083 15872 : if (dump_enabled_p ())
9084 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9085 : "Cannot determine insertion place for "
9086 : "lane extract\n");
9087 : can_insert = false;
9088 : break;
9089 : }
9090 488727 : }
9091 488727 : if (live_p && can_insert)
9092 : {
9093 : /* Only record a live stmt when we can replace all uses. We
9094 : record from which SLP tree we vectorize the uses, so we'll
9095 : cost once and can deal with the case that not all SLP nodes
9096 : may be suitable for code-generation of all live uses.
9097 : ??? But we never split up the work between multiple SLP
9098 : nodes. */
9099 64904 : STMT_VINFO_LIVE_P (stmt_info) = true;
9100 64904 : if (!vectorizable_live_operation (bb_vinfo, stmt_info, node,
9101 : instance, i, false, cost_vec))
9102 : {
9103 0 : STMT_VINFO_LIVE_P (stmt_info) = false;
9104 0 : mark_visited = false;
9105 : }
9106 : }
9107 : }
9108 2225995 : if (mark_visited)
9109 2225995 : svisited.add (stmt_info);
9110 : }
9111 :
9112 : slp_tree child;
9113 2499491 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9114 872499 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9115 228578 : vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
9116 : svisited, visited);
9117 : }
9118 :
9119 : /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
9120 : are live outside of the basic-block vectorized region and that can be
9121 : vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
9122 :
9123 : static void
9124 234430 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
9125 : {
9126 234430 : if (bb_vinfo->slp_instances.is_empty ())
9127 0 : return;
9128 :
9129 234430 : hash_set<slp_tree> visited;
9130 234430 : hash_set<stmt_vec_info> svisited;
9131 1383839 : for (slp_instance instance : bb_vinfo->slp_instances)
9132 : {
9133 680549 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9134 28766 : STMT_VINFO_LIVE_P (SLP_INSTANCE_ROOT_STMTS (instance)[0]) = true;
9135 680549 : vect_location = instance->location ();
9136 680549 : vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
9137 : instance, &instance->cost_vec,
9138 : svisited, visited);
9139 : }
9140 234430 : }
9141 :
9142 : /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
9143 :
9144 : static bool
9145 73941 : vectorizable_bb_reduc_epilogue (slp_instance instance,
9146 : stmt_vector_for_cost *cost_vec)
9147 : {
9148 73941 : gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
9149 73941 : enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
9150 73941 : if (reduc_code == MINUS_EXPR)
9151 0 : reduc_code = PLUS_EXPR;
9152 73941 : internal_fn reduc_fn;
9153 73941 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
9154 73941 : if (!vectype
9155 73929 : || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9156 73929 : || reduc_fn == IFN_LAST
9157 73929 : || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
9158 108280 : || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
9159 34339 : TREE_TYPE (vectype)))
9160 : {
9161 50006 : if (dump_enabled_p ())
9162 309 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9163 : "not vectorized: basic block reduction epilogue "
9164 : "operation unsupported.\n");
9165 50006 : return false;
9166 : }
9167 :
9168 : /* There's no way to cost a horizontal vector reduction via REDUC_FN so
9169 : cost log2 vector operations plus shuffles and one extraction. */
9170 23935 : unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
9171 23935 : record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
9172 : vectype, 0, vect_body);
9173 23935 : record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
9174 : vectype, 0, vect_body);
9175 23935 : record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
9176 : vectype, 0, vect_body);
9177 :
9178 : /* Since we replace all stmts of a possibly longer scalar reduction
9179 : chain account for the extra scalar stmts for that. */
9180 23935 : if (!instance->remain_defs.is_empty ())
9181 19100 : record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
9182 9550 : instance->root_stmts[0], 0, vect_body);
9183 : return true;
9184 : }
9185 :
9186 : /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
9187 : and recurse to children. */
9188 :
9189 : static void
9190 166296 : vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
9191 : hash_set<slp_tree> &visited)
9192 : {
9193 166296 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
9194 166296 : || visited.add (node))
9195 72731 : return;
9196 :
9197 : stmt_vec_info stmt;
9198 : unsigned i;
9199 321756 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
9200 228191 : if (stmt)
9201 233512 : roots.remove (vect_orig_stmt (stmt));
9202 :
9203 : slp_tree child;
9204 199761 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9205 106196 : if (child)
9206 104790 : vect_slp_prune_covered_roots (child, roots, visited);
9207 : }
9208 :
9209 : /* Hand over COST_VEC to the target COSTS grouped by SLP node. */
9210 :
9211 : static void
9212 941325 : add_slp_costs (vector_costs *costs, stmt_vector_for_cost& cost_vec)
9213 : {
9214 3561388 : for (unsigned start = 0; start < cost_vec.length ();)
9215 : {
9216 2620063 : unsigned end = start + 1;
9217 3199102 : while (end < cost_vec.length ()
9218 5465435 : && cost_vec[start].node == cost_vec[end].node)
9219 579039 : end++;
9220 2620063 : costs->add_slp_cost (cost_vec[start].node,
9221 2620063 : array_slice<stmt_info_for_cost>
9222 2620063 : (cost_vec.begin () + start, end - start));
9223 2620063 : start = end;
9224 : }
9225 941325 : }
9226 :
9227 : /* Analyze statements in SLP instances of VINFO. Return true if the
9228 : operations are supported. */
9229 :
9230 : bool
9231 660238 : vect_slp_analyze_operations (vec_info *vinfo)
9232 : {
9233 660238 : slp_instance instance;
9234 660238 : int i;
9235 :
9236 660238 : DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
9237 :
9238 660238 : hash_set<slp_tree> visited;
9239 1722068 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
9240 : {
9241 1302148 : auto_vec<slp_tree> visited_vec;
9242 1302148 : stmt_vector_for_cost cost_vec;
9243 1302148 : cost_vec.create (2);
9244 1302148 : if (is_a <bb_vec_info> (vinfo))
9245 780206 : vect_location = instance->location ();
9246 1302148 : if (!vect_slp_analyze_node_operations (vinfo,
9247 : SLP_INSTANCE_TREE (instance),
9248 : instance, visited, visited_vec,
9249 : &cost_vec)
9250 : /* CTOR instances require vectorized defs for the SLP tree root. */
9251 1071751 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
9252 5698 : && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
9253 : != vect_internal_def
9254 : /* Make sure we vectorized with the expected type. */
9255 5698 : || !useless_type_conversion_p
9256 5698 : (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
9257 : (instance->root_stmts[0]->stmt))),
9258 5698 : TREE_TYPE (SLP_TREE_VECTYPE
9259 : (SLP_INSTANCE_TREE (instance))))))
9260 : /* Check we can vectorize the reduction. */
9261 1071736 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
9262 73941 : && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
9263 : /* Check we can vectorize the gcond. */
9264 2323878 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
9265 61309 : && !vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
9266 61309 : SLP_INSTANCE_ROOT_STMTS (instance)[0],
9267 : NULL,
9268 : SLP_INSTANCE_TREE (instance),
9269 : &cost_vec)))
9270 : {
9271 339123 : cost_vec.release ();
9272 339123 : slp_tree node = SLP_INSTANCE_TREE (instance);
9273 339123 : stmt_vec_info stmt_info;
9274 339123 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9275 256210 : stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
9276 82913 : else if (!SLP_TREE_SCALAR_STMTS (node).is_empty ()
9277 82913 : && SLP_TREE_SCALAR_STMTS (node)[0])
9278 : stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
9279 : else
9280 0 : stmt_info = SLP_TREE_REPRESENTATIVE (node);
9281 339123 : if (is_a <loop_vec_info> (vinfo))
9282 : {
9283 240318 : if (dump_enabled_p ())
9284 6493 : dump_printf_loc (MSG_NOTE, vect_location,
9285 : "unsupported SLP instance starting from: %G",
9286 : stmt_info->stmt);
9287 240318 : return false;
9288 : }
9289 98805 : if (dump_enabled_p ())
9290 363 : dump_printf_loc (MSG_NOTE, vect_location,
9291 : "removing SLP instance operations starting from: %G",
9292 : stmt_info->stmt);
9293 522806 : while (!visited_vec.is_empty ())
9294 : {
9295 424001 : slp_tree node = visited_vec.pop ();
9296 424001 : SLP_TREE_TYPE (node) = undef_vec_info_type;
9297 424001 : if (node->data)
9298 : {
9299 9969 : delete node->data;
9300 9969 : node->data = nullptr;
9301 : }
9302 424001 : visited.remove (node);
9303 : }
9304 98805 : vect_free_slp_instance (instance);
9305 98805 : vinfo->slp_instances.ordered_remove (i);
9306 : }
9307 : else
9308 : {
9309 963025 : i++;
9310 963025 : if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
9311 : {
9312 281624 : add_slp_costs (loop_vinfo->vector_costs, cost_vec);
9313 281624 : cost_vec.release ();
9314 : }
9315 : else
9316 : /* For BB vectorization remember the SLP graph entry
9317 : cost for later. */
9318 681401 : instance->cost_vec = cost_vec;
9319 : }
9320 1302148 : }
9321 :
9322 : /* Now look for SLP instances with a root that are covered by other
9323 : instances and remove them. */
9324 419920 : hash_set<stmt_vec_info> roots;
9325 1734427 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
9326 926797 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9327 32210 : roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
9328 419920 : if (!roots.is_empty ())
9329 : {
9330 13079 : visited.empty ();
9331 74585 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
9332 61506 : vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
9333 : visited);
9334 74585 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
9335 61506 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
9336 32210 : && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
9337 : {
9338 852 : stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
9339 852 : if (dump_enabled_p ())
9340 17 : dump_printf_loc (MSG_NOTE, vect_location,
9341 : "removing SLP instance operations starting "
9342 : "from: %G", root->stmt);
9343 852 : vect_free_slp_instance (instance);
9344 852 : vinfo->slp_instances.ordered_remove (i);
9345 : }
9346 : else
9347 60654 : ++i;
9348 : }
9349 :
9350 839840 : return !vinfo->slp_instances.is_empty ();
9351 1080158 : }
9352 :
9353 : /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
9354 : closing the eventual chain. */
9355 :
9356 : static slp_instance
9357 750343 : get_ultimate_leader (slp_instance instance,
9358 : hash_map<slp_instance, slp_instance> &instance_leader)
9359 : {
9360 750343 : auto_vec<slp_instance *, 8> chain;
9361 750343 : slp_instance *tem;
9362 834494 : while (*(tem = instance_leader.get (instance)) != instance)
9363 : {
9364 84151 : chain.safe_push (tem);
9365 84151 : instance = *tem;
9366 : }
9367 834494 : while (!chain.is_empty ())
9368 84151 : *chain.pop () = instance;
9369 750343 : return instance;
9370 750343 : }
9371 :
9372 : namespace {
9373 : /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
9374 : KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
9375 : for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
9376 :
9377 : INSTANCE_LEADER is as for get_ultimate_leader. */
9378 :
9379 : template<typename T>
9380 : bool
9381 3286873 : vect_map_to_instance (slp_instance instance, T key,
9382 : hash_map<T, slp_instance> &key_to_instance,
9383 : hash_map<slp_instance, slp_instance> &instance_leader)
9384 : {
9385 : bool existed_p;
9386 3286873 : slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
9387 3286873 : if (!existed_p)
9388 : ;
9389 173598 : else if (key_instance != instance)
9390 : {
9391 : /* If we're running into a previously marked key make us the
9392 : leader of the current ultimate leader. This keeps the
9393 : leader chain acyclic and works even when the current instance
9394 : connects two previously independent graph parts. */
9395 69794 : slp_instance key_leader
9396 69794 : = get_ultimate_leader (key_instance, instance_leader);
9397 69794 : if (key_leader != instance)
9398 21107 : instance_leader.put (key_leader, instance);
9399 : }
9400 3286873 : key_instance = instance;
9401 3286873 : return existed_p;
9402 : }
9403 : }
9404 :
9405 : /* Worker of vect_bb_partition_graph, recurse on NODE. */
9406 :
9407 : static void
9408 909127 : vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
9409 : slp_instance instance, slp_tree node,
9410 : hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
9411 : hash_map<slp_tree, slp_instance> &node_to_instance,
9412 : hash_map<slp_instance, slp_instance> &instance_leader)
9413 : {
9414 909127 : stmt_vec_info stmt_info;
9415 909127 : unsigned i;
9416 :
9417 3286873 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9418 2377746 : if (stmt_info)
9419 2377746 : vect_map_to_instance (instance, stmt_info, stmt_to_instance,
9420 : instance_leader);
9421 :
9422 909127 : if (vect_map_to_instance (instance, node, node_to_instance,
9423 : instance_leader))
9424 909127 : return;
9425 :
9426 : slp_tree child;
9427 1739177 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9428 872499 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9429 228578 : vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
9430 : node_to_instance, instance_leader);
9431 : }
9432 :
9433 : /* Partition the SLP graph into pieces that can be costed independently. */
9434 :
9435 : static void
9436 234430 : vect_bb_partition_graph (bb_vec_info bb_vinfo)
9437 : {
9438 234430 : DUMP_VECT_SCOPE ("vect_bb_partition_graph");
9439 :
9440 : /* First walk the SLP graph assigning each involved scalar stmt a
9441 : corresponding SLP graph entry and upon visiting a previously
9442 : marked stmt, make the stmts leader the current SLP graph entry. */
9443 234430 : hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
9444 234430 : hash_map<slp_tree, slp_instance> node_to_instance;
9445 234430 : hash_map<slp_instance, slp_instance> instance_leader;
9446 234430 : slp_instance instance;
9447 914979 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
9448 : {
9449 680549 : instance_leader.put (instance, instance);
9450 680549 : vect_bb_partition_graph_r (bb_vinfo,
9451 : instance, SLP_INSTANCE_TREE (instance),
9452 : stmt_to_instance, node_to_instance,
9453 : instance_leader);
9454 : }
9455 :
9456 : /* Then collect entries to each independent subgraph. */
9457 1149409 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
9458 : {
9459 680549 : slp_instance leader = get_ultimate_leader (instance, instance_leader);
9460 680549 : leader->subgraph_entries.safe_push (instance);
9461 680549 : if (dump_enabled_p ()
9462 680549 : && leader != instance)
9463 71 : dump_printf_loc (MSG_NOTE, vect_location,
9464 : "instance %p is leader of %p\n",
9465 : (void *) leader, (void *) instance);
9466 : }
9467 234430 : }
9468 :
9469 : /* Compute the scalar cost of the SLP node NODE and its children
9470 : and return it. Do not account defs that are marked in LIFE and
9471 : update LIFE according to uses of NODE. */
9472 :
9473 : static void
9474 677059 : vect_bb_slp_scalar_cost (bb_vec_info vinfo,
9475 : vec<stmt_vec_info> &worklist,
9476 : stmt_vector_for_cost *cost_vec,
9477 : hash_set<stmt_vec_info> &visited)
9478 : {
9479 3132315 : while (!worklist.is_empty ())
9480 : {
9481 2455256 : stmt_vec_info stmt = worklist.pop ();
9482 2740030 : if (!PURE_SLP_STMT (stmt))
9483 300321 : continue;
9484 :
9485 : /* When the stmt is live but not actually vectorized we have
9486 : to keep the feeding scalar defs. */
9487 2173369 : if (!STMT_VINFO_LIVE_P (vect_stmt_to_vectorize (stmt)))
9488 : {
9489 2107370 : bool live_p = false;
9490 2107370 : ssa_op_iter op_iter;
9491 2107370 : def_operand_p def_p;
9492 4607660 : FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt->stmt, op_iter, SSA_OP_DEF)
9493 : {
9494 392920 : imm_use_iterator use_iter;
9495 392920 : gimple *use_stmt;
9496 1422610 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
9497 636770 : if (!is_gimple_debug (use_stmt))
9498 : {
9499 469269 : stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
9500 469269 : if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
9501 : {
9502 24218 : if (dump_enabled_p ())
9503 : {
9504 67 : dump_printf_loc (MSG_NOTE, vect_location,
9505 : "stmt considered live: %G",
9506 : stmt->stmt);
9507 67 : dump_printf_loc (MSG_NOTE, vect_location,
9508 : "because of use in: %G",
9509 : use_stmt);
9510 : }
9511 : live_p = true;
9512 : }
9513 392920 : }
9514 : }
9515 2107370 : if (live_p)
9516 15547 : continue;
9517 : }
9518 :
9519 : /* The following assert verifies that vect_bb_partition_graph
9520 : partitions the SLP graph in a way that each scalar stmt of
9521 : the coverage of the SLP graph belongs to exactly one subgraph.
9522 : ??? This is currently not guaranteed since the function
9523 : works purely on SLP_TREE_SCALAR_STMTS, resulting in the assert
9524 : tripping or scalar stmts costed multiple times, making vectorization
9525 : more profitable than it really is. */
9526 : /* gcc_checking_assert (!gimple_visited_p (stmt->stmt)); */
9527 :
9528 2154935 : if (vect_nop_conversion_p (stmt))
9529 : ;
9530 : /* For single-argument PHIs assume coalescing which means zero
9531 : cost for the scalar and the vector PHIs. This avoids
9532 : artificially favoring the vector path (but may pessimize it
9533 : in some cases). */
9534 2133796 : else if (is_a <gphi *> (stmt->stmt)
9535 2133796 : && gimple_phi_num_args (as_a <gphi *> (stmt->stmt)) == 1)
9536 : ;
9537 : else
9538 : {
9539 2124963 : vect_cost_for_stmt kind;
9540 2124963 : if (STMT_VINFO_DATA_REF (stmt))
9541 : {
9542 1956637 : data_reference_p dr = STMT_VINFO_DATA_REF (stmt);
9543 1956637 : tree base = get_base_address (DR_REF (dr));
9544 : /* When the scalar access is to a non-global not
9545 : address-taken decl that is not BLKmode assume we can
9546 : access it with a single non-load/store instruction. */
9547 1956637 : if (DECL_P (base)
9548 1509433 : && !is_global_var (base)
9549 1434830 : && !TREE_ADDRESSABLE (base)
9550 2505237 : && DECL_MODE (base) != BLKmode)
9551 : kind = scalar_stmt;
9552 1813524 : else if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt)))
9553 : kind = scalar_load;
9554 : else
9555 1587462 : kind = scalar_store;
9556 : }
9557 : else
9558 : kind = scalar_stmt;
9559 : /* Cost each scalar stmt only once. */
9560 2124963 : gimple_set_visited (stmt->stmt, true);
9561 2124963 : record_stmt_cost (cost_vec, 1, kind, stmt, NULL_TREE, 0, vect_body);
9562 : }
9563 :
9564 : /* Now walk relevant parts of the SSA use-def graph. */
9565 2154935 : slp_oprnds child_ops (stmt);
9566 4515580 : for (unsigned i = 0; i < child_ops.num_slp_children; ++i)
9567 : {
9568 2360645 : tree op = child_ops.get_op_for_slp_child (stmt, i);
9569 2360645 : stmt_vec_info def = vinfo->lookup_def (op);
9570 2360645 : if (def && !visited.add (def))
9571 683518 : worklist.safe_push (def);
9572 : }
9573 : }
9574 677059 : }
9575 :
9576 :
9577 : /* Comparator for the loop-index sorted cost vectors. */
9578 :
9579 : static int
9580 16819469 : li_cost_vec_cmp (const void *a_, const void *b_, void *)
9581 : {
9582 16819469 : auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
9583 16819469 : auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
9584 16819469 : if (a->first < b->first)
9585 : return -1;
9586 16005914 : else if (a->first == b->first)
9587 15327298 : return 0;
9588 : return 1;
9589 : }
9590 :
9591 : /* Check if vectorization of the basic block is profitable for the
9592 : subgraph denoted by SLP_INSTANCES. */
9593 :
9594 : static bool
9595 656098 : vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
9596 : vec<slp_instance> slp_instances,
9597 : loop_p orig_loop)
9598 : {
9599 656098 : slp_instance instance;
9600 656098 : int i;
9601 656098 : unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
9602 656098 : unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
9603 :
9604 656098 : if (dump_enabled_p ())
9605 : {
9606 105 : dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
9607 105 : hash_set<slp_tree> visited;
9608 425 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9609 110 : vect_print_slp_graph (MSG_NOTE, vect_location,
9610 : SLP_INSTANCE_TREE (instance), visited);
9611 105 : }
9612 :
9613 : /* Then DFS walk scalar stmts, performing costing and handling
9614 : still live scalar stmts via the previously computed vector coverage. */
9615 656098 : stmt_vector_for_cost scalar_costs = vNULL;
9616 656098 : stmt_vector_for_cost vector_costs = vNULL;
9617 656098 : hash_set<slp_tree> visited;
9618 656098 : hash_set<stmt_vec_info> svisited;
9619 1333157 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9620 : {
9621 677059 : auto_vec<stmt_vec_info> worklist;
9622 677059 : if (SLP_INSTANCE_ROOT_STMTS (instance).exists ())
9623 56994 : record_stmt_cost (&scalar_costs,
9624 28497 : SLP_INSTANCE_ROOT_STMTS (instance).length (),
9625 : scalar_stmt,
9626 28497 : SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
9627 3811609 : for (auto stmt : SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)))
9628 : {
9629 1780432 : stmt = vect_orig_stmt (stmt);
9630 1780432 : if (!svisited.add (stmt))
9631 1771738 : worklist.safe_push (stmt);
9632 : }
9633 677059 : vect_bb_slp_scalar_cost (bb_vinfo, worklist, &scalar_costs, svisited);
9634 677059 : vector_costs.safe_splice (instance->cost_vec);
9635 677059 : instance->cost_vec.release ();
9636 677059 : }
9637 :
9638 656098 : if (dump_enabled_p ())
9639 105 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
9640 :
9641 : /* When costing non-loop vectorization we need to consider each covered
9642 : loop independently and make sure vectorization is profitable. For
9643 : now we assume a loop may be not entered or executed an arbitrary
9644 : number of iterations (??? static information can provide more
9645 : precise info here) which means we can simply cost each containing
9646 : loops stmts separately. */
9647 :
9648 : /* First produce cost vectors sorted by loop index. */
9649 656098 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
9650 656098 : li_scalar_costs (scalar_costs.length ());
9651 656098 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
9652 656098 : li_vector_costs (vector_costs.length ());
9653 656098 : stmt_info_for_cost *cost;
9654 2809558 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9655 : {
9656 2153460 : unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
9657 2153460 : li_scalar_costs.quick_push (std::make_pair (l, cost));
9658 : }
9659 : /* Use a random used loop as fallback in case the first vector_costs
9660 : entry does not have a stmt_info associated with it. */
9661 656098 : unsigned l = li_scalar_costs[0].first;
9662 2389563 : FOR_EACH_VEC_ELT (vector_costs, i, cost)
9663 : {
9664 : /* We inherit from the previous COST, invariants, externals and
9665 : extracts immediately follow the cost for the related stmt. */
9666 1733465 : if (cost->stmt_info)
9667 1017261 : l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
9668 1733465 : li_vector_costs.quick_push (std::make_pair (l, cost));
9669 : }
9670 656098 : li_scalar_costs.stablesort (li_cost_vec_cmp, NULL);
9671 656098 : li_vector_costs.stablesort (li_cost_vec_cmp, NULL);
9672 :
9673 : /* Now cost the portions individually. */
9674 : unsigned vi = 0;
9675 : unsigned si = 0;
9676 1139297 : bool profitable = true;
9677 1139297 : while (si < li_scalar_costs.length ()
9678 1799960 : && vi < li_vector_costs.length ())
9679 : {
9680 660651 : unsigned sl = li_scalar_costs[si].first;
9681 660651 : unsigned vl = li_vector_costs[vi].first;
9682 660651 : if (sl != vl)
9683 : {
9684 950 : if (dump_enabled_p ())
9685 0 : dump_printf_loc (MSG_NOTE, vect_location,
9686 : "Scalar %d and vector %d loop part do not "
9687 : "match up, skipping scalar part\n", sl, vl);
9688 : /* Skip the scalar part, assuming zero cost on the vector side. */
9689 1590 : do
9690 : {
9691 1590 : si++;
9692 : }
9693 1590 : while (si < li_scalar_costs.length ()
9694 3251 : && li_scalar_costs[si].first == sl);
9695 950 : continue;
9696 : }
9697 :
9698 659701 : class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
9699 2135281 : do
9700 : {
9701 2135281 : add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
9702 2135281 : si++;
9703 : }
9704 2135281 : while (si < li_scalar_costs.length ()
9705 4278070 : && li_scalar_costs[si].first == sl);
9706 659701 : scalar_target_cost_data->finish_cost (nullptr);
9707 659701 : scalar_cost = scalar_target_cost_data->body_cost ();
9708 :
9709 : /* Complete the target-specific vector cost calculation. */
9710 659701 : class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
9711 659701 : auto_vec<stmt_info_for_cost> tem;
9712 1707395 : do
9713 : {
9714 1707395 : tem.safe_push (*li_vector_costs[vi].second);
9715 1707395 : vi++;
9716 : }
9717 1707395 : while (vi < li_vector_costs.length ()
9718 3423453 : && li_vector_costs[vi].first == vl);
9719 659701 : add_slp_costs (vect_target_cost_data, tem);
9720 659701 : vect_target_cost_data->finish_cost (scalar_target_cost_data);
9721 659701 : vec_prologue_cost = vect_target_cost_data->prologue_cost ();
9722 659701 : vec_inside_cost = vect_target_cost_data->body_cost ();
9723 659701 : vec_epilogue_cost = vect_target_cost_data->epilogue_cost ();
9724 659701 : delete scalar_target_cost_data;
9725 659701 : delete vect_target_cost_data;
9726 :
9727 659701 : vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
9728 :
9729 659701 : if (dump_enabled_p ())
9730 : {
9731 112 : dump_printf_loc (MSG_NOTE, vect_location,
9732 : "Cost model analysis for part in loop %d:\n", sl);
9733 112 : dump_printf (MSG_NOTE, " Vector cost: %d\n",
9734 : vec_inside_cost + vec_outside_cost);
9735 112 : dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
9736 : }
9737 :
9738 : /* Vectorization is profitable if its cost is more than the cost of scalar
9739 : version. Note that we err on the vector side for equal cost because
9740 : the cost estimate is otherwise quite pessimistic (constant uses are
9741 : free on the scalar side but cost a load on the vector side for
9742 : example). */
9743 659701 : if (vec_outside_cost + vec_inside_cost > scalar_cost)
9744 : {
9745 177452 : profitable = false;
9746 177452 : break;
9747 : }
9748 482249 : }
9749 656098 : if (profitable && vi < li_vector_costs.length ())
9750 : {
9751 1062 : if (dump_enabled_p ())
9752 0 : dump_printf_loc (MSG_NOTE, vect_location,
9753 : "Excess vector cost for part in loop %d:\n",
9754 0 : li_vector_costs[vi].first);
9755 : profitable = false;
9756 : }
9757 :
9758 : /* Unset visited flag. This is delayed when the subgraph is profitable
9759 : and we process the loop for remaining unvectorized if-converted code. */
9760 656098 : if (!orig_loop || !profitable)
9761 2808139 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9762 2152144 : gimple_set_visited (cost->stmt_info->stmt, false);
9763 :
9764 656098 : scalar_costs.release ();
9765 656098 : vector_costs.release ();
9766 :
9767 656098 : return profitable;
9768 656098 : }
9769 :
9770 : /* qsort comparator for lane defs. */
9771 :
9772 : static int
9773 40 : vld_cmp (const void *a_, const void *b_)
9774 : {
9775 40 : auto *a = (const std::pair<unsigned, tree> *)a_;
9776 40 : auto *b = (const std::pair<unsigned, tree> *)b_;
9777 40 : return a->first - b->first;
9778 : }
9779 :
9780 : /* Return true if USE_STMT is a vector lane insert into VEC and set
9781 : *THIS_LANE to the lane number that is set. */
9782 :
9783 : static bool
9784 248 : vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
9785 : {
9786 248 : gassign *use_ass = dyn_cast <gassign *> (use_stmt);
9787 91 : if (!use_ass
9788 91 : || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
9789 22 : || (vec
9790 22 : ? gimple_assign_rhs1 (use_ass) != vec
9791 24 : : ((vec = gimple_assign_rhs1 (use_ass)), false))
9792 46 : || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
9793 46 : TREE_TYPE (gimple_assign_rhs2 (use_ass)))
9794 46 : || !constant_multiple_p
9795 46 : (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
9796 92 : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
9797 : this_lane))
9798 202 : return false;
9799 : return true;
9800 : }
9801 :
9802 : /* Find any vectorizable constructors and add them to the grouped_store
9803 : array. */
9804 :
9805 : static void
9806 2183721 : vect_slp_check_for_roots (bb_vec_info bb_vinfo)
9807 : {
9808 17491773 : for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
9809 30616104 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
9810 134643934 : !gsi_end_p (gsi); gsi_next (&gsi))
9811 : {
9812 119335882 : gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
9813 : /* This can be used to start SLP discovery for early breaks for BB early breaks
9814 : when we get that far. */
9815 119335882 : if (!assign)
9816 178883907 : continue;
9817 :
9818 30810641 : tree rhs = gimple_assign_rhs1 (assign);
9819 30810641 : enum tree_code code = gimple_assign_rhs_code (assign);
9820 30810641 : use_operand_p use_p;
9821 30810641 : gimple *use_stmt;
9822 30810641 : if (code == CONSTRUCTOR)
9823 : {
9824 1563583 : if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9825 64308 : || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
9826 93668 : CONSTRUCTOR_NELTS (rhs))
9827 43274 : || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
9828 1606853 : || uniform_vector_p (rhs))
9829 1550488 : continue;
9830 :
9831 : unsigned j;
9832 : tree val;
9833 64272 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9834 51177 : if (TREE_CODE (val) != SSA_NAME
9835 51177 : || !bb_vinfo->lookup_def (val))
9836 : break;
9837 32262 : if (j != CONSTRUCTOR_NELTS (rhs))
9838 3036 : continue;
9839 :
9840 13095 : vec<stmt_vec_info> roots = vNULL;
9841 13095 : roots.safe_push (bb_vinfo->lookup_stmt (assign));
9842 13095 : vec<stmt_vec_info> stmts;
9843 13095 : stmts.create (CONSTRUCTOR_NELTS (rhs));
9844 72670 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9845 46480 : stmts.quick_push
9846 46480 : (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
9847 13095 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9848 13095 : stmts, roots));
9849 : }
9850 29247058 : else if (code == BIT_INSERT_EXPR
9851 927 : && VECTOR_TYPE_P (TREE_TYPE (rhs))
9852 605 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
9853 605 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
9854 602 : && integer_zerop (gimple_assign_rhs3 (assign))
9855 336 : && useless_type_conversion_p
9856 336 : (TREE_TYPE (TREE_TYPE (rhs)),
9857 336 : TREE_TYPE (gimple_assign_rhs2 (assign)))
9858 29247670 : && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
9859 : {
9860 : /* We start to match on insert to lane zero but since the
9861 : inserts need not be ordered we'd have to search both
9862 : the def and the use chains. */
9863 215 : tree vectype = TREE_TYPE (rhs);
9864 215 : unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
9865 215 : auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
9866 215 : auto_sbitmap lanes (nlanes);
9867 215 : bitmap_clear (lanes);
9868 215 : bitmap_set_bit (lanes, 0);
9869 215 : tree def = gimple_assign_lhs (assign);
9870 215 : lane_defs.quick_push
9871 215 : (std::make_pair (0, gimple_assign_rhs2 (assign)));
9872 215 : unsigned lanes_found = 1;
9873 : /* Start with the use chains, the last stmt will be the root. */
9874 215 : stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
9875 215 : vec<stmt_vec_info> roots = vNULL;
9876 215 : roots.safe_push (last);
9877 217 : do
9878 : {
9879 217 : use_operand_p use_p;
9880 217 : gimple *use_stmt;
9881 217 : if (!single_imm_use (def, &use_p, &use_stmt))
9882 : break;
9883 211 : unsigned this_lane;
9884 211 : if (!bb_vinfo->lookup_stmt (use_stmt)
9885 211 : || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
9886 233 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
9887 : break;
9888 22 : if (bitmap_bit_p (lanes, this_lane))
9889 : break;
9890 2 : lanes_found++;
9891 2 : bitmap_set_bit (lanes, this_lane);
9892 2 : gassign *use_ass = as_a <gassign *> (use_stmt);
9893 2 : lane_defs.quick_push (std::make_pair
9894 2 : (this_lane, gimple_assign_rhs2 (use_ass)));
9895 2 : last = bb_vinfo->lookup_stmt (use_ass);
9896 2 : roots.safe_push (last);
9897 2 : def = gimple_assign_lhs (use_ass);
9898 : }
9899 2 : while (lanes_found < nlanes);
9900 215 : if (roots.length () > 1)
9901 2 : std::swap(roots[0], roots[roots.length () - 1]);
9902 215 : if (lanes_found < nlanes)
9903 : {
9904 : /* Now search the def chain. */
9905 215 : def = gimple_assign_rhs1 (assign);
9906 217 : do
9907 : {
9908 217 : if (TREE_CODE (def) != SSA_NAME
9909 217 : || !has_single_use (def))
9910 : break;
9911 56 : gimple *def_stmt = SSA_NAME_DEF_STMT (def);
9912 56 : unsigned this_lane;
9913 56 : if (!bb_vinfo->lookup_stmt (def_stmt)
9914 37 : || !vect_slp_is_lane_insert (def_stmt,
9915 : NULL_TREE, &this_lane)
9916 80 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
9917 : break;
9918 24 : if (bitmap_bit_p (lanes, this_lane))
9919 : break;
9920 4 : lanes_found++;
9921 4 : bitmap_set_bit (lanes, this_lane);
9922 8 : lane_defs.quick_push (std::make_pair
9923 4 : (this_lane,
9924 4 : gimple_assign_rhs2 (def_stmt)));
9925 4 : roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
9926 4 : def = gimple_assign_rhs1 (def_stmt);
9927 : }
9928 4 : while (lanes_found < nlanes);
9929 : }
9930 215 : if (lanes_found == nlanes)
9931 : {
9932 : /* Sort lane_defs after the lane index and register the root. */
9933 2 : lane_defs.qsort (vld_cmp);
9934 2 : vec<stmt_vec_info> stmts;
9935 2 : stmts.create (nlanes);
9936 10 : for (unsigned i = 0; i < nlanes; ++i)
9937 8 : stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
9938 2 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9939 2 : stmts, roots));
9940 : }
9941 : else
9942 213 : roots.release ();
9943 215 : }
9944 29246843 : else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9945 28257903 : && (associative_tree_code (code) || code == MINUS_EXPR)
9946 : /* ??? This pessimizes a two-element reduction. PR54400.
9947 : ??? In-order reduction could be handled if we only
9948 : traverse one operand chain in vect_slp_linearize_chain. */
9949 33172418 : && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
9950 : /* Ops with constants at the tail can be stripped here. */
9951 5809934 : && TREE_CODE (rhs) == SSA_NAME
9952 5743939 : && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
9953 : /* Should be the chain end. */
9954 31540326 : && (!single_imm_use (gimple_assign_lhs (assign),
9955 : &use_p, &use_stmt)
9956 1766607 : || !is_gimple_assign (use_stmt)
9957 1212710 : || (gimple_assign_rhs_code (use_stmt) != code
9958 902842 : && ((code != PLUS_EXPR && code != MINUS_EXPR)
9959 500780 : || (gimple_assign_rhs_code (use_stmt)
9960 500780 : != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
9961 : {
9962 : /* We start the match at the end of a possible association
9963 : chain. */
9964 1884359 : auto_vec<chain_op_t> chain;
9965 1884359 : auto_vec<std::pair<tree_code, gimple *> > worklist;
9966 1884359 : auto_vec<gimple *> chain_stmts;
9967 1884359 : gimple *code_stmt = NULL, *alt_code_stmt = NULL;
9968 1884359 : if (code == MINUS_EXPR)
9969 304477 : code = PLUS_EXPR;
9970 1884359 : internal_fn reduc_fn;
9971 2167296 : if (!reduction_fn_for_scalar_code (code, &reduc_fn)
9972 1884359 : || reduc_fn == IFN_LAST)
9973 282937 : continue;
9974 1601422 : vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
9975 : /* ??? */
9976 : code_stmt, alt_code_stmt, &chain_stmts,
9977 : false);
9978 3202844 : if (chain.length () > 1)
9979 : {
9980 : /* Sort the chain according to def_type and operation. */
9981 1601422 : chain.sort (dt_sort_cmp, bb_vinfo);
9982 : /* ??? Now we'd want to strip externals and constants
9983 : but record those to be handled in the epilogue. */
9984 : /* ??? For now do not allow mixing ops or externs/constants. */
9985 1601422 : bool invalid = false;
9986 1601422 : unsigned remain_cnt = 0;
9987 1601422 : unsigned last_idx = 0;
9988 4834683 : for (unsigned i = 0; i < chain.length (); ++i)
9989 : {
9990 3537738 : if (chain[i].code != code)
9991 : {
9992 : invalid = true;
9993 : break;
9994 : }
9995 3233261 : if (chain[i].dt != vect_internal_def
9996 : /* Avoid stmts where the def is not the LHS, like
9997 : ASMs. */
9998 6254473 : || (gimple_get_lhs (bb_vinfo->lookup_def
9999 3021212 : (chain[i].op)->stmt)
10000 3021212 : != chain[i].op))
10001 214993 : remain_cnt++;
10002 : else
10003 : last_idx = i;
10004 : }
10005 : /* Make sure to have an even number of lanes as we later do
10006 : all-or-nothing discovery, not trying to split further. */
10007 1601422 : if ((chain.length () - remain_cnt) & 1)
10008 169260 : remain_cnt++;
10009 1601422 : if (!invalid && chain.length () - remain_cnt > 1)
10010 : {
10011 1231960 : vec<stmt_vec_info> stmts;
10012 1231960 : vec<tree> remain = vNULL;
10013 1231960 : stmts.create (chain.length ());
10014 1231960 : if (remain_cnt > 0)
10015 114810 : remain.create (remain_cnt);
10016 3956281 : for (unsigned i = 0; i < chain.length (); ++i)
10017 : {
10018 2724321 : stmt_vec_info stmt_info;
10019 2724321 : if (chain[i].dt == vect_internal_def
10020 2684366 : && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
10021 2684366 : gimple_get_lhs (stmt_info->stmt) == chain[i].op)
10022 5408603 : && (i != last_idx
10023 1231960 : || (stmts.length () & 1)))
10024 2598176 : stmts.quick_push (stmt_info);
10025 : else
10026 126145 : remain.quick_push (chain[i].op);
10027 : }
10028 1231960 : vec<stmt_vec_info> roots;
10029 1231960 : roots.create (chain_stmts.length ());
10030 2724321 : for (unsigned i = 0; i < chain_stmts.length (); ++i)
10031 1492361 : roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
10032 1231960 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
10033 1231960 : stmts, roots, remain));
10034 : }
10035 : }
10036 1884359 : }
10037 : }
10038 2183721 : }
10039 :
10040 : /* Walk the grouped store chains and replace entries with their
10041 : pattern variant if any. */
10042 :
10043 : static void
10044 613233 : vect_fixup_store_groups_with_patterns (vec_info *vinfo)
10045 : {
10046 613233 : stmt_vec_info first_element;
10047 613233 : unsigned i;
10048 :
10049 1504702 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
10050 : {
10051 : /* We also have CTORs in this array. */
10052 891469 : if (!STMT_VINFO_GROUPED_ACCESS (first_element))
10053 0 : continue;
10054 891469 : if (STMT_VINFO_IN_PATTERN_P (first_element))
10055 : {
10056 252 : stmt_vec_info orig = first_element;
10057 252 : first_element = STMT_VINFO_RELATED_STMT (first_element);
10058 252 : DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
10059 252 : DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
10060 252 : DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
10061 252 : DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
10062 252 : vinfo->grouped_stores[i] = first_element;
10063 : }
10064 891469 : stmt_vec_info prev = first_element;
10065 2503675 : while (DR_GROUP_NEXT_ELEMENT (prev))
10066 : {
10067 1612206 : stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
10068 1612206 : if (STMT_VINFO_IN_PATTERN_P (elt))
10069 : {
10070 849 : stmt_vec_info orig = elt;
10071 849 : elt = STMT_VINFO_RELATED_STMT (elt);
10072 849 : DR_GROUP_NEXT_ELEMENT (prev) = elt;
10073 849 : DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
10074 849 : DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
10075 : }
10076 1612206 : DR_GROUP_FIRST_ELEMENT (elt) = first_element;
10077 1612206 : prev = elt;
10078 : }
10079 : }
10080 613233 : }
10081 :
10082 : /* Check if the region described by BB_VINFO can be vectorized, returning
10083 : true if so. When returning false, set FATAL to true if the same failure
10084 : would prevent vectorization at other vector sizes, false if it is still
10085 : worth trying other sizes. N_STMTS is the number of statements in the
10086 : region. */
10087 :
10088 : static bool
10089 2183721 : vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
10090 : vec<int> *dataref_groups)
10091 : {
10092 2183721 : DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
10093 :
10094 2183721 : slp_instance instance;
10095 2183721 : int i;
10096 :
10097 : /* The first group of checks is independent of the vector size. */
10098 2183721 : fatal = true;
10099 :
10100 : /* Analyze the data references. */
10101 :
10102 2183721 : if (!vect_analyze_data_refs (bb_vinfo, NULL))
10103 : {
10104 0 : if (dump_enabled_p ())
10105 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10106 : "not vectorized: unhandled data-ref in basic "
10107 : "block.\n");
10108 0 : return false;
10109 : }
10110 :
10111 2183721 : if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
10112 : {
10113 0 : if (dump_enabled_p ())
10114 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10115 : "not vectorized: unhandled data access in "
10116 : "basic block.\n");
10117 0 : return false;
10118 : }
10119 :
10120 2183721 : vect_slp_check_for_roots (bb_vinfo);
10121 :
10122 : /* If there are no grouped stores and no constructors in the region
10123 : there is no need to continue with pattern recog as vect_analyze_slp
10124 : will fail anyway. */
10125 2183721 : if (bb_vinfo->grouped_stores.is_empty ()
10126 1841762 : && bb_vinfo->roots.is_empty ())
10127 : {
10128 1570488 : if (dump_enabled_p ())
10129 1024 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10130 : "not vectorized: no grouped stores in "
10131 : "basic block.\n");
10132 1570488 : return false;
10133 : }
10134 :
10135 : /* While the rest of the analysis below depends on it in some way. */
10136 613233 : fatal = false;
10137 :
10138 613233 : vect_pattern_recog (bb_vinfo);
10139 :
10140 : /* Update store groups from pattern processing. */
10141 613233 : vect_fixup_store_groups_with_patterns (bb_vinfo);
10142 :
10143 : /* Check the SLP opportunities in the basic block, analyze and build SLP
10144 : trees. */
10145 613233 : if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
10146 : {
10147 0 : if (dump_enabled_p ())
10148 : {
10149 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10150 : "Failed to SLP the basic block.\n");
10151 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10152 : "not vectorized: failed to find SLP opportunities "
10153 : "in basic block.\n");
10154 : }
10155 0 : return false;
10156 : }
10157 :
10158 : /* Optimize permutations. */
10159 613233 : vect_optimize_slp (bb_vinfo);
10160 :
10161 : /* Gather the loads reachable from the SLP graph entries. */
10162 613233 : vect_gather_slp_loads (bb_vinfo);
10163 :
10164 613233 : vect_record_base_alignments (bb_vinfo);
10165 :
10166 : /* Analyze and verify the alignment of data references and the
10167 : dependence in the SLP instances. */
10168 1401917 : for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
10169 : {
10170 788684 : vect_location = instance->location ();
10171 788684 : if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
10172 788684 : || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
10173 : {
10174 8478 : slp_tree node = SLP_INSTANCE_TREE (instance);
10175 8478 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
10176 8478 : if (dump_enabled_p ())
10177 4 : dump_printf_loc (MSG_NOTE, vect_location,
10178 : "removing SLP instance operations starting from: %G",
10179 : stmt_info->stmt);
10180 8478 : vect_free_slp_instance (instance);
10181 8478 : BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
10182 8478 : continue;
10183 8478 : }
10184 :
10185 : /* Mark all the statements that we want to vectorize as relevant. */
10186 780206 : vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
10187 :
10188 780206 : i++;
10189 : }
10190 2213942 : if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
10191 : return false;
10192 :
10193 264651 : if (!vect_slp_analyze_operations (bb_vinfo))
10194 : {
10195 30221 : if (dump_enabled_p ())
10196 87 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10197 : "not vectorized: bad operation in basic block.\n");
10198 30221 : return false;
10199 : }
10200 :
10201 : /* Mark all the statements that we vectorize. */
10202 234430 : vect_bb_slp_mark_stmts_vectorized (bb_vinfo);
10203 :
10204 : /* Compute vectorizable live stmts. */
10205 234430 : vect_bb_slp_mark_live_stmts (bb_vinfo);
10206 :
10207 234430 : vect_bb_partition_graph (bb_vinfo);
10208 :
10209 234430 : return true;
10210 : }
10211 :
10212 : /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
10213 : basic blocks in BBS, returning true on success.
10214 : The region has N_STMTS statements and has the datarefs given by DATAREFS. */
10215 :
10216 : static bool
10217 1862158 : vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
10218 : vec<int> *dataref_groups, unsigned int n_stmts,
10219 : loop_p orig_loop)
10220 : {
10221 1862158 : bb_vec_info bb_vinfo;
10222 1862158 : auto_vector_modes vector_modes;
10223 :
10224 : /* Autodetect first vector size we try. */
10225 1862158 : machine_mode next_vector_mode = VOIDmode;
10226 1862158 : targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
10227 1862158 : unsigned int mode_i = 0;
10228 :
10229 1862158 : vec_info_shared shared;
10230 :
10231 1862158 : machine_mode autodetected_vector_mode = VOIDmode;
10232 2505284 : while (1)
10233 : {
10234 2183721 : bool vectorized = false;
10235 2183721 : bool fatal = false;
10236 2183721 : bb_vinfo = new _bb_vec_info (bbs, &shared);
10237 :
10238 2183721 : bool first_time_p = shared.datarefs.is_empty ();
10239 2183721 : BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
10240 2183721 : if (first_time_p)
10241 1885156 : bb_vinfo->shared->save_datarefs ();
10242 : else
10243 298565 : bb_vinfo->shared->check_datarefs ();
10244 2183721 : bb_vinfo->vector_mode = next_vector_mode;
10245 :
10246 2183721 : if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
10247 : {
10248 234430 : if (dump_enabled_p ())
10249 : {
10250 1522 : dump_printf_loc (MSG_NOTE, vect_location,
10251 : "***** Analysis succeeded with vector mode"
10252 761 : " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
10253 761 : dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
10254 : }
10255 :
10256 234430 : bb_vinfo->shared->check_datarefs ();
10257 :
10258 234430 : bool force_clear = false;
10259 234430 : auto_vec<slp_instance> profitable_subgraphs;
10260 1383839 : for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
10261 : {
10262 680549 : if (instance->subgraph_entries.is_empty ())
10263 220728 : continue;
10264 :
10265 659442 : dump_user_location_t saved_vect_location = vect_location;
10266 659442 : vect_location = instance->location ();
10267 659442 : if (!unlimited_cost_model (NULL)
10268 656103 : && !param_vect_allow_possibly_not_worthwhile_vectorizations
10269 1315540 : && !vect_bb_vectorization_profitable_p
10270 656098 : (bb_vinfo, instance->subgraph_entries, orig_loop))
10271 : {
10272 178514 : if (dump_enabled_p ())
10273 32 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10274 : "not vectorized: vectorization is not "
10275 : "profitable.\n");
10276 178514 : vect_location = saved_vect_location;
10277 178514 : continue;
10278 : }
10279 :
10280 480928 : vect_location = saved_vect_location;
10281 480928 : if (!dbg_cnt (vect_slp))
10282 : {
10283 0 : force_clear = true;
10284 0 : continue;
10285 : }
10286 :
10287 480928 : profitable_subgraphs.safe_push (instance);
10288 : }
10289 :
10290 : /* When we're vectorizing an if-converted loop body make sure
10291 : we vectorized all if-converted code. */
10292 392904 : if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
10293 : {
10294 106 : gcc_assert (bb_vinfo->nbbs == 1);
10295 212 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
10296 4390 : !gsi_end_p (gsi); gsi_next (&gsi))
10297 : {
10298 : /* The costing above left us with DCEable vectorized scalar
10299 : stmts having the visited flag set on profitable
10300 : subgraphs. Do the delayed clearing of the flag here. */
10301 4284 : if (gimple_visited_p (gsi_stmt (gsi)))
10302 : {
10303 1260 : gimple_set_visited (gsi_stmt (gsi), false);
10304 1260 : continue;
10305 : }
10306 3024 : if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
10307 813 : continue;
10308 :
10309 6338 : if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
10310 2670 : if (gimple_assign_rhs_code (ass) == COND_EXPR)
10311 : {
10312 69 : if (!profitable_subgraphs.is_empty ()
10313 31 : && dump_enabled_p ())
10314 0 : dump_printf_loc (MSG_NOTE, vect_location,
10315 : "not profitable because of "
10316 : "unprofitable if-converted scalar "
10317 : "code\n");
10318 38 : profitable_subgraphs.truncate (0);
10319 : }
10320 : }
10321 : }
10322 :
10323 : /* Finally schedule the profitable subgraphs. */
10324 1032260 : for (slp_instance instance : profitable_subgraphs)
10325 : {
10326 480882 : if (!vectorized && dump_enabled_p ())
10327 735 : dump_printf_loc (MSG_NOTE, vect_location,
10328 : "Basic block will be vectorized "
10329 : "using SLP\n");
10330 480882 : vectorized = true;
10331 :
10332 : /* Dump before scheduling as store vectorization will remove
10333 : the original stores and mess with the instance tree
10334 : so querying its location will eventually ICE. */
10335 480882 : if (flag_checking)
10336 1934682 : for (slp_instance sub : instance->subgraph_entries)
10337 492036 : gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
10338 480882 : unsigned HOST_WIDE_INT bytes;
10339 480882 : if (dump_enabled_p ())
10340 3493 : for (slp_instance sub : instance->subgraph_entries)
10341 : {
10342 925 : tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
10343 1850 : if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
10344 925 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
10345 925 : sub->location (),
10346 : "basic block part vectorized using %wu "
10347 : "byte vectors\n", bytes);
10348 : else
10349 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
10350 : sub->location (),
10351 : "basic block part vectorized using "
10352 : "variable length vectors\n");
10353 : }
10354 :
10355 480882 : dump_user_location_t saved_vect_location = vect_location;
10356 480882 : vect_location = instance->location ();
10357 :
10358 480882 : vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
10359 :
10360 480882 : vect_location = saved_vect_location;
10361 : }
10362 :
10363 :
10364 : /* Generate the invariant statements. */
10365 234430 : if (!gimple_seq_empty_p (bb_vinfo->inv_pattern_def_seq))
10366 : {
10367 23 : if (dump_enabled_p ())
10368 0 : dump_printf_loc (MSG_NOTE, vect_location,
10369 : "------>generating invariant statements\n");
10370 :
10371 23 : bb_vinfo->insert_seq_on_entry (NULL,
10372 : bb_vinfo->inv_pattern_def_seq);
10373 : }
10374 234430 : }
10375 : else
10376 : {
10377 1949291 : if (dump_enabled_p ())
10378 1316 : dump_printf_loc (MSG_NOTE, vect_location,
10379 : "***** Analysis failed with vector mode %s\n",
10380 1316 : GET_MODE_NAME (bb_vinfo->vector_mode));
10381 : }
10382 :
10383 2183721 : if (mode_i == 0)
10384 1862158 : autodetected_vector_mode = bb_vinfo->vector_mode;
10385 :
10386 2183721 : if (!fatal)
10387 3131092 : while (mode_i < vector_modes.length ()
10388 1761470 : && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
10389 : {
10390 334138 : if (dump_enabled_p ())
10391 1672 : dump_printf_loc (MSG_NOTE, vect_location,
10392 : "***** The result for vector mode %s would"
10393 : " be the same\n",
10394 836 : GET_MODE_NAME (vector_modes[mode_i]));
10395 334138 : mode_i += 1;
10396 : }
10397 :
10398 2183721 : delete bb_vinfo;
10399 :
10400 2183721 : if (mode_i < vector_modes.length ()
10401 2005968 : && VECTOR_MODE_P (autodetected_vector_mode)
10402 1987878 : && (related_vector_mode (vector_modes[mode_i],
10403 : GET_MODE_INNER (autodetected_vector_mode))
10404 993939 : == autodetected_vector_mode)
10405 4189689 : && (related_vector_mode (autodetected_vector_mode,
10406 513530 : GET_MODE_INNER (vector_modes[mode_i]))
10407 1027060 : == vector_modes[mode_i]))
10408 : {
10409 513530 : if (dump_enabled_p ())
10410 205 : dump_printf_loc (MSG_NOTE, vect_location,
10411 : "***** Skipping vector mode %s, which would"
10412 : " repeat the analysis for %s\n",
10413 205 : GET_MODE_NAME (vector_modes[mode_i]),
10414 205 : GET_MODE_NAME (autodetected_vector_mode));
10415 513530 : mode_i += 1;
10416 : }
10417 :
10418 2183721 : if (vectorized
10419 2025278 : || mode_i == vector_modes.length ()
10420 1847570 : || autodetected_vector_mode == VOIDmode
10421 : /* If vect_slp_analyze_bb_1 signaled that analysis for all
10422 : vector sizes will fail do not bother iterating. */
10423 3019262 : || fatal)
10424 3724316 : return vectorized;
10425 :
10426 : /* Try the next biggest vector size. */
10427 321563 : next_vector_mode = vector_modes[mode_i++];
10428 321563 : if (dump_enabled_p ())
10429 219 : dump_printf_loc (MSG_NOTE, vect_location,
10430 : "***** Re-trying analysis with vector mode %s\n",
10431 219 : GET_MODE_NAME (next_vector_mode));
10432 321563 : }
10433 1862158 : }
10434 :
10435 :
10436 : /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
10437 : true if anything in the basic-block was vectorized. */
10438 :
10439 : static bool
10440 1862158 : vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
10441 : {
10442 1862158 : vec<data_reference_p> datarefs = vNULL;
10443 1862158 : auto_vec<int> dataref_groups;
10444 1862158 : int insns = 0;
10445 1862158 : int current_group = 0;
10446 :
10447 12338273 : for (unsigned i = 0; i < bbs.length (); i++)
10448 : {
10449 10476115 : basic_block bb = bbs[i];
10450 87740178 : for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
10451 77264063 : gsi_next (&gsi))
10452 : {
10453 77264063 : gimple *stmt = gsi_stmt (gsi);
10454 77264063 : if (is_gimple_debug (stmt))
10455 47780501 : continue;
10456 :
10457 29483562 : insns++;
10458 :
10459 29483562 : if (gimple_location (stmt) != UNKNOWN_LOCATION)
10460 26448219 : vect_location = stmt;
10461 :
10462 29483562 : if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
10463 : &dataref_groups, current_group))
10464 5061225 : ++current_group;
10465 : }
10466 : /* New BBs always start a new DR group. */
10467 10476115 : ++current_group;
10468 : }
10469 :
10470 1862158 : return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
10471 1862158 : }
10472 :
10473 : /* Special entry for the BB vectorizer. Analyze and transform a single
10474 : if-converted BB with ORIG_LOOPs body being the not if-converted
10475 : representation. Returns true if anything in the basic-block was
10476 : vectorized. */
10477 :
10478 : bool
10479 19359 : vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
10480 : {
10481 19359 : auto_vec<basic_block> bbs;
10482 19359 : bbs.safe_push (bb);
10483 19359 : return vect_slp_bbs (bbs, orig_loop);
10484 19359 : }
10485 :
10486 : /* Main entry for the BB vectorizer. Analyze and transform BB, returns
10487 : true if anything in the basic-block was vectorized. */
10488 :
10489 : bool
10490 905907 : vect_slp_function (function *fun)
10491 : {
10492 905907 : bool r = false;
10493 905907 : int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
10494 905907 : auto_bitmap exit_bbs;
10495 905907 : bitmap_set_bit (exit_bbs, EXIT_BLOCK);
10496 905907 : edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
10497 905907 : unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
10498 905907 : true, rpo, NULL);
10499 :
10500 : /* For the moment split the function into pieces to avoid making
10501 : the iteration on the vector mode moot. Split at points we know
10502 : to not handle well which is CFG merges (SLP discovery doesn't
10503 : handle non-loop-header PHIs) and loop exits. Since pattern
10504 : recog requires reverse iteration to visit uses before defs
10505 : simply chop RPO into pieces. */
10506 905907 : auto_vec<basic_block> bbs;
10507 11393593 : for (unsigned i = 0; i < n; i++)
10508 : {
10509 10487686 : basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
10510 10487686 : bool split = false;
10511 :
10512 : /* Split when a BB is not dominated by the first block. */
10513 19767128 : if (!bbs.is_empty ()
10514 9279442 : && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
10515 : {
10516 654390 : if (dump_enabled_p ())
10517 146 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10518 : "splitting region at dominance boundary bb%d\n",
10519 : bb->index);
10520 : split = true;
10521 : }
10522 : /* Split when the loop determined by the first block
10523 : is exited. This is because we eventually insert
10524 : invariants at region begin. */
10525 18458348 : else if (!bbs.is_empty ()
10526 8625052 : && bbs[0]->loop_father != bb->loop_father
10527 2267412 : && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
10528 : {
10529 3827 : if (dump_enabled_p ())
10530 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10531 : "splitting region at loop %d exit at bb%d\n",
10532 3 : bbs[0]->loop_father->num, bb->index);
10533 : split = true;
10534 : }
10535 9829469 : else if (!bbs.is_empty ()
10536 8621225 : && bb->loop_father->header == bb
10537 468143 : && bb->loop_father->dont_vectorize)
10538 : {
10539 7268 : if (dump_enabled_p ())
10540 72 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10541 : "splitting region at dont-vectorize loop %d "
10542 : "entry at bb%d\n",
10543 : bb->loop_father->num, bb->index);
10544 : split = true;
10545 : }
10546 :
10547 11153171 : if (split && !bbs.is_empty ())
10548 : {
10549 665485 : r |= vect_slp_bbs (bbs, NULL);
10550 665485 : bbs.truncate (0);
10551 : }
10552 :
10553 10487686 : if (bbs.is_empty ())
10554 : {
10555 : /* We need to be able to insert at the head of the region which
10556 : we cannot for region starting with a returns-twice call. */
10557 1873729 : if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
10558 398869 : if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
10559 : {
10560 306 : if (dump_enabled_p ())
10561 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10562 : "skipping bb%d as start of region as it "
10563 : "starts with returns-twice call\n",
10564 : bb->index);
10565 30930 : continue;
10566 : }
10567 : /* If the loop this BB belongs to is marked as not to be vectorized
10568 : honor that also for BB vectorization. */
10569 1873423 : if (bb->loop_father->dont_vectorize)
10570 30624 : continue;
10571 : }
10572 :
10573 10456756 : bbs.safe_push (bb);
10574 :
10575 : /* When we have a stmt ending this block and defining a
10576 : value we have to insert on edges when inserting after it for
10577 : a vector containing its definition. Avoid this for now. */
10578 20913512 : if (gimple *last = *gsi_last_bb (bb))
10579 8475831 : if (gimple_get_lhs (last)
10580 8475831 : && is_ctrl_altering_stmt (last))
10581 : {
10582 271414 : if (dump_enabled_p ())
10583 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10584 : "splitting region at control altering "
10585 : "definition %G", last);
10586 271414 : r |= vect_slp_bbs (bbs, NULL);
10587 271414 : bbs.truncate (0);
10588 : }
10589 : }
10590 :
10591 905907 : if (!bbs.is_empty ())
10592 905900 : r |= vect_slp_bbs (bbs, NULL);
10593 :
10594 905907 : free (rpo);
10595 :
10596 905907 : return r;
10597 905907 : }
10598 :
10599 : /* Build a variable-length vector in which the elements in ELTS are repeated
10600 : to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
10601 : RESULTS and add any new instructions to SEQ.
10602 :
10603 : The approach we use is:
10604 :
10605 : (1) Find a vector mode VM with integer elements of mode IM.
10606 :
10607 : (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
10608 : ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
10609 : from small vectors to IM.
10610 :
10611 : (3) Duplicate each ELTS'[I] into a vector of mode VM.
10612 :
10613 : (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
10614 : correct byte contents.
10615 :
10616 : (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
10617 :
10618 : We try to find the largest IM for which this sequence works, in order
10619 : to cut down on the number of interleaves. */
10620 :
10621 : void
10622 0 : duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
10623 : const vec<tree> &elts, unsigned int nresults,
10624 : vec<tree> &results)
10625 : {
10626 0 : unsigned int nelts = elts.length ();
10627 0 : tree element_type = TREE_TYPE (vector_type);
10628 :
10629 : /* (1) Find a vector mode VM with integer elements of mode IM. */
10630 0 : unsigned int nvectors = 1;
10631 0 : tree new_vector_type;
10632 0 : tree permutes[2];
10633 0 : if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
10634 : &nvectors, &new_vector_type,
10635 : permutes))
10636 0 : gcc_unreachable ();
10637 :
10638 : /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
10639 0 : unsigned int partial_nelts = nelts / nvectors;
10640 0 : tree partial_vector_type = build_vector_type (element_type, partial_nelts);
10641 :
10642 0 : tree_vector_builder partial_elts;
10643 0 : auto_vec<tree, 32> pieces (nvectors * 2);
10644 0 : pieces.quick_grow_cleared (nvectors * 2);
10645 0 : for (unsigned int i = 0; i < nvectors; ++i)
10646 : {
10647 : /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
10648 : ELTS' has mode IM. */
10649 0 : partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
10650 0 : for (unsigned int j = 0; j < partial_nelts; ++j)
10651 0 : partial_elts.quick_push (elts[i * partial_nelts + j]);
10652 0 : tree t = gimple_build_vector (seq, &partial_elts);
10653 0 : t = gimple_build (seq, VIEW_CONVERT_EXPR,
10654 0 : TREE_TYPE (new_vector_type), t);
10655 :
10656 : /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
10657 0 : pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
10658 : }
10659 :
10660 : /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
10661 : correct byte contents.
10662 :
10663 : Conceptually, we need to repeat the following operation log2(nvectors)
10664 : times, where hi_start = nvectors / 2:
10665 :
10666 : out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
10667 : out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
10668 :
10669 : However, if each input repeats every N elements and the VF is
10670 : a multiple of N * 2, the HI result is the same as the LO result.
10671 : This will be true for the first N1 iterations of the outer loop,
10672 : followed by N2 iterations for which both the LO and HI results
10673 : are needed. I.e.:
10674 :
10675 : N1 + N2 = log2(nvectors)
10676 :
10677 : Each "N1 iteration" doubles the number of redundant vectors and the
10678 : effect of the process as a whole is to have a sequence of nvectors/2**N1
10679 : vectors that repeats 2**N1 times. Rather than generate these redundant
10680 : vectors, we halve the number of vectors for each N1 iteration. */
10681 : unsigned int in_start = 0;
10682 : unsigned int out_start = nvectors;
10683 : unsigned int new_nvectors = nvectors;
10684 0 : for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
10685 : {
10686 0 : unsigned int hi_start = new_nvectors / 2;
10687 0 : unsigned int out_i = 0;
10688 0 : for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
10689 : {
10690 0 : if ((in_i & 1) != 0
10691 0 : && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
10692 : 2 * in_repeat))
10693 0 : continue;
10694 :
10695 0 : tree output = make_ssa_name (new_vector_type);
10696 0 : tree input1 = pieces[in_start + (in_i / 2)];
10697 0 : tree input2 = pieces[in_start + (in_i / 2) + hi_start];
10698 0 : gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
10699 : input1, input2,
10700 : permutes[in_i & 1]);
10701 0 : gimple_seq_add_stmt (seq, stmt);
10702 0 : pieces[out_start + out_i] = output;
10703 0 : out_i += 1;
10704 : }
10705 0 : std::swap (in_start, out_start);
10706 0 : new_nvectors = out_i;
10707 : }
10708 :
10709 : /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
10710 0 : results.reserve (nresults);
10711 0 : for (unsigned int i = 0; i < nresults; ++i)
10712 0 : if (i < new_nvectors)
10713 0 : results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
10714 0 : pieces[in_start + i]));
10715 : else
10716 0 : results.quick_push (results[i - new_nvectors]);
10717 0 : }
10718 :
10719 :
10720 : /* For constant and loop invariant defs in OP_NODE this function creates
10721 : vector defs that will be used in the vectorized stmts and stores them
10722 : to SLP_TREE_VEC_DEFS of OP_NODE. */
10723 :
10724 : static void
10725 490404 : vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
10726 : {
10727 490404 : unsigned HOST_WIDE_INT nunits;
10728 490404 : tree vec_cst;
10729 490404 : unsigned j, number_of_places_left_in_vector;
10730 490404 : tree vector_type;
10731 490404 : tree vop;
10732 490404 : int group_size = op_node->ops.length ();
10733 490404 : unsigned int vec_num, i;
10734 490404 : unsigned number_of_copies = 1;
10735 490404 : bool constant_p;
10736 490404 : gimple_seq ctor_seq = NULL;
10737 490404 : auto_vec<tree, 16> permute_results;
10738 :
10739 : /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
10740 490404 : vector_type = SLP_TREE_VECTYPE (op_node);
10741 :
10742 490404 : unsigned int number_of_vectors = vect_get_num_copies (vinfo, op_node);
10743 490404 : SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
10744 490404 : auto_vec<tree> voprnds (number_of_vectors);
10745 :
10746 : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
10747 : created vectors. It is greater than 1 if unrolling is performed.
10748 :
10749 : For example, we have two scalar operands, s1 and s2 (e.g., group of
10750 : strided accesses of size two), while NUNITS is four (i.e., four scalars
10751 : of this type can be packed in a vector). The output vector will contain
10752 : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
10753 : will be 2).
10754 :
10755 : If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
10756 : containing the operands.
10757 :
10758 : For example, NUNITS is four as before, and the group size is 8
10759 : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
10760 : {s5, s6, s7, s8}. */
10761 :
10762 : /* When using duplicate_and_interleave, we just need one element for
10763 : each scalar statement. */
10764 490404 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
10765 : nunits = group_size;
10766 :
10767 490404 : number_of_copies = nunits * number_of_vectors / group_size;
10768 :
10769 490404 : number_of_places_left_in_vector = nunits;
10770 490404 : constant_p = true;
10771 490404 : tree uniform_elt = NULL_TREE;
10772 490404 : tree_vector_builder elts (vector_type, nunits, 1);
10773 490404 : elts.quick_grow (nunits);
10774 490404 : stmt_vec_info insert_after = NULL;
10775 1463553 : for (j = 0; j < number_of_copies; j++)
10776 : {
10777 973149 : tree op;
10778 3731000 : for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
10779 : {
10780 : /* Create 'vect_ = {op0,op1,...,opn}'. */
10781 1784702 : tree orig_op = op;
10782 1784702 : if (number_of_places_left_in_vector == nunits)
10783 : uniform_elt = op;
10784 1164880 : else if (uniform_elt && operand_equal_p (uniform_elt, op))
10785 741174 : op = elts[number_of_places_left_in_vector];
10786 : else
10787 : uniform_elt = NULL_TREE;
10788 1784702 : number_of_places_left_in_vector--;
10789 1784702 : if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
10790 : {
10791 276022 : if (CONSTANT_CLASS_P (op))
10792 : {
10793 100629 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10794 : {
10795 : /* Can't use VIEW_CONVERT_EXPR for booleans because
10796 : of possibly different sizes of scalar value and
10797 : vector element. */
10798 51 : if (integer_zerop (op))
10799 51 : op = build_int_cst (TREE_TYPE (vector_type), 0);
10800 0 : else if (integer_onep (op))
10801 0 : op = build_all_ones_cst (TREE_TYPE (vector_type));
10802 : else
10803 0 : gcc_unreachable ();
10804 : }
10805 : else
10806 100578 : op = fold_unary (VIEW_CONVERT_EXPR,
10807 : TREE_TYPE (vector_type), op);
10808 100629 : gcc_assert (op && CONSTANT_CLASS_P (op));
10809 : }
10810 : else
10811 : {
10812 175393 : tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
10813 175393 : gimple *init_stmt;
10814 175393 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10815 : {
10816 418 : tree true_val
10817 418 : = build_all_ones_cst (TREE_TYPE (vector_type));
10818 418 : tree false_val
10819 418 : = build_zero_cst (TREE_TYPE (vector_type));
10820 418 : gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
10821 418 : init_stmt = gimple_build_assign (new_temp, COND_EXPR,
10822 : op, true_val,
10823 : false_val);
10824 : }
10825 : else
10826 : {
10827 174975 : op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
10828 : op);
10829 174975 : init_stmt
10830 174975 : = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
10831 : op);
10832 : }
10833 175393 : gimple_seq_add_stmt (&ctor_seq, init_stmt);
10834 175393 : op = new_temp;
10835 : }
10836 : }
10837 1784702 : elts[number_of_places_left_in_vector] = op;
10838 1784702 : if (!CONSTANT_CLASS_P (op))
10839 314767 : constant_p = false;
10840 : /* For BB vectorization we have to compute an insert location
10841 : when a def is inside the analyzed region since we cannot
10842 : simply insert at the BB start in this case. */
10843 1784702 : stmt_vec_info opdef;
10844 1784702 : if (TREE_CODE (orig_op) == SSA_NAME
10845 179637 : && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
10846 159703 : && is_a <bb_vec_info> (vinfo)
10847 1886772 : && (opdef = vinfo->lookup_def (orig_op)))
10848 : {
10849 83065 : if (!insert_after)
10850 : insert_after = opdef;
10851 : else
10852 45681 : insert_after = get_later_stmt (insert_after, opdef);
10853 : }
10854 :
10855 1784702 : if (number_of_places_left_in_vector == 0)
10856 : {
10857 619822 : auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
10858 619822 : if (uniform_elt)
10859 646322 : vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
10860 323161 : elts[0]);
10861 593322 : else if (constant_p
10862 593322 : ? multiple_p (type_nunits, nunits)
10863 108627 : : known_eq (type_nunits, nunits))
10864 296661 : vec_cst = gimple_build_vector (&ctor_seq, &elts);
10865 : else
10866 : {
10867 0 : if (permute_results.is_empty ())
10868 0 : duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
10869 : elts, number_of_vectors,
10870 : permute_results);
10871 0 : vec_cst = permute_results[number_of_vectors - j - 1];
10872 : }
10873 619822 : if (!gimple_seq_empty_p (ctor_seq))
10874 : {
10875 135922 : if (insert_after)
10876 : {
10877 37384 : gimple_stmt_iterator gsi;
10878 37384 : if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
10879 : {
10880 624 : gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
10881 624 : gsi_insert_seq_before (&gsi, ctor_seq,
10882 : GSI_CONTINUE_LINKING);
10883 : }
10884 36760 : else if (!stmt_ends_bb_p (insert_after->stmt))
10885 : {
10886 36760 : gsi = gsi_for_stmt (insert_after->stmt);
10887 36760 : gsi_insert_seq_after (&gsi, ctor_seq,
10888 : GSI_CONTINUE_LINKING);
10889 : }
10890 : else
10891 : {
10892 : /* When we want to insert after a def where the
10893 : defining stmt throws then insert on the fallthru
10894 : edge. */
10895 0 : edge e = find_fallthru_edge
10896 0 : (gimple_bb (insert_after->stmt)->succs);
10897 0 : basic_block new_bb
10898 0 : = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
10899 0 : gcc_assert (!new_bb);
10900 : }
10901 : }
10902 : else
10903 98538 : vinfo->insert_seq_on_entry (NULL, ctor_seq);
10904 135922 : ctor_seq = NULL;
10905 : }
10906 619822 : voprnds.quick_push (vec_cst);
10907 619822 : insert_after = NULL;
10908 619822 : number_of_places_left_in_vector = nunits;
10909 619822 : constant_p = true;
10910 619822 : elts.new_vector (vector_type, nunits, 1);
10911 619822 : elts.quick_grow (nunits);
10912 : }
10913 : }
10914 : }
10915 :
10916 : /* Since the vectors are created in the reverse order, we should invert
10917 : them. */
10918 490404 : vec_num = voprnds.length ();
10919 1110226 : for (j = vec_num; j != 0; j--)
10920 : {
10921 619822 : vop = voprnds[j - 1];
10922 619822 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
10923 : }
10924 :
10925 : /* In case that VF is greater than the unrolling factor needed for the SLP
10926 : group of stmts, NUMBER_OF_VECTORS to be created is greater than
10927 : NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
10928 : to replicate the vectors. */
10929 490404 : while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
10930 490404 : for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
10931 : i++)
10932 0 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
10933 490404 : }
10934 :
10935 : /* Get the scalar definition of the Nth lane from SLP_NODE or NULL_TREE
10936 : if there is no definition for it in the scalar IL or it is not known. */
10937 :
10938 : tree
10939 2665 : vect_get_slp_scalar_def (slp_tree slp_node, unsigned n)
10940 : {
10941 2665 : if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
10942 : {
10943 2653 : if (!SLP_TREE_SCALAR_STMTS (slp_node).exists ())
10944 : return NULL_TREE;
10945 2653 : stmt_vec_info def = SLP_TREE_SCALAR_STMTS (slp_node)[n];
10946 2653 : if (!def)
10947 : return NULL_TREE;
10948 2653 : return gimple_get_lhs (STMT_VINFO_STMT (def));
10949 : }
10950 : else
10951 12 : return SLP_TREE_SCALAR_OPS (slp_node)[n];
10952 : }
10953 :
10954 : /* Get the Ith vectorized definition from SLP_NODE. */
10955 :
10956 : tree
10957 146086 : vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
10958 : {
10959 146086 : return SLP_TREE_VEC_DEFS (slp_node)[i];
10960 : }
10961 :
10962 : /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
10963 :
10964 : void
10965 931394 : vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
10966 : {
10967 1862788 : vec_defs->create (SLP_TREE_VEC_DEFS (slp_node).length ());
10968 931394 : vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
10969 931394 : }
10970 :
10971 : /* Get N vectorized definitions for SLP_NODE. */
10972 :
10973 : void
10974 2943 : vect_get_slp_defs (vec_info *,
10975 : slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
10976 : {
10977 2943 : if (n == -1U)
10978 2943 : n = SLP_TREE_CHILDREN (slp_node).length ();
10979 :
10980 10619 : for (unsigned i = 0; i < n; ++i)
10981 : {
10982 7676 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
10983 7676 : vec<tree> vec_defs = vNULL;
10984 7676 : vect_get_slp_defs (child, &vec_defs);
10985 7676 : vec_oprnds->quick_push (vec_defs);
10986 : }
10987 2943 : }
10988 :
10989 : /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
10990 : - PERM gives the permutation that the caller wants to use for NODE,
10991 : which might be different from SLP_LOAD_PERMUTATION.
10992 : - DUMP_P controls whether the function dumps information. */
10993 :
10994 : static bool
10995 121916 : vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
10996 : load_permutation_t &perm,
10997 : const vec<tree> &dr_chain,
10998 : gimple_stmt_iterator *gsi, poly_uint64 vf,
10999 : bool analyze_only, bool dump_p,
11000 : unsigned *n_perms, unsigned int *n_loads,
11001 : bool dce_chain)
11002 : {
11003 121916 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
11004 121916 : int vec_index = 0;
11005 121916 : tree vectype = SLP_TREE_VECTYPE (node);
11006 121916 : unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
11007 121916 : unsigned int mask_element;
11008 121916 : unsigned dr_group_size;
11009 121916 : machine_mode mode;
11010 :
11011 121916 : if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
11012 : {
11013 : /* We have both splats of the same non-grouped load and groups
11014 : of distinct invariant loads entering here. */
11015 1483 : unsigned max_idx = 0;
11016 8219 : for (auto idx : perm)
11017 3770 : max_idx = idx > max_idx ? idx : max_idx;
11018 1483 : dr_group_size = max_idx + 1;
11019 : }
11020 : else
11021 : {
11022 120433 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
11023 120433 : dr_group_size = DR_GROUP_SIZE (stmt_info);
11024 : }
11025 :
11026 121916 : mode = TYPE_MODE (vectype);
11027 121916 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
11028 121916 : unsigned int nstmts = vect_get_num_copies (vinfo, node);
11029 :
11030 : /* Initialize the vect stmts of NODE to properly insert the generated
11031 : stmts later. */
11032 121916 : if (! analyze_only)
11033 58108 : for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
11034 22374 : SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
11035 :
11036 : /* Generate permutation masks for every NODE. Number of masks for each NODE
11037 : is equal to GROUP_SIZE.
11038 : E.g., we have a group of three nodes with three loads from the same
11039 : location in each node, and the vector size is 4. I.e., we have a
11040 : a0b0c0a1b1c1... sequence and we need to create the following vectors:
11041 : for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
11042 : for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
11043 : ...
11044 :
11045 : The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
11046 : The last mask is illegal since we assume two operands for permute
11047 : operation, and the mask element values can't be outside that range.
11048 : Hence, the last mask must be converted into {2,5,5,5}.
11049 : For the first two permutations we need the first and the second input
11050 : vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
11051 : we need the second and the third vectors: {b1,c1,a2,b2} and
11052 : {c2,a3,b3,c3}. */
11053 :
11054 121916 : int vect_stmts_counter = 0;
11055 121916 : unsigned int index = 0;
11056 121916 : int first_vec_index = -1;
11057 121916 : int second_vec_index = -1;
11058 121916 : bool noop_p = true;
11059 121916 : *n_perms = 0;
11060 :
11061 121916 : vec_perm_builder mask;
11062 121916 : unsigned int nelts_to_build;
11063 121916 : unsigned int nvectors_per_build;
11064 121916 : unsigned int in_nlanes;
11065 121916 : bool repeating_p = (group_size == dr_group_size
11066 154682 : && multiple_p (nunits, group_size));
11067 121916 : if (repeating_p)
11068 : {
11069 : /* A single vector contains a whole number of copies of the node, so:
11070 : (a) all permutes can use the same mask; and
11071 : (b) the permutes only need a single vector input. */
11072 30533 : mask.new_vector (nunits, group_size, 3);
11073 30533 : nelts_to_build = mask.encoded_nelts ();
11074 : /* It's possible to obtain zero nstmts during analyze_only, so make
11075 : it at least one to ensure the later computation for n_perms
11076 : proceed. */
11077 30533 : nvectors_per_build = nstmts > 0 ? nstmts : 1;
11078 30533 : in_nlanes = dr_group_size * 3;
11079 : }
11080 : else
11081 : {
11082 : /* We need to construct a separate mask for each vector statement. */
11083 91383 : unsigned HOST_WIDE_INT const_nunits, const_vf;
11084 91383 : if (!nunits.is_constant (&const_nunits)
11085 91383 : || !vf.is_constant (&const_vf))
11086 : return false;
11087 91383 : mask.new_vector (const_nunits, const_nunits, 1);
11088 91383 : nelts_to_build = const_vf * group_size;
11089 91383 : nvectors_per_build = 1;
11090 91383 : in_nlanes = const_vf * dr_group_size;
11091 : }
11092 121916 : auto_sbitmap used_in_lanes (in_nlanes);
11093 121916 : bitmap_clear (used_in_lanes);
11094 121916 : auto_bitmap used_defs;
11095 :
11096 121916 : unsigned int count = mask.encoded_nelts ();
11097 121916 : mask.quick_grow (count);
11098 121916 : vec_perm_indices indices;
11099 :
11100 658613 : for (unsigned int j = 0; j < nelts_to_build; j++)
11101 : {
11102 546341 : unsigned int iter_num = j / group_size;
11103 546341 : unsigned int stmt_num = j % group_size;
11104 546341 : unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
11105 546341 : bitmap_set_bit (used_in_lanes, i);
11106 546341 : if (repeating_p)
11107 : {
11108 : first_vec_index = 0;
11109 : mask_element = i;
11110 : }
11111 : else
11112 : {
11113 : /* Enforced before the loop when !repeating_p. */
11114 348647 : unsigned int const_nunits = nunits.to_constant ();
11115 348647 : vec_index = i / const_nunits;
11116 348647 : mask_element = i % const_nunits;
11117 348647 : if (vec_index == first_vec_index
11118 348647 : || first_vec_index == -1)
11119 : {
11120 : first_vec_index = vec_index;
11121 : }
11122 140107 : else if (vec_index == second_vec_index
11123 140107 : || second_vec_index == -1)
11124 : {
11125 134004 : second_vec_index = vec_index;
11126 134004 : mask_element += const_nunits;
11127 : }
11128 : else
11129 : {
11130 6103 : if (dump_p)
11131 280 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11132 : "permutation requires at "
11133 : "least three vectors %G",
11134 : stmt_info->stmt);
11135 6103 : gcc_assert (analyze_only);
11136 : return false;
11137 : }
11138 :
11139 342544 : gcc_assert (mask_element < 2 * const_nunits);
11140 : }
11141 :
11142 540238 : if (mask_element != index)
11143 351299 : noop_p = false;
11144 540238 : mask[index++] = mask_element;
11145 :
11146 540238 : if (index == count)
11147 : {
11148 145237 : if (!noop_p)
11149 : {
11150 199782 : indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
11151 117990 : if (!can_vec_perm_const_p (mode, mode, indices))
11152 : {
11153 3541 : if (dump_p)
11154 : {
11155 79 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11156 : "unsupported vect permute { ");
11157 669 : for (i = 0; i < count; ++i)
11158 : {
11159 590 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
11160 590 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
11161 : }
11162 79 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
11163 : }
11164 3541 : gcc_assert (analyze_only);
11165 : return false;
11166 : }
11167 :
11168 114449 : tree mask_vec = NULL_TREE;
11169 114449 : if (!analyze_only)
11170 20684 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
11171 :
11172 114449 : if (second_vec_index == -1)
11173 34248 : second_vec_index = first_vec_index;
11174 :
11175 231772 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
11176 : {
11177 117323 : ++*n_perms;
11178 117323 : if (analyze_only)
11179 96356 : continue;
11180 : /* Generate the permute statement if necessary. */
11181 20967 : tree first_vec = dr_chain[first_vec_index + ri];
11182 20967 : tree second_vec = dr_chain[second_vec_index + ri];
11183 20967 : gassign *stmt = as_a<gassign *> (stmt_info->stmt);
11184 20967 : tree perm_dest
11185 20967 : = vect_create_destination_var (gimple_assign_lhs (stmt),
11186 : vectype);
11187 20967 : perm_dest = make_ssa_name (perm_dest);
11188 20967 : gimple *perm_stmt
11189 20967 : = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
11190 : second_vec, mask_vec);
11191 20967 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
11192 : gsi);
11193 20967 : if (dce_chain)
11194 : {
11195 20044 : bitmap_set_bit (used_defs, first_vec_index + ri);
11196 20044 : bitmap_set_bit (used_defs, second_vec_index + ri);
11197 : }
11198 :
11199 : /* Store the vector statement in NODE. */
11200 20967 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
11201 : }
11202 : }
11203 27247 : else if (!analyze_only)
11204 : {
11205 2814 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
11206 : {
11207 1407 : tree first_vec = dr_chain[first_vec_index + ri];
11208 : /* If mask was NULL_TREE generate the requested
11209 : identity transform. */
11210 1407 : if (dce_chain)
11211 1400 : bitmap_set_bit (used_defs, first_vec_index + ri);
11212 :
11213 : /* Store the vector statement in NODE. */
11214 1407 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
11215 : }
11216 : }
11217 :
11218 : index = 0;
11219 : first_vec_index = -1;
11220 : second_vec_index = -1;
11221 : noop_p = true;
11222 : }
11223 : }
11224 :
11225 112272 : if (n_loads)
11226 : {
11227 80455 : if (repeating_p)
11228 10468 : *n_loads = nstmts;
11229 : else
11230 : {
11231 : /* Enforced above when !repeating_p. */
11232 69987 : unsigned int const_nunits = nunits.to_constant ();
11233 69987 : *n_loads = 0;
11234 69987 : bool load_seen = false;
11235 979763 : for (unsigned i = 0; i < in_nlanes; ++i)
11236 : {
11237 909776 : if (i % const_nunits == 0)
11238 : {
11239 383863 : if (load_seen)
11240 109879 : *n_loads += 1;
11241 : load_seen = false;
11242 : }
11243 909776 : if (bitmap_bit_p (used_in_lanes, i))
11244 252311 : load_seen = true;
11245 : }
11246 69987 : if (load_seen)
11247 48312 : *n_loads += 1;
11248 : }
11249 : }
11250 :
11251 112272 : if (dce_chain)
11252 212493 : for (unsigned i = 0; i < dr_chain.length (); ++i)
11253 73503 : if (!bitmap_bit_p (used_defs, i))
11254 : {
11255 40635 : tree def = dr_chain[i];
11256 41018 : do
11257 : {
11258 41018 : gimple *stmt = SSA_NAME_DEF_STMT (def);
11259 41018 : if (is_gimple_assign (stmt)
11260 41018 : && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
11261 41018 : || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
11262 4952 : def = single_ssa_tree_operand (stmt, SSA_OP_USE);
11263 : else
11264 : def = NULL;
11265 41018 : gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
11266 41018 : gsi_remove (&rgsi, true);
11267 41018 : release_defs (stmt);
11268 : }
11269 41018 : while (def);
11270 : }
11271 :
11272 : return true;
11273 121916 : }
11274 :
11275 : /* Generate vector permute statements from a list of loads in DR_CHAIN.
11276 : If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
11277 : permute statements for the SLP node NODE. Store the number of vector
11278 : permute instructions in *N_PERMS and the number of vector load
11279 : instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
11280 : that were not needed. */
11281 :
11282 : bool
11283 89389 : vect_transform_slp_perm_load (vec_info *vinfo,
11284 : slp_tree node, const vec<tree> &dr_chain,
11285 : gimple_stmt_iterator *gsi, poly_uint64 vf,
11286 : bool analyze_only, unsigned *n_perms,
11287 : unsigned int *n_loads, bool dce_chain)
11288 : {
11289 89389 : return vect_transform_slp_perm_load_1 (vinfo, node,
11290 89389 : SLP_TREE_LOAD_PERMUTATION (node),
11291 : dr_chain, gsi, vf, analyze_only,
11292 : dump_enabled_p (), n_perms, n_loads,
11293 89389 : dce_chain);
11294 : }
11295 :
11296 : /* Produce the next vector result for SLP permutation NODE by adding a vector
11297 : statement at GSI. If MASK_VEC is nonnull, add:
11298 :
11299 : <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
11300 :
11301 : otherwise add:
11302 :
11303 : <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF,
11304 : { N, N+1, N+2, ... }>
11305 :
11306 : where N == IDENTITY_OFFSET which is either zero or equal to the
11307 : number of elements of the result. */
11308 :
11309 : static void
11310 31191 : vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11311 : slp_tree node, tree first_def, tree second_def,
11312 : tree mask_vec, poly_uint64 identity_offset)
11313 : {
11314 31191 : tree vectype = SLP_TREE_VECTYPE (node);
11315 :
11316 : /* ??? We SLP match existing vector element extracts but
11317 : allow punning which we need to re-instantiate at uses
11318 : but have no good way of explicitly representing. */
11319 31191 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
11320 31191 : && !types_compatible_p (TREE_TYPE (first_def), vectype))
11321 : {
11322 14 : gassign *conv_stmt
11323 14 : = gimple_build_assign (make_ssa_name (vectype),
11324 : build1 (VIEW_CONVERT_EXPR, vectype, first_def));
11325 14 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
11326 14 : first_def = gimple_assign_lhs (conv_stmt);
11327 : }
11328 31191 : gassign *perm_stmt;
11329 31191 : tree perm_dest = make_ssa_name (vectype);
11330 31191 : if (mask_vec)
11331 : {
11332 27975 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
11333 27975 : TYPE_SIZE (vectype))
11334 27975 : && !types_compatible_p (TREE_TYPE (second_def), vectype))
11335 : {
11336 8 : gassign *conv_stmt
11337 8 : = gimple_build_assign (make_ssa_name (vectype),
11338 : build1 (VIEW_CONVERT_EXPR,
11339 : vectype, second_def));
11340 8 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
11341 8 : second_def = gimple_assign_lhs (conv_stmt);
11342 : }
11343 27975 : perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
11344 : first_def, second_def,
11345 : mask_vec);
11346 : }
11347 : else
11348 : {
11349 3216 : auto def_nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
11350 3216 : unsigned HOST_WIDE_INT vecno;
11351 3216 : poly_uint64 eltno;
11352 3216 : if (!can_div_trunc_p (poly_uint64 (identity_offset), def_nunits,
11353 : &vecno, &eltno))
11354 : gcc_unreachable ();
11355 3216 : tree def = vecno & 1 ? second_def : first_def;
11356 3216 : if (!types_compatible_p (TREE_TYPE (def), vectype))
11357 : {
11358 : /* For identity permutes we still need to handle the case
11359 : of offsetted extracts or concats. */
11360 219 : unsigned HOST_WIDE_INT c;
11361 219 : if (known_le (TYPE_VECTOR_SUBPARTS (vectype), def_nunits))
11362 : {
11363 215 : unsigned HOST_WIDE_INT elsz
11364 215 : = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (def))));
11365 430 : tree lowpart = build3 (BIT_FIELD_REF, vectype, def,
11366 215 : TYPE_SIZE (vectype),
11367 215 : bitsize_int (eltno * elsz));
11368 215 : perm_stmt = gimple_build_assign (perm_dest, lowpart);
11369 : }
11370 4 : else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
11371 4 : def_nunits, &c) && c == 2)
11372 : {
11373 4 : gcc_assert (known_eq (identity_offset, 0U));
11374 4 : tree ctor = build_constructor_va (vectype, 2,
11375 : NULL_TREE, first_def,
11376 : NULL_TREE, second_def);
11377 4 : perm_stmt = gimple_build_assign (perm_dest, ctor);
11378 : }
11379 : else
11380 0 : gcc_unreachable ();
11381 : }
11382 : else
11383 : {
11384 : /* We need a copy here in case the def was external. */
11385 2997 : gcc_assert (known_eq (eltno, 0U));
11386 2997 : perm_stmt = gimple_build_assign (perm_dest, def);
11387 : }
11388 : }
11389 31191 : vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
11390 : /* Store the vector statement in NODE. */
11391 31191 : node->push_vec_def (perm_stmt);
11392 31191 : }
11393 :
11394 : /* Subroutine of vectorizable_slp_permutation. Check whether the target
11395 : can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
11396 : If GSI is nonnull, emit the permutation there.
11397 :
11398 : When GSI is null, the only purpose of NODE is to give properties
11399 : of the result, such as the vector type and number of SLP lanes.
11400 : The node does not need to be a VEC_PERM_EXPR.
11401 :
11402 : If the target supports the operation, return the number of individual
11403 : VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
11404 : dump file if DUMP_P is true. */
11405 :
11406 : static int
11407 430332 : vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
11408 : slp_tree node, lane_permutation_t &perm,
11409 : vec<slp_tree> &children, bool dump_p)
11410 : {
11411 430332 : tree vectype = SLP_TREE_VECTYPE (node);
11412 :
11413 : /* ??? We currently only support all same vector input types
11414 : while the SLP IL should really do a concat + select and thus accept
11415 : arbitrary mismatches. */
11416 430332 : slp_tree child;
11417 430332 : unsigned i;
11418 430332 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
11419 430332 : bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
11420 : /* True if we're permuting a single input of 2N vectors down
11421 : to N vectors. This case doesn't generalize beyond 2 since
11422 : VEC_PERM_EXPR only takes 2 inputs. */
11423 430332 : bool pack_p = false;
11424 : /* If we're permuting inputs of N vectors each into X*N outputs,
11425 : this is the value of X, otherwise it is 1. */
11426 430332 : unsigned int unpack_factor = 1;
11427 430332 : tree op_vectype = NULL_TREE;
11428 431890 : FOR_EACH_VEC_ELT (children, i, child)
11429 431818 : if (SLP_TREE_VECTYPE (child))
11430 : {
11431 : op_vectype = SLP_TREE_VECTYPE (child);
11432 : break;
11433 : }
11434 430332 : if (!op_vectype)
11435 72 : op_vectype = vectype;
11436 943470 : FOR_EACH_VEC_ELT (children, i, child)
11437 : {
11438 513138 : if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
11439 9430 : && !vect_maybe_update_slp_op_vectype (child, op_vectype))
11440 513138 : || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
11441 1026276 : || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
11442 : {
11443 0 : if (dump_p)
11444 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11445 : "Unsupported vector types in lane permutation\n");
11446 0 : return -1;
11447 : }
11448 513138 : auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
11449 513138 : unsigned int this_unpack_factor;
11450 : /* Detect permutations of external, pre-existing vectors. The external
11451 : node's SLP_TREE_LANES stores the total number of units in the vector,
11452 : or zero if the vector has variable length.
11453 :
11454 : We are expected to keep the original VEC_PERM_EXPR for such cases.
11455 : There is no repetition to model. */
11456 513138 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def
11457 513138 : && SLP_TREE_SCALAR_OPS (child).is_empty ())
11458 : repeating_p = false;
11459 : /* Check whether the input has twice as many lanes per vector. */
11460 506256 : else if (children.length () == 1
11461 506256 : && known_eq (SLP_TREE_LANES (child) * nunits,
11462 : SLP_TREE_LANES (node) * op_nunits * 2))
11463 : pack_p = true;
11464 : /* Check whether the output has N times as many lanes per vector. */
11465 513138 : else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
11466 462397 : SLP_TREE_LANES (child) * nunits,
11467 : &this_unpack_factor)
11468 427407 : && (i == 0 || unpack_factor == this_unpack_factor))
11469 : unpack_factor = this_unpack_factor;
11470 : else
11471 : repeating_p = false;
11472 : }
11473 :
11474 860664 : gcc_assert (perm.length () == SLP_TREE_LANES (node));
11475 :
11476 : /* Load-lanes permute. This permute only acts as a forwarder to
11477 : select the correct vector def of the load-lanes load which
11478 : has the permuted vectors in its vector defs like
11479 : { v0, w0, r0, v1, w1, r1 ... } for a ld3. All costs are
11480 : accounted for in the costing for the actual load so we
11481 : return zero here. */
11482 430332 : if (node->ldst_lanes)
11483 : {
11484 0 : gcc_assert (children.length () == 1);
11485 0 : if (!gsi)
11486 : /* This is a trivial op always supported. */
11487 : return 0;
11488 0 : slp_tree child = children[0];
11489 0 : unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
11490 0 : / SLP_TREE_LANES (node));
11491 0 : unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
11492 0 : unsigned nvectors = vect_get_num_copies (vinfo, node);
11493 0 : for (unsigned i = 0; i < nvectors; ++i)
11494 : {
11495 0 : tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx];
11496 0 : node->push_vec_def (def);
11497 : }
11498 : return 0;
11499 : }
11500 :
11501 : /* Set REPEATING_P to true if the permutations are cyclical wrt UNPACK_FACTOR
11502 : and if we can generate the vectors in a vector-length agnostic way.
11503 : This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
11504 : compile time.
11505 :
11506 : The significance of UNPACK_STEP is that, when PACK_P is false,
11507 : output vector I operates on a window of UNPACK_STEP elements from each
11508 : input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR). For example,
11509 : when UNPACK_FACTOR is 2, the first output vector operates on lanes
11510 : [0, NUNITS / 2 - 1] of each input vector and the second output vector
11511 : operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
11512 :
11513 : When REPEATING_P is true, NOUTPUTS holds the total number of outputs
11514 : that we actually need to generate. */
11515 430332 : uint64_t noutputs = 0;
11516 430332 : poly_uint64 unpack_step = 0;
11517 430332 : loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
11518 183103 : if (!linfo
11519 469515 : || !multiple_p (nunits, unpack_factor, &unpack_step)
11520 182161 : || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
11521 182161 : * SLP_TREE_LANES (node), nunits, &noutputs))
11522 : repeating_p = false;
11523 :
11524 : /* We can handle the conditions described for REPEATING_P above for
11525 : both variable- and constant-length vectors. The fallback requires
11526 : us to generate every element of every permute vector explicitly,
11527 : which is only possible for constant-length permute vectors.
11528 :
11529 : Set:
11530 :
11531 : - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
11532 : mask vectors that we want to build.
11533 :
11534 : - NCOPIES to the number of copies of PERM that we need in order
11535 : to build the necessary permute mask vectors. */
11536 182161 : uint64_t npatterns;
11537 182161 : unsigned nelts_per_pattern;
11538 182161 : uint64_t ncopies;
11539 182161 : if (repeating_p)
11540 : {
11541 : /* We need permute mask vectors that have the form:
11542 :
11543 : { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
11544 :
11545 : In other words, the original n-element permute in PERM is
11546 : "unrolled" to fill a full vector. The stepped vector encoding
11547 : that we use for permutes requires 3n elements. */
11548 142978 : npatterns = SLP_TREE_LANES (node);
11549 142978 : nelts_per_pattern = ncopies = 3;
11550 : }
11551 : else
11552 : {
11553 : /* Calculate every element of every permute mask vector explicitly,
11554 : instead of relying on the pattern described above. */
11555 287354 : if (!nunits.is_constant (&npatterns)
11556 287354 : || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
11557 : {
11558 : if (dump_p)
11559 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11560 : "unsupported permutation %p on variable-length"
11561 : " vectors\n", (void *) node);
11562 : return -1;
11563 : }
11564 287354 : nelts_per_pattern = ncopies = 1;
11565 287354 : if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
11566 : {
11567 : if (dump_p)
11568 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11569 : "unsupported permutation %p for variable VF\n",
11570 : (void *) node);
11571 : return -1;
11572 : }
11573 : pack_p = false;
11574 : unpack_factor = 1;
11575 : }
11576 430332 : unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
11577 430332 : gcc_assert (repeating_p || multiple_p (olanes, nunits));
11578 :
11579 : /* Compute the { { SLP operand, vector index}, lane } permutation sequence
11580 : from the { SLP operand, scalar lane } permutation as recorded in the
11581 : SLP node as intermediate step. This part should already work
11582 : with SLP children with arbitrary number of lanes. */
11583 430332 : auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
11584 430332 : auto_vec<poly_uint64> active_lane;
11585 430332 : vperm.create (olanes);
11586 430332 : active_lane.safe_grow_cleared (children.length (), true);
11587 868922 : for (unsigned int ui = 0; ui < unpack_factor; ++ui)
11588 : {
11589 1936440 : for (unsigned j = 0; j < children.length (); ++j)
11590 529630 : active_lane[j] = ui * unpack_step;
11591 1279748 : for (unsigned i = 0; i < ncopies; ++i)
11592 : {
11593 5251236 : for (unsigned pi = 0; pi < perm.length (); ++pi)
11594 : {
11595 1784460 : std::pair<unsigned, unsigned> p = perm[pi];
11596 1784460 : tree vtype = SLP_TREE_VECTYPE (children[p.first]);
11597 1784460 : if (repeating_p)
11598 833508 : vperm.quick_push ({{p.first, 0},
11599 833508 : p.second + active_lane[p.first]});
11600 : else
11601 : {
11602 : /* We checked above that the vectors are constant-length. */
11603 950952 : unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
11604 950952 : .to_constant ();
11605 950952 : unsigned lane = active_lane[p.first].to_constant ();
11606 950952 : unsigned vi = (lane + p.second) / vnunits;
11607 950952 : unsigned vl = (lane + p.second) % vnunits;
11608 950952 : vperm.quick_push ({{p.first, vi}, vl});
11609 : }
11610 : }
11611 : /* Advance to the next group. */
11612 1837700 : for (unsigned j = 0; j < children.length (); ++j)
11613 996542 : active_lane[j] += SLP_TREE_LANES (children[j]);
11614 : }
11615 : }
11616 :
11617 430332 : if (dump_p)
11618 : {
11619 8975 : dump_printf_loc (MSG_NOTE, vect_location,
11620 : "vectorizing permutation %p", (void *)node);
11621 32494 : for (unsigned i = 0; i < perm.length (); ++i)
11622 23519 : dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
11623 8975 : if (repeating_p)
11624 7574 : dump_printf (MSG_NOTE, " (repeat %d)", SLP_TREE_LANES (node));
11625 8975 : dump_printf (MSG_NOTE, "\n");
11626 8975 : dump_printf_loc (MSG_NOTE, vect_location, "as");
11627 90432 : for (unsigned i = 0; i < vperm.length (); ++i)
11628 : {
11629 81457 : if (i != 0
11630 81457 : && (repeating_p
11631 55237 : ? multiple_p (i, npatterns)
11632 60615 : : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
11633 24347 : dump_printf (MSG_NOTE, ",");
11634 81457 : dump_printf (MSG_NOTE, " vops%u[%u][",
11635 81457 : vperm[i].first.first, vperm[i].first.second);
11636 81457 : dump_dec (MSG_NOTE, vperm[i].second);
11637 81457 : dump_printf (MSG_NOTE, "]");
11638 : }
11639 8975 : dump_printf (MSG_NOTE, "\n");
11640 : }
11641 :
11642 : /* We can only handle two-vector permutes, everything else should
11643 : be lowered on the SLP level. The following is closely inspired
11644 : by vect_transform_slp_perm_load and is supposed to eventually
11645 : replace it.
11646 : ??? As intermediate step do code-gen in the SLP tree representation
11647 : somehow? */
11648 430332 : std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
11649 430332 : std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
11650 430332 : unsigned int index = 0;
11651 430332 : poly_uint64 mask_element;
11652 430332 : vec_perm_builder mask;
11653 430332 : mask.new_vector (nunits, npatterns, nelts_per_pattern);
11654 430332 : unsigned int count = mask.encoded_nelts ();
11655 430332 : mask.quick_grow (count);
11656 430332 : vec_perm_indices indices;
11657 430332 : unsigned nperms = 0;
11658 : /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
11659 : vectors to check during analysis, but we need to generate NOUTPUTS
11660 : vectors during transformation. */
11661 430332 : unsigned total_nelts = olanes;
11662 430332 : unsigned process_nelts = olanes;
11663 430332 : if (repeating_p)
11664 : {
11665 142978 : total_nelts = (total_nelts / unpack_factor) * noutputs;
11666 142978 : if (gsi)
11667 9808 : process_nelts = total_nelts;
11668 : }
11669 430332 : unsigned last_ei = (total_nelts - 1) % process_nelts;
11670 2224049 : for (unsigned i = 0; i < process_nelts; ++i)
11671 : {
11672 : /* VI is the input vector index when generating code for REPEATING_P. */
11673 1801090 : unsigned vi = i / olanes * (pack_p ? 2 : 1);
11674 1801090 : unsigned ei = i % olanes;
11675 1801090 : mask_element = vperm[ei].second;
11676 1801090 : if (pack_p)
11677 : {
11678 : /* In this case, we have N outputs and the single child provides 2N
11679 : inputs. Output X permutes inputs 2X and 2X+1.
11680 :
11681 : The mask indices are taken directly from the SLP permutation node.
11682 : Index X selects from the first vector if (X / NUNITS) % 2 == 0;
11683 : X selects from the second vector otherwise. These conditions
11684 : are only known at compile time for constant-length vectors. */
11685 : first_vec = std::make_pair (0, 0);
11686 : second_vec = std::make_pair (0, 1);
11687 : }
11688 1632019 : else if (first_vec.first == -1U
11689 1632019 : || first_vec == vperm[ei].first)
11690 1400333 : first_vec = vperm[ei].first;
11691 231686 : else if (second_vec.first == -1U
11692 231686 : || second_vec == vperm[ei].first)
11693 : {
11694 231289 : second_vec = vperm[ei].first;
11695 231289 : mask_element += nunits;
11696 : }
11697 : else
11698 : {
11699 397 : if (dump_p)
11700 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11701 : "permutation requires at "
11702 : "least three vectors\n");
11703 397 : gcc_assert (!gsi);
11704 : return -1;
11705 : }
11706 :
11707 1800693 : mask[index++] = mask_element;
11708 :
11709 1800693 : if (index == count)
11710 : {
11711 746085 : indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
11712 : TYPE_VECTOR_SUBPARTS (op_vectype));
11713 573002 : bool identity_p = (indices.series_p (0, 1, mask[0], 1)
11714 850417 : && constant_multiple_p (mask[0], nunits));
11715 573002 : machine_mode vmode = TYPE_MODE (vectype);
11716 573002 : machine_mode op_vmode = TYPE_MODE (op_vectype);
11717 573002 : unsigned HOST_WIDE_INT c;
11718 573002 : if ((!identity_p
11719 530339 : && !can_vec_perm_const_p (vmode, op_vmode, indices))
11720 573002 : || (identity_p
11721 42663 : && !known_le (nunits,
11722 : TYPE_VECTOR_SUBPARTS (op_vectype))
11723 6984 : && (!constant_multiple_p (nunits,
11724 8 : TYPE_VECTOR_SUBPARTS (op_vectype),
11725 8 : &c) || c != 2)))
11726 : {
11727 6976 : if (dump_p)
11728 : {
11729 152 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
11730 : vect_location,
11731 : "unsupported vect permute { ");
11732 1586 : for (i = 0; i < count; ++i)
11733 : {
11734 1434 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
11735 1434 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
11736 : }
11737 152 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
11738 : }
11739 6976 : gcc_assert (!gsi);
11740 7373 : return -1;
11741 : }
11742 :
11743 566026 : if (!identity_p)
11744 523363 : nperms += CEIL (total_nelts, process_nelts) - (ei > last_ei);
11745 566026 : if (gsi)
11746 : {
11747 31191 : if (second_vec.first == -1U)
11748 6973 : second_vec = first_vec;
11749 :
11750 31191 : slp_tree
11751 31191 : first_node = children[first_vec.first],
11752 31191 : second_node = children[second_vec.first];
11753 :
11754 31191 : tree mask_vec = NULL_TREE;
11755 31191 : if (!identity_p)
11756 27975 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
11757 :
11758 31191 : tree first_def
11759 31191 : = vect_get_slp_vect_def (first_node, first_vec.second + vi);
11760 31191 : tree second_def
11761 31191 : = vect_get_slp_vect_def (second_node, second_vec.second + vi);
11762 31191 : vect_add_slp_permutation (vinfo, gsi, node, first_def,
11763 31191 : second_def, mask_vec, mask[0]);
11764 : }
11765 :
11766 : index = 0;
11767 : first_vec = std::make_pair (-1U, -1U);
11768 : second_vec = std::make_pair (-1U, -1U);
11769 : }
11770 : }
11771 :
11772 422959 : return nperms;
11773 430332 : }
11774 :
11775 : /* Vectorize the SLP permutations in NODE as specified
11776 : in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
11777 : child number and lane number.
11778 : Interleaving of two two-lane two-child SLP subtrees (not supported):
11779 : [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
11780 : A blend of two four-lane two-child SLP subtrees:
11781 : [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
11782 : Highpart of a four-lane one-child SLP subtree (not supported):
11783 : [ { 0, 2 }, { 0, 3 } ]
11784 : Where currently only a subset is supported by code generating below. */
11785 :
11786 : bool
11787 137559 : vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11788 : slp_tree node, stmt_vector_for_cost *cost_vec)
11789 : {
11790 137559 : tree vectype = SLP_TREE_VECTYPE (node);
11791 137559 : lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
11792 137559 : int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
11793 137559 : SLP_TREE_CHILDREN (node),
11794 : dump_enabled_p ());
11795 137559 : if (nperms < 0)
11796 : return false;
11797 :
11798 136272 : if (!gsi && nperms != 0)
11799 114552 : record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
11800 :
11801 : return true;
11802 : }
11803 :
11804 : /* Vectorize SLP NODE. */
11805 :
11806 : static void
11807 1471918 : vect_schedule_slp_node (vec_info *vinfo,
11808 : slp_tree node, slp_instance instance)
11809 : {
11810 1471918 : gimple_stmt_iterator si;
11811 1471918 : int i;
11812 1471918 : slp_tree child;
11813 :
11814 : /* Vectorize externals and constants. */
11815 1471918 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
11816 1471918 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
11817 : {
11818 : /* ??? vectorizable_shift can end up using a scalar operand which is
11819 : currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
11820 : node in this case. */
11821 498121 : if (!SLP_TREE_VECTYPE (node))
11822 498121 : return;
11823 :
11824 : /* There are two reasons vector defs might already exist. The first
11825 : is that we are vectorizing an existing vector def. The second is
11826 : when performing BB vectorization shared constant/external nodes
11827 : are not split apart during partitioning so during the code-gen
11828 : DFS walk we can end up visiting them twice. */
11829 491087 : if (! SLP_TREE_VEC_DEFS (node).exists ())
11830 490404 : vect_create_constant_vectors (vinfo, node);
11831 491087 : return;
11832 : }
11833 :
11834 973797 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
11835 :
11836 973797 : gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
11837 973797 : if (SLP_TREE_VECTYPE (node))
11838 973791 : SLP_TREE_VEC_DEFS (node).create (vect_get_num_copies (vinfo, node));
11839 :
11840 973797 : if (!SLP_TREE_PERMUTE_P (node) && STMT_VINFO_DATA_REF (stmt_info))
11841 : {
11842 : /* Vectorized loads go before the first scalar load to make it
11843 : ready early, vectorized stores go before the last scalar
11844 : stmt which is where all uses are ready. */
11845 712399 : stmt_vec_info last_stmt_info = NULL;
11846 712399 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
11847 166677 : last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
11848 : else /* DR_IS_WRITE */
11849 545722 : last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
11850 712399 : si = gsi_for_stmt (last_stmt_info->stmt);
11851 712399 : }
11852 261398 : else if (!SLP_TREE_PERMUTE_P (node)
11853 245065 : && (SLP_TREE_TYPE (node) == cycle_phi_info_type
11854 : || SLP_TREE_TYPE (node) == induc_vec_info_type
11855 : || SLP_TREE_TYPE (node) == phi_info_type))
11856 : {
11857 : /* For PHI node vectorization we do not use the insertion iterator. */
11858 54184 : si = gsi_none ();
11859 : }
11860 : else
11861 : {
11862 : /* Emit other stmts after the children vectorized defs which is
11863 : earliest possible. */
11864 : gimple *last_stmt = NULL;
11865 : bool seen_vector_def = false;
11866 576424 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11867 369210 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
11868 : {
11869 : /* For fold-left reductions we are retaining the scalar
11870 : reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
11871 : set so the representation isn't perfect. Resort to the
11872 : last scalar def here. */
11873 296213 : if (SLP_TREE_VEC_DEFS (child).is_empty ())
11874 : {
11875 925 : gcc_assert (SLP_TREE_TYPE (child) == cycle_phi_info_type);
11876 925 : gphi *phi = as_a <gphi *>
11877 925 : (vect_find_last_scalar_stmt_in_slp (child)->stmt);
11878 925 : if (!last_stmt)
11879 : last_stmt = phi;
11880 705 : else if (vect_stmt_dominates_stmt_p (last_stmt, phi))
11881 : last_stmt = phi;
11882 694 : else if (vect_stmt_dominates_stmt_p (phi, last_stmt))
11883 : ;
11884 : else
11885 0 : gcc_unreachable ();
11886 : }
11887 : /* We are emitting all vectorized stmts in the same place and
11888 : the last one is the last.
11889 : ??? Unless we have a load permutation applied and that
11890 : figures to re-use an earlier generated load. */
11891 : unsigned j;
11892 : tree vdef;
11893 700342 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11894 : {
11895 404129 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11896 404129 : if (!last_stmt)
11897 : last_stmt = vstmt;
11898 207488 : else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11899 : last_stmt = vstmt;
11900 45656 : else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
11901 : ;
11902 : else
11903 0 : gcc_unreachable ();
11904 : }
11905 : }
11906 72997 : else if (!SLP_TREE_VECTYPE (child))
11907 : {
11908 : /* For externals we use unvectorized at all scalar defs. */
11909 : unsigned j;
11910 : tree def;
11911 14941 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
11912 8543 : if (TREE_CODE (def) == SSA_NAME
11913 8543 : && !SSA_NAME_IS_DEFAULT_DEF (def))
11914 : {
11915 295 : gimple *stmt = SSA_NAME_DEF_STMT (def);
11916 295 : if (gimple_uid (stmt) == -1u)
11917 : /* If the stmt is not inside the region do not
11918 : use it as possible insertion point. */
11919 : ;
11920 285 : else if (!last_stmt)
11921 : last_stmt = stmt;
11922 261 : else if (vect_stmt_dominates_stmt_p (last_stmt, stmt))
11923 : last_stmt = stmt;
11924 159 : else if (vect_stmt_dominates_stmt_p (stmt, last_stmt))
11925 : ;
11926 : else
11927 0 : gcc_unreachable ();
11928 : }
11929 : }
11930 : else
11931 : {
11932 : /* For externals we have to look at all defs since their
11933 : insertion place is decided per vector. But beware
11934 : of pre-existing vectors where we need to make sure
11935 : we do not insert before the region boundary. */
11936 66599 : if (SLP_TREE_SCALAR_OPS (child).is_empty ()
11937 512 : && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
11938 : seen_vector_def = true;
11939 : else
11940 : {
11941 : unsigned j;
11942 : tree vdef;
11943 530091 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11944 94399 : if (TREE_CODE (vdef) == SSA_NAME
11945 94399 : && !SSA_NAME_IS_DEFAULT_DEF (vdef))
11946 : {
11947 19452 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11948 19452 : if (!last_stmt)
11949 : last_stmt = vstmt;
11950 10846 : else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11951 : last_stmt = vstmt;
11952 8721 : else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
11953 : ;
11954 : else
11955 0 : gcc_unreachable ();
11956 : }
11957 : }
11958 : }
11959 : /* This can happen when all children are pre-existing vectors or
11960 : constants. */
11961 207214 : if (!last_stmt)
11962 1723 : last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
11963 1723 : if (!last_stmt)
11964 : {
11965 0 : gcc_assert (seen_vector_def);
11966 0 : si = gsi_after_labels (vinfo->bbs[0]);
11967 : }
11968 207214 : else if (is_ctrl_altering_stmt (last_stmt))
11969 : {
11970 : /* We split regions to vectorize at control altering stmts
11971 : with a definition so this must be an external which
11972 : we can insert at the start of the region. */
11973 0 : si = gsi_after_labels (vinfo->bbs[0]);
11974 : }
11975 207214 : else if (is_a <bb_vec_info> (vinfo)
11976 17733 : && !SLP_TREE_PERMUTE_P (node)
11977 16394 : && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
11978 208568 : && gimple_could_trap_p (stmt_info->stmt))
11979 : {
11980 : /* We've constrained possibly trapping operations to all come
11981 : from the same basic-block, if vectorized defs would allow earlier
11982 : scheduling still force vectorized stmts to the original block.
11983 : This is only necessary for BB vectorization since for loop vect
11984 : all operations are in a single BB and scalar stmt based
11985 : placement doesn't play well with epilogue vectorization. */
11986 54 : gcc_assert (dominated_by_p (CDI_DOMINATORS,
11987 : gimple_bb (stmt_info->stmt),
11988 : gimple_bb (last_stmt)));
11989 54 : si = gsi_after_labels (gimple_bb (stmt_info->stmt));
11990 : }
11991 207160 : else if (is_a <gphi *> (last_stmt))
11992 14496 : si = gsi_after_labels (gimple_bb (last_stmt));
11993 : else
11994 : {
11995 192664 : si = gsi_for_stmt (last_stmt);
11996 192664 : gsi_next (&si);
11997 :
11998 192664 : if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
11999 : {
12000 : /* Avoid scheduling stmts to random places in the CFG, any
12001 : stmt dominance check we performed is possibly wrong as UIDs
12002 : are not initialized for all of the function for loop
12003 : vectorization. Instead append to the loop preheader. */
12004 175201 : if ((LOOP_VINFO_LOOP (loop_vinfo)->header
12005 175201 : != gimple_bb (last_stmt))
12006 178416 : && dominated_by_p (CDI_DOMINATORS,
12007 : LOOP_VINFO_LOOP (loop_vinfo)->header,
12008 3215 : gimple_bb (last_stmt)))
12009 1402 : si = gsi_end_bb (loop_preheader_edge
12010 701 : (LOOP_VINFO_LOOP (loop_vinfo))->src);
12011 : /* Avoid scheduling internal defs outside of the loop when
12012 : we might have only implicitly tracked loop mask/len defs. */
12013 74 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
12014 175201 : || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
12015 : {
12016 74 : gimple_stmt_iterator si2
12017 74 : = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
12018 74 : if ((gsi_end_p (si2)
12019 0 : && (LOOP_VINFO_LOOP (loop_vinfo)->header
12020 0 : != gimple_bb (last_stmt))
12021 0 : && dominated_by_p (CDI_DOMINATORS,
12022 : LOOP_VINFO_LOOP (loop_vinfo)->header,
12023 0 : gimple_bb (last_stmt)))
12024 74 : || (!gsi_end_p (si2)
12025 74 : && last_stmt != *si2
12026 72 : && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
12027 3 : si = si2;
12028 : }
12029 : }
12030 : }
12031 : }
12032 :
12033 973797 : if (dump_enabled_p ())
12034 : {
12035 71697 : if (stmt_info)
12036 71644 : dump_printf_loc (MSG_NOTE, vect_location,
12037 : "------>vectorizing SLP node starting from: %G",
12038 : stmt_info->stmt);
12039 : else
12040 : {
12041 53 : dump_printf_loc (MSG_NOTE, vect_location,
12042 : "------>vectorizing SLP node:\n");
12043 53 : vect_print_slp_tree (MSG_NOTE, vect_location, node);
12044 : }
12045 : }
12046 973797 : vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
12047 : }
12048 :
12049 : /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
12050 : For loop vectorization this is done in vectorizable_call, but for SLP
12051 : it needs to be deferred until end of vect_schedule_slp, because multiple
12052 : SLP instances may refer to the same scalar stmt. */
12053 :
12054 : static void
12055 602294 : vect_remove_slp_scalar_calls (vec_info *vinfo,
12056 : slp_tree node, hash_set<slp_tree> &visited)
12057 : {
12058 602294 : gimple *new_stmt;
12059 602294 : gimple_stmt_iterator gsi;
12060 602294 : int i;
12061 602294 : slp_tree child;
12062 602294 : tree lhs;
12063 602294 : stmt_vec_info stmt_info;
12064 :
12065 602294 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
12066 188573 : return;
12067 :
12068 457325 : if (visited.add (node))
12069 : return;
12070 :
12071 925930 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
12072 512209 : vect_remove_slp_scalar_calls (vinfo, child, visited);
12073 :
12074 1309696 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
12075 : {
12076 486387 : if (!stmt_info)
12077 3976 : continue;
12078 482411 : stmt_info = vect_orig_stmt (stmt_info);
12079 482411 : gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
12080 5237 : if (!stmt || gimple_bb (stmt) == NULL)
12081 477218 : continue;
12082 5193 : lhs = gimple_call_lhs (stmt);
12083 5193 : if (lhs)
12084 4585 : new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
12085 : else
12086 608 : new_stmt = gimple_build_nop ();
12087 5193 : unlink_stmt_vdef (stmt_info->stmt);
12088 5193 : gsi = gsi_for_stmt (stmt);
12089 5193 : vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
12090 5193 : if (lhs)
12091 4585 : SSA_NAME_DEF_STMT (lhs) = new_stmt;
12092 : }
12093 : }
12094 :
12095 : static void
12096 90085 : vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
12097 : {
12098 90085 : hash_set<slp_tree> visited;
12099 90085 : vect_remove_slp_scalar_calls (vinfo, node, visited);
12100 90085 : }
12101 :
12102 : /* Vectorize the instance root. */
12103 :
12104 : void
12105 10978 : vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance instance)
12106 : {
12107 10978 : gassign *rstmt = NULL;
12108 :
12109 10978 : if (instance->kind == slp_inst_kind_ctor)
12110 : {
12111 5295 : if (SLP_TREE_VEC_DEFS (node).length () == 1)
12112 : {
12113 5256 : tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
12114 5256 : tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
12115 5256 : if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
12116 5256 : TREE_TYPE (vect_lhs)))
12117 0 : vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
12118 : vect_lhs);
12119 5256 : rstmt = gimple_build_assign (root_lhs, vect_lhs);
12120 : }
12121 : else
12122 : {
12123 39 : gcc_assert (SLP_TREE_VEC_DEFS (node).length () > 1);
12124 39 : tree child_def;
12125 39 : int j;
12126 39 : vec<constructor_elt, va_gc> *v;
12127 39 : vec_alloc (v, SLP_TREE_VEC_DEFS (node).length ());
12128 :
12129 : /* A CTOR can handle V16HI composition from VNx8HI so we
12130 : do not need to convert vector elements if the types
12131 : do not match. */
12132 117 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
12133 78 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
12134 39 : tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
12135 39 : tree rtype
12136 39 : = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
12137 39 : tree r_constructor = build_constructor (rtype, v);
12138 39 : rstmt = gimple_build_assign (lhs, r_constructor);
12139 : }
12140 : }
12141 5683 : else if (instance->kind == slp_inst_kind_bb_reduc)
12142 : {
12143 : /* Largely inspired by reduction chain epilogue handling in
12144 : vect_create_epilog_for_reduction. */
12145 4113 : vec<tree> vec_defs = vNULL;
12146 4113 : vect_get_slp_defs (node, &vec_defs);
12147 4113 : enum tree_code reduc_code
12148 4113 : = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
12149 : /* ??? We actually have to reflect signs somewhere. */
12150 4113 : if (reduc_code == MINUS_EXPR)
12151 0 : reduc_code = PLUS_EXPR;
12152 4113 : gimple_seq epilogue = NULL;
12153 : /* We may end up with more than one vector result, reduce them
12154 : to one vector. */
12155 4113 : tree vec_def = vec_defs[0];
12156 4113 : tree vectype = TREE_TYPE (vec_def);
12157 4113 : tree compute_vectype = vectype;
12158 4113 : bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
12159 3918 : && TYPE_OVERFLOW_UNDEFINED (vectype)
12160 6874 : && operation_can_overflow (reduc_code));
12161 2618 : if (pun_for_overflow_p)
12162 : {
12163 2618 : compute_vectype = unsigned_type_for (vectype);
12164 2618 : vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
12165 : compute_vectype, vec_def);
12166 : }
12167 6491 : for (unsigned i = 1; i < vec_defs.length (); ++i)
12168 : {
12169 2378 : tree def = vec_defs[i];
12170 2378 : if (pun_for_overflow_p)
12171 2275 : def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
12172 : compute_vectype, def);
12173 2378 : vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
12174 : vec_def, def);
12175 : }
12176 4113 : vec_defs.release ();
12177 : /* ??? Support other schemes than direct internal fn. */
12178 4113 : internal_fn reduc_fn;
12179 4113 : if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
12180 4113 : || reduc_fn == IFN_LAST)
12181 0 : gcc_unreachable ();
12182 4113 : tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
12183 4113 : TREE_TYPE (compute_vectype), vec_def);
12184 4113 : if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
12185 : {
12186 2557 : tree rem_def = NULL_TREE;
12187 11891 : for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
12188 : {
12189 9334 : def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
12190 9334 : if (!rem_def)
12191 : rem_def = def;
12192 : else
12193 6777 : rem_def = gimple_build (&epilogue, reduc_code,
12194 6777 : TREE_TYPE (scalar_def),
12195 : rem_def, def);
12196 : }
12197 2557 : scalar_def = gimple_build (&epilogue, reduc_code,
12198 2557 : TREE_TYPE (scalar_def),
12199 : scalar_def, rem_def);
12200 : }
12201 4113 : scalar_def = gimple_convert (&epilogue,
12202 4113 : TREE_TYPE (vectype), scalar_def);
12203 4113 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
12204 4113 : gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
12205 4113 : gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
12206 4113 : update_stmt (gsi_stmt (rgsi));
12207 4113 : return;
12208 : }
12209 1570 : else if (instance->kind == slp_inst_kind_gcond)
12210 : {
12211 : /* Only support a single root for now as we can't codegen CFG yet and so we
12212 : can't support lane > 1 at this time. */
12213 1570 : gcc_assert (instance->root_stmts.length () == 1);
12214 1570 : auto root_stmt_info = instance->root_stmts[0];
12215 1570 : auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
12216 1570 : gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
12217 1570 : gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
12218 1570 : bool res = vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
12219 : root_stmt_info, &rgsi, node, NULL);
12220 1570 : gcc_assert (res);
12221 1570 : return;
12222 : }
12223 : else
12224 0 : gcc_unreachable ();
12225 :
12226 5295 : gcc_assert (rstmt);
12227 :
12228 5295 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
12229 5295 : gsi_replace (&rgsi, rstmt, true);
12230 : }
12231 :
12232 : struct slp_scc_info
12233 : {
12234 : bool on_stack;
12235 : int dfs;
12236 : int lowlink;
12237 : };
12238 :
12239 : /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
12240 :
12241 : static void
12242 1471918 : vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
12243 : hash_map<slp_tree, slp_scc_info> &scc_info,
12244 : int &maxdfs, vec<slp_tree> &stack)
12245 : {
12246 1471918 : bool existed_p;
12247 1471918 : slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
12248 1471918 : gcc_assert (!existed_p);
12249 1471918 : info->dfs = maxdfs;
12250 1471918 : info->lowlink = maxdfs;
12251 1471918 : maxdfs++;
12252 :
12253 : /* Leaf. */
12254 1471918 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
12255 : {
12256 498121 : info->on_stack = false;
12257 498121 : vect_schedule_slp_node (vinfo, node, instance);
12258 1028005 : return;
12259 : }
12260 :
12261 973797 : info->on_stack = true;
12262 973797 : stack.safe_push (node);
12263 :
12264 973797 : unsigned i;
12265 973797 : slp_tree child;
12266 : /* DFS recurse. */
12267 2009345 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
12268 : {
12269 1035548 : if (!child)
12270 55377 : continue;
12271 980171 : slp_scc_info *child_info = scc_info.get (child);
12272 980171 : if (!child_info)
12273 : {
12274 889908 : vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
12275 : /* Recursion might have re-allocated the node. */
12276 889908 : info = scc_info.get (node);
12277 889908 : child_info = scc_info.get (child);
12278 889908 : info->lowlink = MIN (info->lowlink, child_info->lowlink);
12279 : }
12280 90263 : else if (child_info->on_stack)
12281 25529 : info->lowlink = MIN (info->lowlink, child_info->dfs);
12282 : }
12283 973797 : if (info->lowlink != info->dfs)
12284 : return;
12285 :
12286 942034 : auto_vec<slp_tree, 4> phis_to_fixup;
12287 :
12288 : /* Singleton. */
12289 942034 : if (stack.last () == node)
12290 : {
12291 918179 : stack.pop ();
12292 918179 : info->on_stack = false;
12293 918179 : vect_schedule_slp_node (vinfo, node, instance);
12294 918179 : if (!SLP_TREE_PERMUTE_P (node)
12295 918179 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
12296 30458 : phis_to_fixup.quick_push (node);
12297 : }
12298 : else
12299 : {
12300 : /* SCC. */
12301 23855 : int last_idx = stack.length () - 1;
12302 55618 : while (stack[last_idx] != node)
12303 31763 : last_idx--;
12304 : /* We can break the cycle at PHIs who have at least one child
12305 : code generated. Then we could re-start the DFS walk until
12306 : all nodes in the SCC are covered (we might have new entries
12307 : for only back-reachable nodes). But it's simpler to just
12308 : iterate and schedule those that are ready. */
12309 23855 : unsigned todo = stack.length () - last_idx;
12310 24194 : do
12311 : {
12312 105790 : for (int idx = stack.length () - 1; idx >= last_idx; --idx)
12313 : {
12314 57402 : slp_tree entry = stack[idx];
12315 57402 : if (!entry)
12316 956 : continue;
12317 56446 : bool phi = (!SLP_TREE_PERMUTE_P (entry)
12318 56446 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
12319 56446 : bool ready = !phi;
12320 142866 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
12321 111519 : if (!child)
12322 : {
12323 22979 : gcc_assert (phi);
12324 : ready = true;
12325 : break;
12326 : }
12327 88540 : else if (scc_info.get (child)->on_stack)
12328 : {
12329 24055 : if (!phi)
12330 : {
12331 : ready = false;
12332 : break;
12333 : }
12334 : }
12335 : else
12336 : {
12337 64485 : if (phi)
12338 : {
12339 : ready = true;
12340 : break;
12341 : }
12342 : }
12343 33467 : if (ready)
12344 : {
12345 55618 : vect_schedule_slp_node (vinfo, entry, instance);
12346 55618 : scc_info.get (entry)->on_stack = false;
12347 55618 : stack[idx] = NULL;
12348 55618 : todo--;
12349 55618 : if (phi)
12350 24301 : phis_to_fixup.safe_push (entry);
12351 : }
12352 : }
12353 : }
12354 24194 : while (todo != 0);
12355 :
12356 : /* Pop the SCC. */
12357 23855 : stack.truncate (last_idx);
12358 : }
12359 :
12360 : /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
12361 : slp_tree phi_node;
12362 1938827 : FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
12363 : {
12364 54759 : gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
12365 54759 : edge_iterator ei;
12366 54759 : edge e;
12367 172943 : FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
12368 : {
12369 118184 : unsigned dest_idx = e->dest_idx;
12370 118184 : child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
12371 118184 : if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
12372 66423 : continue;
12373 51761 : unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
12374 : /* Simply fill all args. */
12375 51761 : if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
12376 : != vect_first_order_recurrence)
12377 111276 : for (unsigned i = 0; i < n; ++i)
12378 : {
12379 59560 : tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
12380 59560 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
12381 59560 : add_phi_arg (phi, vect_get_slp_vect_def (child, i),
12382 : e, gimple_phi_arg_location (phi, dest_idx));
12383 : }
12384 : else
12385 : {
12386 : /* Unless it is a first order recurrence which needs
12387 : args filled in for both the PHI node and the permutes. */
12388 45 : gimple *perm
12389 45 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
12390 45 : gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
12391 45 : add_phi_arg (as_a <gphi *> (rphi),
12392 : vect_get_slp_vect_def (child, n - 1),
12393 : e, gimple_phi_arg_location (phi, dest_idx));
12394 127 : for (unsigned i = 0; i < n; ++i)
12395 : {
12396 82 : gimple *perm
12397 82 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
12398 82 : if (i > 0)
12399 37 : gimple_assign_set_rhs1 (perm,
12400 : vect_get_slp_vect_def (child, i - 1));
12401 82 : gimple_assign_set_rhs2 (perm,
12402 : vect_get_slp_vect_def (child, i));
12403 82 : update_stmt (perm);
12404 : }
12405 : }
12406 : }
12407 : }
12408 942034 : }
12409 :
12410 : /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
12411 :
12412 : void
12413 542685 : vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
12414 : {
12415 542685 : slp_instance instance;
12416 542685 : unsigned int i;
12417 :
12418 542685 : hash_map<slp_tree, slp_scc_info> scc_info;
12419 542685 : int maxdfs = 0;
12420 1124806 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
12421 : {
12422 582121 : slp_tree node = SLP_INSTANCE_TREE (instance);
12423 582121 : if (dump_enabled_p ())
12424 : {
12425 16071 : dump_printf_loc (MSG_NOTE, vect_location,
12426 : "Vectorizing SLP tree:\n");
12427 : /* ??? Dump all? */
12428 16071 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
12429 467 : dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
12430 467 : SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
12431 16071 : vect_print_slp_graph (MSG_NOTE, vect_location,
12432 : SLP_INSTANCE_TREE (instance));
12433 : }
12434 : /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
12435 : have a PHI be the node breaking the cycle. */
12436 582121 : auto_vec<slp_tree> stack;
12437 582121 : if (!scc_info.get (node))
12438 582010 : vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
12439 :
12440 582121 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
12441 10978 : vectorize_slp_instance_root_stmt (vinfo, node, instance);
12442 :
12443 582121 : if (dump_enabled_p ())
12444 16071 : dump_printf_loc (MSG_NOTE, vect_location,
12445 : "vectorizing stmts using SLP.\n");
12446 582121 : }
12447 :
12448 1667491 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
12449 : {
12450 582121 : slp_tree root = SLP_INSTANCE_TREE (instance);
12451 582121 : stmt_vec_info store_info;
12452 582121 : unsigned int j;
12453 :
12454 : /* Remove scalar call stmts. Do not do this for basic-block
12455 : vectorization as not all uses may be vectorized.
12456 : ??? Why should this be necessary? DCE should be able to
12457 : remove the stmts itself.
12458 : ??? For BB vectorization we can as well remove scalar
12459 : stmts starting from the SLP tree root if they have no
12460 : uses. */
12461 582121 : if (is_a <loop_vec_info> (vinfo))
12462 90085 : vect_remove_slp_scalar_calls (vinfo, root);
12463 :
12464 : /* Remove vectorized stores original scalar stmts. */
12465 2598092 : for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
12466 : {
12467 1470249 : if (!store_info
12468 1470235 : || !STMT_VINFO_DATA_REF (store_info)
12469 1442514 : || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
12470 : break;
12471 :
12472 1433850 : store_info = vect_orig_stmt (store_info);
12473 : /* Free the attached stmt_vec_info and remove the stmt. */
12474 1433850 : vinfo->remove_stmt (store_info);
12475 :
12476 : /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
12477 : to not crash in vect_free_slp_tree later. */
12478 1433850 : if (SLP_TREE_REPRESENTATIVE (root) == store_info)
12479 545391 : SLP_TREE_REPRESENTATIVE (root) = NULL;
12480 : }
12481 : }
12482 542685 : }
|