Line data Source code
1 : /* SLP - Basic Block Vectorization
2 : Copyright (C) 2007-2026 Free Software Foundation, Inc.
3 : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : and Ira Rosen <irar@il.ibm.com>
5 :
6 : This file is part of GCC.
7 :
8 : GCC is free software; you can redistribute it and/or modify it under
9 : the terms of the GNU General Public License as published by the Free
10 : Software Foundation; either version 3, or (at your option) any later
11 : version.
12 :
13 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with GCC; see the file COPYING3. If not see
20 : <http://www.gnu.org/licenses/>. */
21 :
22 : #include "config.h"
23 : #define INCLUDE_ALGORITHM
24 : #include "system.h"
25 : #include "coretypes.h"
26 : #include "backend.h"
27 : #include "target.h"
28 : #include "rtl.h"
29 : #include "tree.h"
30 : #include "gimple.h"
31 : #include "tree-pass.h"
32 : #include "ssa.h"
33 : #include "optabs-tree.h"
34 : #include "insn-config.h"
35 : #include "recog.h" /* FIXME: for insn_data */
36 : #include "fold-const.h"
37 : #include "stor-layout.h"
38 : #include "gimple-iterator.h"
39 : #include "cfgloop.h"
40 : #include "tree-vectorizer.h"
41 : #include "langhooks.h"
42 : #include "gimple-walk.h"
43 : #include "dbgcnt.h"
44 : #include "tree-vector-builder.h"
45 : #include "vec-perm-indices.h"
46 : #include "gimple-fold.h"
47 : #include "internal-fn.h"
48 : #include "dump-context.h"
49 : #include "cfganal.h"
50 : #include "tree-eh.h"
51 : #include "tree-cfg.h"
52 : #include "alloc-pool.h"
53 : #include "sreal.h"
54 : #include "predict.h"
55 :
56 : #define REDUC_GROUP_FIRST_ELEMENT(S) \
57 : (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element)
58 :
59 : static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
60 : load_permutation_t &,
61 : const vec<tree> &,
62 : gimple_stmt_iterator *,
63 : poly_uint64, bool, bool,
64 : unsigned *,
65 : unsigned * = nullptr,
66 : bool = false);
67 : static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
68 : slp_tree, lane_permutation_t &,
69 : vec<slp_tree> &, bool);
70 : static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
71 : static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
72 :
73 : static object_allocator<_slp_tree> *slp_tree_pool;
74 : static slp_tree slp_first_node;
75 :
76 : void
77 1119179 : vect_slp_init (void)
78 : {
79 1119179 : slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
80 1119179 : }
81 :
82 : void
83 1119179 : vect_slp_fini (void)
84 : {
85 1781007 : while (slp_first_node)
86 661828 : delete slp_first_node;
87 2238358 : delete slp_tree_pool;
88 1119179 : slp_tree_pool = NULL;
89 1119179 : }
90 :
91 : void *
92 7724265 : _slp_tree::operator new (size_t n)
93 : {
94 7724265 : gcc_assert (n == sizeof (_slp_tree));
95 7724265 : return slp_tree_pool->allocate_raw ();
96 : }
97 :
98 : void
99 7724265 : _slp_tree::operator delete (void *node, size_t n)
100 : {
101 7724265 : gcc_assert (n == sizeof (_slp_tree));
102 7724265 : slp_tree_pool->remove_raw (node);
103 7724265 : }
104 :
105 :
106 : /* Initialize a SLP node. */
107 :
108 7724265 : _slp_tree::_slp_tree ()
109 : {
110 7724265 : this->prev_node = NULL;
111 7724265 : if (slp_first_node)
112 6761217 : slp_first_node->prev_node = this;
113 7724265 : this->next_node = slp_first_node;
114 7724265 : slp_first_node = this;
115 7724265 : SLP_TREE_SCALAR_STMTS (this) = vNULL;
116 7724265 : SLP_TREE_SCALAR_OPS (this) = vNULL;
117 7724265 : SLP_TREE_LIVE_LANES (this) = vNULL;
118 7724265 : SLP_TREE_VEC_DEFS (this) = vNULL;
119 7724265 : SLP_TREE_CHILDREN (this) = vNULL;
120 7724265 : SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
121 7724265 : SLP_TREE_LANE_PERMUTATION (this) = vNULL;
122 7724265 : SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
123 7724265 : SLP_TREE_CODE (this) = ERROR_MARK;
124 7724265 : SLP_TREE_GS_SCALE (this) = 0;
125 7724265 : SLP_TREE_GS_BASE (this) = NULL_TREE;
126 7724265 : this->ldst_lanes = false;
127 7724265 : this->avoid_stlf_fail = false;
128 7724265 : SLP_TREE_VECTYPE (this) = NULL_TREE;
129 7724265 : SLP_TREE_REPRESENTATIVE (this) = NULL;
130 7724265 : this->cycle_info.id = -1;
131 7724265 : this->cycle_info.reduc_idx = -1;
132 7724265 : SLP_TREE_REF_COUNT (this) = 1;
133 7724265 : this->failed = NULL;
134 7724265 : this->max_nunits = 1;
135 7724265 : this->lanes = 0;
136 7724265 : SLP_TREE_TYPE (this) = undef_vec_info_type;
137 7724265 : this->data = NULL;
138 7724265 : }
139 :
140 : /* Tear down a SLP node. */
141 :
142 7724265 : _slp_tree::~_slp_tree ()
143 : {
144 7724265 : if (this->prev_node)
145 4671856 : this->prev_node->next_node = this->next_node;
146 : else
147 3052409 : slp_first_node = this->next_node;
148 7724265 : if (this->next_node)
149 5824743 : this->next_node->prev_node = this->prev_node;
150 7724265 : SLP_TREE_CHILDREN (this).release ();
151 7724265 : SLP_TREE_SCALAR_STMTS (this).release ();
152 7724265 : SLP_TREE_SCALAR_OPS (this).release ();
153 7724265 : SLP_TREE_LIVE_LANES (this).release ();
154 7724265 : SLP_TREE_VEC_DEFS (this).release ();
155 7724265 : SLP_TREE_LOAD_PERMUTATION (this).release ();
156 7724265 : SLP_TREE_LANE_PERMUTATION (this).release ();
157 7724265 : if (this->failed)
158 2013173 : free (failed);
159 7724265 : if (this->data)
160 1243347 : delete this->data;
161 7724265 : }
162 :
163 : /* Push the single SSA definition in DEF to the vector of vector defs. */
164 :
165 : void
166 526592 : _slp_tree::push_vec_def (gimple *def)
167 : {
168 526592 : if (gphi *phi = dyn_cast <gphi *> (def))
169 58656 : vec_defs.quick_push (gimple_phi_result (phi));
170 : else
171 : {
172 467936 : def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
173 467936 : vec_defs.quick_push (get_def_from_ptr (defop));
174 : }
175 526592 : }
176 :
177 : /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
178 :
179 : void
180 14671281 : vect_free_slp_tree (slp_tree node)
181 : {
182 14671281 : int i;
183 14671281 : slp_tree child;
184 :
185 14671281 : if (--SLP_TREE_REF_COUNT (node) != 0)
186 14671281 : return;
187 :
188 10976927 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
189 3914490 : if (child)
190 3557140 : vect_free_slp_tree (child);
191 :
192 7062437 : delete node;
193 : }
194 :
195 : /* Return a location suitable for dumpings related to the SLP instance. */
196 :
197 : dump_user_location_t
198 3404189 : _slp_instance::location () const
199 : {
200 3404189 : if (!root_stmts.is_empty ())
201 319218 : return root_stmts[0]->stmt;
202 : else
203 3084971 : return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
204 : }
205 :
206 :
207 : /* Free the memory allocated for the SLP instance. */
208 :
209 : void
210 1555778 : vect_free_slp_instance (slp_instance instance)
211 : {
212 1555778 : vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
213 1555778 : SLP_INSTANCE_LOADS (instance).release ();
214 1555778 : SLP_INSTANCE_ROOT_STMTS (instance).release ();
215 1555778 : SLP_INSTANCE_REMAIN_DEFS (instance).release ();
216 1555778 : instance->subgraph_entries.release ();
217 1555778 : instance->cost_vec.release ();
218 1555778 : free (instance);
219 1555778 : }
220 :
221 :
222 : /* Create a SLP node with NOPS children with CODE, either VEC_PERM_EXPR
223 : for a permute node or else ERROR_MARK. */
224 :
225 : slp_tree
226 95248 : vect_create_new_slp_node (unsigned nops, tree_code code)
227 : {
228 95248 : gcc_assert (code == ERROR_MARK || code == VEC_PERM_EXPR);
229 95248 : slp_tree node = new _slp_tree;
230 95248 : SLP_TREE_SCALAR_STMTS (node) = vNULL;
231 95248 : SLP_TREE_CHILDREN (node).create (nops);
232 95248 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
233 95248 : SLP_TREE_CODE (node) = code;
234 95248 : return node;
235 : }
236 :
237 : /* Create a SLP node inplace at NODE for SCALAR_STMTS and NOPS children. */
238 :
239 : static slp_tree
240 3765227 : vect_create_new_slp_node (slp_tree node,
241 : vec<stmt_vec_info> scalar_stmts, unsigned nops)
242 : {
243 3765227 : SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
244 3765227 : SLP_TREE_CHILDREN (node).create (nops);
245 3765227 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
246 3765227 : SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
247 3765227 : SLP_TREE_LANES (node) = scalar_stmts.length ();
248 3765227 : return node;
249 : }
250 :
251 : /* Create an SLP node for SCALAR_STMTS and NOPS children. */
252 :
253 : static slp_tree
254 8037 : vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
255 : {
256 8037 : return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
257 : }
258 :
259 : /* Create a vect_external_def SLP node inplace at NODE for scalar
260 : operands OPS. */
261 :
262 : static slp_tree
263 1839969 : vect_create_new_slp_node (slp_tree node, vec<tree> ops)
264 : {
265 1839969 : SLP_TREE_SCALAR_OPS (node) = ops;
266 1839969 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
267 0 : SLP_TREE_LANES (node) = ops.length ();
268 1839969 : return node;
269 : }
270 :
271 : /* Create a vect_external_def SLP node for scalar operands OPS. */
272 :
273 : static slp_tree
274 1839969 : vect_create_new_slp_node (vec<tree> ops)
275 : {
276 1839969 : return vect_create_new_slp_node (new _slp_tree, ops);
277 : }
278 :
279 :
280 : /* This structure is used in creation of an SLP tree. Each instance
281 : corresponds to the same operand in a group of scalar stmts in an SLP
282 : node. */
283 : typedef struct _slp_oprnd_info
284 : {
285 : /* Def-stmts for the operands. */
286 : vec<stmt_vec_info> def_stmts;
287 : /* Operands. */
288 : vec<tree> ops;
289 : /* Information about the first statement, its vector def-type, type, the
290 : operand itself in case it's constant, and an indication if it's a pattern
291 : stmt and gather/scatter info. */
292 : tree first_op_type;
293 : enum vect_def_type first_dt;
294 : bool any_pattern;
295 : bool first_gs_p;
296 : gather_scatter_info first_gs_info;
297 : } *slp_oprnd_info;
298 :
299 :
300 : /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
301 : operand. */
302 : static vec<slp_oprnd_info>
303 3332559 : vect_create_oprnd_info (int nops, int group_size)
304 : {
305 3332559 : int i;
306 3332559 : slp_oprnd_info oprnd_info;
307 3332559 : vec<slp_oprnd_info> oprnds_info;
308 :
309 3332559 : oprnds_info.create (nops);
310 11954468 : for (i = 0; i < nops; i++)
311 : {
312 5289350 : oprnd_info = XNEW (struct _slp_oprnd_info);
313 5289350 : oprnd_info->def_stmts.create (group_size);
314 5289350 : oprnd_info->ops.create (group_size);
315 5289350 : oprnd_info->first_dt = vect_uninitialized_def;
316 5289350 : oprnd_info->first_op_type = NULL_TREE;
317 5289350 : oprnd_info->any_pattern = false;
318 5289350 : oprnd_info->first_gs_p = false;
319 5289350 : oprnds_info.quick_push (oprnd_info);
320 : }
321 :
322 3332559 : return oprnds_info;
323 : }
324 :
325 :
326 : /* Free operands info. */
327 :
328 : static void
329 3332559 : vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
330 : {
331 3332559 : int i;
332 3332559 : slp_oprnd_info oprnd_info;
333 :
334 8621909 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
335 : {
336 5289350 : oprnd_info->def_stmts.release ();
337 5289350 : oprnd_info->ops.release ();
338 5289350 : XDELETE (oprnd_info);
339 : }
340 :
341 3332559 : oprnds_info.release ();
342 3332559 : }
343 :
344 : /* Return the execution frequency of NODE (so that a higher value indicates
345 : a "more important" node when optimizing for speed). */
346 :
347 : static sreal
348 3489700 : vect_slp_node_weight (slp_tree node)
349 : {
350 3489700 : stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
351 3489700 : basic_block bb = gimple_bb (stmt_info->stmt);
352 3489700 : return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
353 : }
354 :
355 : /* Return true if STMTS contains a pattern statement. */
356 :
357 : static bool
358 22258 : vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
359 : {
360 22258 : stmt_vec_info stmt_info;
361 22258 : unsigned int i;
362 71982 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
363 51960 : if (stmt_info && is_pattern_stmt_p (stmt_info))
364 : return true;
365 : return false;
366 : }
367 :
368 : /* Return true when all lanes in the external or constant NODE have
369 : the same value. */
370 :
371 : static bool
372 594228 : vect_slp_tree_uniform_p (slp_tree node)
373 : {
374 594228 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
375 : || SLP_TREE_DEF_TYPE (node) == vect_external_def);
376 :
377 : /* Pre-exsting vectors. */
378 1045727 : if (SLP_TREE_SCALAR_OPS (node).is_empty ())
379 : return false;
380 :
381 : unsigned i;
382 : tree op, first = NULL_TREE;
383 1361082 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
384 1218353 : if (!first)
385 : first = op;
386 624125 : else if (!operand_equal_p (first, op, 0))
387 : return false;
388 :
389 : return true;
390 : }
391 :
392 : /* Find the place of the data-ref in STMT_INFO in the interleaving chain
393 : that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
394 : of the chain. */
395 :
396 : int
397 701848 : vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
398 : stmt_vec_info first_stmt_info)
399 : {
400 701848 : stmt_vec_info next_stmt_info = first_stmt_info;
401 701848 : int result = 0;
402 :
403 701848 : if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
404 : return -1;
405 :
406 1753522 : do
407 : {
408 1753522 : if (next_stmt_info == stmt_info)
409 : return result;
410 1051674 : next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
411 1051674 : if (next_stmt_info)
412 1051674 : result += DR_GROUP_GAP (next_stmt_info);
413 : }
414 1051674 : while (next_stmt_info);
415 :
416 : return -1;
417 : }
418 :
419 : /* Check whether it is possible to load COUNT elements of type ELT_TYPE
420 : using the method implemented by duplicate_and_interleave. Return true
421 : if so, returning the number of intermediate vectors in *NVECTORS_OUT
422 : (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
423 : (if nonnull). */
424 :
425 : bool
426 0 : can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
427 : tree elt_type, unsigned int *nvectors_out,
428 : tree *vector_type_out,
429 : tree *permutes)
430 : {
431 0 : tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
432 0 : if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
433 0 : return false;
434 :
435 0 : machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
436 0 : poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
437 0 : unsigned int nvectors = 1;
438 0 : for (;;)
439 : {
440 0 : scalar_int_mode int_mode;
441 0 : poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
442 0 : if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
443 : {
444 : /* Get the natural vector type for this SLP group size. */
445 0 : tree int_type = build_nonstandard_integer_type
446 0 : (GET_MODE_BITSIZE (int_mode), 1);
447 0 : tree vector_type
448 0 : = get_vectype_for_scalar_type (vinfo, int_type, count);
449 0 : poly_int64 half_nelts;
450 0 : if (vector_type
451 0 : && VECTOR_MODE_P (TYPE_MODE (vector_type))
452 0 : && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
453 : GET_MODE_SIZE (base_vector_mode))
454 0 : && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
455 : 2, &half_nelts))
456 : {
457 : /* Try fusing consecutive sequences of COUNT / NVECTORS elements
458 : together into elements of type INT_TYPE and using the result
459 : to build NVECTORS vectors. */
460 0 : poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
461 0 : vec_perm_builder sel1 (nelts, 2, 3);
462 0 : vec_perm_builder sel2 (nelts, 2, 3);
463 :
464 0 : for (unsigned int i = 0; i < 3; ++i)
465 : {
466 0 : sel1.quick_push (i);
467 0 : sel1.quick_push (i + nelts);
468 0 : sel2.quick_push (half_nelts + i);
469 0 : sel2.quick_push (half_nelts + i + nelts);
470 : }
471 0 : vec_perm_indices indices1 (sel1, 2, nelts);
472 0 : vec_perm_indices indices2 (sel2, 2, nelts);
473 0 : machine_mode vmode = TYPE_MODE (vector_type);
474 0 : if (can_vec_perm_const_p (vmode, vmode, indices1)
475 0 : && can_vec_perm_const_p (vmode, vmode, indices2))
476 : {
477 0 : if (nvectors_out)
478 0 : *nvectors_out = nvectors;
479 0 : if (vector_type_out)
480 0 : *vector_type_out = vector_type;
481 0 : if (permutes)
482 : {
483 0 : permutes[0] = vect_gen_perm_mask_checked (vector_type,
484 : indices1);
485 0 : permutes[1] = vect_gen_perm_mask_checked (vector_type,
486 : indices2);
487 : }
488 0 : return true;
489 : }
490 0 : }
491 : }
492 0 : if (!multiple_p (elt_bytes, 2, &elt_bytes))
493 : return false;
494 0 : nvectors *= 2;
495 : /* We need to be able to fuse COUNT / NVECTORS elements together. */
496 0 : if (!multiple_p (count, nvectors))
497 : return false;
498 : }
499 : }
500 :
501 : /* Return true if DTA and DTB match. */
502 :
503 : static bool
504 17006516 : vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
505 : {
506 17006516 : return (dta == dtb
507 350522 : || ((dta == vect_external_def || dta == vect_constant_def)
508 217337 : && (dtb == vect_external_def || dtb == vect_constant_def)));
509 : }
510 :
511 : #define GATHER_SCATTER_OFFSET (-3)
512 :
513 : /* For most SLP statements, there is a one-to-one mapping between
514 : gimple arguments and child nodes. If that is not true for STMT,
515 : return an array that contains:
516 :
517 : - the number of child nodes, followed by
518 : - for each child node, the index of the argument associated with that node.
519 : The special index -1 is the first operand of an embedded comparison and
520 : the special index -2 is the second operand of an embedded comparison.
521 : The special indes -3 is the offset of a gather as analyzed by
522 : vect_check_gather_scatter.
523 :
524 : SWAP is as for vect_get_and_check_slp_defs. */
525 :
526 : static const int *
527 24281791 : vect_get_operand_map (const gimple *stmt, bool gather_scatter_p,
528 : unsigned char swap)
529 : {
530 24281791 : static const int no_arg_map[] = { 0 };
531 24281791 : static const int arg0_map[] = { 1, 0 };
532 24281791 : static const int arg2_map[] = { 1, 2 };
533 24281791 : static const int arg2_arg3_map[] = { 2, 2, 3 };
534 24281791 : static const int arg2_arg4_map[] = { 2, 2, 4 };
535 24281791 : static const int arg2_arg5_arg6_map[] = { 3, 2, 5, 6 };
536 24281791 : static const int arg2_arg4_arg5_map[] = { 3, 2, 4, 5 };
537 24281791 : static const int arg3_arg2_map[] = { 2, 3, 2 };
538 24281791 : static const int op00_map[] = { 1, -1 };
539 24281791 : static const int op1_op0_map[] = { 2, 1, 0 };
540 24281791 : static const int off_map[] = { 1, GATHER_SCATTER_OFFSET };
541 24281791 : static const int off_op0_map[] = { 2, GATHER_SCATTER_OFFSET, 0 };
542 24281791 : static const int off_arg2_arg3_map[] = { 3, GATHER_SCATTER_OFFSET, 2, 3 };
543 24281791 : static const int off_arg3_arg2_map[] = { 3, GATHER_SCATTER_OFFSET, 3, 2 };
544 24281791 : static const int mask_call_maps[6][7] = {
545 : { 1, 1, },
546 : { 2, 1, 2, },
547 : { 3, 1, 2, 3, },
548 : { 4, 1, 2, 3, 4, },
549 : { 5, 1, 2, 3, 4, 5, },
550 : { 6, 1, 2, 3, 4, 5, 6 },
551 : };
552 :
553 24281791 : gcc_checking_assert (!swap
554 : || !is_gimple_assign (stmt)
555 : || TREE_CODE_CLASS
556 : (gimple_assign_rhs_code (stmt)) == tcc_comparison
557 : || commutative_tree_code
558 : (gimple_assign_rhs_code (stmt)));
559 :
560 24281791 : if (auto assign = dyn_cast<const gassign *> (stmt))
561 : {
562 22823475 : tree_code code = gimple_assign_rhs_code (assign);
563 22823475 : if (code == COND_EXPR
564 22823475 : && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
565 0 : gcc_unreachable ();
566 22823475 : else if ((TREE_CODE_CLASS (code) == tcc_comparison
567 21485444 : || commutative_tree_code (code))
568 31761078 : && swap)
569 : return op1_op0_map;
570 22782730 : else if (code == VIEW_CONVERT_EXPR)
571 : return op00_map;
572 22774579 : else if (gather_scatter_p)
573 43349 : return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
574 43349 : ? off_op0_map : off_map);
575 : }
576 1458316 : else if (auto call = dyn_cast<const gcall *> (stmt))
577 : {
578 161573 : if (gimple_call_internal_p (call))
579 92080 : switch (gimple_call_internal_fn (call))
580 : {
581 15940 : case IFN_MASK_LOAD:
582 27186 : return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
583 :
584 : case IFN_GATHER_LOAD:
585 : return arg2_map;
586 :
587 0 : case IFN_MASK_GATHER_LOAD:
588 0 : case IFN_MASK_LEN_GATHER_LOAD:
589 0 : return arg2_arg5_arg6_map;
590 :
591 0 : case IFN_SCATTER_STORE:
592 0 : return arg2_arg4_map;
593 :
594 0 : case IFN_MASK_SCATTER_STORE:
595 0 : case IFN_MASK_LEN_SCATTER_STORE:
596 0 : return arg2_arg4_arg5_map;
597 :
598 9481 : case IFN_MASK_STORE:
599 17540 : return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
600 :
601 988 : case IFN_MASK_CALL:
602 988 : {
603 988 : unsigned nargs = gimple_call_num_args (call);
604 988 : if (nargs >= 2 && nargs <= 7)
605 988 : return mask_call_maps[nargs-2];
606 : else
607 : return nullptr;
608 : }
609 :
610 278 : case IFN_CLZ:
611 278 : case IFN_CTZ:
612 278 : return arg0_map;
613 :
614 6306 : case IFN_GOMP_SIMD_LANE:
615 6306 : return no_arg_map;
616 :
617 : default:
618 : break;
619 : }
620 : }
621 : return nullptr;
622 : }
623 :
624 : static const int *
625 24265806 : vect_get_operand_map (const stmt_vec_info stmt, unsigned char swap = 0)
626 : {
627 0 : return vect_get_operand_map (stmt->stmt, STMT_VINFO_GATHER_SCATTER_P (stmt),
628 0 : swap);
629 : }
630 :
631 : /* Return the SLP node child index for operand OP of STMT. */
632 :
633 : int
634 1375777 : vect_slp_child_index_for_operand (const stmt_vec_info stmt, int op)
635 : {
636 1375777 : const int *opmap = vect_get_operand_map (stmt);
637 1375777 : if (!opmap)
638 : return op;
639 21863 : for (int i = 1; i < 1 + opmap[0]; ++i)
640 21863 : if (opmap[i] == op)
641 12246 : return i - 1;
642 0 : gcc_unreachable ();
643 : }
644 :
645 : /* Helper class for mapping of GIMPLE operands to SLP children. */
646 : /* ??? Add vect_slp_child_index_for_operand here and amend opmaps
647 : with the full reverse mapping and indicating the position of the
648 : first commutative operand index, eliding the swap_p argument from
649 : vect_get_operand_map. Adjust all consumers. */
650 :
651 : struct slp_oprnds {
652 : slp_oprnds (stmt_vec_info);
653 : tree get_op_for_slp_child (stmt_vec_info, unsigned);
654 : const int *opmap;
655 : const unsigned int num_slp_children;
656 : };
657 :
658 4414551 : slp_oprnds::slp_oprnds (stmt_vec_info stmt_info)
659 4414551 : : opmap (vect_get_operand_map (stmt_info)),
660 4414551 : num_slp_children (opmap ? opmap[0] : gimple_num_args (stmt_info->stmt))
661 : {
662 4414551 : }
663 :
664 : /* For SLP child number N get the corresponding tree operand from GIMPLE
665 : statement described by STMT_INFO. */
666 :
667 : tree
668 4867996 : slp_oprnds::get_op_for_slp_child (stmt_vec_info stmt_info, unsigned n)
669 : {
670 4867996 : gcc_assert (n < num_slp_children);
671 4867996 : int opno = opmap ? opmap[n + 1] : (int) n;
672 4867996 : if (opno == GATHER_SCATTER_OFFSET)
673 0 : gcc_unreachable (); // TODO
674 4867996 : else if (opno < 0)
675 1934 : return TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
676 : else
677 4866062 : return gimple_arg (stmt_info->stmt, opno);
678 : }
679 :
680 : /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
681 : they are of a valid type and that they match the defs of the first stmt of
682 : the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
683 : by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
684 : indicates swap is required for cond_expr stmts. Specifically, SWAP
685 : is 1 if STMT is cond and operands of comparison need to be swapped;
686 : SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
687 :
688 : If there was a fatal error return -1; if the error could be corrected by
689 : swapping operands of father node of this one, return 1; if everything is
690 : ok return 0. */
691 : static int
692 12708576 : vect_get_and_check_slp_defs (vec_info *vinfo, tree vectype, unsigned char swap,
693 : bool *skip_args,
694 : vec<stmt_vec_info> stmts, unsigned stmt_num,
695 : vec<slp_oprnd_info> *oprnds_info)
696 : {
697 12708576 : stmt_vec_info stmt_info = stmts[stmt_num];
698 12708576 : tree oprnd;
699 12708576 : unsigned int i, number_of_oprnds;
700 12708576 : enum vect_def_type dt = vect_uninitialized_def;
701 12708576 : slp_oprnd_info oprnd_info;
702 12708576 : gather_scatter_info gs_info;
703 12708576 : unsigned int gs_op = -1u;
704 12708576 : unsigned int commutative_op = -1U;
705 12708576 : bool first = stmt_num == 0;
706 :
707 12708576 : if (!stmt_info)
708 : {
709 0 : for (auto oi : *oprnds_info)
710 : {
711 0 : oi->def_stmts.quick_push (NULL);
712 0 : oi->ops.quick_push (NULL_TREE);
713 : }
714 : return 0;
715 : }
716 :
717 12708576 : if (!is_a<gcall *> (stmt_info->stmt)
718 : && !is_a<gassign *> (stmt_info->stmt)
719 : && !is_a<gphi *> (stmt_info->stmt))
720 : return -1;
721 :
722 12708576 : number_of_oprnds = gimple_num_args (stmt_info->stmt);
723 12708576 : const int *map = vect_get_operand_map (stmt_info, swap);
724 12708576 : if (map)
725 75915 : number_of_oprnds = *map++;
726 12708576 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
727 : {
728 49322 : if (gimple_call_internal_p (stmt))
729 : {
730 32584 : internal_fn ifn = gimple_call_internal_fn (stmt);
731 32584 : commutative_op = first_commutative_argument (ifn);
732 32584 : if (internal_gather_scatter_fn_p (ifn))
733 : {
734 0 : vect_describe_gather_scatter_call
735 0 : (stmt_info,
736 0 : first ? &(*oprnds_info)[0]->first_gs_info : &gs_info);
737 0 : if (first)
738 0 : (*oprnds_info)[0]->first_gs_p = true;
739 : gs_op = 0;
740 : }
741 : }
742 : }
743 12659254 : else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
744 : {
745 14778099 : if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
746 8367446 : commutative_op = 0;
747 : }
748 :
749 12708576 : bool swapped = (swap != 0);
750 12708576 : bool backedge = false;
751 12708576 : enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
752 35153419 : for (i = 0; i < number_of_oprnds; i++)
753 : {
754 22446056 : oprnd_info = (*oprnds_info)[i];
755 22446056 : int opno = map ? map[i] : int (i);
756 22446056 : if (opno == GATHER_SCATTER_OFFSET)
757 : {
758 22752 : gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
759 22752 : if (!is_a <loop_vec_info> (vinfo)
760 22752 : || !vect_check_gather_scatter (stmt_info, vectype,
761 : as_a <loop_vec_info> (vinfo),
762 : first ? &oprnd_info->first_gs_info
763 : : &gs_info))
764 1213 : return -1;
765 :
766 22752 : if (first)
767 : {
768 22501 : oprnd_info->first_gs_p = true;
769 22501 : oprnd = oprnd_info->first_gs_info.offset;
770 : }
771 : else
772 : {
773 251 : gs_op = i;
774 251 : oprnd = gs_info.offset;
775 : }
776 : }
777 22423304 : else if (opno < 0)
778 2842 : oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
779 : else
780 : {
781 22420462 : oprnd = gimple_arg (stmt_info->stmt, opno);
782 22420462 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
783 : {
784 1219502 : edge e = gimple_phi_arg_edge (stmt, opno);
785 2439004 : backedge = (is_a <bb_vec_info> (vinfo)
786 1880031 : ? e->flags & EDGE_DFS_BACK
787 660529 : : dominated_by_p (CDI_DOMINATORS, e->src,
788 660529 : gimple_bb (stmt_info->stmt)));
789 : }
790 : }
791 :
792 22446056 : stmt_vec_info def_stmt_info;
793 22446056 : if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
794 : {
795 995 : if (dump_enabled_p ())
796 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
797 : "Build SLP failed: can't analyze def for %T\n",
798 : oprnd);
799 :
800 995 : return -1;
801 : }
802 :
803 22445061 : if (skip_args[i])
804 : {
805 526482 : oprnd_info->def_stmts.quick_push (NULL);
806 526482 : oprnd_info->ops.quick_push (NULL_TREE);
807 526482 : oprnd_info->first_dt = vect_uninitialized_def;
808 526482 : continue;
809 : }
810 :
811 21918579 : oprnd_info->def_stmts.quick_push (def_stmt_info);
812 21918579 : oprnd_info->ops.quick_push (oprnd);
813 :
814 21918579 : if (def_stmt_info
815 21918579 : && is_pattern_stmt_p (def_stmt_info))
816 : {
817 396402 : if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
818 : != def_stmt_info)
819 279971 : oprnd_info->any_pattern = true;
820 : else
821 : /* If we promote this to external use the original stmt def. */
822 116431 : oprnd_info->ops.last ()
823 232862 : = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
824 : }
825 :
826 : /* If there's a extern def on a backedge make sure we can
827 : code-generate at the region start.
828 : ??? This is another case that could be fixed by adjusting
829 : how we split the function but at the moment we'd have conflicting
830 : goals there. */
831 21918579 : if (backedge
832 167638 : && dts[i] == vect_external_def
833 239 : && is_a <bb_vec_info> (vinfo)
834 239 : && TREE_CODE (oprnd) == SSA_NAME
835 218 : && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
836 21918797 : && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
837 218 : gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
838 : {
839 218 : if (dump_enabled_p ())
840 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
841 : "Build SLP failed: extern def %T only defined "
842 : "on backedge\n", oprnd);
843 218 : return -1;
844 : }
845 :
846 21918361 : if (first)
847 : {
848 4800622 : tree type = TREE_TYPE (oprnd);
849 4800622 : dt = dts[i];
850 :
851 : /* For the swapping logic below force vect_reduction_def
852 : for the reduction op in a SLP reduction group. */
853 4800622 : if (!STMT_VINFO_DATA_REF (stmt_info)
854 3629700 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
855 5210 : && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
856 4803199 : && def_stmt_info)
857 2577 : dts[i] = dt = vect_reduction_def;
858 :
859 : /* Check the types of the definition. */
860 4800622 : switch (dt)
861 : {
862 4800622 : case vect_external_def:
863 4800622 : case vect_constant_def:
864 4800622 : case vect_internal_def:
865 4800622 : case vect_reduction_def:
866 4800622 : case vect_double_reduction_def:
867 4800622 : case vect_induction_def:
868 4800622 : case vect_nested_cycle:
869 4800622 : case vect_first_order_recurrence:
870 4800622 : break;
871 :
872 0 : default:
873 : /* FORNOW: Not supported. */
874 0 : if (dump_enabled_p ())
875 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
876 : "Build SLP failed: illegal type of def %T\n",
877 : oprnd);
878 0 : return -1;
879 : }
880 :
881 4800622 : oprnd_info->first_dt = dt;
882 4800622 : oprnd_info->first_op_type = type;
883 : }
884 : }
885 12707363 : if (first)
886 : return 0;
887 :
888 : /* Now match the operand definition types to that of the first stmt. */
889 26240542 : for (i = 0; i < number_of_oprnds;)
890 : {
891 17131596 : if (skip_args[i])
892 : {
893 43202 : ++i;
894 43202 : continue;
895 : }
896 :
897 17088394 : oprnd_info = (*oprnds_info)[i];
898 17088394 : dt = dts[i];
899 17088394 : stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
900 17088394 : oprnd = oprnd_info->ops[stmt_num];
901 17088394 : tree type = TREE_TYPE (oprnd);
902 :
903 17088394 : if (!types_compatible_p (oprnd_info->first_op_type, type))
904 : {
905 87752 : if (dump_enabled_p ())
906 109 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
907 : "Build SLP failed: different operand types\n");
908 87752 : return 1;
909 : }
910 :
911 17000642 : if ((gs_op == i) != oprnd_info->first_gs_p)
912 : {
913 0 : if (dump_enabled_p ())
914 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
915 : "Build SLP failed: mixed gather and non-gather\n");
916 0 : return 1;
917 : }
918 17000642 : else if (gs_op == i)
919 : {
920 221 : if (!operand_equal_p (oprnd_info->first_gs_info.base,
921 221 : gs_info.base))
922 : {
923 16 : if (dump_enabled_p ())
924 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
925 : "Build SLP failed: different gather base\n");
926 16 : return 1;
927 : }
928 205 : if (oprnd_info->first_gs_info.scale != gs_info.scale)
929 : {
930 8 : if (dump_enabled_p ())
931 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
932 : "Build SLP failed: different gather scale\n");
933 8 : return 1;
934 : }
935 : }
936 :
937 : /* Not first stmt of the group, check that the def-stmt/s match
938 : the def-stmt/s of the first stmt. Allow different definition
939 : types for reduction chains: the first stmt must be a
940 : vect_reduction_def (a phi node), and the rest
941 : end in the reduction chain. */
942 17000618 : if ((!vect_def_types_match (oprnd_info->first_dt, dt)
943 293639 : && !(oprnd_info->first_dt == vect_reduction_def
944 4535 : && !STMT_VINFO_DATA_REF (stmt_info)
945 4535 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
946 4509 : && def_stmt_info
947 4509 : && !STMT_VINFO_DATA_REF (def_stmt_info)
948 4509 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
949 : == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
950 16711488 : || (!STMT_VINFO_DATA_REF (stmt_info)
951 15400078 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
952 9386 : && ((!def_stmt_info
953 9217 : || STMT_VINFO_DATA_REF (def_stmt_info)
954 16906 : || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
955 : != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
956 9386 : != (oprnd_info->first_dt != vect_reduction_def))))
957 : {
958 : /* Try swapping operands if we got a mismatch. For BB
959 : vectorization only in case it will clearly improve things. */
960 291579 : if (i == commutative_op && !swapped
961 289130 : && (!is_a <bb_vec_info> (vinfo)
962 4620 : || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
963 4620 : dts[i+1])
964 1122 : && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
965 : || vect_def_types_match
966 156 : ((*oprnds_info)[i+1]->first_dt, dts[i])))))
967 : {
968 2449 : if (dump_enabled_p ())
969 152 : dump_printf_loc (MSG_NOTE, vect_location,
970 : "trying swapped operands\n");
971 2449 : std::swap (dts[i], dts[i+1]);
972 2449 : std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
973 2449 : (*oprnds_info)[i+1]->def_stmts[stmt_num]);
974 2449 : std::swap ((*oprnds_info)[i]->ops[stmt_num],
975 2449 : (*oprnds_info)[i+1]->ops[stmt_num]);
976 : /* After swapping some operands we lost track whether an
977 : operand has any pattern defs so be conservative here. */
978 2449 : if ((*oprnds_info)[i]->any_pattern
979 2449 : || (*oprnds_info)[i+1]->any_pattern)
980 36 : (*oprnds_info)[i]->any_pattern
981 18 : = (*oprnds_info)[i+1]->any_pattern = true;
982 2449 : swapped = true;
983 2449 : continue;
984 : }
985 :
986 286681 : if (is_a <bb_vec_info> (vinfo)
987 271285 : && !oprnd_info->any_pattern
988 557728 : && number_of_oprnds > 1)
989 : {
990 : /* Now for commutative ops we should see whether we can
991 : make the other operand matching. */
992 103532 : if (dump_enabled_p ())
993 203 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
994 : "treating operand as external\n");
995 103532 : oprnd_info->first_dt = dt = vect_external_def;
996 : }
997 : else
998 : {
999 183149 : if (dump_enabled_p ())
1000 407 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1001 : "Build SLP failed: different types\n");
1002 183149 : return 1;
1003 : }
1004 : }
1005 :
1006 : /* Make sure to demote the overall operand to external. */
1007 16815020 : if (dt == vect_external_def)
1008 333818 : oprnd_info->first_dt = vect_external_def;
1009 : /* For a SLP reduction chain we want to duplicate the reduction to
1010 : each of the chain members. That gets us a sane SLP graph (still
1011 : the stmts are not 100% correct wrt the initial values). */
1012 16481202 : else if ((dt == vect_internal_def
1013 16481202 : || dt == vect_reduction_def)
1014 15556904 : && oprnd_info->first_dt == vect_reduction_def
1015 100868 : && !STMT_VINFO_DATA_REF (stmt_info)
1016 100868 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
1017 4509 : && !STMT_VINFO_DATA_REF (def_stmt_info)
1018 16485711 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
1019 : == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
1020 : {
1021 4509 : oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
1022 4509 : oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
1023 : }
1024 :
1025 16815020 : ++i;
1026 : }
1027 :
1028 : /* Swap operands. */
1029 9108946 : if (swapped)
1030 : {
1031 40814 : if (dump_enabled_p ())
1032 438 : dump_printf_loc (MSG_NOTE, vect_location,
1033 : "swapped operands to match def types in %G",
1034 : stmt_info->stmt);
1035 : }
1036 :
1037 : return 0;
1038 : }
1039 :
1040 : /* Return true if call statements CALL1 and CALL2 are similar enough
1041 : to be combined into the same SLP group. */
1042 :
1043 : bool
1044 21243 : compatible_calls_p (gcall *call1, gcall *call2, bool allow_two_operators)
1045 : {
1046 21243 : unsigned int nargs = gimple_call_num_args (call1);
1047 21243 : if (nargs != gimple_call_num_args (call2))
1048 : return false;
1049 :
1050 19292 : auto cfn1 = gimple_call_combined_fn (call1);
1051 19292 : auto cfn2 = gimple_call_combined_fn (call2);
1052 19292 : if (cfn1 != cfn2
1053 2 : && (!allow_two_operators
1054 2 : || !((cfn1 == CFN_FMA || cfn1 == CFN_FMS)
1055 2 : && (cfn2 == CFN_FMA || cfn2 == CFN_FMS))))
1056 : return false;
1057 :
1058 19292 : if (gimple_call_internal_p (call1))
1059 : {
1060 7009 : if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
1061 7009 : TREE_TYPE (gimple_call_lhs (call2))))
1062 : return false;
1063 14432 : for (unsigned int i = 0; i < nargs; ++i)
1064 7423 : if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
1065 7423 : TREE_TYPE (gimple_call_arg (call2, i))))
1066 : return false;
1067 : }
1068 : else
1069 : {
1070 12283 : if (!operand_equal_p (gimple_call_fn (call1),
1071 12283 : gimple_call_fn (call2), 0))
1072 : return false;
1073 :
1074 26928 : if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
1075 : return false;
1076 : }
1077 :
1078 : /* Check that any unvectorized arguments are equal. */
1079 15985 : if (const int *map = vect_get_operand_map (call1, false, false))
1080 : {
1081 15 : unsigned int nkept = *map++;
1082 15 : unsigned int mapi = 0;
1083 57 : for (unsigned int i = 0; i < nargs; ++i)
1084 42 : if (mapi < nkept && map[mapi] == int (i))
1085 27 : mapi += 1;
1086 15 : else if (!operand_equal_p (gimple_call_arg (call1, i),
1087 15 : gimple_call_arg (call2, i)))
1088 : return false;
1089 : }
1090 :
1091 : return true;
1092 : }
1093 :
1094 : /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1095 : caller's attempt to find the vector type in STMT_INFO with the narrowest
1096 : element type. Return true if VECTYPE is nonnull and if it is valid
1097 : for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1098 : number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1099 : vect_build_slp_tree. */
1100 :
1101 : static bool
1102 5497333 : vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1103 : unsigned int group_size,
1104 : tree vectype, poly_uint64 *max_nunits)
1105 : {
1106 5497333 : if (!vectype)
1107 : {
1108 3925 : if (dump_enabled_p ())
1109 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1110 : "Build SLP failed: unsupported data-type in %G\n",
1111 : stmt_info->stmt);
1112 : /* Fatal mismatch. */
1113 3925 : return false;
1114 : }
1115 :
1116 : /* If populating the vector type requires unrolling then fail
1117 : before adjusting *max_nunits for basic-block vectorization. */
1118 5493408 : if (is_a <bb_vec_info> (vinfo)
1119 5493408 : && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1120 : {
1121 142018 : if (dump_enabled_p ())
1122 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1123 : "Build SLP failed: unrolling required "
1124 : "in basic block SLP\n");
1125 : /* Fatal mismatch. */
1126 142018 : return false;
1127 : }
1128 :
1129 : /* In case of multiple types we need to detect the smallest type. */
1130 5351390 : vect_update_max_nunits (max_nunits, vectype);
1131 5351390 : return true;
1132 : }
1133 :
1134 : /* Verify if the scalar stmts STMTS are isomorphic, require data
1135 : permutation or are of unsupported types of operation. Return
1136 : true if they are, otherwise return false and indicate in *MATCHES
1137 : which stmts are not isomorphic to the first one. If MATCHES[0]
1138 : is false then this indicates the comparison could not be
1139 : carried out or the stmts will never be vectorized by SLP.
1140 :
1141 : Note COND_EXPR is possibly isomorphic to another one after swapping its
1142 : operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1143 : the first stmt by swapping the two operands of comparison; set SWAP[i]
1144 : to 2 if stmt I is isormorphic to the first stmt by inverting the code
1145 : of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1146 : to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1147 :
1148 : static bool
1149 5762971 : vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1150 : vec<stmt_vec_info> stmts, unsigned int group_size,
1151 : poly_uint64 *max_nunits, bool *matches,
1152 : bool *two_operators, tree *node_vectype)
1153 : {
1154 5762971 : unsigned int i;
1155 5762971 : stmt_vec_info first_stmt_info = stmts[0];
1156 5762971 : code_helper first_stmt_code = ERROR_MARK;
1157 5762971 : code_helper alt_stmt_code = ERROR_MARK;
1158 5762971 : code_helper first_cond_code = ERROR_MARK;
1159 5762971 : bool need_same_oprnds = false;
1160 5762971 : tree first_lhs = NULL_TREE;
1161 5762971 : tree first_op1 = NULL_TREE;
1162 5762971 : stmt_vec_info first_load = NULL, prev_first_load = NULL;
1163 5762971 : bool first_stmt_ldst_p = false, first_stmt_ldst_masklen_p = false;
1164 5762971 : bool first_stmt_phi_p = false;
1165 5762971 : int first_reduc_idx = -1;
1166 5762971 : bool maybe_soft_fail = false;
1167 5762971 : tree soft_fail_nunits_vectype = NULL_TREE;
1168 :
1169 5762971 : tree vectype, nunits_vectype;
1170 5762971 : if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
1171 : &nunits_vectype, group_size))
1172 : {
1173 : /* Fatal mismatch. */
1174 207234 : matches[0] = false;
1175 207234 : return false;
1176 : }
1177 5555737 : if (is_a <bb_vec_info> (vinfo)
1178 5555737 : && known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U))
1179 : {
1180 358361 : if (dump_enabled_p ())
1181 296 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1182 : "Build SLP failed: not using single lane "
1183 : "vector type %T\n", vectype);
1184 358361 : matches[0] = false;
1185 358361 : return false;
1186 : }
1187 : /* Record nunits required but continue analysis, producing matches[]
1188 : as if nunits was not an issue. This allows splitting of groups
1189 : to happen. */
1190 5197376 : if (nunits_vectype
1191 5197376 : && !vect_record_max_nunits (vinfo, first_stmt_info, group_size,
1192 : nunits_vectype, max_nunits))
1193 : {
1194 142018 : gcc_assert (is_a <bb_vec_info> (vinfo));
1195 142018 : maybe_soft_fail = true;
1196 142018 : soft_fail_nunits_vectype = nunits_vectype;
1197 : }
1198 :
1199 5197376 : gcc_assert (vectype || !gimple_get_lhs (first_stmt_info->stmt));
1200 5197376 : *node_vectype = vectype;
1201 :
1202 : /* For every stmt in NODE find its def stmt/s. */
1203 5197376 : stmt_vec_info stmt_info;
1204 22179446 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1205 : {
1206 17145263 : bool ldst_p = false;
1207 17145263 : bool ldst_masklen_p = false;
1208 17145263 : bool phi_p = false;
1209 17145263 : code_helper rhs_code = ERROR_MARK;
1210 :
1211 17145263 : swap[i] = 0;
1212 17145263 : matches[i] = false;
1213 17145263 : if (!stmt_info)
1214 : {
1215 40246 : matches[i] = true;
1216 17022316 : continue;
1217 : }
1218 :
1219 17105017 : gimple *stmt = stmt_info->stmt;
1220 17105017 : if (dump_enabled_p ())
1221 218452 : dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1222 :
1223 : /* Fail to vectorize statements marked as unvectorizable, throw
1224 : or are volatile. */
1225 17105017 : if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1226 16914600 : || stmt_can_throw_internal (cfun, stmt)
1227 33229063 : || gimple_has_volatile_ops (stmt))
1228 : {
1229 195916 : if (dump_enabled_p ())
1230 199 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1231 : "Build SLP failed: unvectorizable statement %G",
1232 : stmt);
1233 : /* ??? For BB vectorization we want to commutate operands in a way
1234 : to shuffle all unvectorizable defs into one operand and have
1235 : the other still vectorized. The following doesn't reliably
1236 : work for this though but it's the easiest we can do here. */
1237 195916 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1238 64372 : continue;
1239 : /* Fatal mismatch. */
1240 131544 : matches[0] = false;
1241 131544 : return false;
1242 : }
1243 :
1244 16909101 : gcall *call_stmt = dyn_cast <gcall *> (stmt);
1245 16909101 : tree lhs = gimple_get_lhs (stmt);
1246 16909101 : if (lhs == NULL_TREE && !call_stmt)
1247 : {
1248 36 : if (dump_enabled_p ())
1249 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250 : "Build SLP failed: not GIMPLE_ASSIGN nor "
1251 : "GIMPLE_CALL %G", stmt);
1252 36 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1253 36 : continue;
1254 : /* Fatal mismatch. */
1255 0 : matches[0] = false;
1256 0 : return false;
1257 : }
1258 :
1259 16909065 : if (call_stmt)
1260 : {
1261 102597 : combined_fn cfn = gimple_call_combined_fn (call_stmt);
1262 102597 : if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1263 58630 : rhs_code = cfn;
1264 : else
1265 : rhs_code = CALL_EXPR;
1266 :
1267 102597 : if (cfn == CFN_GATHER_LOAD
1268 102597 : || cfn == CFN_SCATTER_STORE)
1269 : ldst_p = true;
1270 : else if (cfn == CFN_MASK_LOAD
1271 : || cfn == CFN_MASK_GATHER_LOAD
1272 : || cfn == CFN_MASK_LEN_GATHER_LOAD
1273 : || cfn == CFN_MASK_SCATTER_STORE
1274 : || cfn == CFN_MASK_LEN_SCATTER_STORE)
1275 : {
1276 : ldst_p = true;
1277 : ldst_masklen_p = true;
1278 : }
1279 : else if (cfn == CFN_MASK_STORE)
1280 : {
1281 : ldst_p = true;
1282 : ldst_masklen_p = true;
1283 : rhs_code = CFN_MASK_STORE;
1284 : }
1285 : else if (cfn == CFN_GOMP_SIMD_LANE)
1286 : ;
1287 91063 : else if ((cfn != CFN_LAST
1288 : && cfn != CFN_MASK_CALL
1289 47096 : && internal_fn_p (cfn)
1290 36893 : && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1291 90988 : || gimple_call_tail_p (call_stmt)
1292 90988 : || gimple_call_noreturn_p (call_stmt)
1293 182051 : || gimple_call_chain (call_stmt))
1294 : {
1295 424 : if (dump_enabled_p ())
1296 13 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1297 : "Build SLP failed: unsupported call type %G",
1298 : (gimple *) call_stmt);
1299 424 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1300 64 : continue;
1301 : /* Fatal mismatch. */
1302 360 : matches[0] = false;
1303 360 : return false;
1304 : }
1305 : }
1306 16806468 : else if (gimple_code (stmt) == GIMPLE_PHI)
1307 : {
1308 : rhs_code = ERROR_MARK;
1309 : phi_p = true;
1310 : }
1311 : else
1312 : {
1313 16015914 : rhs_code = gimple_assign_rhs_code (stmt);
1314 16015914 : ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1315 : }
1316 :
1317 : /* Check the operation. */
1318 16908641 : if (i == 0)
1319 : {
1320 5065472 : first_lhs = lhs;
1321 5065472 : first_stmt_code = rhs_code;
1322 5065472 : first_stmt_ldst_p = ldst_p;
1323 5065472 : first_stmt_ldst_masklen_p = ldst_masklen_p;
1324 5065472 : first_stmt_phi_p = phi_p;
1325 5065472 : first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
1326 :
1327 : /* Shift arguments should be equal in all the packed stmts for a
1328 : vector shift with scalar shift operand. */
1329 5065472 : if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1330 4929982 : || rhs_code == LROTATE_EXPR
1331 9995382 : || rhs_code == RROTATE_EXPR)
1332 : {
1333 : /* First see if we have a vector/vector shift. */
1334 135945 : if (!directly_supported_p (rhs_code, vectype, optab_vector))
1335 : {
1336 : /* No vector/vector shift, try for a vector/scalar shift. */
1337 123911 : if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1338 : {
1339 11991 : if (dump_enabled_p ())
1340 386 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1341 : "Build SLP failed: "
1342 : "op not supported by target.\n");
1343 11991 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1344 : continue;
1345 : /* Fatal mismatch. */
1346 11991 : matches[0] = false;
1347 11991 : return false;
1348 : }
1349 111920 : need_same_oprnds = true;
1350 111920 : first_op1 = gimple_assign_rhs2 (stmt);
1351 : }
1352 : }
1353 4929527 : else if (rhs_code == WIDEN_LSHIFT_EXPR)
1354 : {
1355 0 : need_same_oprnds = true;
1356 0 : first_op1 = gimple_assign_rhs2 (stmt);
1357 : }
1358 4929527 : else if (!ldst_p
1359 4929527 : && rhs_code == BIT_FIELD_REF)
1360 : {
1361 5773 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1362 5773 : if (!is_a <bb_vec_info> (vinfo)
1363 5647 : || TREE_CODE (vec) != SSA_NAME
1364 : /* When the element types are not compatible we pun the
1365 : source to the target vectype which requires equal size. */
1366 11408 : || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1367 4912 : || !types_compatible_p (TREE_TYPE (vectype),
1368 4912 : TREE_TYPE (TREE_TYPE (vec))))
1369 1039 : && !operand_equal_p (TYPE_SIZE (vectype),
1370 1039 : TYPE_SIZE (TREE_TYPE (vec)))))
1371 : {
1372 781 : if (dump_enabled_p ())
1373 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1374 : "Build SLP failed: "
1375 : "BIT_FIELD_REF not supported\n");
1376 : /* Fatal mismatch. */
1377 781 : matches[0] = false;
1378 781 : return false;
1379 : }
1380 : }
1381 4923754 : else if (rhs_code == CFN_DIV_POW2)
1382 : {
1383 0 : need_same_oprnds = true;
1384 0 : first_op1 = gimple_call_arg (call_stmt, 1);
1385 : }
1386 4923754 : else if (rhs_code == CFN_GOMP_SIMD_LANE)
1387 : {
1388 3153 : need_same_oprnds = true;
1389 3153 : first_op1 = gimple_call_arg (call_stmt, 1);
1390 : }
1391 : }
1392 : else
1393 : {
1394 11843169 : int comm_arg;
1395 11843522 : if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1396 : /* For SLP reduction groups the index isn't necessarily
1397 : uniform but only that of the first stmt matters. */
1398 2169 : && !(first_reduc_idx != -1
1399 2169 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1400 2169 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1401 11843169 : && !(first_reduc_idx != -1
1402 982 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1403 982 : && (comm_arg = first_commutative_argument
1404 982 : (rhs_code, TREE_TYPE (lhs))) >= 0
1405 : && (first_reduc_idx
1406 773 : == 2 * comm_arg + 1 - STMT_VINFO_REDUC_IDX (stmt_info))))
1407 : {
1408 353 : if (dump_enabled_p ())
1409 : {
1410 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1411 : "Build SLP failed: different reduc_idx "
1412 : "%d instead of %d in %G",
1413 : STMT_VINFO_REDUC_IDX (stmt_info),
1414 : first_reduc_idx, stmt);
1415 : }
1416 : /* Mismatch. */
1417 353 : continue;
1418 : }
1419 11842816 : if (!ldst_p
1420 9291882 : && first_stmt_code != rhs_code
1421 13251164 : && alt_stmt_code == ERROR_MARK)
1422 : alt_stmt_code = rhs_code;
1423 13225959 : if ((!ldst_p
1424 9291882 : && first_stmt_code != rhs_code
1425 1408348 : && (first_stmt_code != IMAGPART_EXPR
1426 129 : || rhs_code != REALPART_EXPR)
1427 1408328 : && (first_stmt_code != REALPART_EXPR
1428 526 : || rhs_code != IMAGPART_EXPR)
1429 : /* Handle mismatches in plus/minus by computing both
1430 : and merging the results. */
1431 1408317 : && !((((first_stmt_code == PLUS_EXPR
1432 1304554 : || first_stmt_code == MINUS_EXPR)
1433 132550 : && (alt_stmt_code == PLUS_EXPR
1434 123459 : || alt_stmt_code == MINUS_EXPR))
1435 1379078 : || ((first_stmt_code == CFN_FMA
1436 1379076 : || first_stmt_code == CFN_FMS)
1437 2 : && (alt_stmt_code == CFN_FMA
1438 2 : || alt_stmt_code == CFN_FMS)))
1439 29241 : && rhs_code == alt_stmt_code)
1440 1418950 : && !(first_stmt_code.is_tree_code ()
1441 1302879 : && rhs_code.is_tree_code ()
1442 1209241 : && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1443 : == tcc_comparison)
1444 127081 : && (swap_tree_comparison (tree_code (first_stmt_code))
1445 127081 : == tree_code (rhs_code))
1446 : && (first_reduc_idx == -1
1447 0 : || REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
1448 : || (ldst_p
1449 5101868 : && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1450 2550934 : != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1451 : || (ldst_p
1452 2507529 : && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1453 2507529 : != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1454 10459820 : || first_stmt_ldst_p != ldst_p
1455 10459681 : || (ldst_p && first_stmt_ldst_masklen_p != ldst_masklen_p)
1456 22302489 : || first_stmt_phi_p != phi_p)
1457 : {
1458 1383143 : if (dump_enabled_p ())
1459 : {
1460 2929 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1461 : "Build SLP failed: different operation "
1462 : "in stmt %G", stmt);
1463 2929 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464 : "original stmt %G", first_stmt_info->stmt);
1465 : }
1466 : /* Mismatch. */
1467 1383143 : continue;
1468 : }
1469 :
1470 10462002 : if (!ldst_p
1471 7952279 : && first_stmt_code == BIT_FIELD_REF
1472 10465391 : && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1473 5718 : != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1474 : {
1475 2329 : if (dump_enabled_p ())
1476 40 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1477 : "Build SLP failed: different BIT_FIELD_REF "
1478 : "arguments in %G", stmt);
1479 : /* Mismatch. */
1480 2329 : continue;
1481 : }
1482 :
1483 10457344 : if (call_stmt
1484 22075 : && first_stmt_code != CFN_MASK_LOAD
1485 10478933 : && first_stmt_code != CFN_MASK_STORE)
1486 : {
1487 21243 : if (!is_a <gcall *> (stmts[0]->stmt)
1488 21243 : || !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1489 : call_stmt, true))
1490 : {
1491 5258 : if (dump_enabled_p ())
1492 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1493 : "Build SLP failed: different calls in %G",
1494 : stmt);
1495 : /* Mismatch. */
1496 5258 : continue;
1497 : }
1498 : }
1499 :
1500 10265931 : if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1501 11252021 : && (gimple_bb (first_stmt_info->stmt)
1502 986090 : != gimple_bb (stmt_info->stmt)))
1503 : {
1504 27268 : if (dump_enabled_p ())
1505 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1506 : "Build SLP failed: different BB for PHI "
1507 : "or possibly trapping operation in %G", stmt);
1508 : /* Mismatch. */
1509 27268 : continue;
1510 : }
1511 :
1512 10424818 : if (need_same_oprnds)
1513 : {
1514 55226 : tree other_op1 = gimple_arg (stmt, 1);
1515 55226 : if (!operand_equal_p (first_op1, other_op1, 0))
1516 : {
1517 7630 : if (dump_enabled_p ())
1518 123 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1519 : "Build SLP failed: different shift "
1520 : "arguments in %G", stmt);
1521 : /* Mismatch. */
1522 7630 : continue;
1523 : }
1524 : }
1525 :
1526 10417925 : if (first_lhs
1527 10417188 : && lhs
1528 10417188 : && !types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (first_lhs)))
1529 : {
1530 737 : if (dump_enabled_p ())
1531 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1532 : "Build SLP failed: different vector type "
1533 : "in %G", stmt);
1534 : /* Mismatch. */
1535 737 : continue;
1536 : }
1537 : }
1538 :
1539 : /* Grouped store or load. */
1540 15469151 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1541 : {
1542 3871822 : gcc_assert (ldst_p);
1543 3871822 : if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1544 : {
1545 : /* Store. */
1546 3035756 : gcc_assert (rhs_code == CFN_MASK_STORE
1547 : || REFERENCE_CLASS_P (lhs)
1548 : || DECL_P (lhs));
1549 : }
1550 : else
1551 : {
1552 : /* Load. */
1553 836066 : first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1554 836066 : if (prev_first_load)
1555 : {
1556 : /* Check that there are no loads from different interleaving
1557 : chains in the same node. */
1558 381347 : if (prev_first_load != first_load)
1559 : {
1560 54516 : if (dump_enabled_p ())
1561 1994 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1562 : vect_location,
1563 : "Build SLP failed: different "
1564 : "interleaving chains in one node %G",
1565 : stmt);
1566 : /* Mismatch. */
1567 54516 : continue;
1568 : }
1569 : }
1570 : else
1571 : prev_first_load = first_load;
1572 : }
1573 : }
1574 : /* Non-grouped store or load. */
1575 11597329 : else if (ldst_p)
1576 : {
1577 888095 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1578 616978 : && rhs_code != CFN_GATHER_LOAD
1579 : && rhs_code != CFN_MASK_GATHER_LOAD
1580 : && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1581 : && rhs_code != CFN_SCATTER_STORE
1582 : && rhs_code != CFN_MASK_SCATTER_STORE
1583 : && rhs_code != CFN_MASK_LEN_SCATTER_STORE
1584 616978 : && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1585 : /* Not grouped loads are handled as externals for BB
1586 : vectorization. For loop vectorization we can handle
1587 : splats the same we handle single element interleaving.
1588 : Likewise we can handle a collection of invariant refs. */
1589 1486222 : && (is_a <bb_vec_info> (vinfo)
1590 598127 : || (stmt_info != first_stmt_info
1591 68088 : && !(integer_zerop (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
1592 241 : && integer_zerop (DR_STEP (STMT_VINFO_DATA_REF
1593 : (first_stmt_info)))))))
1594 : {
1595 : /* Not grouped load. */
1596 67606 : if (dump_enabled_p ())
1597 145 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1598 : "Build SLP failed: not grouped load %G", stmt);
1599 :
1600 67606 : if (i != 0)
1601 67606 : continue;
1602 : /* Fatal mismatch. */
1603 0 : matches[0] = false;
1604 0 : return false;
1605 : }
1606 : }
1607 : /* Not memory operation. */
1608 : else
1609 : {
1610 10709234 : if (!phi_p
1611 10041670 : && rhs_code.is_tree_code ()
1612 9993049 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1613 1520940 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1614 940254 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1615 878250 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1616 64930 : && rhs_code != VIEW_CONVERT_EXPR
1617 : && rhs_code != CALL_EXPR
1618 : && rhs_code != BIT_FIELD_REF
1619 10709234 : && rhs_code != SSA_NAME)
1620 : {
1621 18517 : if (dump_enabled_p ())
1622 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1623 : "Build SLP failed: operation unsupported %G",
1624 : stmt);
1625 18517 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1626 0 : continue;
1627 : /* Fatal mismatch. */
1628 18517 : matches[0] = false;
1629 18517 : return false;
1630 : }
1631 :
1632 10690717 : if (rhs_code == COND_EXPR)
1633 : {
1634 59121 : tree cond_expr = gimple_assign_rhs1 (stmt);
1635 59121 : enum tree_code cond_code = TREE_CODE (cond_expr);
1636 59121 : enum tree_code swap_code = ERROR_MARK;
1637 59121 : enum tree_code invert_code = ERROR_MARK;
1638 :
1639 59121 : if (i == 0)
1640 49828 : first_cond_code = TREE_CODE (cond_expr);
1641 9293 : else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1642 : {
1643 0 : bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1644 0 : swap_code = swap_tree_comparison (cond_code);
1645 0 : invert_code = invert_tree_comparison (cond_code, honor_nans);
1646 : }
1647 :
1648 59121 : if (first_cond_code == cond_code)
1649 : ;
1650 : /* Isomorphic can be achieved by swapping. */
1651 0 : else if (first_cond_code == swap_code)
1652 0 : swap[i] = 1;
1653 : /* Isomorphic can be achieved by inverting. */
1654 0 : else if (first_cond_code == invert_code)
1655 0 : swap[i] = 2;
1656 : else
1657 : {
1658 0 : if (dump_enabled_p ())
1659 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1660 : "Build SLP failed: different"
1661 : " operation %G", stmt);
1662 : /* Mismatch. */
1663 0 : continue;
1664 : }
1665 : }
1666 :
1667 10690717 : if (i != 0
1668 7910343 : && first_stmt_code != rhs_code
1669 68745 : && first_stmt_code.is_tree_code ()
1670 68743 : && rhs_code.is_tree_code ()
1671 68743 : && TREE_CODE_CLASS ((tree_code)first_stmt_code) == tcc_comparison
1672 10730406 : && (swap_tree_comparison ((tree_code)first_stmt_code)
1673 39689 : == (tree_code)rhs_code))
1674 39689 : swap[i] = 1;
1675 :
1676 10690717 : if (i != 0
1677 7910343 : && first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1678 1574 : && first_reduc_idx != -1
1679 1574 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1680 1574 : && rhs_code.is_tree_code ()
1681 1566 : && commutative_tree_code (tree_code (rhs_code))
1682 10692283 : && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info))
1683 1566 : swap[i] = 1;
1684 : }
1685 :
1686 15328512 : matches[i] = true;
1687 : }
1688 :
1689 20372186 : for (i = 0; i < group_size; ++i)
1690 16019815 : if (!matches[i])
1691 : return false;
1692 :
1693 : /* If we allowed a two-operation SLP node verify the target can cope
1694 : with the permute we are going to use. */
1695 4352371 : if (alt_stmt_code != ERROR_MARK
1696 4352371 : && (!alt_stmt_code.is_tree_code ()
1697 53561 : || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1698 53561 : && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1699 : {
1700 14463 : *two_operators = true;
1701 : }
1702 :
1703 4352371 : if (maybe_soft_fail)
1704 : {
1705 141610 : unsigned HOST_WIDE_INT const_nunits;
1706 141610 : if (!TYPE_VECTOR_SUBPARTS
1707 141610 : (soft_fail_nunits_vectype).is_constant (&const_nunits)
1708 141610 : || const_nunits > group_size)
1709 0 : matches[0] = false;
1710 : else
1711 : {
1712 : /* With constant vector elements simulate a mismatch at the
1713 : point we need to split. */
1714 141610 : unsigned tail = group_size & (const_nunits - 1);
1715 141610 : memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1716 : }
1717 141610 : return false;
1718 : }
1719 :
1720 : return true;
1721 : }
1722 :
1723 : /* Traits for the hash_set to record failed SLP builds for a stmt set.
1724 : Note we never remove apart from at destruction time so we do not
1725 : need a special value for deleted that differs from empty. */
1726 : struct bst_traits
1727 : {
1728 : typedef vec <stmt_vec_info> value_type;
1729 : typedef vec <stmt_vec_info> compare_type;
1730 : static inline hashval_t hash (value_type);
1731 : static inline bool equal (value_type existing, value_type candidate);
1732 479989088 : static inline bool is_empty (value_type x) { return !x.exists (); }
1733 107526258 : static inline bool is_deleted (value_type x) { return !x.exists (); }
1734 : static const bool empty_zero_p = true;
1735 0 : static inline void mark_empty (value_type &x) { x.release (); }
1736 : static inline void mark_deleted (value_type &x) { x.release (); }
1737 9231886 : static inline void remove (value_type &x) { x.release (); }
1738 : };
1739 : inline hashval_t
1740 93669909 : bst_traits::hash (value_type x)
1741 : {
1742 93669909 : inchash::hash h;
1743 424237028 : for (unsigned i = 0; i < x.length (); ++i)
1744 330567119 : h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
1745 93669909 : return h.end ();
1746 : }
1747 : inline bool
1748 81959155 : bst_traits::equal (value_type existing, value_type candidate)
1749 : {
1750 245877465 : if (existing.length () != candidate.length ())
1751 : return false;
1752 83357726 : for (unsigned i = 0; i < existing.length (); ++i)
1753 79004451 : if (existing[i] != candidate[i])
1754 : return false;
1755 : return true;
1756 : }
1757 :
1758 : typedef hash_map <vec <stmt_vec_info>, slp_tree,
1759 : simple_hashmap_traits <bst_traits, slp_tree> >
1760 : scalar_stmts_to_slp_tree_map_t;
1761 :
1762 : /* Release BST_MAP. */
1763 :
1764 : static void
1765 1794282 : release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
1766 : {
1767 : /* The map keeps a reference on SLP nodes built, release that. */
1768 11026168 : for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1769 20258054 : it != bst_map->end (); ++it)
1770 9231886 : if ((*it).second)
1771 9231886 : vect_free_slp_tree ((*it).second);
1772 1794282 : delete bst_map;
1773 1794282 : }
1774 :
1775 : /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1776 : but then vec::insert does memmove and that's not compatible with
1777 : std::pair. */
1778 : struct chain_op_t
1779 : {
1780 3691438 : chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1781 3691438 : : code (code_), dt (dt_), op (op_) {}
1782 : tree_code code;
1783 : vect_def_type dt;
1784 : tree op;
1785 : };
1786 :
1787 : /* Comparator for sorting associatable chains. */
1788 :
1789 : static int
1790 8248221 : dt_sort_cmp (const void *op1_, const void *op2_, void *)
1791 : {
1792 8248221 : auto *op1 = (const chain_op_t *) op1_;
1793 8248221 : auto *op2 = (const chain_op_t *) op2_;
1794 8248221 : if (op1->dt != op2->dt)
1795 942589 : return (int)op1->dt - (int)op2->dt;
1796 7305632 : return (int)op1->code - (int)op2->code;
1797 : }
1798 :
1799 : /* Linearize the associatable expression chain at START with the
1800 : associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1801 : filling CHAIN with the result and using WORKLIST as intermediate storage.
1802 : CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1803 : or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1804 : stmts, starting with START. When ALLOW_ALT_CODE is false, do not
1805 : follow into MINUS_EXPR when building a PLUS chain (treat MINUS as leaf). */
1806 :
1807 : static void
1808 1671252 : vect_slp_linearize_chain (vec_info *vinfo,
1809 : vec<std::pair<tree_code, gimple *> > &worklist,
1810 : vec<chain_op_t> &chain,
1811 : enum tree_code code, gimple *start,
1812 : gimple *&code_stmt, gimple *&alt_code_stmt,
1813 : vec<gimple *> *chain_stmts,
1814 : bool allow_alt_code = true)
1815 : {
1816 : /* For each lane linearize the addition/subtraction (or other
1817 : uniform associatable operation) expression tree. */
1818 1671252 : worklist.safe_push (std::make_pair (code, start));
1819 3691438 : while (!worklist.is_empty ())
1820 : {
1821 2020186 : auto entry = worklist.pop ();
1822 2020186 : gassign *stmt = as_a <gassign *> (entry.second);
1823 2020186 : enum tree_code in_code = entry.first;
1824 4040372 : enum tree_code this_code = gimple_assign_rhs_code (stmt);
1825 : /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1826 2020186 : if (!code_stmt
1827 2020186 : && gimple_assign_rhs_code (stmt) == code)
1828 1418096 : code_stmt = stmt;
1829 602090 : else if (!alt_code_stmt
1830 602090 : && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1831 305953 : alt_code_stmt = stmt;
1832 2020186 : if (chain_stmts)
1833 1947409 : chain_stmts->safe_push (stmt);
1834 6060558 : for (unsigned opnum = 1; opnum <= 2; ++opnum)
1835 : {
1836 4040372 : tree op = gimple_op (stmt, opnum);
1837 4040372 : vect_def_type dt;
1838 4040372 : stmt_vec_info def_stmt_info;
1839 4040372 : bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1840 4040372 : gcc_assert (res);
1841 4040372 : if (dt == vect_internal_def
1842 4040372 : && is_pattern_stmt_p (def_stmt_info))
1843 8636 : op = gimple_get_lhs (def_stmt_info->stmt);
1844 4040372 : gimple *use_stmt;
1845 4040372 : use_operand_p use_p;
1846 4040372 : if (dt == vect_internal_def
1847 3756986 : && single_imm_use (op, &use_p, &use_stmt)
1848 2329185 : && is_gimple_assign (def_stmt_info->stmt)
1849 6190318 : && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1850 1801324 : || (allow_alt_code
1851 49385 : && code == PLUS_EXPR
1852 28860 : && (gimple_assign_rhs_code (def_stmt_info->stmt)
1853 : == MINUS_EXPR))))
1854 : {
1855 348934 : tree_code op_def_code = this_code;
1856 348934 : if (op_def_code == MINUS_EXPR && opnum == 1)
1857 51097 : op_def_code = PLUS_EXPR;
1858 348934 : if (in_code == MINUS_EXPR)
1859 135 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1860 348934 : worklist.safe_push (std::make_pair (op_def_code,
1861 348934 : def_stmt_info->stmt));
1862 : }
1863 : else
1864 : {
1865 3691438 : tree_code op_def_code = this_code;
1866 3691438 : if (op_def_code == MINUS_EXPR && opnum == 1)
1867 254973 : op_def_code = PLUS_EXPR;
1868 3691438 : if (in_code == MINUS_EXPR)
1869 3997 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1870 3691438 : chain.safe_push (chain_op_t (op_def_code, dt, op));
1871 : }
1872 : }
1873 : }
1874 1671252 : }
1875 :
1876 : static slp_tree
1877 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1878 : vec<stmt_vec_info> stmts, unsigned int group_size,
1879 : poly_uint64 *max_nunits,
1880 : bool *matches, unsigned *limit, unsigned *tree_size,
1881 : scalar_stmts_to_slp_tree_map_t *bst_map);
1882 :
1883 : static slp_tree
1884 6250055 : vect_build_slp_tree (vec_info *vinfo,
1885 : vec<stmt_vec_info> stmts, unsigned int group_size,
1886 : poly_uint64 *max_nunits,
1887 : bool *matches, unsigned *limit, unsigned *tree_size,
1888 : scalar_stmts_to_slp_tree_map_t *bst_map)
1889 : {
1890 6250055 : if (slp_tree *leader = bst_map->get (stmts))
1891 : {
1892 481847 : if (dump_enabled_p ())
1893 17132 : dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1894 17132 : !(*leader)->failed ? "" : "failed ",
1895 : (void *) *leader);
1896 481847 : if (!(*leader)->failed)
1897 : {
1898 434016 : SLP_TREE_REF_COUNT (*leader)++;
1899 434016 : vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1900 434016 : stmts.release ();
1901 434016 : return *leader;
1902 : }
1903 47831 : memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1904 47831 : return NULL;
1905 : }
1906 :
1907 : /* Single-lane SLP doesn't have the chance of run-away, do not account
1908 : it to the limit. */
1909 5768208 : if (stmts.length () > 1)
1910 : {
1911 3183116 : if (*limit == 0)
1912 : {
1913 1235 : if (dump_enabled_p ())
1914 12 : dump_printf_loc (MSG_NOTE, vect_location,
1915 : "SLP discovery limit exceeded\n");
1916 1235 : memset (matches, 0, sizeof (bool) * group_size);
1917 1235 : return NULL;
1918 : }
1919 3181881 : --*limit;
1920 : }
1921 :
1922 : /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1923 : so we can pick up backedge destinations during discovery. */
1924 5766973 : slp_tree res = new _slp_tree;
1925 5766973 : SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1926 5766973 : SLP_TREE_SCALAR_STMTS (res) = stmts;
1927 5766973 : bst_map->put (stmts.copy (), res);
1928 :
1929 5766973 : if (dump_enabled_p ())
1930 146007 : dump_printf_loc (MSG_NOTE, vect_location,
1931 : "starting SLP discovery for node %p\n", (void *) res);
1932 :
1933 5766973 : poly_uint64 this_max_nunits = 1;
1934 5766973 : slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1935 : &this_max_nunits,
1936 : matches, limit, tree_size, bst_map);
1937 5766973 : if (!res_)
1938 : {
1939 2013173 : if (dump_enabled_p ())
1940 8297 : dump_printf_loc (MSG_NOTE, vect_location,
1941 : "SLP discovery for node %p failed\n", (void *) res);
1942 : /* Mark the node invalid so we can detect those when still in use
1943 : as backedge destinations. */
1944 2013173 : SLP_TREE_SCALAR_STMTS (res) = vNULL;
1945 2013173 : SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1946 2013173 : res->failed = XNEWVEC (bool, group_size);
1947 2013173 : if (flag_checking)
1948 : {
1949 : unsigned i;
1950 3567746 : for (i = 0; i < group_size; ++i)
1951 3567746 : if (!matches[i])
1952 : break;
1953 2013173 : gcc_assert (i < group_size);
1954 : }
1955 2013173 : memcpy (res->failed, matches, sizeof (bool) * group_size);
1956 : }
1957 : else
1958 : {
1959 3753800 : if (dump_enabled_p ())
1960 137710 : dump_printf_loc (MSG_NOTE, vect_location,
1961 : "SLP discovery for node %p succeeded\n",
1962 : (void *) res);
1963 3753800 : gcc_assert (res_ == res);
1964 3753800 : res->max_nunits = this_max_nunits;
1965 3753800 : vect_update_max_nunits (max_nunits, this_max_nunits);
1966 : /* Keep a reference for the bst_map use. */
1967 3753800 : SLP_TREE_REF_COUNT (res)++;
1968 : }
1969 : return res_;
1970 : }
1971 :
1972 : /* Helper for building an associated SLP node chain. */
1973 :
1974 : static void
1975 178 : vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1976 : slp_tree op0, slp_tree op1,
1977 : stmt_vec_info oper1, stmt_vec_info oper2,
1978 : vec<std::pair<unsigned, unsigned> > lperm)
1979 : {
1980 178 : unsigned group_size = SLP_TREE_LANES (op1);
1981 :
1982 178 : slp_tree child1 = new _slp_tree;
1983 178 : SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1984 178 : SLP_TREE_VECTYPE (child1) = vectype;
1985 178 : SLP_TREE_LANES (child1) = group_size;
1986 178 : SLP_TREE_CHILDREN (child1).create (2);
1987 178 : SLP_TREE_CHILDREN (child1).quick_push (op0);
1988 178 : SLP_TREE_CHILDREN (child1).quick_push (op1);
1989 178 : SLP_TREE_REPRESENTATIVE (child1) = oper1;
1990 :
1991 178 : slp_tree child2 = new _slp_tree;
1992 178 : SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1993 178 : SLP_TREE_VECTYPE (child2) = vectype;
1994 178 : SLP_TREE_LANES (child2) = group_size;
1995 178 : SLP_TREE_CHILDREN (child2).create (2);
1996 178 : SLP_TREE_CHILDREN (child2).quick_push (op0);
1997 178 : SLP_TREE_REF_COUNT (op0)++;
1998 178 : SLP_TREE_CHILDREN (child2).quick_push (op1);
1999 178 : SLP_TREE_REF_COUNT (op1)++;
2000 178 : SLP_TREE_REPRESENTATIVE (child2) = oper2;
2001 :
2002 178 : SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
2003 178 : SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
2004 178 : SLP_TREE_VECTYPE (perm) = vectype;
2005 178 : SLP_TREE_LANES (perm) = group_size;
2006 : /* ??? We should set this NULL but that's not expected. */
2007 178 : SLP_TREE_REPRESENTATIVE (perm) = oper1;
2008 178 : SLP_TREE_LANE_PERMUTATION (perm) = lperm;
2009 178 : SLP_TREE_CHILDREN (perm).quick_push (child1);
2010 178 : SLP_TREE_CHILDREN (perm).quick_push (child2);
2011 178 : }
2012 :
2013 : /* Recursively build an SLP tree starting from NODE.
2014 : Fail (and return a value not equal to zero) if def-stmts are not
2015 : isomorphic, require data permutation or are of unsupported types of
2016 : operation. Otherwise, return 0.
2017 : The value returned is the depth in the SLP tree where a mismatch
2018 : was found. */
2019 :
2020 : static slp_tree
2021 5766973 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
2022 : vec<stmt_vec_info> stmts, unsigned int group_size,
2023 : poly_uint64 *max_nunits,
2024 : bool *matches, unsigned *limit, unsigned *tree_size,
2025 : scalar_stmts_to_slp_tree_map_t *bst_map)
2026 : {
2027 5766973 : unsigned nops, i, this_tree_size = 0;
2028 5766973 : poly_uint64 this_max_nunits = *max_nunits;
2029 :
2030 5766973 : matches[0] = false;
2031 :
2032 5766973 : stmt_vec_info stmt_info = stmts[0];
2033 5766973 : if (!is_a<gcall *> (stmt_info->stmt)
2034 : && !is_a<gassign *> (stmt_info->stmt)
2035 : && !is_a<gphi *> (stmt_info->stmt))
2036 : return NULL;
2037 :
2038 5766902 : nops = gimple_num_args (stmt_info->stmt);
2039 5766902 : if (const int *map = vect_get_operand_map (stmt_info))
2040 35108 : nops = map[0];
2041 :
2042 : /* If the SLP node is a PHI (induction or reduction), terminate
2043 : the recursion. */
2044 5766902 : bool *skip_args = XALLOCAVEC (bool, nops);
2045 5766902 : memset (skip_args, 0, sizeof (bool) * nops);
2046 5766902 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2047 2781491 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
2048 : {
2049 299977 : tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
2050 299977 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2051 : group_size);
2052 299977 : if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
2053 : max_nunits))
2054 : return NULL;
2055 :
2056 296052 : vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
2057 296052 : if (def_type == vect_induction_def)
2058 : {
2059 : /* Induction PHIs are not cycles but walk the initial
2060 : value. Only for inner loops through, for outer loops
2061 : we need to pick up the value from the actual PHIs
2062 : to more easily support peeling and epilogue vectorization. */
2063 190200 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2064 190200 : if (!nested_in_vect_loop_p (loop, stmt_info))
2065 189376 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
2066 : else
2067 : loop = loop->inner;
2068 190200 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
2069 : }
2070 105852 : else if (def_type == vect_reduction_def
2071 : || def_type == vect_double_reduction_def
2072 : || def_type == vect_nested_cycle
2073 105852 : || def_type == vect_first_order_recurrence)
2074 : {
2075 : /* Else def types have to match. */
2076 : stmt_vec_info other_info;
2077 : bool all_same = true;
2078 239614 : FOR_EACH_VEC_ELT (stmts, i, other_info)
2079 : {
2080 135076 : if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
2081 1768716 : return NULL;
2082 135070 : if (other_info != stmt_info)
2083 26185 : all_same = false;
2084 : }
2085 104538 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2086 : /* Reduction initial values are not explicitly represented. */
2087 104538 : if (def_type != vect_first_order_recurrence
2088 104538 : && gimple_bb (stmt_info->stmt) == loop->header)
2089 101393 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
2090 : /* Reduction chain backedge defs are filled manually.
2091 : ??? Need a better way to identify a SLP reduction chain PHI.
2092 : Or a better overall way to SLP match those. */
2093 104538 : if (stmts.length () > 1
2094 104538 : && all_same && def_type == vect_reduction_def)
2095 2311 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
2096 : }
2097 1308 : else if (def_type != vect_internal_def)
2098 : return NULL;
2099 : }
2100 :
2101 :
2102 5762971 : bool two_operators = false;
2103 5762971 : unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
2104 5762971 : tree vectype = NULL_TREE;
2105 5762971 : if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
2106 : &this_max_nunits, matches, &two_operators,
2107 : &vectype))
2108 : return NULL;
2109 :
2110 : /* If the SLP node is a load, terminate the recursion unless masked. */
2111 4210761 : if (STMT_VINFO_DATA_REF (stmt_info)
2112 2037596 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2113 : {
2114 901201 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2115 : gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
2116 : else
2117 : {
2118 882601 : *max_nunits = this_max_nunits;
2119 882601 : (*tree_size)++;
2120 882601 : node = vect_create_new_slp_node (node, stmts, 0);
2121 882601 : SLP_TREE_VECTYPE (node) = vectype;
2122 : /* And compute the load permutation. Whether it is actually
2123 : a permutation depends on the unrolling factor which is
2124 : decided later. */
2125 882601 : vec<unsigned> load_permutation;
2126 882601 : int j;
2127 882601 : stmt_vec_info load_info;
2128 882601 : load_permutation.create (group_size);
2129 882601 : stmt_vec_info first_stmt_info
2130 882601 : = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2131 882601 : ? DR_GROUP_FIRST_ELEMENT (stmt_info) : stmt_info;
2132 882601 : bool any_permute = false;
2133 2126748 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2134 : {
2135 1244147 : int load_place;
2136 1244147 : if (! load_info)
2137 : {
2138 39926 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2139 : load_place = j;
2140 : else
2141 : load_place = 0;
2142 : }
2143 1204221 : else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2144 701848 : load_place = vect_get_place_in_interleaving_chain
2145 701848 : (load_info, first_stmt_info);
2146 : else
2147 : /* Recognize the splat case as { 0, 0, ... } but make
2148 : sure to use the appropriate refs for collections
2149 : of invariant refs. */
2150 502373 : load_place = (load_info == stmt_info) ? 0 : j;
2151 742015 : gcc_assert (load_place != -1);
2152 1244147 : any_permute |= load_place != j;
2153 1244147 : load_permutation.quick_push (load_place);
2154 : }
2155 :
2156 882601 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
2157 : {
2158 3406 : gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
2159 3406 : bool has_gaps = false;
2160 3406 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2161 209 : for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
2162 1346 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2163 1137 : if (DR_GROUP_GAP (si) != 1)
2164 160 : has_gaps = true;
2165 : /* We cannot handle permuted masked loads directly, see
2166 : PR114375. We cannot handle strided masked loads or masked
2167 : loads with gaps unless the mask is uniform. */
2168 3406 : if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
2169 209 : && (DR_GROUP_GAP (first_stmt_info) != 0
2170 149 : || (has_gaps
2171 55 : && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))))
2172 6717 : || STMT_VINFO_STRIDED_P (stmt_info))
2173 : {
2174 108 : load_permutation.release ();
2175 108 : matches[0] = false;
2176 879347 : return NULL;
2177 : }
2178 :
2179 : /* For permuted masked loads do an unpermuted masked load of
2180 : the whole group followed by a SLP permute node. */
2181 3298 : if (any_permute
2182 3298 : || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2183 84 : && DR_GROUP_SIZE (first_stmt_info) != group_size))
2184 : {
2185 : /* Discover the whole unpermuted load. */
2186 44 : vec<stmt_vec_info> stmts2;
2187 44 : unsigned dr_group_size = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2188 78 : ? DR_GROUP_SIZE (first_stmt_info) : 1;
2189 44 : stmts2.create (dr_group_size);
2190 44 : stmts2.quick_grow_cleared (dr_group_size);
2191 44 : unsigned i = 0;
2192 44 : for (stmt_vec_info si = first_stmt_info;
2193 594 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2194 : {
2195 550 : if (si != first_stmt_info)
2196 2106 : for (unsigned k = 1; k < DR_GROUP_GAP (si); ++k)
2197 1600 : stmts2[i++] = NULL;
2198 550 : stmts2[i++] = si;
2199 : }
2200 44 : bool *matches2 = XALLOCAVEC (bool, dr_group_size);
2201 44 : slp_tree unperm_load
2202 44 : = vect_build_slp_tree (vinfo, stmts2, dr_group_size,
2203 : &this_max_nunits, matches2, limit,
2204 44 : &this_tree_size, bst_map);
2205 : /* When we are able to do the full masked load emit that
2206 : followed by 'node' being the desired final permutation. */
2207 44 : if (unperm_load)
2208 : {
2209 16 : gcc_assert
2210 : (!SLP_TREE_LOAD_PERMUTATION (unperm_load).exists ());
2211 16 : lane_permutation_t lperm;
2212 16 : lperm.create (group_size);
2213 56 : for (unsigned j = 0; j < load_permutation.length (); ++j)
2214 40 : lperm.quick_push
2215 40 : (std::make_pair (0, load_permutation[j]));
2216 16 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2217 16 : SLP_TREE_CHILDREN (node).safe_push (unperm_load);
2218 16 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2219 16 : load_permutation.release ();
2220 16 : return node;
2221 : }
2222 28 : stmts2.release ();
2223 28 : load_permutation.release ();
2224 28 : matches[0] = false;
2225 28 : return NULL;
2226 : }
2227 3254 : load_permutation.release ();
2228 : }
2229 : else
2230 : {
2231 879195 : if (!any_permute
2232 766424 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2233 1169250 : && group_size == DR_GROUP_SIZE (first_stmt_info))
2234 126548 : load_permutation.release ();
2235 879195 : SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
2236 879195 : return node;
2237 : }
2238 : }
2239 : }
2240 3309560 : else if (gimple_assign_single_p (stmt_info->stmt)
2241 2277250 : && !gimple_vuse (stmt_info->stmt)
2242 3317356 : && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
2243 : {
2244 : /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
2245 : the same SSA name vector of a compatible type to vectype. */
2246 2366 : vec<std::pair<unsigned, unsigned> > lperm = vNULL;
2247 2366 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
2248 2366 : stmt_vec_info estmt_info;
2249 7440 : FOR_EACH_VEC_ELT (stmts, i, estmt_info)
2250 : {
2251 5221 : gassign *estmt = as_a <gassign *> (estmt_info->stmt);
2252 5221 : tree bfref = gimple_assign_rhs1 (estmt);
2253 5221 : HOST_WIDE_INT lane;
2254 5221 : if (!known_eq (bit_field_size (bfref),
2255 : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2256 10295 : || !constant_multiple_p (bit_field_offset (bfref),
2257 5074 : bit_field_size (bfref), &lane))
2258 : {
2259 147 : lperm.release ();
2260 147 : matches[0] = false;
2261 147 : return NULL;
2262 : }
2263 5074 : lperm.safe_push (std::make_pair (0, (unsigned)lane));
2264 : }
2265 2219 : slp_tree vnode = vect_create_new_slp_node (vNULL);
2266 2219 : if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2267 : /* ??? We record vectype here but we hide eventually necessary
2268 : punning and instead rely on code generation to materialize
2269 : VIEW_CONVERT_EXPRs as necessary. We instead should make
2270 : this explicit somehow. */
2271 703 : SLP_TREE_VECTYPE (vnode) = vectype;
2272 : else
2273 : {
2274 : /* For different size but compatible elements we can still
2275 : use VEC_PERM_EXPR without punning. */
2276 1516 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2277 : && types_compatible_p (TREE_TYPE (vectype),
2278 : TREE_TYPE (TREE_TYPE (vec))));
2279 1516 : SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2280 : }
2281 2219 : auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2282 2219 : unsigned HOST_WIDE_INT const_nunits;
2283 2219 : if (nunits.is_constant (&const_nunits))
2284 2219 : SLP_TREE_LANES (vnode) = const_nunits;
2285 2219 : SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2286 : /* We are always building a permutation node even if it is an identity
2287 : permute to shield the rest of the vectorizer from the odd node
2288 : representing an actual vector without any scalar ops.
2289 : ??? We could hide it completely with making the permute node
2290 : external? */
2291 2219 : node = vect_create_new_slp_node (node, stmts, 1);
2292 2219 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2293 2219 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2294 2219 : SLP_TREE_VECTYPE (node) = vectype;
2295 2219 : SLP_TREE_CHILDREN (node).quick_push (vnode);
2296 2219 : return node;
2297 : }
2298 : /* When discovery reaches an associatable operation see whether we can
2299 : improve that to match up lanes in a way superior to the operand
2300 : swapping code which at most looks at two defs.
2301 : ??? For BB vectorization we cannot do the brute-force search
2302 : for matching as we can succeed by means of builds from scalars
2303 : and have no good way to "cost" one build against another. */
2304 3307194 : else if (is_a <loop_vec_info> (vinfo)
2305 : /* Do not bother for single-lane SLP. */
2306 1964804 : && group_size > 1
2307 : /* ??? We don't handle !vect_internal_def defs below. */
2308 111580 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2309 : /* ??? Do not associate a reduction, this will wreck REDUC_IDX
2310 : mapping as long as that exists on the stmt_info level. */
2311 86213 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1
2312 77718 : && is_gimple_assign (stmt_info->stmt)
2313 77404 : && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2314 50840 : || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2315 3335666 : && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2316 16258 : || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2317 13736 : && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2318 : {
2319 : /* See if we have a chain of (mixed) adds or subtracts or other
2320 : associatable ops. */
2321 21439 : enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2322 21439 : if (code == MINUS_EXPR)
2323 796 : code = PLUS_EXPR;
2324 21439 : stmt_vec_info other_op_stmt_info = NULL;
2325 21439 : stmt_vec_info op_stmt_info = NULL;
2326 21439 : unsigned chain_len = 0;
2327 21439 : auto_vec<chain_op_t> chain;
2328 21439 : auto_vec<std::pair<tree_code, gimple *> > worklist;
2329 21439 : auto_vec<vec<chain_op_t> > chains (group_size);
2330 21439 : auto_vec<slp_tree, 4> children;
2331 21439 : bool hard_fail = true;
2332 22506 : for (unsigned lane = 0; lane < group_size; ++lane)
2333 : {
2334 22150 : if (!stmts[lane])
2335 : {
2336 : /* ??? Below we require lane zero is present. */
2337 0 : if (lane == 0)
2338 : {
2339 : hard_fail = false;
2340 21083 : break;
2341 : }
2342 0 : chains.quick_push (vNULL);
2343 0 : continue;
2344 : }
2345 : /* For each lane linearize the addition/subtraction (or other
2346 : uniform associatable operation) expression tree. */
2347 22150 : gimple *op_stmt = NULL, *other_op_stmt = NULL;
2348 22150 : vect_slp_linearize_chain (vinfo, worklist, chain, code,
2349 22150 : stmts[lane]->stmt, op_stmt, other_op_stmt,
2350 : NULL);
2351 22150 : if (!op_stmt_info && op_stmt)
2352 20860 : op_stmt_info = vinfo->lookup_stmt (op_stmt);
2353 22150 : if (!other_op_stmt_info && other_op_stmt)
2354 832 : other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2355 22150 : if (chain.length () == 2)
2356 : {
2357 : /* In a chain of just two elements resort to the regular
2358 : operand swapping scheme. Likewise if we run into a
2359 : length mismatch process regularly as well as we did not
2360 : process the other lanes we cannot report a good hint what
2361 : lanes to try swapping in the parent. */
2362 : hard_fail = false;
2363 : break;
2364 : }
2365 1070 : else if (chain_len == 0)
2366 396 : chain_len = chain.length ();
2367 1348 : else if (chain.length () != chain_len)
2368 : {
2369 : /* ??? Here we could slip in magic to compensate with
2370 : neutral operands. */
2371 3 : matches[lane] = false;
2372 3 : if (lane != group_size - 1)
2373 3 : matches[0] = false;
2374 : break;
2375 : }
2376 1067 : chains.quick_push (chain.copy ());
2377 1067 : chain.truncate (0);
2378 : }
2379 42878 : if (chains.length () == group_size)
2380 : {
2381 : /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2382 356 : if (!op_stmt_info)
2383 : {
2384 3 : hard_fail = false;
2385 3 : goto out;
2386 : }
2387 : /* Now we have a set of chains with the same length. */
2388 : /* 1. pre-sort according to def_type and operation. */
2389 1308 : for (unsigned lane = 0; lane < group_size; ++lane)
2390 1910 : chains[lane].stablesort (dt_sort_cmp, vinfo);
2391 353 : if (dump_enabled_p ())
2392 : {
2393 157 : dump_printf_loc (MSG_NOTE, vect_location,
2394 : "pre-sorted chains of %s\n",
2395 : get_tree_code_name (code));
2396 685 : for (unsigned lane = 0; lane < group_size; ++lane)
2397 : {
2398 528 : if (!stmts[lane])
2399 0 : dump_printf (MSG_NOTE, "--");
2400 : else
2401 2422 : for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2402 3788 : dump_printf (MSG_NOTE, "%s %T ",
2403 1894 : get_tree_code_name (chains[lane][opnum].code),
2404 1894 : chains[lane][opnum].op);
2405 528 : dump_printf (MSG_NOTE, "\n");
2406 : }
2407 : }
2408 : /* 2. try to build children nodes, associating as necessary. */
2409 : /* 2a. prepare and perform early checks to avoid eating into
2410 : discovery limit unnecessarily. */
2411 353 : vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
2412 1487 : for (unsigned n = 0; n < chain_len; ++n)
2413 : {
2414 1134 : vect_def_type dt = chains[0][n].dt;
2415 1134 : unsigned lane;
2416 4357 : for (lane = 0; lane < group_size; ++lane)
2417 6446 : if (stmts[lane] && chains[lane][n].dt != dt)
2418 : {
2419 0 : if (dt == vect_constant_def
2420 0 : && chains[lane][n].dt == vect_external_def)
2421 : dt = vect_external_def;
2422 0 : else if (dt == vect_external_def
2423 0 : && chains[lane][n].dt == vect_constant_def)
2424 : ;
2425 : else
2426 : break;
2427 : }
2428 1134 : if (lane != group_size)
2429 : {
2430 0 : if (dump_enabled_p ())
2431 0 : dump_printf_loc (MSG_NOTE, vect_location,
2432 : "giving up on chain due to mismatched "
2433 : "def types\n");
2434 0 : matches[lane] = false;
2435 0 : if (lane != group_size - 1)
2436 0 : matches[0] = false;
2437 0 : goto out;
2438 : }
2439 1134 : dts[n] = dt;
2440 1134 : if (dt == vect_constant_def
2441 1134 : || dt == vect_external_def)
2442 : {
2443 : /* Check whether we can build the invariant. If we can't
2444 : we never will be able to. */
2445 93 : tree type = TREE_TYPE (chains[0][n].op);
2446 1134 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2447 : && (TREE_CODE (type) == BOOLEAN_TYPE
2448 : || !can_duplicate_and_interleave_p (vinfo, group_size,
2449 : type)))
2450 : {
2451 : matches[0] = false;
2452 : goto out;
2453 : }
2454 : }
2455 1041 : else if (dt != vect_internal_def)
2456 : {
2457 : /* Not sure, we might need sth special.
2458 : gcc.dg/vect/pr96854.c,
2459 : gfortran.dg/vect/fast-math-pr37021.f90
2460 : and gfortran.dg/vect/pr61171.f trigger. */
2461 : /* Soft-fail for now. */
2462 0 : hard_fail = false;
2463 0 : goto out;
2464 : }
2465 : }
2466 : /* 2b. do the actual build. */
2467 1429 : for (unsigned n = 0; n < chain_len; ++n)
2468 : {
2469 1096 : vect_def_type dt = dts[n];
2470 1096 : unsigned lane;
2471 1096 : if (dt == vect_constant_def
2472 1096 : || dt == vect_external_def)
2473 : {
2474 93 : vec<tree> ops;
2475 93 : ops.create (group_size);
2476 461 : for (lane = 0; lane < group_size; ++lane)
2477 275 : if (stmts[lane])
2478 275 : ops.quick_push (chains[lane][n].op);
2479 : else
2480 0 : ops.quick_push (NULL_TREE);
2481 93 : slp_tree child = vect_create_new_slp_node (ops);
2482 93 : SLP_TREE_DEF_TYPE (child) = dt;
2483 93 : children.safe_push (child);
2484 : }
2485 : else
2486 : {
2487 1003 : vec<stmt_vec_info> op_stmts;
2488 1003 : op_stmts.create (group_size);
2489 1003 : slp_tree child = NULL;
2490 : /* Brute-force our way. We have to consider a lane
2491 : failing after fixing an earlier fail up in the
2492 : SLP discovery recursion. So track the current
2493 : permute per lane. */
2494 1003 : unsigned *perms = XALLOCAVEC (unsigned, group_size);
2495 1003 : memset (perms, 0, sizeof (unsigned) * group_size);
2496 1097 : do
2497 : {
2498 1097 : op_stmts.truncate (0);
2499 5320 : for (lane = 0; lane < group_size; ++lane)
2500 3126 : if (stmts[lane])
2501 3126 : op_stmts.quick_push
2502 3126 : (vinfo->lookup_def (chains[lane][n].op));
2503 : else
2504 0 : op_stmts.quick_push (NULL);
2505 1097 : child = vect_build_slp_tree (vinfo, op_stmts,
2506 : group_size, &this_max_nunits,
2507 : matches, limit,
2508 : &this_tree_size, bst_map);
2509 : /* ??? We're likely getting too many fatal mismatches
2510 : here so maybe we want to ignore them (but then we
2511 : have no idea which lanes fatally mismatched). */
2512 1097 : if (child || !matches[0])
2513 : break;
2514 : /* Swap another lane we have not yet matched up into
2515 : lanes that did not match. If we run out of
2516 : permute possibilities for a lane terminate the
2517 : search. */
2518 287 : bool term = false;
2519 287 : for (lane = 1; lane < group_size; ++lane)
2520 193 : if (!matches[lane])
2521 : {
2522 165 : if (n + perms[lane] + 1 == chain_len)
2523 : {
2524 : term = true;
2525 : break;
2526 : }
2527 146 : if (dump_enabled_p ())
2528 113 : dump_printf_loc (MSG_NOTE, vect_location,
2529 : "swapping operand %d and %d "
2530 : "of lane %d\n",
2531 : n, n + perms[lane] + 1, lane);
2532 292 : std::swap (chains[lane][n],
2533 146 : chains[lane][n + perms[lane] + 1]);
2534 146 : perms[lane]++;
2535 : }
2536 113 : if (term)
2537 : break;
2538 : }
2539 : while (1);
2540 1003 : if (!child)
2541 : {
2542 20 : if (dump_enabled_p ())
2543 18 : dump_printf_loc (MSG_NOTE, vect_location,
2544 : "failed to match up op %d\n", n);
2545 20 : op_stmts.release ();
2546 20 : if (lane != group_size - 1)
2547 10 : matches[0] = false;
2548 : else
2549 10 : matches[lane] = false;
2550 20 : goto out;
2551 : }
2552 983 : if (dump_enabled_p ())
2553 : {
2554 421 : dump_printf_loc (MSG_NOTE, vect_location,
2555 : "matched up op %d to\n", n);
2556 421 : vect_print_slp_tree (MSG_NOTE, vect_location, child);
2557 : }
2558 983 : children.safe_push (child);
2559 : }
2560 : }
2561 : /* 3. build SLP nodes to combine the chain. */
2562 1213 : for (unsigned lane = 0; lane < group_size; ++lane)
2563 1772 : if (stmts[lane] && chains[lane][0].code != code)
2564 : {
2565 : /* See if there's any alternate all-PLUS entry. */
2566 : unsigned n;
2567 6 : for (n = 1; n < chain_len; ++n)
2568 : {
2569 30 : for (lane = 0; lane < group_size; ++lane)
2570 48 : if (stmts[lane] && chains[lane][n].code != code)
2571 : break;
2572 6 : if (lane == group_size)
2573 : break;
2574 : }
2575 6 : if (n != chain_len)
2576 : {
2577 : /* Swap that in at first position. */
2578 6 : std::swap (children[0], children[n]);
2579 30 : for (lane = 0; lane < group_size; ++lane)
2580 24 : if (stmts[lane])
2581 24 : std::swap (chains[lane][0], chains[lane][n]);
2582 : }
2583 : else
2584 : {
2585 : /* ??? When this triggers and we end up with two
2586 : vect_constant/external_def up-front things break (ICE)
2587 : spectacularly finding an insertion place for the
2588 : all-constant op. We should have a fully
2589 : vect_internal_def operand though(?) so we can swap
2590 : that into first place and then prepend the all-zero
2591 : constant. */
2592 0 : if (dump_enabled_p ())
2593 0 : dump_printf_loc (MSG_NOTE, vect_location,
2594 : "inserting constant zero to compensate "
2595 : "for (partially) negated first "
2596 : "operand\n");
2597 0 : chain_len++;
2598 0 : for (lane = 0; lane < group_size; ++lane)
2599 0 : if (stmts[lane])
2600 0 : chains[lane].safe_insert
2601 0 : (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2602 0 : vec<tree> zero_ops;
2603 0 : zero_ops.create (group_size);
2604 0 : zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2605 0 : for (lane = 1; lane < group_size; ++lane)
2606 0 : if (stmts[lane])
2607 0 : zero_ops.quick_push (zero_ops[0]);
2608 : else
2609 0 : zero_ops.quick_push (NULL_TREE);
2610 0 : slp_tree zero = vect_create_new_slp_node (zero_ops);
2611 0 : SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2612 0 : children.safe_insert (0, zero);
2613 : }
2614 : break;
2615 : }
2616 1071 : for (unsigned i = 1; i < children.length (); ++i)
2617 : {
2618 738 : slp_tree op0 = children[i - 1];
2619 738 : slp_tree op1 = children[i];
2620 738 : bool this_two_op = false;
2621 2660 : for (unsigned lane = 0; lane < group_size; ++lane)
2622 4200 : if (stmts[lane] && chains[lane][i].code != chains[0][i].code)
2623 : {
2624 : this_two_op = true;
2625 : break;
2626 : }
2627 738 : slp_tree child;
2628 738 : if (i == children.length () - 1)
2629 333 : child = vect_create_new_slp_node (node, stmts, 2);
2630 : else
2631 405 : child = vect_create_new_slp_node (2, ERROR_MARK);
2632 738 : if (this_two_op)
2633 : {
2634 178 : vec<std::pair<unsigned, unsigned> > lperm;
2635 178 : lperm.create (group_size);
2636 630 : for (unsigned lane = 0; lane < group_size; ++lane)
2637 904 : lperm.quick_push (std::make_pair
2638 452 : (chains[lane][i].code != chains[0][i].code, lane));
2639 356 : vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2640 178 : (chains[0][i].code == code
2641 : ? op_stmt_info
2642 : : other_op_stmt_info),
2643 178 : (chains[0][i].code == code
2644 : ? other_op_stmt_info
2645 : : op_stmt_info),
2646 : lperm);
2647 : }
2648 : else
2649 : {
2650 560 : SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2651 560 : SLP_TREE_VECTYPE (child) = vectype;
2652 560 : SLP_TREE_LANES (child) = group_size;
2653 560 : SLP_TREE_CHILDREN (child).quick_push (op0);
2654 560 : SLP_TREE_CHILDREN (child).quick_push (op1);
2655 560 : SLP_TREE_REPRESENTATIVE (child)
2656 1120 : = (chains[0][i].code == code
2657 560 : ? op_stmt_info : other_op_stmt_info);
2658 : }
2659 738 : children[i] = child;
2660 : }
2661 333 : *tree_size += this_tree_size + 1;
2662 333 : *max_nunits = this_max_nunits;
2663 1593 : while (!chains.is_empty ())
2664 904 : chains.pop ().release ();
2665 : return node;
2666 : }
2667 21083 : out:
2668 21106 : if (dump_enabled_p ())
2669 2809 : dump_printf_loc (MSG_NOTE, vect_location,
2670 : "failed to line up SLP graph by re-associating "
2671 : "operations in lanes%s\n",
2672 : !hard_fail ? " trying regular discovery" : "");
2673 21111 : while (!children.is_empty ())
2674 5 : vect_free_slp_tree (children.pop ());
2675 21269 : while (!chains.is_empty ())
2676 163 : chains.pop ().release ();
2677 : /* Hard-fail, otherwise we might run into quadratic processing of the
2678 : chains starting one stmt into the chain again. */
2679 21106 : if (hard_fail)
2680 : return NULL;
2681 : /* Fall thru to normal processing. */
2682 21439 : }
2683 :
2684 : /* Get at the operands, verifying they are compatible. */
2685 3328692 : vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2686 3328692 : slp_oprnd_info oprnd_info;
2687 16036055 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2688 : {
2689 25417152 : int res = vect_get_and_check_slp_defs (vinfo, vectype,
2690 12708576 : swap[i], skip_args,
2691 : stmts, i, &oprnds_info);
2692 12708576 : if (res != 0)
2693 543063 : matches[(res == -1) ? 0 : i] = false;
2694 12708576 : if (!matches[0])
2695 : break;
2696 : }
2697 15725209 : for (i = 0; i < group_size; ++i)
2698 12609021 : if (!matches[i])
2699 : {
2700 212504 : vect_free_oprnd_info (oprnds_info);
2701 212504 : return NULL;
2702 : }
2703 9348564 : swap = NULL;
2704 :
2705 9348564 : bool has_two_operators_perm = false;
2706 18697128 : auto_vec<unsigned> two_op_perm_indices[2];
2707 3116188 : vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
2708 :
2709 3130423 : if (two_operators && oprnds_info.length () == 2 && group_size > 2)
2710 : {
2711 3867 : unsigned idx = 0;
2712 3867 : hash_map<gimple *, unsigned> seen;
2713 3867 : vec<slp_oprnd_info> new_oprnds_info
2714 3867 : = vect_create_oprnd_info (1, group_size);
2715 3867 : bool success = true;
2716 :
2717 3867 : enum tree_code code = ERROR_MARK;
2718 3867 : if (oprnds_info[0]->def_stmts[0]
2719 3867 : && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
2720 3809 : code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
2721 3867 : basic_block bb = nullptr;
2722 :
2723 7470 : for (unsigned j = 0; j < group_size; ++j)
2724 : {
2725 17480 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2726 : {
2727 13877 : stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
2728 13877 : if (!stmt_info
2729 13654 : || !is_a<gassign *> (stmt_info->stmt)
2730 13651 : || gimple_assign_rhs_code (stmt_info->stmt) != code
2731 24350 : || skip_args[i])
2732 : {
2733 : success = false;
2734 3408 : break;
2735 : }
2736 : /* Avoid mixing lanes with defs in different basic-blocks. */
2737 10473 : if (!bb)
2738 3985 : bb = gimple_bb (vect_orig_stmt (stmt_info)->stmt);
2739 8252 : else if (gimple_bb (vect_orig_stmt (stmt_info)->stmt) != bb)
2740 : {
2741 : success = false;
2742 : break;
2743 : }
2744 :
2745 10469 : bool exists;
2746 10469 : unsigned &stmt_idx
2747 10469 : = seen.get_or_insert (stmt_info->stmt, &exists);
2748 :
2749 10469 : if (!exists)
2750 : {
2751 9128 : new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
2752 9128 : new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
2753 9128 : stmt_idx = idx;
2754 9128 : idx++;
2755 : }
2756 :
2757 10469 : two_op_perm_indices[i].safe_push (stmt_idx);
2758 : }
2759 :
2760 7011 : if (!success)
2761 : break;
2762 : }
2763 :
2764 3867 : if (success && idx == group_size)
2765 : {
2766 94 : if (dump_enabled_p ())
2767 : {
2768 0 : dump_printf_loc (MSG_NOTE, vect_location,
2769 : "Replace two_operators operands:\n");
2770 :
2771 0 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2772 : {
2773 0 : dump_printf_loc (MSG_NOTE, vect_location,
2774 : "Operand %u:\n", i);
2775 0 : for (unsigned j = 0; j < group_size; j++)
2776 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2777 0 : j, oprnd_info->def_stmts[j]->stmt);
2778 : }
2779 :
2780 0 : dump_printf_loc (MSG_NOTE, vect_location,
2781 : "With a single operand:\n");
2782 0 : for (unsigned j = 0; j < group_size; j++)
2783 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2784 0 : j, new_oprnds_info[0]->def_stmts[j]->stmt);
2785 : }
2786 :
2787 94 : two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
2788 94 : two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
2789 :
2790 94 : new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
2791 94 : new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
2792 94 : new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
2793 94 : new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
2794 94 : new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
2795 :
2796 94 : vect_free_oprnd_info (oprnds_info);
2797 94 : oprnds_info = new_oprnds_info;
2798 94 : nops = 1;
2799 94 : has_two_operators_perm = true;
2800 : }
2801 : else
2802 3773 : vect_free_oprnd_info (new_oprnds_info);
2803 3867 : }
2804 :
2805 6232376 : auto_vec<slp_tree, 4> children;
2806 :
2807 3116188 : stmt_info = stmts[0];
2808 :
2809 3116188 : int reduc_idx = -1;
2810 3116188 : int gs_scale = 0;
2811 3116188 : tree gs_base = NULL_TREE;
2812 :
2813 : /* Create SLP_TREE nodes for the definition node/s. */
2814 7975929 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2815 : {
2816 4975482 : slp_tree child = nullptr;
2817 4975482 : unsigned int j;
2818 :
2819 : /* We're skipping certain operands from processing, for example
2820 : outer loop reduction initial defs. */
2821 4975482 : if (skip_args[i])
2822 : {
2823 483280 : children.safe_push (NULL);
2824 5343021 : continue;
2825 : }
2826 :
2827 4492202 : if (oprnd_info->first_dt == vect_uninitialized_def)
2828 : {
2829 : /* COND_EXPR have one too many eventually if the condition
2830 : is a SSA name. */
2831 0 : gcc_assert (i == 3 && nops == 4);
2832 0 : continue;
2833 : }
2834 :
2835 4492202 : if (oprnd_info->first_gs_p)
2836 : {
2837 22453 : gs_scale = oprnd_info->first_gs_info.scale;
2838 22453 : gs_base = oprnd_info->first_gs_info.base;
2839 : }
2840 :
2841 4492202 : if (is_a <bb_vec_info> (vinfo)
2842 1578681 : && oprnd_info->first_dt == vect_internal_def
2843 5314452 : && !oprnd_info->any_pattern)
2844 : {
2845 : /* For BB vectorization, if all defs are the same do not
2846 : bother to continue the build along the single-lane
2847 : graph but use a splat of the scalar value. */
2848 778251 : stmt_vec_info first_def = oprnd_info->def_stmts[0];
2849 834862 : for (j = 1; j < group_size; ++j)
2850 794073 : if (oprnd_info->def_stmts[j] != first_def)
2851 : break;
2852 778251 : if (j == group_size
2853 : /* But avoid doing this for loads where we may be
2854 : able to CSE things, unless the stmt is not
2855 : vectorizable. */
2856 778251 : && (!STMT_VINFO_VECTORIZABLE (first_def)
2857 50111 : || !gimple_vuse (first_def->stmt)))
2858 : {
2859 31723 : if (dump_enabled_p ())
2860 105 : dump_printf_loc (MSG_NOTE, vect_location,
2861 : "Using a splat of the uniform operand %G",
2862 : first_def->stmt);
2863 31723 : oprnd_info->first_dt = vect_external_def;
2864 : }
2865 : }
2866 :
2867 4492202 : if (oprnd_info->first_dt == vect_external_def
2868 4492202 : || oprnd_info->first_dt == vect_constant_def)
2869 : {
2870 1472226 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2871 : {
2872 : tree op0;
2873 : tree uniform_val = op0 = oprnd_info->ops[0];
2874 : for (j = 1; j < oprnd_info->ops.length (); ++j)
2875 : if (oprnd_info->ops[j]
2876 : && !operand_equal_p (uniform_val, oprnd_info->ops[j]))
2877 : {
2878 : uniform_val = NULL_TREE;
2879 : break;
2880 : }
2881 : if (!uniform_val
2882 : && !can_duplicate_and_interleave_p (vinfo,
2883 : oprnd_info->ops.length (),
2884 : TREE_TYPE (op0)))
2885 : {
2886 : matches[j] = false;
2887 : if (dump_enabled_p ())
2888 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2889 : "Build SLP failed: invalid type of def "
2890 : "for variable-length SLP %T\n", op0);
2891 : goto fail;
2892 : }
2893 : }
2894 1472226 : slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2895 1472226 : SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2896 1472226 : oprnd_info->ops = vNULL;
2897 1472226 : children.safe_push (invnode);
2898 1472226 : continue;
2899 1472226 : }
2900 :
2901 : /* See which SLP operand a reduction chain continues on. We want
2902 : to chain even PHIs but not backedges. */
2903 3019976 : if (STMT_VINFO_REDUC_DEF (oprnd_info->def_stmts[0])
2904 3019976 : || STMT_VINFO_REDUC_IDX (oprnd_info->def_stmts[0]) != -1)
2905 : {
2906 232679 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2907 : {
2908 756 : if (oprnd_info->first_dt == vect_double_reduction_def)
2909 378 : reduc_idx = i;
2910 : }
2911 231923 : else if (is_a <gphi *> (stmt_info->stmt)
2912 231923 : && gimple_phi_num_args
2913 99465 : (as_a <gphi *> (stmt_info->stmt)) != 1)
2914 : ;
2915 132841 : else if (STMT_VINFO_REDUC_IDX (stmt_info) == -1
2916 383 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2917 : ;
2918 132841 : else if (reduc_idx == -1)
2919 124423 : reduc_idx = i;
2920 : else
2921 : /* For .COND_* reduction operations the else value can be the
2922 : same as one of the operation operands. The other def
2923 : stmts have been moved, so we can't check easily. Check
2924 : it's a call at least. */
2925 8418 : gcc_assert (is_a <gcall *> (stmt_info->stmt));
2926 : }
2927 :
2928 : /* When we have a masked load with uniform mask discover this
2929 : as a single-lane mask with a splat permute. This way we can
2930 : recognize this as a masked load-lane by stripping the splat. */
2931 3019976 : if (is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
2932 57460 : && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
2933 : IFN_MASK_LOAD)
2934 6075 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2935 3020053 : && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info)))
2936 : {
2937 35 : vec<stmt_vec_info> def_stmts2;
2938 35 : def_stmts2.create (1);
2939 35 : def_stmts2.quick_push (oprnd_info->def_stmts[0]);
2940 35 : child = vect_build_slp_tree (vinfo, def_stmts2, 1,
2941 : &this_max_nunits,
2942 : matches, limit,
2943 : &this_tree_size, bst_map);
2944 35 : if (child)
2945 : {
2946 35 : slp_tree pnode = vect_create_new_slp_node (1, VEC_PERM_EXPR);
2947 35 : SLP_TREE_VECTYPE (pnode) = SLP_TREE_VECTYPE (child);
2948 35 : SLP_TREE_LANES (pnode) = group_size;
2949 35 : SLP_TREE_SCALAR_STMTS (pnode).create (group_size);
2950 35 : SLP_TREE_LANE_PERMUTATION (pnode).create (group_size);
2951 210 : for (unsigned k = 0; k < group_size; ++k)
2952 : {
2953 175 : SLP_TREE_SCALAR_STMTS (pnode)
2954 175 : .quick_push (oprnd_info->def_stmts[0]);
2955 175 : SLP_TREE_LANE_PERMUTATION (pnode)
2956 175 : .quick_push (std::make_pair (0u, 0u));
2957 : }
2958 35 : SLP_TREE_CHILDREN (pnode).quick_push (child);
2959 35 : pnode->max_nunits = child->max_nunits;
2960 35 : children.safe_push (pnode);
2961 35 : oprnd_info->def_stmts = vNULL;
2962 35 : continue;
2963 35 : }
2964 : else
2965 0 : def_stmts2.release ();
2966 : }
2967 :
2968 3019941 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2969 : group_size, &this_max_nunits,
2970 : matches, limit,
2971 : &this_tree_size, bst_map)) != NULL)
2972 : {
2973 2534219 : oprnd_info->def_stmts = vNULL;
2974 2534219 : children.safe_push (child);
2975 2534219 : continue;
2976 : }
2977 :
2978 : /* If the SLP build for operand zero failed and operand zero
2979 : and one can be commutated try that for the scalar stmts
2980 : that failed the match. */
2981 485722 : if (i == 0
2982 : /* A first scalar stmt mismatch signals a fatal mismatch. */
2983 382900 : && matches[0]
2984 : /* ??? For COND_EXPRs we can swap the comparison operands
2985 : as well as the arms under some constraints. */
2986 180923 : && (nops == 2 || nops == 3)
2987 109657 : && oprnds_info[1]->first_dt == vect_internal_def
2988 59997 : && (is_gimple_assign (stmt_info->stmt)
2989 11660 : || is_gimple_call (stmt_info->stmt))
2990 : /* Swapping operands for reductions breaks assumptions later on. */
2991 534072 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
2992 : {
2993 : /* See whether we can swap the matching or the non-matching
2994 : stmt operands. */
2995 : bool swap_not_matching = true;
2996 52685 : do
2997 : {
2998 7059348 : for (j = 0; j < group_size; ++j)
2999 : {
3000 7021490 : if (matches[j] != !swap_not_matching)
3001 71026 : continue;
3002 6950464 : stmt_vec_info stmt_info = stmts[j];
3003 : /* Verify if we can swap operands of this stmt. */
3004 6950464 : if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
3005 : {
3006 6950438 : tree_code code = gimple_assign_rhs_code (stmt);
3007 6950438 : if (! commutative_tree_code (code)
3008 6950438 : && ! commutative_ternary_tree_code (code))
3009 : {
3010 14803 : if (!swap_not_matching)
3011 6879 : goto fail;
3012 : swap_not_matching = false;
3013 : break;
3014 : }
3015 : }
3016 7006689 : else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
3017 : {
3018 26 : internal_fn fn = (gimple_call_internal_p (call)
3019 26 : ? gimple_call_internal_fn (call)
3020 : : IFN_LAST);
3021 26 : if ((! commutative_binary_fn_p (fn)
3022 26 : && ! commutative_ternary_fn_p (fn))
3023 28 : || first_commutative_argument (fn) != 0)
3024 : {
3025 24 : if (!swap_not_matching)
3026 12 : goto fail;
3027 : swap_not_matching = false;
3028 : break;
3029 : }
3030 : }
3031 : }
3032 : }
3033 45794 : while (j != group_size);
3034 :
3035 : /* Swap mismatched definition stmts. */
3036 37858 : if (dump_enabled_p ())
3037 351 : dump_printf_loc (MSG_NOTE, vect_location,
3038 : "Re-trying with swapped operands of stmts ");
3039 7035720 : for (j = 0; j < group_size; ++j)
3040 6997862 : if (matches[j] == !swap_not_matching)
3041 : {
3042 13870906 : std::swap (oprnds_info[0]->def_stmts[j],
3043 6935453 : oprnds_info[1]->def_stmts[j]);
3044 13870906 : std::swap (oprnds_info[0]->ops[j],
3045 6935453 : oprnds_info[1]->ops[j]);
3046 6935453 : if (dump_enabled_p ())
3047 956 : dump_printf (MSG_NOTE, "%d ", j);
3048 : }
3049 37858 : if (dump_enabled_p ())
3050 351 : dump_printf (MSG_NOTE, "\n");
3051 : /* After swapping some operands we lost track whether an
3052 : operand has any pattern defs so be conservative here. */
3053 72435 : if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
3054 3340 : oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
3055 : /* And try again with scratch 'matches' ... */
3056 37858 : bool *tem = XALLOCAVEC (bool, group_size);
3057 37858 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
3058 : group_size, &this_max_nunits,
3059 : tem, limit,
3060 : &this_tree_size, bst_map)) != NULL)
3061 : {
3062 6559 : oprnd_info->def_stmts = vNULL;
3063 6559 : children.safe_push (child);
3064 6559 : continue;
3065 : }
3066 : }
3067 479163 : fail:
3068 :
3069 : /* If the SLP build failed and we analyze a basic-block
3070 : simply treat nodes we fail to build as externally defined
3071 : (and thus build vectors from the scalar defs).
3072 : The cost model will reject outright expensive cases.
3073 : ??? This doesn't treat cases where permutation ultimatively
3074 : fails (or we don't try permutation below). Ideally we'd
3075 : even compute a permutation that will end up with the maximum
3076 : SLP tree size... */
3077 479163 : if (is_a <bb_vec_info> (vinfo)
3078 : /* ??? Rejecting patterns this way doesn't work. We'd have to
3079 : do extra work to cancel the pattern so the uses see the
3080 : scalar version. */
3081 398657 : && !is_pattern_stmt_p (stmt_info)
3082 853267 : && !oprnd_info->any_pattern)
3083 : {
3084 : /* But if there's a leading vector sized set of matching stmts
3085 : fail here so we can split the group. This matches the condition
3086 : vect_analyze_slp_instance uses. */
3087 : /* ??? We might want to split here and combine the results to support
3088 : multiple vector sizes better. */
3089 586838 : for (j = 0; j < group_size; ++j)
3090 586838 : if (!matches[j])
3091 : break;
3092 373841 : if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
3093 373812 : && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
3094 : {
3095 363422 : if (dump_enabled_p ())
3096 555 : dump_printf_loc (MSG_NOTE, vect_location,
3097 : "Building vector operands from scalars\n");
3098 363422 : this_tree_size++;
3099 363422 : child = vect_create_new_slp_node (oprnd_info->ops);
3100 363422 : children.safe_push (child);
3101 363422 : oprnd_info->ops = vNULL;
3102 363422 : continue;
3103 : }
3104 : }
3105 :
3106 115741 : gcc_assert (child == NULL);
3107 131839 : FOR_EACH_VEC_ELT (children, j, child)
3108 16098 : if (child)
3109 16098 : vect_free_slp_tree (child);
3110 115741 : vect_free_oprnd_info (oprnds_info);
3111 115741 : return NULL;
3112 : }
3113 :
3114 3000447 : vect_free_oprnd_info (oprnds_info);
3115 :
3116 : /* If we have all children of a child built up from uniform scalars
3117 : or does more than one possibly expensive vector construction then
3118 : just throw that away, causing it built up from scalars.
3119 : The exception is the SLP node for the vector store. */
3120 3000447 : if (is_a <bb_vec_info> (vinfo)
3121 1100397 : && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3122 : /* ??? Rejecting patterns this way doesn't work. We'd have to
3123 : do extra work to cancel the pattern so the uses see the
3124 : scalar version. */
3125 3437427 : && !is_pattern_stmt_p (stmt_info))
3126 : {
3127 : slp_tree child;
3128 : unsigned j;
3129 : bool all_uniform_p = true;
3130 : unsigned n_vector_builds = 0;
3131 1240335 : FOR_EACH_VEC_ELT (children, j, child)
3132 : {
3133 829304 : if (!child)
3134 : ;
3135 829304 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3136 : all_uniform_p = false;
3137 590842 : else if (!vect_slp_tree_uniform_p (child))
3138 : {
3139 449490 : all_uniform_p = false;
3140 449490 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
3141 415060 : n_vector_builds++;
3142 : }
3143 : }
3144 411031 : if (all_uniform_p
3145 411031 : || n_vector_builds > 1
3146 698479 : || (n_vector_builds == children.length ()
3147 30307 : && is_a <gphi *> (stmt_info->stmt)))
3148 : {
3149 : /* Roll back. */
3150 128410 : matches[0] = false;
3151 407877 : FOR_EACH_VEC_ELT (children, j, child)
3152 279467 : if (child)
3153 279467 : vect_free_slp_tree (child);
3154 :
3155 128410 : if (dump_enabled_p ())
3156 177 : dump_printf_loc (MSG_NOTE, vect_location,
3157 : "Building parent vector operands from "
3158 : "scalars instead\n");
3159 128410 : return NULL;
3160 : }
3161 : }
3162 :
3163 2872037 : *tree_size += this_tree_size + 1;
3164 2872037 : *max_nunits = this_max_nunits;
3165 :
3166 2872037 : if (two_operators)
3167 : {
3168 : /* ??? We'd likely want to either cache in bst_map sth like
3169 : { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
3170 : the true { a+b, a+b, a+b, a+b } ... but there we don't have
3171 : explicit stmts to put in so the keying on 'stmts' doesn't
3172 : work (but we have the same issue with nodes that use 'ops'). */
3173 :
3174 6841 : if (has_two_operators_perm)
3175 : {
3176 40 : slp_tree child = children[0];
3177 40 : children.truncate (0);
3178 120 : for (i = 0; i < 2; i++)
3179 : {
3180 80 : slp_tree pnode
3181 80 : = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
3182 80 : SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
3183 80 : SLP_TREE_VECTYPE (pnode) = vectype;
3184 80 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3185 80 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3186 80 : lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
3187 80 : children.safe_push (pnode);
3188 :
3189 656 : for (unsigned j = 0; j < stmts.length (); j++)
3190 576 : perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
3191 : }
3192 :
3193 40 : SLP_TREE_REF_COUNT (child) += 4;
3194 : }
3195 :
3196 6841 : slp_tree one = new _slp_tree;
3197 6841 : slp_tree two = new _slp_tree;
3198 6841 : SLP_TREE_DEF_TYPE (one) = vect_internal_def;
3199 6841 : SLP_TREE_DEF_TYPE (two) = vect_internal_def;
3200 6841 : SLP_TREE_VECTYPE (one) = vectype;
3201 6841 : SLP_TREE_VECTYPE (two) = vectype;
3202 6841 : SLP_TREE_CHILDREN (one).safe_splice (children);
3203 6841 : SLP_TREE_CHILDREN (two).safe_splice (children);
3204 6841 : slp_tree child;
3205 27366 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
3206 13684 : SLP_TREE_REF_COUNT (child)++;
3207 :
3208 : /* Here we record the original defs since this
3209 : node represents the final lane configuration. */
3210 6841 : node = vect_create_new_slp_node (node, stmts, 2);
3211 6841 : SLP_TREE_VECTYPE (node) = vectype;
3212 6841 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3213 6841 : SLP_TREE_CHILDREN (node).quick_push (one);
3214 6841 : SLP_TREE_CHILDREN (node).quick_push (two);
3215 6841 : enum tree_code code0 = ERROR_MARK;
3216 6841 : enum tree_code ocode = ERROR_MARK;
3217 6841 : if (gassign *stmt = dyn_cast <gassign *> (stmts[0]->stmt))
3218 6839 : code0 = gimple_assign_rhs_code (stmt);
3219 6841 : stmt_vec_info ostmt_info;
3220 6841 : unsigned j = 0;
3221 24996 : FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
3222 : {
3223 18155 : int op = 0;
3224 18155 : if (gassign *ostmt = dyn_cast <gassign *> (ostmt_info->stmt))
3225 : {
3226 18151 : if (gimple_assign_rhs_code (ostmt) != code0)
3227 : {
3228 9110 : ocode = gimple_assign_rhs_code (ostmt);
3229 : op = 1;
3230 : j = i;
3231 : }
3232 : }
3233 : else
3234 : {
3235 8 : if (gimple_call_combined_fn (stmts[0]->stmt)
3236 4 : != gimple_call_combined_fn (ostmt_info->stmt))
3237 : {
3238 2 : op = 1;
3239 2 : j = i;
3240 : }
3241 : }
3242 18155 : SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (op, i));
3243 : }
3244 6841 : SLP_TREE_CODE (one) = code0;
3245 6841 : SLP_TREE_CODE (two) = ocode;
3246 6841 : SLP_TREE_LANES (one) = stmts.length ();
3247 6841 : SLP_TREE_LANES (two) = stmts.length ();
3248 6841 : SLP_TREE_REPRESENTATIVE (one) = stmts[0];
3249 6841 : SLP_TREE_REPRESENTATIVE (two) = stmts[j];
3250 :
3251 6841 : return node;
3252 : }
3253 :
3254 2865196 : node = vect_create_new_slp_node (node, stmts, nops);
3255 2865196 : SLP_TREE_VECTYPE (node) = vectype;
3256 2865196 : SLP_TREE_CHILDREN (node).splice (children);
3257 2865196 : SLP_TREE_GS_SCALE (node) = gs_scale;
3258 2865196 : SLP_TREE_GS_BASE (node) = gs_base;
3259 2865196 : if (reduc_idx != -1)
3260 : {
3261 116060 : gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) != -1
3262 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
3263 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def);
3264 116060 : SLP_TREE_REDUC_IDX (node) = reduc_idx;
3265 116060 : node->cycle_info.id = SLP_TREE_CHILDREN (node)[reduc_idx]->cycle_info.id;
3266 : }
3267 : /* When reaching the reduction PHI, create a vect_reduc_info. */
3268 2749136 : else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3269 2749136 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3270 2749136 : && is_a <gphi *> (STMT_VINFO_STMT (stmt_info)))
3271 : {
3272 101393 : loop_vec_info loop_vinfo = as_a <loop_vec_info> (vinfo);
3273 101393 : gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) == -1);
3274 101393 : node->cycle_info.id = loop_vinfo->reduc_infos.length ();
3275 101393 : vect_reduc_info reduc_info = new vect_reduc_info_s ();
3276 101393 : loop_vinfo->reduc_infos.safe_push (reduc_info);
3277 101393 : stmt_vec_info reduc_phi = stmt_info;
3278 : /* ??? For double reductions vect_is_simple_reduction stores the
3279 : reduction type and code on the inner loop header PHI. */
3280 101393 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3281 : {
3282 378 : use_operand_p use_p;
3283 378 : gimple *use_stmt;
3284 378 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
3285 : &use_p, &use_stmt);
3286 378 : gcc_assert (res);
3287 378 : reduc_phi = loop_vinfo->lookup_stmt (use_stmt);
3288 : }
3289 101393 : VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (stmt_info);
3290 101393 : VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (reduc_phi);
3291 101393 : VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (reduc_phi);
3292 101393 : VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
3293 : }
3294 : return node;
3295 9348564 : }
3296 :
3297 : /* Dump a single SLP tree NODE. */
3298 :
3299 : static void
3300 444694 : vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
3301 : slp_tree node)
3302 : {
3303 444694 : unsigned i, j;
3304 444694 : slp_tree child;
3305 444694 : stmt_vec_info stmt_info;
3306 444694 : tree op;
3307 :
3308 444694 : dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
3309 444694 : dump_user_location_t user_loc = loc.get_user_location ();
3310 444694 : dump_printf_loc (metadata, user_loc,
3311 : "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
3312 : ", refcnt=%u)",
3313 444694 : SLP_TREE_DEF_TYPE (node) == vect_external_def
3314 : ? " (external)"
3315 : : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
3316 429051 : ? " (constant)"
3317 : : ""), (void *) node,
3318 444694 : estimated_poly_value (node->max_nunits),
3319 : SLP_TREE_REF_COUNT (node));
3320 444694 : if (SLP_TREE_VECTYPE (node))
3321 377150 : dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
3322 444694 : dump_printf (metadata, "%s",
3323 444694 : node->avoid_stlf_fail ? " (avoid-stlf-fail)" : "");
3324 444694 : if (node->cycle_info.id != -1 || node->cycle_info.reduc_idx != -1)
3325 23869 : dump_printf (metadata, " cycle %d, link %d", node->cycle_info.id,
3326 : node->cycle_info.reduc_idx);
3327 444694 : dump_printf (metadata, "\n");
3328 444694 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3329 : {
3330 362152 : if (SLP_TREE_PERMUTE_P (node))
3331 13668 : dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
3332 : else
3333 348484 : dump_printf_loc (metadata, user_loc, "op template: %G",
3334 348484 : SLP_TREE_REPRESENTATIVE (node)->stmt);
3335 : }
3336 444694 : if (SLP_TREE_SCALAR_STMTS (node).exists ())
3337 866474 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3338 512425 : if (stmt_info)
3339 507144 : dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
3340 507144 : SLP_TREE_LIVE_LANES (node).contains (i)
3341 503519 : ? "[l*]" : (STMT_VINFO_LIVE_P (stmt_info)
3342 503519 : ? "[l] " : ""),
3343 : i, stmt_info->stmt);
3344 : else
3345 5281 : dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
3346 : else
3347 : {
3348 90645 : dump_printf_loc (metadata, user_loc, "\t{ ");
3349 199582 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
3350 108937 : dump_printf (metadata, "%T%s ", op,
3351 108937 : i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
3352 90645 : dump_printf (metadata, "}\n");
3353 : }
3354 444694 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3355 : {
3356 64844 : dump_printf_loc (metadata, user_loc, "\tload permutation {");
3357 147798 : FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
3358 82954 : dump_printf (dump_kind, " %u", j);
3359 64844 : dump_printf (dump_kind, " }\n");
3360 : }
3361 444694 : if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3362 : {
3363 13676 : dump_printf_loc (metadata, user_loc, "\tlane permutation {");
3364 51245 : for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
3365 37569 : dump_printf (dump_kind, " %u[%u]",
3366 37569 : SLP_TREE_LANE_PERMUTATION (node)[i].first,
3367 37569 : SLP_TREE_LANE_PERMUTATION (node)[i].second);
3368 13676 : dump_printf (dump_kind, " }%s\n",
3369 13676 : node->ldst_lanes ? " (load-lanes)" : "");
3370 : }
3371 444694 : if (SLP_TREE_CHILDREN (node).is_empty ())
3372 169596 : return;
3373 275098 : dump_printf_loc (metadata, user_loc, "\tchildren");
3374 725837 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3375 450739 : dump_printf (dump_kind, " %p", (void *)child);
3376 275098 : dump_printf (dump_kind, "%s\n",
3377 275098 : node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
3378 : ? " (store-lanes)" : "");
3379 : }
3380 :
3381 : DEBUG_FUNCTION void
3382 0 : debug (slp_tree node)
3383 : {
3384 0 : debug_dump_context ctx;
3385 0 : vect_print_slp_tree (MSG_NOTE,
3386 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3387 : node);
3388 0 : }
3389 :
3390 : /* Recursive helper for the dot producer below. */
3391 :
3392 : static void
3393 0 : dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
3394 : {
3395 0 : if (visited.add (node))
3396 : return;
3397 :
3398 0 : fprintf (f, "\"%p\" [label=\"", (void *)node);
3399 0 : vect_print_slp_tree (MSG_NOTE,
3400 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3401 : node);
3402 0 : fprintf (f, "\"];\n");
3403 :
3404 :
3405 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3406 0 : fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
3407 :
3408 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3409 0 : if (child)
3410 0 : dot_slp_tree (f, child, visited);
3411 : }
3412 :
3413 : DEBUG_FUNCTION void
3414 0 : dot_slp_tree (const char *fname, slp_tree node)
3415 : {
3416 0 : FILE *f = fopen (fname, "w");
3417 0 : fprintf (f, "digraph {\n");
3418 0 : fflush (f);
3419 0 : {
3420 0 : debug_dump_context ctx (f);
3421 0 : hash_set<slp_tree> visited;
3422 0 : dot_slp_tree (f, node, visited);
3423 0 : }
3424 0 : fflush (f);
3425 0 : fprintf (f, "}\n");
3426 0 : fclose (f);
3427 0 : }
3428 :
3429 : DEBUG_FUNCTION void
3430 0 : dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
3431 : {
3432 0 : FILE *f = fopen (fname, "w");
3433 0 : fprintf (f, "digraph {\n");
3434 0 : fflush (f);
3435 0 : {
3436 0 : debug_dump_context ctx (f);
3437 0 : hash_set<slp_tree> visited;
3438 0 : for (auto inst : slp_instances)
3439 0 : dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
3440 0 : }
3441 0 : fflush (f);
3442 0 : fprintf (f, "}\n");
3443 0 : fclose (f);
3444 0 : }
3445 :
3446 : /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
3447 :
3448 : static void
3449 483463 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3450 : slp_tree node, hash_set<slp_tree> &visited)
3451 : {
3452 483463 : unsigned i;
3453 483463 : slp_tree child;
3454 :
3455 483463 : if (visited.add (node))
3456 483463 : return;
3457 :
3458 444220 : vect_print_slp_tree (dump_kind, loc, node);
3459 :
3460 1338665 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3461 450225 : if (child)
3462 407530 : vect_print_slp_graph (dump_kind, loc, child, visited);
3463 : }
3464 :
3465 : static void
3466 46615 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3467 : slp_tree entry)
3468 : {
3469 46615 : hash_set<slp_tree> visited;
3470 46615 : vect_print_slp_graph (dump_kind, loc, entry, visited);
3471 46615 : }
3472 :
3473 : DEBUG_FUNCTION void
3474 0 : debug (slp_instance instance)
3475 : {
3476 0 : debug_dump_context ctx;
3477 0 : vect_print_slp_graph (MSG_NOTE,
3478 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3479 : SLP_INSTANCE_TREE (instance));
3480 0 : }
3481 :
3482 :
3483 : /* Compute the set of scalar stmts participating in external nodes. */
3484 :
3485 : static void
3486 1571433 : vect_slp_gather_extern_scalar_stmts (vec_info *vinfo, slp_tree node,
3487 : hash_set<slp_tree> &visited,
3488 : hash_set<stmt_vec_info> &estmts)
3489 : {
3490 1571433 : if (visited.add (node))
3491 : return;
3492 :
3493 1526321 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3494 : {
3495 : slp_tree child;
3496 : int i;
3497 1765425 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3498 888331 : if (child)
3499 888331 : vect_slp_gather_extern_scalar_stmts (vinfo, child, visited, estmts);
3500 : }
3501 : else
3502 3653787 : for (tree def : SLP_TREE_SCALAR_OPS (node))
3503 : {
3504 1707648 : stmt_vec_info def_stmt = vinfo->lookup_def (def);
3505 1707648 : if (def_stmt)
3506 342420 : estmts.add (def_stmt);
3507 : }
3508 : }
3509 :
3510 : /* Mark the original scalar stmt coverage of the vector SLP graph of VINFO
3511 : with STMT_SLP_TYPE == pure_slp. */
3512 :
3513 : static void
3514 236000 : vect_bb_slp_mark_stmts_vectorized (bb_vec_info vinfo)
3515 : {
3516 : /* Gather the scalar stmt leafs of the SLP graph to stop the below DFS
3517 : walk on. */
3518 236000 : hash_set<stmt_vec_info> scalar_stmts_in_externs;
3519 236000 : hash_set<slp_tree> visited;
3520 1391102 : for (auto instance : BB_VINFO_SLP_INSTANCES (vinfo))
3521 683102 : vect_slp_gather_extern_scalar_stmts (vinfo, SLP_INSTANCE_TREE (instance),
3522 : visited, scalar_stmts_in_externs);
3523 :
3524 : /* DFS walk scalar stmts to compute the vectorized coverage indicated
3525 : by STMT_SLP_TYPE (stmt) == pure_slp on the original scalar (non-pattern)
3526 : stmts. */
3527 1391102 : for (auto instance : BB_VINFO_SLP_INSTANCES (vinfo))
3528 : {
3529 794266 : for (auto stmt : SLP_INSTANCE_ROOT_STMTS (instance))
3530 52972 : if (!scalar_stmts_in_externs.contains (stmt))
3531 52273 : STMT_SLP_TYPE (stmt) = pure_slp;
3532 683102 : auto_vec<stmt_vec_info> worklist;
3533 3860298 : for (auto stmt : SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)))
3534 : {
3535 1810992 : stmt = vect_orig_stmt (stmt);
3536 1810992 : if (!scalar_stmts_in_externs.contains (stmt)
3537 1810992 : && STMT_SLP_TYPE (stmt) != pure_slp)
3538 : {
3539 1801830 : STMT_SLP_TYPE (stmt) = pure_slp;
3540 1801830 : worklist.safe_push (stmt);
3541 : }
3542 : }
3543 3607790 : while (!worklist.is_empty ())
3544 : {
3545 2244253 : stmt_vec_info stmt = worklist.pop ();
3546 :
3547 : /* Now walk relevant parts of the SSA use-def graph. */
3548 2244253 : slp_oprnds child_ops (stmt);
3549 4728751 : for (unsigned i = 0; i < child_ops.num_slp_children; ++i)
3550 : {
3551 2484498 : tree op = child_ops.get_op_for_slp_child (stmt, i);
3552 2484498 : stmt_vec_info def = vinfo->lookup_def (op);
3553 2484498 : if (def
3554 866683 : && !scalar_stmts_in_externs.contains (def)
3555 3012671 : && STMT_SLP_TYPE (def) != pure_slp)
3556 : {
3557 442423 : STMT_SLP_TYPE (def) = pure_slp;
3558 442423 : worklist.safe_push (def);
3559 : }
3560 : }
3561 : }
3562 683102 : }
3563 236000 : }
3564 :
3565 : /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
3566 :
3567 : static void
3568 2513778 : vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
3569 : {
3570 2513778 : int i;
3571 2513778 : stmt_vec_info stmt_info;
3572 2513778 : slp_tree child;
3573 :
3574 2513778 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3575 : return;
3576 :
3577 1505634 : if (visited.add (node))
3578 : return;
3579 :
3580 4478546 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3581 3121232 : if (stmt_info)
3582 : {
3583 3121232 : gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
3584 : || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
3585 3121232 : STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
3586 : }
3587 :
3588 3087640 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3589 1730326 : if (child)
3590 1730326 : vect_mark_slp_stmts_relevant (child, visited);
3591 : }
3592 :
3593 : static void
3594 783452 : vect_mark_slp_stmts_relevant (slp_tree node)
3595 : {
3596 783452 : hash_set<slp_tree> visited;
3597 783452 : vect_mark_slp_stmts_relevant (node, visited);
3598 783452 : }
3599 :
3600 :
3601 : /* Gather loads in the SLP graph NODE and populate the INST loads array. */
3602 :
3603 : static void
3604 10616414 : vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
3605 : hash_set<slp_tree> &visited)
3606 : {
3607 10616414 : if (!node || visited.add (node))
3608 1747318 : return;
3609 :
3610 8869096 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3611 : return;
3612 :
3613 6570400 : if (!SLP_TREE_PERMUTE_P (node))
3614 : {
3615 6363316 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3616 6363316 : if (STMT_VINFO_DATA_REF (stmt_info)
3617 2760202 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3618 1556814 : loads.safe_push (node);
3619 : }
3620 :
3621 : unsigned i;
3622 : slp_tree child;
3623 14960146 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3624 8389746 : vect_gather_slp_loads (loads, child, visited);
3625 : }
3626 :
3627 :
3628 : /* Find the last store in SLP INSTANCE. */
3629 :
3630 : stmt_vec_info
3631 2742960 : vect_find_last_scalar_stmt_in_slp (slp_tree node)
3632 : {
3633 2742960 : stmt_vec_info last = NULL;
3634 2742960 : stmt_vec_info stmt_vinfo;
3635 :
3636 9997907 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3637 7254947 : if (stmt_vinfo)
3638 : {
3639 7254947 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3640 7254947 : last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
3641 : }
3642 :
3643 2742960 : return last;
3644 : }
3645 :
3646 : /* Find the first stmt in NODE. */
3647 :
3648 : stmt_vec_info
3649 535435 : vect_find_first_scalar_stmt_in_slp (slp_tree node)
3650 : {
3651 535435 : stmt_vec_info first = NULL;
3652 535435 : stmt_vec_info stmt_vinfo;
3653 :
3654 1814265 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3655 1278830 : if (stmt_vinfo)
3656 : {
3657 1276136 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3658 1276136 : if (!first
3659 1276136 : || get_later_stmt (stmt_vinfo, first) == first)
3660 : first = stmt_vinfo;
3661 : }
3662 :
3663 535435 : return first;
3664 : }
3665 :
3666 : /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3667 : two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3668 : (also containing the first GROUP1_SIZE stmts, since stores are
3669 : consecutive), the second containing the remainder.
3670 : Return the first stmt in the second group. */
3671 :
3672 : static stmt_vec_info
3673 157571 : vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3674 : {
3675 157571 : gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3676 157571 : gcc_assert (group1_size > 0);
3677 157571 : int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3678 157571 : gcc_assert (group2_size > 0);
3679 157571 : DR_GROUP_SIZE (first_vinfo) = group1_size;
3680 :
3681 157571 : stmt_vec_info stmt_info = first_vinfo;
3682 527855 : for (unsigned i = group1_size; i > 1; i--)
3683 : {
3684 370284 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3685 370284 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3686 : }
3687 : /* STMT is now the last element of the first group. */
3688 157571 : stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3689 157571 : DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3690 :
3691 157571 : DR_GROUP_SIZE (group2) = group2_size;
3692 440602 : for (stmt_info = group2; stmt_info;
3693 283031 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3694 : {
3695 283031 : DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3696 283031 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3697 : }
3698 :
3699 : /* For the second group, the DR_GROUP_GAP is that before the original group,
3700 : plus skipping over the first vector. */
3701 157571 : DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3702 :
3703 : /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3704 157571 : DR_GROUP_GAP (first_vinfo) += group2_size;
3705 :
3706 157571 : if (dump_enabled_p ())
3707 61 : dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3708 : group1_size, group2_size);
3709 :
3710 157571 : return group2;
3711 : }
3712 :
3713 : /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3714 : statements and a vector of NUNITS elements. */
3715 :
3716 : static poly_uint64
3717 4159928 : calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3718 : {
3719 4159928 : return exact_div (common_multiple (nunits, group_size), group_size);
3720 : }
3721 :
3722 : /* Helper that checks to see if a node is a load node. */
3723 :
3724 : static inline bool
3725 108 : vect_is_slp_load_node (slp_tree root)
3726 : {
3727 108 : return (!SLP_TREE_PERMUTE_P (root)
3728 108 : && SLP_TREE_DEF_TYPE (root) == vect_internal_def
3729 102 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3730 172 : && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
3731 : }
3732 :
3733 :
3734 : /* Helper function of optimize_load_redistribution that performs the operation
3735 : recursively. */
3736 :
3737 : static slp_tree
3738 20434 : optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3739 : vec_info *vinfo, unsigned int group_size,
3740 : hash_map<slp_tree, slp_tree> *load_map,
3741 : slp_tree root)
3742 : {
3743 20434 : if (slp_tree *leader = load_map->get (root))
3744 3669 : return *leader;
3745 :
3746 16765 : slp_tree node;
3747 16765 : unsigned i;
3748 :
3749 : /* For now, we don't know anything about externals so do not do anything. */
3750 16765 : if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3751 : return NULL;
3752 12385 : else if (SLP_TREE_PERMUTE_P (root))
3753 : {
3754 : /* First convert this node into a load node and add it to the leaves
3755 : list and flatten the permute from a lane to a load one. If it's
3756 : unneeded it will be elided later. */
3757 76 : vec<stmt_vec_info> stmts;
3758 76 : stmts.create (SLP_TREE_LANES (root));
3759 76 : lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3760 140 : for (unsigned j = 0; j < lane_perm.length (); j++)
3761 : {
3762 108 : std::pair<unsigned, unsigned> perm = lane_perm[j];
3763 108 : node = SLP_TREE_CHILDREN (root)[perm.first];
3764 :
3765 108 : if (!vect_is_slp_load_node (node)
3766 108 : || SLP_TREE_CHILDREN (node).exists ())
3767 : {
3768 44 : stmts.release ();
3769 44 : goto next;
3770 : }
3771 :
3772 64 : stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3773 : }
3774 :
3775 32 : if (dump_enabled_p ())
3776 0 : dump_printf_loc (MSG_NOTE, vect_location,
3777 : "converting stmts on permute node %p\n",
3778 : (void *) root);
3779 :
3780 32 : bool *matches = XALLOCAVEC (bool, group_size);
3781 32 : poly_uint64 max_nunits = 1;
3782 32 : unsigned tree_size = 0, limit = 1;
3783 32 : node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3784 : matches, &limit, &tree_size, bst_map);
3785 32 : if (!node)
3786 0 : stmts.release ();
3787 :
3788 32 : load_map->put (root, node);
3789 32 : return node;
3790 : }
3791 :
3792 12309 : next:
3793 12353 : load_map->put (root, NULL);
3794 :
3795 29030 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3796 : {
3797 16677 : slp_tree value
3798 16677 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3799 : node);
3800 16677 : if (value)
3801 : {
3802 32 : SLP_TREE_REF_COUNT (value)++;
3803 32 : SLP_TREE_CHILDREN (root)[i] = value;
3804 : /* ??? We know the original leafs of the replaced nodes will
3805 : be referenced by bst_map, only the permutes created by
3806 : pattern matching are not. */
3807 32 : if (SLP_TREE_REF_COUNT (node) == 1)
3808 32 : load_map->remove (node);
3809 32 : vect_free_slp_tree (node);
3810 : }
3811 : }
3812 :
3813 : return NULL;
3814 : }
3815 :
3816 : /* Temporary workaround for loads not being CSEd during SLP build. This
3817 : function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3818 : VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3819 : same DR such that the final operation is equal to a permuted load. Such
3820 : NODES are then directly converted into LOADS themselves. The nodes are
3821 : CSEd using BST_MAP. */
3822 :
3823 : static void
3824 2851 : optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3825 : vec_info *vinfo, unsigned int group_size,
3826 : hash_map<slp_tree, slp_tree> *load_map,
3827 : slp_tree root)
3828 : {
3829 2851 : slp_tree node;
3830 2851 : unsigned i;
3831 :
3832 6608 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3833 : {
3834 3757 : slp_tree value
3835 3757 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3836 : node);
3837 3757 : if (value)
3838 : {
3839 0 : SLP_TREE_REF_COUNT (value)++;
3840 0 : SLP_TREE_CHILDREN (root)[i] = value;
3841 : /* ??? We know the original leafs of the replaced nodes will
3842 : be referenced by bst_map, only the permutes created by
3843 : pattern matching are not. */
3844 0 : if (SLP_TREE_REF_COUNT (node) == 1)
3845 0 : load_map->remove (node);
3846 0 : vect_free_slp_tree (node);
3847 : }
3848 : }
3849 2851 : }
3850 :
3851 : /* Helper function of vect_match_slp_patterns.
3852 :
3853 : Attempts to match patterns against the slp tree rooted in REF_NODE using
3854 : VINFO. Patterns are matched in post-order traversal.
3855 :
3856 : If matching is successful the value in REF_NODE is updated and returned, if
3857 : not then it is returned unchanged. */
3858 :
3859 : static bool
3860 6122731 : vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3861 : slp_tree_to_load_perm_map_t *perm_cache,
3862 : slp_compat_nodes_map_t *compat_cache,
3863 : hash_set<slp_tree> *visited)
3864 : {
3865 6122731 : unsigned i;
3866 6122731 : slp_tree node = *ref_node;
3867 6122731 : bool found_p = false;
3868 6122731 : if (!node || visited->add (node))
3869 874984 : return false;
3870 :
3871 : slp_tree child;
3872 9817627 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3873 4569880 : found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3874 : vinfo, perm_cache, compat_cache,
3875 : visited);
3876 :
3877 15743241 : for (unsigned x = 0; x < num__slp_patterns; x++)
3878 : {
3879 10495494 : vect_pattern *pattern
3880 10495494 : = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3881 10495494 : if (pattern)
3882 : {
3883 1171 : pattern->build (vinfo);
3884 1171 : delete pattern;
3885 1171 : found_p = true;
3886 : }
3887 : }
3888 :
3889 : return found_p;
3890 : }
3891 :
3892 : /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3893 : vec_info VINFO.
3894 :
3895 : The modified tree is returned. Patterns are tried in order and multiple
3896 : patterns may match. */
3897 :
3898 : static bool
3899 1552851 : vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3900 : hash_set<slp_tree> *visited,
3901 : slp_tree_to_load_perm_map_t *perm_cache,
3902 : slp_compat_nodes_map_t *compat_cache)
3903 : {
3904 1552851 : DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3905 1552851 : slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3906 :
3907 1552851 : if (dump_enabled_p ())
3908 30482 : dump_printf_loc (MSG_NOTE, vect_location,
3909 : "Analyzing SLP tree %p for patterns\n",
3910 30482 : (void *) SLP_INSTANCE_TREE (instance));
3911 :
3912 1552851 : return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3913 1552851 : visited);
3914 : }
3915 :
3916 : /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3917 : vectorizing with VECTYPE that might be NULL. MASKED_P indicates whether
3918 : the stores are masked.
3919 : Return true if we could use IFN_STORE_LANES instead and if that appears
3920 : to be the better approach. */
3921 :
3922 : static bool
3923 6016 : vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3924 : tree vectype, bool masked_p,
3925 : unsigned int group_size,
3926 : unsigned int new_group_size)
3927 : {
3928 6016 : if (!vectype)
3929 : {
3930 6016 : tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3931 6016 : vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3932 : }
3933 6016 : if (!vectype)
3934 : return false;
3935 : /* Allow the split if one of the two new groups would operate on full
3936 : vectors *within* rather than across one scalar loop iteration.
3937 : This is purely a heuristic, but it should work well for group
3938 : sizes of 3 and 4, where the possible splits are:
3939 :
3940 : 3->2+1: OK if the vector has exactly two elements
3941 : 4->2+2: Likewise
3942 : 4->3+1: Less clear-cut. */
3943 6016 : if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3944 3395 : || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3945 2644 : return false;
3946 3372 : return vect_store_lanes_supported (vectype, group_size, masked_p) != IFN_LAST;
3947 : }
3948 :
3949 : /* Analyze an SLP instance starting from a group of grouped stores. Call
3950 : vect_build_slp_tree to build a tree of packed stmts if possible.
3951 : Return FALSE if it's impossible to SLP any stmt in the loop. */
3952 :
3953 : static bool
3954 : vect_analyze_slp_instance (vec_info *vinfo,
3955 : scalar_stmts_to_slp_tree_map_t *bst_map,
3956 : stmt_vec_info stmt_info, slp_instance_kind kind,
3957 : unsigned max_tree_size, unsigned *limit,
3958 : bool force_single_lane);
3959 :
3960 : /* Build an interleaving scheme for the store sources RHS_NODES from
3961 : SCALAR_STMTS. */
3962 :
3963 : static slp_tree
3964 7914 : vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
3965 : vec<stmt_vec_info> &scalar_stmts,
3966 : poly_uint64 max_nunits)
3967 : {
3968 7914 : unsigned int group_size = scalar_stmts.length ();
3969 15828 : slp_tree node = vect_create_new_slp_node (scalar_stmts,
3970 7914 : SLP_TREE_CHILDREN
3971 : (rhs_nodes[0]).length ());
3972 7914 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
3973 7914 : node->max_nunits = max_nunits;
3974 7914 : for (unsigned l = 0;
3975 15855 : l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
3976 : {
3977 : /* And a permute merging all RHS SLP trees. */
3978 7941 : slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
3979 7941 : VEC_PERM_EXPR);
3980 7941 : SLP_TREE_CHILDREN (node).quick_push (perm);
3981 7941 : SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
3982 7941 : SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
3983 7941 : perm->max_nunits = max_nunits;
3984 7941 : SLP_TREE_LANES (perm) = group_size;
3985 : /* ??? We should set this NULL but that's not expected. */
3986 7941 : SLP_TREE_REPRESENTATIVE (perm)
3987 7941 : = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
3988 30916 : for (unsigned j = 0; j < rhs_nodes.length (); ++j)
3989 : {
3990 22975 : SLP_TREE_CHILDREN (perm)
3991 22975 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
3992 22975 : SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
3993 22975 : for (unsigned k = 0;
3994 48282 : k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
3995 : {
3996 : /* ??? We should populate SLP_TREE_SCALAR_STMTS
3997 : or SLP_TREE_SCALAR_OPS but then we might have
3998 : a mix of both in our children. */
3999 25307 : SLP_TREE_LANE_PERMUTATION (perm)
4000 25307 : .quick_push (std::make_pair (j, k));
4001 : }
4002 : }
4003 :
4004 : /* Now we have a single permute node but we cannot code-generate
4005 : the case with more than two inputs.
4006 : Perform pairwise reduction, reducing the two inputs
4007 : with the least number of lanes to one and then repeat until
4008 : we end up with two inputs. That scheme makes sure we end
4009 : up with permutes satisfying the restriction of requiring at
4010 : most two vector inputs to produce a single vector output
4011 : when the number of lanes is even. */
4012 15034 : while (SLP_TREE_CHILDREN (perm).length () > 2)
4013 : {
4014 : /* When we have three equal sized groups left the pairwise
4015 : reduction does not result in a scheme that avoids using
4016 : three vectors. Instead merge the first two groups
4017 : to the final size with do-not-care elements (chosen
4018 : from the first group) and then merge with the third.
4019 : { A0, B0, x, A1, B1, x, ... }
4020 : -> { A0, B0, C0, A1, B1, C1, ... }
4021 : This handles group size of three (and at least
4022 : power-of-two multiples of that). */
4023 7093 : if (SLP_TREE_CHILDREN (perm).length () == 3
4024 3269 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
4025 3269 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
4026 7093 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
4027 2451 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
4028 : {
4029 2145 : int ai = 0;
4030 2145 : int bi = 1;
4031 2145 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
4032 2145 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
4033 2145 : unsigned n = SLP_TREE_LANES (perm);
4034 :
4035 2145 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
4036 2145 : SLP_TREE_LANES (permab) = n;
4037 2145 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
4038 2145 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
4039 2145 : permab->max_nunits = max_nunits;
4040 : /* ??? Should be NULL but that's not expected. */
4041 2145 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
4042 2145 : SLP_TREE_CHILDREN (permab).quick_push (a);
4043 4304 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
4044 2159 : SLP_TREE_LANE_PERMUTATION (permab)
4045 2159 : .quick_push (std::make_pair (0, k));
4046 2145 : SLP_TREE_CHILDREN (permab).quick_push (b);
4047 4304 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
4048 2159 : SLP_TREE_LANE_PERMUTATION (permab)
4049 2159 : .quick_push (std::make_pair (1, k));
4050 : /* Push the do-not-care lanes. */
4051 4304 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
4052 2159 : SLP_TREE_LANE_PERMUTATION (permab)
4053 2159 : .quick_push (std::make_pair (0, k));
4054 :
4055 : /* Put the merged node into 'perm', in place of a. */
4056 2145 : SLP_TREE_CHILDREN (perm)[ai] = permab;
4057 : /* Adjust the references to b in the permutation
4058 : of perm and to the later children which we'll
4059 : remove. */
4060 8622 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
4061 : {
4062 6477 : std::pair<unsigned, unsigned> &p
4063 6477 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
4064 6477 : if (p.first == (unsigned) bi)
4065 : {
4066 2159 : p.first = ai;
4067 2159 : p.second += SLP_TREE_LANES (a);
4068 : }
4069 4318 : else if (p.first > (unsigned) bi)
4070 2159 : p.first--;
4071 : }
4072 2145 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
4073 2145 : break;
4074 : }
4075 :
4076 : /* Pick the two nodes with the least number of lanes,
4077 : prefer the earliest candidate and maintain ai < bi. */
4078 : int ai = -1;
4079 : int bi = -1;
4080 45078 : for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
4081 : {
4082 40130 : if (ai == -1)
4083 4948 : ai = ci;
4084 35182 : else if (bi == -1)
4085 4948 : bi = ci;
4086 30234 : else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
4087 30234 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
4088 30234 : || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
4089 24904 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
4090 : {
4091 11548 : if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
4092 5774 : <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
4093 2687 : bi = ci;
4094 : else
4095 : {
4096 3087 : ai = bi;
4097 3087 : bi = ci;
4098 : }
4099 : }
4100 : }
4101 :
4102 : /* Produce a merge of nodes ai and bi. */
4103 4948 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
4104 4948 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
4105 4948 : unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
4106 4948 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
4107 4948 : SLP_TREE_LANES (permab) = n;
4108 4948 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
4109 4948 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
4110 4948 : permab->max_nunits = max_nunits;
4111 : /* ??? Should be NULL but that's not expected. */
4112 4948 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
4113 4948 : SLP_TREE_CHILDREN (permab).quick_push (a);
4114 13096 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
4115 8148 : SLP_TREE_LANE_PERMUTATION (permab)
4116 8148 : .quick_push (std::make_pair (0, k));
4117 4948 : SLP_TREE_CHILDREN (permab).quick_push (b);
4118 12420 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
4119 7472 : SLP_TREE_LANE_PERMUTATION (permab)
4120 7472 : .quick_push (std::make_pair (1, k));
4121 :
4122 : /* Put the merged node into 'perm', in place of a. */
4123 4948 : SLP_TREE_CHILDREN (perm)[ai] = permab;
4124 : /* Adjust the references to b in the permutation
4125 : of perm and to the later children which we'll
4126 : remove. */
4127 72097 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
4128 : {
4129 67149 : std::pair<unsigned, unsigned> &p
4130 67149 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
4131 67149 : if (p.first == (unsigned) bi)
4132 : {
4133 7472 : p.first = ai;
4134 7472 : p.second += SLP_TREE_LANES (a);
4135 : }
4136 59677 : else if (p.first > (unsigned) bi)
4137 25082 : p.first--;
4138 : }
4139 4948 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
4140 : }
4141 : }
4142 :
4143 7914 : return node;
4144 : }
4145 :
4146 : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
4147 : of KIND. Return true if successful. SCALAR_STMTS is owned by this
4148 : function, REMAIN and ROOT_STMT_INFOS ownership is transferred back to
4149 : the caller upon failure. */
4150 :
4151 : static bool
4152 1899245 : vect_build_slp_instance (vec_info *vinfo,
4153 : slp_instance_kind kind,
4154 : vec<stmt_vec_info> &scalar_stmts,
4155 : vec<stmt_vec_info> &root_stmt_infos,
4156 : vec<tree> &remain,
4157 : unsigned max_tree_size, unsigned *limit,
4158 : scalar_stmts_to_slp_tree_map_t *bst_map,
4159 : bool force_single_lane)
4160 : {
4161 : /* If there's no budget left bail out early. */
4162 1899245 : if (*limit == 0)
4163 : {
4164 27238 : scalar_stmts.release ();
4165 27238 : return false;
4166 : }
4167 :
4168 1872007 : if (kind == slp_inst_kind_ctor)
4169 : {
4170 12854 : if (dump_enabled_p ())
4171 86 : dump_printf_loc (MSG_NOTE, vect_location,
4172 : "Analyzing vectorizable constructor: %G\n",
4173 43 : root_stmt_infos[0]->stmt);
4174 : }
4175 1859153 : else if (kind == slp_inst_kind_gcond)
4176 : {
4177 277650 : if (dump_enabled_p ())
4178 5696 : dump_printf_loc (MSG_NOTE, vect_location,
4179 : "Analyzing vectorizable control flow: %G",
4180 2848 : root_stmt_infos[0]->stmt);
4181 : }
4182 :
4183 1872007 : if (dump_enabled_p ())
4184 : {
4185 25550 : dump_printf_loc (MSG_NOTE, vect_location,
4186 : "Starting SLP discovery for\n");
4187 54536 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4188 57972 : dump_printf_loc (MSG_NOTE, vect_location,
4189 28986 : " %G", scalar_stmts[i]->stmt);
4190 : }
4191 :
4192 : /* Build the tree for the SLP instance. */
4193 1872007 : unsigned int group_size = scalar_stmts.length ();
4194 1872007 : bool *matches = XALLOCAVEC (bool, group_size);
4195 1872007 : poly_uint64 max_nunits = 1;
4196 1872007 : unsigned tree_size = 0;
4197 :
4198 1872007 : slp_tree node = NULL;
4199 1872007 : if (group_size > 1 && force_single_lane)
4200 : {
4201 0 : matches[0] = true;
4202 0 : matches[1] = false;
4203 : }
4204 : else
4205 1872007 : node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4206 : &max_nunits, matches, limit,
4207 : &tree_size, bst_map);
4208 1872007 : if (node != NULL)
4209 : {
4210 : /* Calculate the unrolling factor based on the smallest type. */
4211 764838 : poly_uint64 unrolling_factor
4212 764838 : = calculate_unrolling_factor (max_nunits, group_size);
4213 :
4214 764838 : if (maybe_ne (unrolling_factor, 1U)
4215 764838 : && is_a <bb_vec_info> (vinfo))
4216 : {
4217 0 : unsigned HOST_WIDE_INT const_max_nunits;
4218 0 : if (!max_nunits.is_constant (&const_max_nunits)
4219 0 : || const_max_nunits > group_size)
4220 : {
4221 0 : if (dump_enabled_p ())
4222 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4223 : "Build SLP failed: store group "
4224 : "size not a multiple of the vector size "
4225 : "in basic block SLP\n");
4226 0 : vect_free_slp_tree (node);
4227 0 : return false;
4228 : }
4229 : /* Fatal mismatch. */
4230 0 : if (dump_enabled_p ())
4231 0 : dump_printf_loc (MSG_NOTE, vect_location,
4232 : "SLP discovery succeeded but node needs "
4233 : "splitting\n");
4234 0 : memset (matches, true, group_size);
4235 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
4236 0 : vect_free_slp_tree (node);
4237 : }
4238 : else
4239 : {
4240 : /* Create a new SLP instance. */
4241 764838 : slp_instance new_instance = XNEW (class _slp_instance);
4242 764838 : SLP_INSTANCE_TREE (new_instance) = node;
4243 764838 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4244 764838 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4245 764838 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4246 764838 : SLP_INSTANCE_KIND (new_instance) = kind;
4247 764838 : new_instance->reduc_phis = NULL;
4248 764838 : new_instance->cost_vec = vNULL;
4249 764838 : new_instance->subgraph_entries = vNULL;
4250 :
4251 764838 : if (dump_enabled_p ())
4252 22489 : dump_printf_loc (MSG_NOTE, vect_location,
4253 : "SLP size %u vs. limit %u.\n",
4254 : tree_size, max_tree_size);
4255 :
4256 764838 : vinfo->slp_instances.safe_push (new_instance);
4257 :
4258 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4259 : the number of scalar stmts in the root in a few places.
4260 : Verify that assumption holds. */
4261 1529676 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4262 : .length () == group_size);
4263 :
4264 764838 : if (dump_enabled_p ())
4265 : {
4266 22489 : if (kind == slp_inst_kind_reduc_group)
4267 1455 : dump_printf_loc (MSG_NOTE, vect_location,
4268 : "SLP discovery of size %d reduction group "
4269 : "succeeded\n", group_size);
4270 22489 : dump_printf_loc (MSG_NOTE, vect_location,
4271 : "Final SLP tree for instance %p:\n",
4272 : (void *) new_instance);
4273 22489 : vect_print_slp_graph (MSG_NOTE, vect_location,
4274 : SLP_INSTANCE_TREE (new_instance));
4275 : }
4276 :
4277 764838 : return true;
4278 : }
4279 : }
4280 : /* Failed to SLP. */
4281 :
4282 : /* While we arrive here even with slp_inst_kind_store we should only
4283 : for group_size == 1. The code to split store groups is only in
4284 : vect_analyze_slp_instance now. */
4285 1107169 : gcc_assert (kind != slp_inst_kind_store || group_size == 1);
4286 :
4287 : /* Free the allocated memory. */
4288 1107169 : scalar_stmts.release ();
4289 :
4290 : /* Failed to SLP. */
4291 1107169 : if (dump_enabled_p ())
4292 3061 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4293 : return false;
4294 : }
4295 :
4296 : /* Analyze an SLP instance starting from a the start of a reduction chain.
4297 : Call vect_build_slp_tree to build a tree of packed stmts if possible.
4298 : Return FALSE if SLP build fails. */
4299 :
4300 : static bool
4301 63364 : vect_analyze_slp_reduc_chain (loop_vec_info vinfo,
4302 : scalar_stmts_to_slp_tree_map_t *bst_map,
4303 : stmt_vec_info scalar_stmt,
4304 : unsigned max_tree_size, unsigned *limit)
4305 : {
4306 63364 : vec<stmt_vec_info> scalar_stmts = vNULL;
4307 :
4308 63364 : bool fail = false;
4309 : /* ??? We could leave operation code checking to SLP discovery. */
4310 63364 : code_helper code = STMT_VINFO_REDUC_CODE (STMT_VINFO_REDUC_DEF
4311 : (vect_orig_stmt (scalar_stmt)));
4312 63364 : bool first = true;
4313 63364 : stmt_vec_info next_stmt = scalar_stmt;
4314 71551 : do
4315 : {
4316 71551 : stmt_vec_info stmt = next_stmt;
4317 71551 : gimple_match_op op;
4318 71551 : if (!gimple_extract_op (STMT_VINFO_STMT (stmt), &op))
4319 0 : gcc_unreachable ();
4320 143102 : tree reduc_def = gimple_arg (STMT_VINFO_STMT (stmt),
4321 71551 : STMT_VINFO_REDUC_IDX (stmt));
4322 71551 : next_stmt = vect_stmt_to_vectorize (vinfo->lookup_def (reduc_def));
4323 71551 : gcc_assert (is_a <gphi *> (STMT_VINFO_STMT (next_stmt))
4324 : || STMT_VINFO_REDUC_IDX (next_stmt) != -1);
4325 77095 : if (!gimple_extract_op (STMT_VINFO_STMT (vect_orig_stmt (stmt)), &op))
4326 0 : gcc_unreachable ();
4327 71551 : if (CONVERT_EXPR_CODE_P (op.code)
4328 3421 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))
4329 74960 : && (first
4330 1692 : || is_a <gphi *> (STMT_VINFO_STMT (next_stmt))))
4331 : ;
4332 68146 : else if (code != op.code)
4333 : {
4334 2559 : fail = true;
4335 2559 : break;
4336 : }
4337 : else
4338 65587 : scalar_stmts.safe_push (stmt);
4339 68992 : first = false;
4340 : }
4341 68992 : while (!is_a <gphi *> (STMT_VINFO_STMT (next_stmt)));
4342 63364 : if (fail)
4343 2559 : return false;
4344 :
4345 : /* Remember a stmt with the actual reduction operation. */
4346 60805 : stmt_vec_info reduc_scalar_stmt = scalar_stmts[0];
4347 :
4348 : /* When the SSA def chain through reduc-idx does not form a natural
4349 : reduction chain try to linearize an associative operation manually. */
4350 60805 : if (scalar_stmts.length () == 1
4351 58186 : && code.is_tree_code ()
4352 52128 : && associative_tree_code ((tree_code)code)
4353 : /* We may not associate if a fold-left reduction is required. */
4354 112068 : && !needs_fold_left_reduction_p (TREE_TYPE (gimple_get_lhs
4355 : (reduc_scalar_stmt->stmt)),
4356 : code))
4357 : {
4358 49134 : auto_vec<chain_op_t> chain;
4359 49134 : auto_vec<std::pair<tree_code, gimple *> > worklist;
4360 49134 : gimple *op_stmt = NULL, *other_op_stmt = NULL;
4361 49134 : if (is_a <gassign *> (scalar_stmts[0]->stmt)
4362 : /* We cannot linearize an operation that vect_slp_linearize_chain
4363 : would not put on its worklist. */
4364 49134 : && gimple_assign_rhs_code (scalar_stmts[0]->stmt) == (tree_code)code)
4365 : {
4366 48487 : vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
4367 48487 : scalar_stmts[0]->stmt, op_stmt,
4368 : other_op_stmt,
4369 : NULL);
4370 :
4371 48487 : scalar_stmts.truncate (0);
4372 48487 : stmt_vec_info tail = NULL;
4373 242680 : for (auto el : chain)
4374 : {
4375 97530 : if (el.dt == vect_external_def
4376 97530 : || el.dt == vect_constant_def
4377 97530 : || el.code != (tree_code) code)
4378 : {
4379 311 : scalar_stmts.release ();
4380 311 : return false;
4381 : }
4382 97219 : stmt_vec_info stmt = vinfo->lookup_def (el.op);
4383 97219 : if (STMT_VINFO_REDUC_IDX (stmt) != -1
4384 95675 : || STMT_VINFO_REDUC_DEF (stmt))
4385 : {
4386 48374 : gcc_assert (tail == NULL);
4387 48374 : tail = stmt;
4388 48374 : continue;
4389 : }
4390 48845 : scalar_stmts.safe_push (stmt);
4391 : }
4392 48176 : gcc_assert (tail);
4393 : }
4394 :
4395 : /* When this linearization didn't produce a chain see if stripping
4396 : a wrapping sign conversion produces one. */
4397 48823 : if (scalar_stmts.length () == 1
4398 48823 : && (code == PLUS_EXPR || code == MULT_EXPR || code == BIT_IOR_EXPR
4399 : || code == BIT_AND_EXPR || code == BIT_XOR_EXPR))
4400 : {
4401 47093 : gimple *stmt = scalar_stmts[0]->stmt;
4402 47093 : if (!is_gimple_assign (stmt)
4403 46034 : || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
4404 4498 : || TREE_CODE (gimple_assign_rhs1 (stmt)) != SSA_NAME
4405 51591 : || !tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
4406 4498 : TREE_TYPE (gimple_assign_rhs1 (stmt))))
4407 : {
4408 45341 : scalar_stmts.release ();
4409 45341 : return false;
4410 : }
4411 1752 : stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
4412 1752 : if (!is_gimple_assign (stmt)
4413 1752 : || gimple_assign_rhs_code (stmt) != (tree_code)code)
4414 : {
4415 1733 : scalar_stmts.release ();
4416 1733 : return false;
4417 : }
4418 19 : chain.truncate (0);
4419 19 : vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
4420 : stmt, op_stmt, other_op_stmt, NULL);
4421 :
4422 19 : scalar_stmts.truncate (0);
4423 19 : stmt_vec_info tail = NULL;
4424 93 : for (auto el : chain)
4425 : {
4426 44 : if (el.dt == vect_external_def
4427 44 : || el.dt == vect_constant_def
4428 44 : || el.code != (tree_code) code)
4429 : {
4430 8 : scalar_stmts.release ();
4431 8 : return false;
4432 : }
4433 36 : stmt_vec_info stmt = vinfo->lookup_def (el.op);
4434 36 : if (STMT_VINFO_REDUC_IDX (stmt) != -1
4435 36 : || STMT_VINFO_REDUC_DEF (stmt))
4436 : {
4437 0 : gcc_assert (tail == NULL);
4438 0 : tail = stmt;
4439 0 : continue;
4440 : }
4441 36 : scalar_stmts.safe_push (stmt);
4442 : }
4443 : /* Unlike the above this does not include the reduction SSA
4444 : cycle. */
4445 11 : gcc_assert (!tail);
4446 : }
4447 :
4448 1741 : if (scalar_stmts.length () < 2)
4449 : {
4450 1622 : scalar_stmts.release ();
4451 1622 : return false;
4452 : }
4453 :
4454 119 : if (dump_enabled_p ())
4455 : {
4456 34 : dump_printf_loc (MSG_NOTE, vect_location,
4457 : "Starting SLP discovery of reduction chain for\n");
4458 140 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4459 212 : dump_printf_loc (MSG_NOTE, vect_location,
4460 106 : " %G", scalar_stmts[i]->stmt);
4461 : }
4462 :
4463 119 : unsigned int group_size = scalar_stmts.length ();
4464 119 : bool *matches = XALLOCAVEC (bool, group_size);
4465 119 : poly_uint64 max_nunits = 1;
4466 119 : unsigned tree_size = 0;
4467 119 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4468 : &max_nunits, matches, limit,
4469 119 : &tree_size, bst_map);
4470 119 : if (!node)
4471 : {
4472 47 : scalar_stmts.release ();
4473 47 : return false;
4474 : }
4475 :
4476 72 : unsigned cycle_id = vinfo->reduc_infos.length ();
4477 72 : vect_reduc_info reduc_info = new vect_reduc_info_s ();
4478 72 : vinfo->reduc_infos.safe_push (reduc_info);
4479 72 : VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (next_stmt);
4480 72 : VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (next_stmt);
4481 72 : VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (next_stmt);
4482 72 : VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
4483 72 : reduc_info->is_reduc_chain = true;
4484 :
4485 : /* Build the node for the PHI and possibly the conversions. */
4486 72 : slp_tree phis = vect_create_new_slp_node (2, ERROR_MARK);
4487 72 : SLP_TREE_REPRESENTATIVE (phis) = next_stmt;
4488 72 : phis->cycle_info.id = cycle_id;
4489 72 : SLP_TREE_LANES (phis) = group_size;
4490 72 : if (reduc_scalar_stmt == scalar_stmt)
4491 68 : SLP_TREE_VECTYPE (phis) = SLP_TREE_VECTYPE (node);
4492 : else
4493 4 : SLP_TREE_VECTYPE (phis)
4494 4 : = signed_or_unsigned_type_for (TYPE_UNSIGNED
4495 : (TREE_TYPE (gimple_get_lhs
4496 : (scalar_stmt->stmt))),
4497 : SLP_TREE_VECTYPE (node));
4498 : /* ??? vect_cse_slp_nodes cannot cope with cycles without any
4499 : SLP_TREE_SCALAR_STMTS. */
4500 72 : SLP_TREE_SCALAR_STMTS (phis).create (group_size);
4501 375 : for (unsigned i = 0; i < group_size; ++i)
4502 303 : SLP_TREE_SCALAR_STMTS (phis).quick_push (next_stmt);
4503 :
4504 72 : slp_tree op_input = phis;
4505 72 : if (reduc_scalar_stmt != scalar_stmt)
4506 : {
4507 4 : slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
4508 4 : SLP_TREE_REPRESENTATIVE (conv)
4509 4 : = vinfo->lookup_def (gimple_arg (reduc_scalar_stmt->stmt,
4510 4 : STMT_VINFO_REDUC_IDX
4511 : (reduc_scalar_stmt)));
4512 4 : SLP_TREE_CHILDREN (conv).quick_push (phis);
4513 4 : conv->cycle_info.id = cycle_id;
4514 4 : SLP_TREE_REDUC_IDX (conv) = 0;
4515 4 : SLP_TREE_LANES (conv) = group_size;
4516 4 : SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (node);
4517 4 : SLP_TREE_SCALAR_STMTS (conv) = vNULL;
4518 4 : op_input = conv;
4519 : }
4520 :
4521 72 : slp_tree reduc = vect_create_new_slp_node (2, ERROR_MARK);
4522 72 : SLP_TREE_REPRESENTATIVE (reduc) = reduc_scalar_stmt;
4523 72 : SLP_TREE_CHILDREN (reduc).quick_push (op_input);
4524 72 : SLP_TREE_CHILDREN (reduc).quick_push (node);
4525 72 : reduc->cycle_info.id = cycle_id;
4526 72 : SLP_TREE_REDUC_IDX (reduc) = 0;
4527 72 : SLP_TREE_LANES (reduc) = group_size;
4528 72 : SLP_TREE_VECTYPE (reduc) = SLP_TREE_VECTYPE (node);
4529 : /* ??? For the reduction epilogue we need a live lane. */
4530 72 : SLP_TREE_SCALAR_STMTS (reduc).create (group_size);
4531 72 : SLP_TREE_SCALAR_STMTS (reduc).quick_push (reduc_scalar_stmt);
4532 303 : for (unsigned i = 1; i < group_size; ++i)
4533 231 : SLP_TREE_SCALAR_STMTS (reduc).quick_push (NULL);
4534 :
4535 72 : if (reduc_scalar_stmt != scalar_stmt)
4536 : {
4537 4 : slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
4538 4 : SLP_TREE_REPRESENTATIVE (conv) = scalar_stmt;
4539 4 : SLP_TREE_CHILDREN (conv).quick_push (reduc);
4540 4 : conv->cycle_info.id = cycle_id;
4541 4 : SLP_TREE_REDUC_IDX (conv) = 0;
4542 4 : SLP_TREE_LANES (conv) = group_size;
4543 4 : SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (phis);
4544 : /* ??? For the reduction epilogue we need a live lane. */
4545 4 : SLP_TREE_SCALAR_STMTS (conv).create (group_size);
4546 4 : SLP_TREE_SCALAR_STMTS (conv).quick_push (scalar_stmt);
4547 8 : for (unsigned i = 1; i < group_size; ++i)
4548 4 : SLP_TREE_SCALAR_STMTS (conv).quick_push (NULL);
4549 4 : reduc = conv;
4550 : }
4551 :
4552 72 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (vinfo));
4553 72 : SLP_TREE_CHILDREN (phis).quick_push (NULL);
4554 72 : SLP_TREE_CHILDREN (phis).quick_push (NULL);
4555 72 : SLP_TREE_CHILDREN (phis)[le->dest_idx] = reduc;
4556 72 : SLP_TREE_REF_COUNT (reduc)++;
4557 :
4558 : /* Create a new SLP instance. */
4559 72 : slp_instance new_instance = XNEW (class _slp_instance);
4560 72 : SLP_INSTANCE_TREE (new_instance) = reduc;
4561 72 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4562 72 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4563 72 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4564 72 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
4565 72 : new_instance->reduc_phis = NULL;
4566 72 : new_instance->cost_vec = vNULL;
4567 72 : new_instance->subgraph_entries = vNULL;
4568 :
4569 72 : vinfo->slp_instances.safe_push (new_instance);
4570 :
4571 72 : if (dump_enabled_p ())
4572 : {
4573 24 : dump_printf_loc (MSG_NOTE, vect_location,
4574 : "Final SLP tree for instance %p:\n",
4575 : (void *) new_instance);
4576 24 : vect_print_slp_graph (MSG_NOTE, vect_location,
4577 : SLP_INSTANCE_TREE (new_instance));
4578 : }
4579 :
4580 72 : return true;
4581 49134 : }
4582 :
4583 11671 : if (scalar_stmts.length () <= 1)
4584 : {
4585 9052 : scalar_stmts.release ();
4586 9052 : return false;
4587 : }
4588 :
4589 2619 : scalar_stmts.reverse ();
4590 2619 : stmt_vec_info reduc_phi_info = next_stmt;
4591 :
4592 : /* Build the tree for the SLP instance. */
4593 2619 : vec<stmt_vec_info> root_stmt_infos = vNULL;
4594 2619 : vec<tree> remain = vNULL;
4595 :
4596 2619 : if (dump_enabled_p ())
4597 : {
4598 180 : dump_printf_loc (MSG_NOTE, vect_location,
4599 : "Starting SLP discovery of reduction chain for\n");
4600 966 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4601 1572 : dump_printf_loc (MSG_NOTE, vect_location,
4602 786 : " %G", scalar_stmts[i]->stmt);
4603 : }
4604 :
4605 : /* Build the tree for the SLP instance. */
4606 2619 : unsigned int group_size = scalar_stmts.length ();
4607 2619 : bool *matches = XALLOCAVEC (bool, group_size);
4608 2619 : poly_uint64 max_nunits = 1;
4609 2619 : unsigned tree_size = 0;
4610 :
4611 : /* ??? We need this only for SLP discovery. */
4612 10014 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4613 7395 : REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = scalar_stmts[0];
4614 :
4615 2619 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4616 : &max_nunits, matches, limit,
4617 2619 : &tree_size, bst_map);
4618 :
4619 10014 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4620 7395 : REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = NULL;
4621 :
4622 2619 : if (node != NULL)
4623 : {
4624 : /* Create a new SLP instance. */
4625 2286 : slp_instance new_instance = XNEW (class _slp_instance);
4626 2286 : SLP_INSTANCE_TREE (new_instance) = node;
4627 2286 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4628 2286 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4629 2286 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4630 2286 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
4631 2286 : new_instance->reduc_phis = NULL;
4632 2286 : new_instance->cost_vec = vNULL;
4633 2286 : new_instance->subgraph_entries = vNULL;
4634 :
4635 2286 : vect_reduc_info reduc_info = info_for_reduction (vinfo, node);
4636 2286 : reduc_info->is_reduc_chain = true;
4637 :
4638 2286 : if (dump_enabled_p ())
4639 135 : dump_printf_loc (MSG_NOTE, vect_location,
4640 : "SLP size %u vs. limit %u.\n",
4641 : tree_size, max_tree_size);
4642 :
4643 : /* Fixup SLP reduction chains. If this is a reduction chain with
4644 : a conversion in front amend the SLP tree with a node for that. */
4645 2286 : gimple *scalar_def = STMT_VINFO_REDUC_DEF (reduc_phi_info)->stmt;
4646 2286 : if (is_gimple_assign (scalar_def)
4647 2286 : && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (scalar_def)))
4648 : {
4649 43 : stmt_vec_info conv_info = vect_stmt_to_vectorize
4650 43 : (STMT_VINFO_REDUC_DEF (reduc_phi_info));
4651 43 : scalar_stmts = vNULL;
4652 43 : scalar_stmts.create (group_size);
4653 135 : for (unsigned i = 0; i < group_size; ++i)
4654 92 : scalar_stmts.quick_push (conv_info);
4655 43 : slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
4656 43 : SLP_TREE_VECTYPE (conv)
4657 43 : = get_vectype_for_scalar_type (vinfo,
4658 43 : TREE_TYPE
4659 : (gimple_assign_lhs (scalar_def)),
4660 : group_size);
4661 43 : SLP_TREE_REDUC_IDX (conv) = 0;
4662 43 : conv->cycle_info.id = node->cycle_info.id;
4663 43 : SLP_TREE_CHILDREN (conv).quick_push (node);
4664 43 : SLP_INSTANCE_TREE (new_instance) = conv;
4665 : }
4666 : /* Fill the backedge child of the PHI SLP node. The
4667 : general matching code cannot find it because the
4668 : scalar code does not reflect how we vectorize the
4669 : reduction. */
4670 2286 : use_operand_p use_p;
4671 2286 : imm_use_iterator imm_iter;
4672 2286 : class loop *loop = LOOP_VINFO_LOOP (vinfo);
4673 11023 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
4674 : gimple_get_lhs (scalar_def))
4675 : /* There are exactly two non-debug uses, the reduction
4676 : PHI and the loop-closed PHI node. */
4677 6451 : if (!is_gimple_debug (USE_STMT (use_p))
4678 6451 : && gimple_bb (USE_STMT (use_p)) == loop->header)
4679 : {
4680 2286 : auto_vec<stmt_vec_info, 64> phis (group_size);
4681 2286 : stmt_vec_info phi_info = vinfo->lookup_stmt (USE_STMT (use_p));
4682 8842 : for (unsigned i = 0; i < group_size; ++i)
4683 6556 : phis.quick_push (phi_info);
4684 2286 : slp_tree *phi_node = bst_map->get (phis);
4685 2286 : unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
4686 4572 : SLP_TREE_CHILDREN (*phi_node)[dest_idx]
4687 2286 : = SLP_INSTANCE_TREE (new_instance);
4688 2286 : SLP_INSTANCE_TREE (new_instance)->refcnt++;
4689 2286 : }
4690 :
4691 2286 : vinfo->slp_instances.safe_push (new_instance);
4692 :
4693 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4694 : the number of scalar stmts in the root in a few places.
4695 : Verify that assumption holds. */
4696 4572 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4697 : .length () == group_size);
4698 :
4699 2286 : if (dump_enabled_p ())
4700 : {
4701 135 : dump_printf_loc (MSG_NOTE, vect_location,
4702 : "Final SLP tree for instance %p:\n",
4703 : (void *) new_instance);
4704 135 : vect_print_slp_graph (MSG_NOTE, vect_location,
4705 : SLP_INSTANCE_TREE (new_instance));
4706 : }
4707 :
4708 2286 : return true;
4709 : }
4710 :
4711 : /* Failed to SLP. */
4712 333 : scalar_stmts.release ();
4713 333 : if (dump_enabled_p ())
4714 45 : dump_printf_loc (MSG_NOTE, vect_location,
4715 : "SLP discovery of reduction chain failed\n");
4716 : return false;
4717 : }
4718 :
4719 : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
4720 : of KIND. Return true if successful. */
4721 :
4722 : static bool
4723 89261 : vect_analyze_slp_reduction (loop_vec_info vinfo,
4724 : stmt_vec_info scalar_stmt,
4725 : unsigned max_tree_size, unsigned *limit,
4726 : scalar_stmts_to_slp_tree_map_t *bst_map,
4727 : bool force_single_lane)
4728 : {
4729 89261 : slp_instance_kind kind = slp_inst_kind_reduc_group;
4730 :
4731 : /* Try to gather a reduction chain. Only attempt if there's budget left
4732 : since chain analysis may build multi-lane trees that consume limit. */
4733 89261 : if (! force_single_lane
4734 63634 : && *limit != 0
4735 63634 : && STMT_VINFO_DEF_TYPE (scalar_stmt) == vect_reduction_def
4736 152625 : && vect_analyze_slp_reduc_chain (vinfo, bst_map, scalar_stmt,
4737 : max_tree_size, limit))
4738 : return true;
4739 :
4740 86903 : vec<stmt_vec_info> scalar_stmts;
4741 86903 : scalar_stmts.create (1);
4742 86903 : scalar_stmts.quick_push (scalar_stmt);
4743 :
4744 86903 : if (dump_enabled_p ())
4745 : {
4746 3483 : dump_printf_loc (MSG_NOTE, vect_location,
4747 : "Starting SLP discovery for\n");
4748 6966 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4749 6966 : dump_printf_loc (MSG_NOTE, vect_location,
4750 3483 : " %G", scalar_stmts[i]->stmt);
4751 : }
4752 :
4753 : /* Build the tree for the SLP instance. */
4754 86903 : unsigned int group_size = scalar_stmts.length ();
4755 86903 : bool *matches = XALLOCAVEC (bool, group_size);
4756 86903 : poly_uint64 max_nunits = 1;
4757 86903 : unsigned tree_size = 0;
4758 :
4759 86903 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4760 : &max_nunits, matches, limit,
4761 : &tree_size, bst_map);
4762 86903 : if (node != NULL)
4763 : {
4764 : /* Create a new SLP instance. */
4765 83895 : slp_instance new_instance = XNEW (class _slp_instance);
4766 83895 : SLP_INSTANCE_TREE (new_instance) = node;
4767 83895 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4768 83895 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4769 83895 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4770 83895 : SLP_INSTANCE_KIND (new_instance) = kind;
4771 83895 : new_instance->reduc_phis = NULL;
4772 83895 : new_instance->cost_vec = vNULL;
4773 83895 : new_instance->subgraph_entries = vNULL;
4774 :
4775 83895 : if (dump_enabled_p ())
4776 3363 : dump_printf_loc (MSG_NOTE, vect_location,
4777 : "SLP size %u vs. limit %u.\n",
4778 : tree_size, max_tree_size);
4779 :
4780 83895 : vinfo->slp_instances.safe_push (new_instance);
4781 :
4782 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4783 : the number of scalar stmts in the root in a few places.
4784 : Verify that assumption holds. */
4785 167790 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4786 : .length () == group_size);
4787 :
4788 83895 : if (dump_enabled_p ())
4789 : {
4790 3363 : dump_printf_loc (MSG_NOTE, vect_location,
4791 : "Final SLP tree for instance %p:\n",
4792 : (void *) new_instance);
4793 3363 : vect_print_slp_graph (MSG_NOTE, vect_location,
4794 : SLP_INSTANCE_TREE (new_instance));
4795 : }
4796 :
4797 83895 : return true;
4798 : }
4799 : /* Failed to SLP. */
4800 :
4801 : /* Free the allocated memory. */
4802 3008 : scalar_stmts.release ();
4803 :
4804 : /* Failed to SLP. */
4805 3008 : if (dump_enabled_p ())
4806 120 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4807 : return false;
4808 : }
4809 :
4810 : /* Analyze a single SLP reduction group. If successful add a SLP instance
4811 : for it and return true, otherwise return false and have *MATCHES
4812 : populated. */
4813 :
4814 : static bool
4815 26957 : vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo,
4816 : vec<stmt_vec_info> scalar_stmts,
4817 : scalar_stmts_to_slp_tree_map_t *bst_map,
4818 : unsigned max_tree_size, unsigned *limit,
4819 : bool *matches)
4820 : {
4821 : /* Try to form a reduction group. */
4822 26957 : unsigned int group_size = scalar_stmts.length ();
4823 26957 : if (!matches)
4824 11205 : matches = XALLOCAVEC (bool, group_size);
4825 26957 : poly_uint64 max_nunits = 1;
4826 26957 : unsigned tree_size = 0;
4827 26957 : slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts,
4828 : group_size,
4829 : &max_nunits, matches, limit,
4830 : &tree_size, bst_map);
4831 26957 : if (!node)
4832 : return false;
4833 :
4834 : /* Create a new SLP instance. */
4835 12245 : slp_instance new_instance = XNEW (class _slp_instance);
4836 12245 : SLP_INSTANCE_TREE (new_instance) = node;
4837 12245 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4838 12245 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4839 12245 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4840 12245 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group;
4841 12245 : new_instance->reduc_phis = NULL;
4842 12245 : new_instance->cost_vec = vNULL;
4843 12245 : new_instance->subgraph_entries = vNULL;
4844 :
4845 12245 : if (dump_enabled_p ())
4846 579 : dump_printf_loc (MSG_NOTE, vect_location,
4847 : "SLP size %u vs. limit %u.\n",
4848 : tree_size, max_tree_size);
4849 :
4850 12245 : loop_vinfo->slp_instances.safe_push (new_instance);
4851 :
4852 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4853 : the number of scalar stmts in the root in a few places.
4854 : Verify that assumption holds. */
4855 24490 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4856 : .length () == group_size);
4857 :
4858 12245 : if (dump_enabled_p ())
4859 : {
4860 579 : dump_printf_loc (MSG_NOTE, vect_location,
4861 : "SLP discovery of size %d reduction group "
4862 : "succeeded\n", group_size);
4863 579 : dump_printf_loc (MSG_NOTE, vect_location,
4864 : "Final SLP tree for instance %p:\n",
4865 : (void *) new_instance);
4866 579 : vect_print_slp_graph (MSG_NOTE, vect_location,
4867 : SLP_INSTANCE_TREE (new_instance));
4868 : }
4869 :
4870 : return true;
4871 : }
4872 :
4873 : /* Analyze reductions in LOOP_VINFO and populate SLP instances
4874 : accordingly. Returns false if something fails. */
4875 :
4876 : static bool
4877 491685 : vect_analyze_slp_reductions (loop_vec_info loop_vinfo,
4878 : unsigned max_tree_size, unsigned *limit,
4879 : scalar_stmts_to_slp_tree_map_t *bst_map,
4880 : bool force_single_lane)
4881 : {
4882 557348 : if (loop_vinfo->reductions.is_empty ())
4883 : return true;
4884 :
4885 : /* Collect reduction statements we can combine into
4886 : a SLP reduction. */
4887 73065 : vec<stmt_vec_info> scalar_stmts;
4888 73065 : scalar_stmts.create (loop_vinfo->reductions.length ());
4889 324286 : for (auto next_info : loop_vinfo->reductions)
4890 : {
4891 105091 : next_info = vect_stmt_to_vectorize (next_info);
4892 105091 : if ((STMT_VINFO_RELEVANT_P (next_info)
4893 14 : || STMT_VINFO_LIVE_P (next_info))
4894 : /* ??? Make sure we didn't skip a conversion around a
4895 : reduction path. In that case we'd have to reverse
4896 : engineer that conversion stmt following the chain using
4897 : reduc_idx and from the PHI using reduc_def. */
4898 105077 : && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
4899 105077 : || (STMT_VINFO_DEF_TYPE (next_info)
4900 : == vect_double_reduction_def)))
4901 : {
4902 : /* Do not discover SLP reductions combining lane-reducing
4903 : ops, that will fail later. */
4904 105077 : if (!force_single_lane
4905 105077 : && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
4906 78761 : scalar_stmts.quick_push (next_info);
4907 : /* Do SLP discovery for single-lane reductions. */
4908 26316 : else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
4909 : max_tree_size, limit,
4910 : bst_map,
4911 : force_single_lane))
4912 : {
4913 0 : scalar_stmts.release ();
4914 0 : return false;
4915 : }
4916 : }
4917 : }
4918 :
4919 73065 : if (scalar_stmts.length () > 1)
4920 : {
4921 : /* Try to form a reduction group. */
4922 4572 : unsigned int group_size = scalar_stmts.length ();
4923 4572 : bool *matches = XALLOCAVEC (bool, group_size);
4924 4572 : if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts, bst_map,
4925 : max_tree_size, limit, matches))
4926 4419 : return true;
4927 :
4928 : /* When analysis as a single SLP reduction group failed try to
4929 : form sub-groups by collecting matching lanes. Do not recurse
4930 : that on failure (to limit compile-time costs), but recurse
4931 : for the initial non-matching parts. Everything not covered
4932 : by a sub-group gets single-reduction treatment. */
4933 3496 : vec<stmt_vec_info> cands = vNULL;
4934 11358 : while (matches[0])
4935 : {
4936 11205 : cands.truncate (0);
4937 11205 : cands.reserve (group_size, true);
4938 88267 : for (unsigned i = 0; i < group_size; ++i)
4939 77062 : if (matches[i])
4940 19538 : cands.quick_push (scalar_stmts[i]);
4941 :
4942 : /* Try to form a reduction group. */
4943 11205 : if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map,
4944 : max_tree_size, limit, NULL))
4945 7851 : cands = vNULL;
4946 : else
4947 : {
4948 : /* Do SLP discovery for single-lane reductions. */
4949 20489 : for (auto stmt_info : cands)
4950 10452 : if (! vect_analyze_slp_reduction (loop_vinfo,
4951 : vect_stmt_to_vectorize
4952 : (stmt_info),
4953 : max_tree_size, limit,
4954 : bst_map, force_single_lane))
4955 : {
4956 25 : scalar_stmts.release ();
4957 25 : cands.release ();
4958 25 : return false;
4959 : }
4960 : }
4961 : /* Remove the handled stmts from scalar_stmts and try again,
4962 : possibly repeating the above with updated matches[]. */
4963 : unsigned j = 0;
4964 88172 : for (unsigned i = 0; i < group_size; ++i)
4965 76992 : if (!matches[i])
4966 : {
4967 57494 : scalar_stmts[j] = scalar_stmts[i];
4968 57494 : ++j;
4969 : }
4970 11180 : scalar_stmts.truncate (j);
4971 11180 : group_size = scalar_stmts.length ();
4972 11180 : if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
4973 : bst_map, max_tree_size, limit,
4974 : matches))
4975 : return true;
4976 : }
4977 : }
4978 : /* Do SLP discovery for single-lane reductions. */
4979 255448 : for (auto stmt_info : scalar_stmts)
4980 52493 : if (! vect_analyze_slp_reduction (loop_vinfo,
4981 : vect_stmt_to_vectorize (stmt_info),
4982 : max_tree_size, limit,
4983 : bst_map, force_single_lane))
4984 : {
4985 2983 : scalar_stmts.release ();
4986 2983 : return false;
4987 : }
4988 :
4989 65663 : scalar_stmts.release ();
4990 65663 : return true;
4991 : }
4992 :
4993 : /* Analyze an SLP instance starting from a group of grouped stores. Call
4994 : vect_build_slp_tree to build a tree of packed stmts if possible.
4995 : Return FALSE if it's impossible to SLP any stmt in the group. */
4996 :
4997 : static bool
4998 1100259 : vect_analyze_slp_instance (vec_info *vinfo,
4999 : scalar_stmts_to_slp_tree_map_t *bst_map,
5000 : stmt_vec_info stmt_info,
5001 : slp_instance_kind kind,
5002 : unsigned max_tree_size, unsigned *limit,
5003 : bool force_single_lane)
5004 : {
5005 1100259 : vec<stmt_vec_info> scalar_stmts;
5006 :
5007 1100259 : if (is_a <bb_vec_info> (vinfo))
5008 1070962 : vect_location = stmt_info->stmt;
5009 :
5010 1100259 : gcc_assert (kind == slp_inst_kind_store);
5011 :
5012 : /* Collect the stores and store them in scalar_stmts. */
5013 1100259 : scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
5014 1100259 : stmt_vec_info next_info = stmt_info;
5015 5469793 : while (next_info)
5016 : {
5017 3269275 : scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
5018 3269275 : next_info = DR_GROUP_NEXT_ELEMENT (next_info);
5019 : }
5020 :
5021 1100259 : vec<stmt_vec_info> root_stmt_infos = vNULL;
5022 1100259 : vec<tree> remain = vNULL;
5023 :
5024 : /* Build the tree for the SLP instance. */
5025 :
5026 : /* If there's no budget left bail out early. */
5027 1100259 : if (*limit == 0)
5028 : return false;
5029 :
5030 1100236 : if (dump_enabled_p ())
5031 : {
5032 4131 : dump_printf_loc (MSG_NOTE, vect_location,
5033 : "Starting SLP discovery for\n");
5034 23829 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
5035 39396 : dump_printf_loc (MSG_NOTE, vect_location,
5036 19698 : " %G", scalar_stmts[i]->stmt);
5037 : }
5038 :
5039 : /* Build the tree for the SLP instance. */
5040 1100236 : unsigned int group_size = scalar_stmts.length ();
5041 1100236 : bool *matches = XALLOCAVEC (bool, group_size);
5042 1100236 : poly_uint64 max_nunits = 1;
5043 1100236 : unsigned tree_size = 0;
5044 1100236 : unsigned i;
5045 :
5046 1100236 : slp_tree node = NULL;
5047 1100236 : if (group_size > 1 && force_single_lane)
5048 : {
5049 1689 : matches[0] = true;
5050 1689 : matches[1] = false;
5051 : }
5052 : else
5053 1098547 : node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
5054 : &max_nunits, matches, limit,
5055 : &tree_size, bst_map);
5056 1100236 : if (node != NULL)
5057 : {
5058 : /* Calculate the unrolling factor based on the smallest type. */
5059 684528 : poly_uint64 unrolling_factor
5060 684528 : = calculate_unrolling_factor (max_nunits, group_size);
5061 :
5062 684528 : if (maybe_ne (unrolling_factor, 1U)
5063 684528 : && is_a <bb_vec_info> (vinfo))
5064 : {
5065 0 : unsigned HOST_WIDE_INT const_max_nunits;
5066 0 : if (!max_nunits.is_constant (&const_max_nunits)
5067 0 : || const_max_nunits > group_size)
5068 : {
5069 0 : if (dump_enabled_p ())
5070 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5071 : "Build SLP failed: store group "
5072 : "size not a multiple of the vector size "
5073 : "in basic block SLP\n");
5074 0 : vect_free_slp_tree (node);
5075 0 : return false;
5076 : }
5077 : /* Fatal mismatch. */
5078 0 : if (dump_enabled_p ())
5079 0 : dump_printf_loc (MSG_NOTE, vect_location,
5080 : "SLP discovery succeeded but node needs "
5081 : "splitting\n");
5082 0 : memset (matches, true, group_size);
5083 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
5084 0 : vect_free_slp_tree (node);
5085 : }
5086 : else
5087 : {
5088 : /* Create a new SLP instance. */
5089 684528 : slp_instance new_instance = XNEW (class _slp_instance);
5090 684528 : SLP_INSTANCE_TREE (new_instance) = node;
5091 684528 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
5092 684528 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
5093 684528 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
5094 684528 : SLP_INSTANCE_KIND (new_instance) = kind;
5095 684528 : new_instance->reduc_phis = NULL;
5096 684528 : new_instance->cost_vec = vNULL;
5097 684528 : new_instance->subgraph_entries = vNULL;
5098 :
5099 684528 : if (dump_enabled_p ())
5100 3147 : dump_printf_loc (MSG_NOTE, vect_location,
5101 : "SLP size %u vs. limit %u.\n",
5102 : tree_size, max_tree_size);
5103 :
5104 684528 : vinfo->slp_instances.safe_push (new_instance);
5105 :
5106 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
5107 : the number of scalar stmts in the root in a few places.
5108 : Verify that assumption holds. */
5109 1369056 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
5110 : .length () == group_size);
5111 :
5112 684528 : if (dump_enabled_p ())
5113 : {
5114 3147 : dump_printf_loc (MSG_NOTE, vect_location,
5115 : "Final SLP tree for instance %p:\n",
5116 : (void *) new_instance);
5117 3147 : vect_print_slp_graph (MSG_NOTE, vect_location,
5118 : SLP_INSTANCE_TREE (new_instance));
5119 : }
5120 :
5121 684528 : return true;
5122 : }
5123 : }
5124 : /* Failed to SLP. */
5125 :
5126 : /* Try to break the group up into pieces. */
5127 415708 : if (*limit > 0 && kind == slp_inst_kind_store)
5128 : {
5129 : /* ??? We could delay all the actual splitting of store-groups
5130 : until after SLP discovery of the original group completed.
5131 : Then we can recurse to vect_build_slp_instance directly. */
5132 1086898 : for (i = 0; i < group_size; i++)
5133 1086898 : if (!matches[i])
5134 : break;
5135 :
5136 : /* For basic block SLP, try to break the group up into multiples of
5137 : a vector size. */
5138 415707 : if (is_a <bb_vec_info> (vinfo)
5139 415707 : && (i > 1 && i < group_size))
5140 : {
5141 : /* Free the allocated memory. */
5142 155113 : scalar_stmts.release ();
5143 :
5144 155113 : tree scalar_type
5145 155113 : = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
5146 310226 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
5147 155113 : 1 << floor_log2 (i));
5148 155113 : unsigned HOST_WIDE_INT const_nunits;
5149 155113 : if (vectype
5150 155113 : && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
5151 : {
5152 : /* Split into two groups at the first vector boundary. */
5153 155113 : gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
5154 155113 : unsigned group1_size = i & ~(const_nunits - 1);
5155 :
5156 155113 : if (dump_enabled_p ())
5157 59 : dump_printf_loc (MSG_NOTE, vect_location,
5158 : "Splitting SLP group at stmt %u\n", i);
5159 155113 : stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
5160 : group1_size);
5161 155113 : bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
5162 : kind, max_tree_size,
5163 : limit, false);
5164 : /* Split the rest at the failure point and possibly
5165 : re-analyze the remaining matching part if it has
5166 : at least two lanes. */
5167 155113 : if (group1_size < i
5168 5363 : && (i + 1 < group_size
5169 2937 : || i - group1_size > 1))
5170 : {
5171 2458 : stmt_vec_info rest2 = rest;
5172 2458 : rest = vect_split_slp_store_group (rest, i - group1_size);
5173 2458 : if (i - group1_size > 1)
5174 61 : res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
5175 : kind, max_tree_size,
5176 : limit, false);
5177 : }
5178 : /* Re-analyze the non-matching tail if it has at least
5179 : two lanes. */
5180 155113 : if (i + 1 < group_size)
5181 22086 : res |= vect_analyze_slp_instance (vinfo, bst_map,
5182 : rest, kind, max_tree_size,
5183 : limit, false);
5184 155113 : return res;
5185 : }
5186 : }
5187 :
5188 : /* For loop vectorization split the RHS into arbitrary pieces of
5189 : size >= 1. */
5190 260594 : else if (is_a <loop_vec_info> (vinfo)
5191 260594 : && (group_size != 1 && i < group_size))
5192 : {
5193 8175 : gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
5194 28 : bool masked_p = call
5195 28 : && gimple_call_internal_p (call)
5196 28 : && internal_fn_mask_index (gimple_call_internal_fn (call)) != -1;
5197 : /* There are targets that cannot do even/odd interleaving schemes
5198 : so they absolutely need to use load/store-lanes. For now
5199 : force single-lane SLP for them - they would be happy with
5200 : uniform power-of-two lanes (but depending on element size),
5201 : but even if we can use 'i' as indicator we would need to
5202 : backtrack when later lanes fail to discover with the same
5203 : granularity. We cannot turn any of strided or scatter store
5204 : into store-lanes. */
5205 : /* ??? If this is not in sync with what get_load_store_type
5206 : later decides the SLP representation is not good for other
5207 : store vectorization methods. */
5208 8175 : bool want_store_lanes
5209 8175 : = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5210 8175 : && ! STMT_VINFO_STRIDED_P (stmt_info)
5211 6100 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
5212 6096 : && compare_step_with_zero (vinfo, stmt_info) > 0
5213 14191 : && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
5214 16350 : masked_p, group_size, i));
5215 8175 : if (want_store_lanes || force_single_lane)
5216 : i = 1;
5217 :
5218 : /* A fatal discovery fail doesn't always mean single-lane SLP
5219 : isn't a possibility, so try. */
5220 6486 : if (i == 0)
5221 : i = 1;
5222 :
5223 8175 : if (dump_enabled_p ())
5224 883 : dump_printf_loc (MSG_NOTE, vect_location,
5225 : "Splitting SLP group at stmt %u\n", i);
5226 :
5227 : /* Analyze the stored values and pinch them together with
5228 : a permute node so we can preserve the whole store group. */
5229 8175 : auto_vec<slp_tree> rhs_nodes;
5230 8175 : poly_uint64 max_nunits = 1;
5231 :
5232 8175 : unsigned int rhs_common_nlanes = 0;
5233 8175 : unsigned int start = 0, end = i;
5234 36622 : while (start < group_size)
5235 : {
5236 28708 : gcc_assert (end - start >= 1);
5237 28708 : vec<stmt_vec_info> substmts;
5238 28708 : substmts.create (end - start);
5239 89549 : for (unsigned j = start; j < end; ++j)
5240 60841 : substmts.quick_push (scalar_stmts[j]);
5241 28708 : max_nunits = 1;
5242 28708 : node = vect_build_slp_tree (vinfo, substmts, end - start,
5243 : &max_nunits,
5244 : matches, limit, &tree_size, bst_map);
5245 28708 : if (node)
5246 : {
5247 22920 : rhs_nodes.safe_push (node);
5248 22920 : vect_update_max_nunits (&max_nunits, node->max_nunits);
5249 22920 : if (start == 0)
5250 7920 : rhs_common_nlanes = SLP_TREE_LANES (node);
5251 15000 : else if (rhs_common_nlanes != SLP_TREE_LANES (node))
5252 1375 : rhs_common_nlanes = 0;
5253 22920 : start = end;
5254 22920 : if (want_store_lanes || force_single_lane)
5255 5084 : end = start + 1;
5256 : else
5257 : end = group_size;
5258 : }
5259 : else
5260 : {
5261 5788 : substmts.release ();
5262 5788 : if (end - start == 1)
5263 : {
5264 : /* Single-lane discovery failed. Free ressources. */
5265 281 : for (auto node : rhs_nodes)
5266 8 : vect_free_slp_tree (node);
5267 261 : scalar_stmts.release ();
5268 261 : if (dump_enabled_p ())
5269 39 : dump_printf_loc (MSG_NOTE, vect_location,
5270 : "SLP discovery failed\n");
5271 261 : return false;
5272 : }
5273 :
5274 : /* ??? It really happens that we soft-fail SLP
5275 : build at a mismatch but the matching part hard-fails
5276 : later. As we know we arrived here with a group
5277 : larger than one try a group of size one! */
5278 5527 : if (!matches[0])
5279 44 : end = start + 1;
5280 : else
5281 12065 : for (unsigned j = start; j < end; j++)
5282 12065 : if (!matches[j - start])
5283 : {
5284 : end = j;
5285 : break;
5286 : }
5287 : }
5288 : }
5289 :
5290 : /* Now re-assess whether we want store lanes in case the
5291 : discovery ended up producing all single-lane RHSs. */
5292 7914 : if (! want_store_lanes
5293 7914 : && rhs_common_nlanes == 1
5294 6857 : && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5295 6857 : && ! STMT_VINFO_STRIDED_P (stmt_info)
5296 5156 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
5297 5153 : && compare_step_with_zero (vinfo, stmt_info) > 0
5298 13012 : && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes[0]),
5299 : group_size, masked_p)
5300 : != IFN_LAST))
5301 : want_store_lanes = true;
5302 :
5303 : /* Now we assume we can build the root SLP node from all stores. */
5304 7914 : if (want_store_lanes)
5305 : {
5306 : /* For store-lanes feed the store node with all RHS nodes
5307 : in order. */
5308 0 : node = vect_create_new_slp_node (scalar_stmts,
5309 0 : SLP_TREE_CHILDREN
5310 : (rhs_nodes[0]).length ());
5311 0 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
5312 0 : node->max_nunits = max_nunits;
5313 0 : node->ldst_lanes = true;
5314 0 : SLP_TREE_CHILDREN (node)
5315 0 : .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
5316 0 : + rhs_nodes.length () - 1);
5317 : /* First store value and possibly mask. */
5318 0 : SLP_TREE_CHILDREN (node)
5319 0 : .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
5320 : /* Rest of the store values. All mask nodes are the same,
5321 : this should be guaranteed by dataref group discovery. */
5322 0 : for (unsigned j = 1; j < rhs_nodes.length (); ++j)
5323 0 : SLP_TREE_CHILDREN (node)
5324 0 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
5325 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
5326 0 : child->refcnt++;
5327 : }
5328 : else
5329 7914 : node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
5330 : max_nunits);
5331 :
5332 30826 : while (!rhs_nodes.is_empty ())
5333 22912 : vect_free_slp_tree (rhs_nodes.pop ());
5334 :
5335 : /* Create a new SLP instance. */
5336 7914 : slp_instance new_instance = XNEW (class _slp_instance);
5337 7914 : SLP_INSTANCE_TREE (new_instance) = node;
5338 7914 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
5339 7914 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
5340 7914 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
5341 7914 : SLP_INSTANCE_KIND (new_instance) = kind;
5342 7914 : new_instance->reduc_phis = NULL;
5343 7914 : new_instance->cost_vec = vNULL;
5344 7914 : new_instance->subgraph_entries = vNULL;
5345 :
5346 7914 : if (dump_enabled_p ())
5347 844 : dump_printf_loc (MSG_NOTE, vect_location,
5348 : "SLP size %u vs. limit %u.\n",
5349 : tree_size, max_tree_size);
5350 :
5351 7914 : vinfo->slp_instances.safe_push (new_instance);
5352 :
5353 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
5354 : the number of scalar stmts in the root in a few places.
5355 : Verify that assumption holds. */
5356 15828 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
5357 : .length () == group_size);
5358 :
5359 7914 : if (dump_enabled_p ())
5360 : {
5361 844 : dump_printf_loc (MSG_NOTE, vect_location,
5362 : "Final SLP tree for instance %p:\n",
5363 : (void *) new_instance);
5364 844 : vect_print_slp_graph (MSG_NOTE, vect_location,
5365 : SLP_INSTANCE_TREE (new_instance));
5366 : }
5367 7914 : return true;
5368 8175 : }
5369 : else
5370 : /* Free the allocated memory. */
5371 252419 : scalar_stmts.release ();
5372 :
5373 : /* Even though the first vector did not all match, we might be able to SLP
5374 : (some) of the remainder. FORNOW ignore this possibility. */
5375 : }
5376 : else
5377 : /* Free the allocated memory. */
5378 1 : scalar_stmts.release ();
5379 :
5380 : /* Failed to SLP. */
5381 252420 : if (dump_enabled_p ())
5382 42 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
5383 : return false;
5384 : }
5385 :
5386 : /* qsort comparator ordering SLP load nodes. */
5387 :
5388 : static int
5389 2642523 : vllp_cmp (const void *a_, const void *b_)
5390 : {
5391 2642523 : const slp_tree a = *(const slp_tree *)a_;
5392 2642523 : const slp_tree b = *(const slp_tree *)b_;
5393 2642523 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
5394 2642523 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
5395 2642523 : if (STMT_VINFO_GROUPED_ACCESS (a0)
5396 1537931 : && STMT_VINFO_GROUPED_ACCESS (b0)
5397 4118996 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
5398 : {
5399 : /* Same group, order after lanes used. */
5400 343995 : if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
5401 : return 1;
5402 335212 : else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
5403 : return -1;
5404 : else
5405 : {
5406 : /* Try to order loads using the same lanes together, breaking
5407 : the tie with the lane number that first differs. */
5408 325678 : if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
5409 325678 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
5410 : return 0;
5411 325678 : else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
5412 325678 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
5413 : return 1;
5414 321635 : else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
5415 321635 : && SLP_TREE_LOAD_PERMUTATION (b).exists ())
5416 : return -1;
5417 : else
5418 : {
5419 314249 : for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
5420 314249 : if (SLP_TREE_LOAD_PERMUTATION (a)[i]
5421 314249 : != SLP_TREE_LOAD_PERMUTATION (b)[i])
5422 : {
5423 : /* In-order lane first, that's what the above case for
5424 : no permutation does. */
5425 312937 : if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
5426 : return -1;
5427 191758 : else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
5428 : return 1;
5429 100787 : else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
5430 100787 : < SLP_TREE_LOAD_PERMUTATION (b)[i])
5431 : return -1;
5432 : else
5433 : return 1;
5434 : }
5435 : return 0;
5436 : }
5437 : }
5438 : }
5439 : else /* Different groups or non-groups. */
5440 : {
5441 : /* Order groups as their first element to keep them together. */
5442 2298528 : if (STMT_VINFO_GROUPED_ACCESS (a0))
5443 2298528 : a0 = DR_GROUP_FIRST_ELEMENT (a0);
5444 2298528 : if (STMT_VINFO_GROUPED_ACCESS (b0))
5445 2298528 : b0 = DR_GROUP_FIRST_ELEMENT (b0);
5446 2298528 : if (a0 == b0)
5447 : return 0;
5448 : /* Tie using UID. */
5449 2298408 : else if (gimple_uid (STMT_VINFO_STMT (a0))
5450 2298408 : < gimple_uid (STMT_VINFO_STMT (b0)))
5451 : return -1;
5452 : else
5453 : {
5454 1020907 : gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
5455 : != gimple_uid (STMT_VINFO_STMT (b0)));
5456 : return 1;
5457 : }
5458 : }
5459 : }
5460 :
5461 : /* Return whether if the load permutation of NODE is consecutive starting
5462 : with value START_VAL in the first element. If START_VAL is not given
5463 : the first element's value is used. */
5464 :
5465 : bool
5466 623055 : vect_load_perm_consecutive_p (slp_tree node, unsigned start_val)
5467 : {
5468 623055 : load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
5469 :
5470 623055 : if (!perm.exists () || !perm.length ())
5471 : return false;
5472 :
5473 623055 : if (start_val == UINT_MAX)
5474 79184 : start_val = perm[0];
5475 :
5476 1230186 : for (unsigned int i = 0; i < perm.length (); i++)
5477 630332 : if (perm[i] != start_val + (unsigned int) i)
5478 : return false;
5479 :
5480 : return true;
5481 : }
5482 :
5483 : /* Process the set of LOADS that are all from the same dataref group. */
5484 :
5485 : static void
5486 161048 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
5487 : scalar_stmts_to_slp_tree_map_t *bst_map,
5488 : const array_slice<slp_tree> &loads,
5489 : bool force_single_lane)
5490 : {
5491 : /* We at this point want to lower without a fixed VF or vector
5492 : size in mind which means we cannot actually compute whether we
5493 : need three or more vectors for a load permutation yet. So always
5494 : lower. */
5495 161048 : stmt_vec_info first
5496 161048 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
5497 161048 : unsigned group_lanes = DR_GROUP_SIZE (first);
5498 :
5499 : /* Verify if all load permutations can be implemented with a suitably
5500 : large element load-lanes operation. */
5501 161048 : unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
5502 161048 : if (STMT_VINFO_STRIDED_P (first)
5503 158755 : || compare_step_with_zero (loop_vinfo, first) <= 0
5504 156097 : || exact_log2 (ld_lanes_lanes) == -1
5505 : /* ??? For now only support the single-lane case as there is
5506 : missing support on the store-lane side and code generation
5507 : isn't up to the task yet. */
5508 153312 : || ld_lanes_lanes != 1
5509 303410 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
5510 : group_lanes / ld_lanes_lanes,
5511 : false) == IFN_LAST)
5512 : ld_lanes_lanes = 0;
5513 : else
5514 : /* Verify the loads access the same number of lanes aligned to
5515 : ld_lanes_lanes. */
5516 0 : for (slp_tree load : loads)
5517 : {
5518 0 : if (SLP_TREE_LANES (load) != ld_lanes_lanes)
5519 : {
5520 : ld_lanes_lanes = 0;
5521 : break;
5522 : }
5523 0 : unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
5524 0 : if (first % ld_lanes_lanes != 0)
5525 : {
5526 : ld_lanes_lanes = 0;
5527 : break;
5528 : }
5529 0 : if (!vect_load_perm_consecutive_p (load))
5530 : {
5531 : ld_lanes_lanes = 0;
5532 : break;
5533 : }
5534 : }
5535 :
5536 : /* Only a power-of-two number of lanes matches interleaving with N levels.
5537 : ??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
5538 : at each step. */
5539 261768 : if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
5540 : return;
5541 :
5542 264142 : for (slp_tree load : loads)
5543 : {
5544 : /* Leave masked or gather loads alone for now. */
5545 186462 : if (!SLP_TREE_CHILDREN (load).is_empty ())
5546 60028 : continue;
5547 :
5548 : /* For single-element interleaving spanning multiple vectors avoid
5549 : lowering, we want to use VMAT_ELEMENTWISE later. */
5550 186456 : if (ld_lanes_lanes == 0
5551 186456 : && SLP_TREE_LANES (load) == 1
5552 167228 : && !DR_GROUP_NEXT_ELEMENT (first)
5553 265901 : && maybe_gt (group_lanes,
5554 : TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (load))))
5555 51246 : return;
5556 :
5557 : /* We want to pattern-match special cases here and keep those
5558 : alone. Candidates are splats and load-lane. */
5559 :
5560 : /* We need to lower only loads of less than half of the groups
5561 : lanes, including duplicate lanes. Note this leaves nodes
5562 : with a non-1:1 load permutation around instead of canonicalizing
5563 : those into a load and a permute node. Removing this early
5564 : check would do such canonicalization. */
5565 135210 : if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
5566 56454 : && ld_lanes_lanes == 0)
5567 56454 : continue;
5568 :
5569 : /* Build the permute to get the original load permutation order. */
5570 78756 : bool contiguous = vect_load_perm_consecutive_p (load);
5571 78756 : lane_permutation_t final_perm;
5572 78756 : final_perm.create (SLP_TREE_LANES (load));
5573 158426 : for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
5574 159340 : final_perm.quick_push (
5575 79670 : std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
5576 :
5577 : /* When the load permutation accesses a contiguous unpermuted,
5578 : power-of-two aligned and sized chunk leave the load alone.
5579 : We can likely (re-)load it more efficiently rather than
5580 : extracting it from the larger load.
5581 : ??? Long-term some of the lowering should move to where
5582 : the vector types involved are fixed. */
5583 82324 : if (!force_single_lane
5584 78756 : && ld_lanes_lanes == 0
5585 53099 : && contiguous
5586 52856 : && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
5587 6563 : && pow2p_hwi (SLP_TREE_LANES (load))
5588 6527 : && pow2p_hwi (group_lanes)
5589 3568 : && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
5590 82324 : && group_lanes % SLP_TREE_LANES (load) == 0)
5591 : {
5592 3568 : final_perm.release ();
5593 3568 : continue;
5594 : }
5595 :
5596 : /* First build (and possibly re-use) a load node for the
5597 : unpermuted group. Gaps in the middle and on the end are
5598 : represented with NULL stmts. */
5599 75188 : vec<stmt_vec_info> stmts;
5600 75188 : stmts.create (group_lanes);
5601 267221 : for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
5602 : {
5603 192033 : if (s != first)
5604 121598 : for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
5605 4753 : stmts.quick_push (NULL);
5606 192033 : stmts.quick_push (s);
5607 : }
5608 137098 : for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
5609 61910 : stmts.quick_push (NULL);
5610 75188 : poly_uint64 max_nunits = 1;
5611 75188 : bool *matches = XALLOCAVEC (bool, group_lanes);
5612 75188 : unsigned limit = 1;
5613 75188 : unsigned tree_size = 0;
5614 75188 : slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
5615 : group_lanes,
5616 : &max_nunits, matches, &limit,
5617 75188 : &tree_size, bst_map);
5618 75188 : gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0).exists ());
5619 :
5620 75188 : if (ld_lanes_lanes != 0)
5621 : {
5622 : /* ??? If this is not in sync with what get_load_store_type
5623 : later decides the SLP representation is not good for other
5624 : store vectorization methods. */
5625 0 : l0->ldst_lanes = true;
5626 0 : load->ldst_lanes = true;
5627 : }
5628 :
5629 233230 : while (1)
5630 : {
5631 154209 : unsigned group_lanes = SLP_TREE_LANES (l0);
5632 154209 : if (ld_lanes_lanes != 0
5633 154209 : || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
5634 : break;
5635 :
5636 : /* Try to lower by reducing the group to half its size using an
5637 : interleaving scheme. For this try to compute whether all
5638 : elements needed for this load are in even or odd elements of
5639 : an even/odd decomposition with N consecutive elements.
5640 : Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
5641 : with N == 2. */
5642 : /* ??? Only an even number of lanes can be handed this way, but the
5643 : fallback below could work for any number. We have to make sure
5644 : to round up in that case. */
5645 79021 : gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
5646 11009 : unsigned even = 0, odd = 0;
5647 11009 : if ((group_lanes & 1) == 0)
5648 : {
5649 11009 : even = (1 << ceil_log2 (group_lanes)) - 1;
5650 11009 : odd = even;
5651 44713 : for (auto l : final_perm)
5652 : {
5653 11686 : even &= ~l.second;
5654 11686 : odd &= l.second;
5655 : }
5656 : }
5657 :
5658 : /* Now build an even or odd extraction from the unpermuted load. */
5659 79021 : lane_permutation_t perm;
5660 79021 : perm.create ((group_lanes + 1) / 2);
5661 79021 : unsigned even_level = even ? 1 << ctz_hwi (even) : 0;
5662 79021 : unsigned odd_level = odd ? 1 << ctz_hwi (odd) : 0;
5663 79021 : if (even_level
5664 10092 : && group_lanes % (2 * even_level) == 0
5665 : /* ??? When code generating permutes we do not try to pun
5666 : to larger component modes so level != 1 isn't a natural
5667 : even/odd extract. Prefer one if possible. */
5668 10092 : && (even_level == 1 || !odd_level || odd_level != 1))
5669 : {
5670 : /* { 0, 1, ... 4, 5 ..., } */
5671 36375 : for (unsigned i = 0; i < group_lanes / 2 / even_level; ++i)
5672 57438 : for (unsigned j = 0; j < even_level; ++j)
5673 28892 : perm.quick_push (std::make_pair (0, 2 * i * even_level + j));
5674 : }
5675 68929 : else if (odd_level)
5676 : {
5677 : /* { ..., 2, 3, ... 6, 7 } */
5678 3150 : gcc_assert (group_lanes % (2 * odd_level) == 0);
5679 13714 : for (unsigned i = 0; i < group_lanes / 2 / odd_level; ++i)
5680 21182 : for (unsigned j = 0; j < odd_level; ++j)
5681 10618 : perm.quick_push
5682 10618 : (std::make_pair (0, (2 * i + 1) * odd_level + j));
5683 : }
5684 : else
5685 : {
5686 : /* As fallback extract all used lanes and fill to half the
5687 : group size by repeating the last element.
5688 : ??? This is quite a bad strathegy for re-use - we could
5689 : brute force our way to find more optimal filling lanes to
5690 : maximize re-use when looking at all loads from the group. */
5691 68042 : auto_bitmap l;
5692 272224 : for (auto p : final_perm)
5693 68098 : bitmap_set_bit (l, p.second);
5694 68042 : unsigned i = 0;
5695 68042 : bitmap_iterator bi;
5696 136140 : EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
5697 68098 : perm.quick_push (std::make_pair (0, i));
5698 272320 : while (perm.length () < (group_lanes + 1) / 2)
5699 68118 : perm.quick_push (perm.last ());
5700 68042 : }
5701 :
5702 : /* Update final_perm with the intermediate permute. */
5703 158719 : for (unsigned i = 0; i < final_perm.length (); ++i)
5704 : {
5705 79698 : unsigned l = final_perm[i].second;
5706 79698 : unsigned j;
5707 88146 : for (j = 0; j < perm.length (); ++j)
5708 88146 : if (perm[j].second == l)
5709 : {
5710 79698 : final_perm[i].second = j;
5711 79698 : break;
5712 : }
5713 79698 : gcc_assert (j < perm.length ());
5714 : }
5715 :
5716 : /* And create scalar stmts. */
5717 79021 : vec<stmt_vec_info> perm_stmts;
5718 79021 : perm_stmts.create (perm.length ());
5719 254747 : for (unsigned i = 0; i < perm.length (); ++i)
5720 175726 : perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
5721 :
5722 79021 : slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
5723 79021 : SLP_TREE_CHILDREN (p).quick_push (l0);
5724 79021 : SLP_TREE_LANE_PERMUTATION (p) = perm;
5725 79021 : SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
5726 79021 : SLP_TREE_LANES (p) = perm.length ();
5727 79021 : SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
5728 : /* ??? As we have scalar stmts for this intermediate permute we
5729 : could CSE it via bst_map but we do not want to pick up
5730 : another SLP node with a load permutation. We instead should
5731 : have a "local" CSE map here. */
5732 79021 : SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
5733 :
5734 : /* We now have a node for (group_lanes + 1) / 2 lanes. */
5735 79021 : l0 = p;
5736 79021 : }
5737 :
5738 : /* And finally from the ordered reduction node create the
5739 : permute to shuffle the lanes into the original load-permutation
5740 : order. We replace the original load node with this. */
5741 75188 : SLP_TREE_CODE (load) = VEC_PERM_EXPR;
5742 75188 : SLP_TREE_LOAD_PERMUTATION (load).release ();
5743 75188 : SLP_TREE_LANE_PERMUTATION (load) = final_perm;
5744 75188 : SLP_TREE_CHILDREN (load).create (1);
5745 75188 : SLP_TREE_CHILDREN (load).quick_push (l0);
5746 : }
5747 : }
5748 :
5749 : /* Transform SLP loads in the SLP graph created by SLP discovery to
5750 : group loads from the same group and lower load permutations that
5751 : are unlikely to be supported into a series of permutes.
5752 : In the degenerate case of having only single-lane SLP instances
5753 : this should result in a series of permute nodes emulating an
5754 : interleaving scheme. */
5755 :
5756 : static void
5757 473922 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
5758 : scalar_stmts_to_slp_tree_map_t *bst_map,
5759 : bool force_single_lane)
5760 : {
5761 : /* Gather and sort loads across all instances. */
5762 473922 : hash_set<slp_tree> visited;
5763 473922 : auto_vec<slp_tree> loads;
5764 2180441 : for (auto inst : loop_vinfo->slp_instances)
5765 760609 : vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
5766 473922 : if (loads.is_empty ())
5767 89883 : return;
5768 384039 : loads.qsort (vllp_cmp);
5769 :
5770 : /* Now process each dataref group separately. */
5771 384039 : unsigned firsti = 0;
5772 718994 : for (unsigned i = 1; i < loads.length (); ++i)
5773 : {
5774 334955 : slp_tree first = loads[firsti];
5775 334955 : slp_tree next = loads[i];
5776 334955 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
5777 334955 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
5778 334955 : if (STMT_VINFO_GROUPED_ACCESS (a0)
5779 157621 : && STMT_VINFO_GROUPED_ACCESS (b0)
5780 479513 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
5781 62679 : continue;
5782 : /* Now we have one or multiple SLP loads of the same group from
5783 : firsti to i - 1. */
5784 272276 : if (STMT_VINFO_GROUPED_ACCESS (a0))
5785 94942 : vect_lower_load_permutations (loop_vinfo, bst_map,
5786 94942 : make_array_slice (&loads[firsti],
5787 : i - firsti),
5788 : force_single_lane);
5789 : firsti = i;
5790 : }
5791 768078 : if (firsti < loads.length ()
5792 768078 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
5793 66106 : vect_lower_load_permutations (loop_vinfo, bst_map,
5794 66106 : make_array_slice (&loads[firsti],
5795 66106 : loads.length () - firsti),
5796 : force_single_lane);
5797 473922 : }
5798 :
5799 : /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
5800 : trees of packed scalar stmts if SLP is possible. */
5801 :
5802 : opt_result
5803 1113267 : vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
5804 : bool force_single_lane)
5805 : {
5806 1113267 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5807 1113267 : unsigned int i;
5808 1113267 : stmt_vec_info first_element;
5809 1113267 : slp_instance instance;
5810 :
5811 1113267 : DUMP_VECT_SCOPE ("vect_analyze_slp");
5812 :
5813 1113267 : unsigned limit = max_tree_size;
5814 :
5815 1113267 : scalar_stmts_to_slp_tree_map_t *bst_map
5816 1113267 : = new scalar_stmts_to_slp_tree_map_t ();
5817 :
5818 : /* Find SLP sequences starting from groups of grouped stores. */
5819 3149264 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5820 922999 : if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
5821 : slp_inst_kind_store, max_tree_size, &limit,
5822 : force_single_lane)
5823 922999 : && loop_vinfo)
5824 : {
5825 269 : release_scalar_stmts_to_slp_tree_map (bst_map);
5826 269 : return opt_result::failure_at (vect_location, "SLP build failed.\n");
5827 : }
5828 :
5829 : /* For loops also start SLP discovery from non-grouped stores. */
5830 1112998 : if (loop_vinfo)
5831 : {
5832 : data_reference_p dr;
5833 1630458 : FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
5834 1138773 : if (DR_IS_WRITE (dr))
5835 : {
5836 370755 : stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
5837 : /* Grouped stores are already handled above. */
5838 370755 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5839 99638 : continue;
5840 271117 : vec<stmt_vec_info> stmts;
5841 271117 : vec<stmt_vec_info> roots = vNULL;
5842 271117 : vec<tree> remain = vNULL;
5843 271117 : stmts.create (1);
5844 271117 : stmts.quick_push (stmt_info);
5845 271117 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
5846 : stmts, roots, remain, max_tree_size,
5847 : &limit, bst_map, force_single_lane))
5848 : {
5849 6930 : release_scalar_stmts_to_slp_tree_map (bst_map);
5850 6930 : return opt_result::failure_at (vect_location,
5851 : "SLP build failed.\n");
5852 : }
5853 : }
5854 :
5855 : stmt_vec_info stmt_info;
5856 491725 : FOR_EACH_VEC_ELT (LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo), i, stmt_info)
5857 : {
5858 20 : vec<stmt_vec_info> stmts;
5859 20 : vec<stmt_vec_info> roots = vNULL;
5860 20 : vec<tree> remain = vNULL;
5861 20 : stmts.create (1);
5862 20 : stmts.quick_push (stmt_info);
5863 20 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
5864 : stmts, roots, remain, max_tree_size,
5865 : &limit, bst_map, force_single_lane))
5866 : {
5867 0 : release_scalar_stmts_to_slp_tree_map (bst_map);
5868 0 : return opt_result::failure_at (vect_location,
5869 : "SLP build failed.\n");
5870 : }
5871 : }
5872 : }
5873 :
5874 1106068 : if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
5875 : {
5876 1857585 : for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
5877 : {
5878 1243202 : vect_location = bb_vinfo->roots[i].roots[0]->stmt;
5879 : /* Apply patterns. */
5880 3884056 : for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
5881 5281708 : bb_vinfo->roots[i].stmts[j]
5882 2723176 : = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
5883 1243202 : if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
5884 1243202 : bb_vinfo->roots[i].stmts,
5885 1243202 : bb_vinfo->roots[i].roots,
5886 1243202 : bb_vinfo->roots[i].remain,
5887 : max_tree_size, &limit, bst_map, false))
5888 : {
5889 128828 : bb_vinfo->roots[i].roots = vNULL;
5890 128828 : bb_vinfo->roots[i].remain = vNULL;
5891 : }
5892 1243202 : bb_vinfo->roots[i].stmts = vNULL;
5893 : }
5894 : }
5895 :
5896 1106068 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5897 : {
5898 : /* Find SLP sequences starting from groups of reductions. */
5899 491685 : if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit,
5900 : bst_map, force_single_lane))
5901 : {
5902 3008 : release_scalar_stmts_to_slp_tree_map (bst_map);
5903 3008 : return opt_result::failure_at (vect_location, "SLP build failed.\n");
5904 : }
5905 :
5906 : /* Make sure to vectorize only-live stmts, usually inductions. */
5907 2200801 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
5908 1423481 : for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
5909 679370 : gsi_next (&gsi))
5910 : {
5911 688711 : gphi *lc_phi = *gsi;
5912 688711 : tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
5913 688711 : stmt_vec_info stmt_info;
5914 688711 : if (TREE_CODE (def) == SSA_NAME
5915 576843 : && !virtual_operand_p (def)
5916 299961 : && (stmt_info = loop_vinfo->lookup_def (def))
5917 268605 : && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
5918 268605 : && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
5919 208767 : && STMT_VINFO_LIVE_P (stmt_info)
5920 208767 : && !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))
5921 796054 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
5922 : {
5923 107256 : vec<stmt_vec_info> stmts;
5924 107256 : vec<stmt_vec_info> roots = vNULL;
5925 107256 : vec<tree> remain = vNULL;
5926 107256 : stmts.create (1);
5927 107256 : stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
5928 107256 : if (! vect_build_slp_instance (vinfo,
5929 : slp_inst_kind_reduc_group,
5930 : stmts, roots, remain,
5931 : max_tree_size, &limit,
5932 : bst_map, force_single_lane))
5933 : {
5934 9341 : release_scalar_stmts_to_slp_tree_map (bst_map);
5935 9341 : return opt_result::failure_at (vect_location,
5936 : "SLP build failed.\n");
5937 : }
5938 : }
5939 9341 : }
5940 :
5941 : /* Find SLP sequences starting from gconds. */
5942 1190700 : for (auto cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
5943 : {
5944 279302 : auto cond_info = loop_vinfo->lookup_stmt (cond);
5945 :
5946 279302 : cond_info = vect_stmt_to_vectorize (cond_info);
5947 279302 : vec<stmt_vec_info> roots = vNULL;
5948 279302 : roots.safe_push (cond_info);
5949 279302 : gimple *stmt = STMT_VINFO_STMT (cond_info);
5950 279302 : tree args0 = gimple_cond_lhs (stmt);
5951 279302 : tree args1 = gimple_cond_rhs (stmt);
5952 :
5953 : /* These should be enforced by cond lowering, but if it failed
5954 : bail. */
5955 279302 : if (gimple_cond_code (stmt) != NE_EXPR
5956 278218 : || TREE_TYPE (args0) != boolean_type_node
5957 556952 : || !integer_zerop (args1))
5958 : {
5959 1652 : roots.release ();
5960 1652 : release_scalar_stmts_to_slp_tree_map (bst_map);
5961 1652 : return opt_result::failure_at (vect_location,
5962 : "SLP build failed.\n");
5963 : }
5964 :
5965 : /* An argument without a loop def will be codegened from vectorizing the
5966 : root gcond itself. As such we don't need to try to build an SLP tree
5967 : from them. It's highly likely that the resulting SLP tree here if both
5968 : arguments have a def will be incompatible, but we rely on it being split
5969 : later on. */
5970 277650 : auto varg = loop_vinfo->lookup_def (args0);
5971 277650 : vec<stmt_vec_info> stmts;
5972 277650 : vec<tree> remain = vNULL;
5973 277650 : stmts.create (1);
5974 277650 : stmts.quick_push (vect_stmt_to_vectorize (varg));
5975 :
5976 277650 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
5977 : stmts, roots, remain,
5978 : max_tree_size, &limit,
5979 : bst_map, force_single_lane))
5980 : {
5981 3762 : roots.release ();
5982 3762 : release_scalar_stmts_to_slp_tree_map (bst_map);
5983 3762 : return opt_result::failure_at (vect_location,
5984 : "SLP build failed.\n");
5985 : }
5986 : }
5987 : }
5988 :
5989 1088305 : hash_set<slp_tree> visited_patterns;
5990 1088305 : slp_tree_to_load_perm_map_t perm_cache;
5991 1088305 : slp_compat_nodes_map_t compat_cache;
5992 :
5993 : /* See if any patterns can be found in the SLP tree. */
5994 1088305 : bool pattern_found = false;
5995 3729461 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5996 1552851 : pattern_found |= vect_match_slp_patterns (instance, vinfo,
5997 : &visited_patterns, &perm_cache,
5998 : &compat_cache);
5999 :
6000 : /* If any were found optimize permutations of loads. */
6001 1088305 : if (pattern_found)
6002 : {
6003 285 : hash_map<slp_tree, slp_tree> load_map;
6004 3421 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6005 : {
6006 2851 : slp_tree root = SLP_INSTANCE_TREE (instance);
6007 2851 : optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
6008 : &load_map, root);
6009 : }
6010 285 : }
6011 :
6012 : /* Check whether we should force some SLP instances to use load/store-lanes
6013 : and do so by forcing SLP re-discovery with single lanes. We used
6014 : to cancel SLP when this applied to all instances in a loop but now
6015 : we decide this per SLP instance. It's important to do this only
6016 : after SLP pattern recognition. */
6017 1088305 : if (is_a <loop_vec_info> (vinfo))
6018 1234531 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6019 760609 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
6020 291398 : && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
6021 : {
6022 291398 : slp_tree slp_root = SLP_INSTANCE_TREE (instance);
6023 291398 : unsigned int group_size = SLP_TREE_LANES (slp_root);
6024 291398 : tree vectype = SLP_TREE_VECTYPE (slp_root);
6025 :
6026 291398 : stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
6027 291398 : gimple *rep = STMT_VINFO_STMT (rep_info);
6028 291398 : bool masked = (is_gimple_call (rep)
6029 2556 : && gimple_call_internal_p (rep)
6030 293934 : && internal_fn_mask_index
6031 2536 : (gimple_call_internal_fn (rep)) != -1);
6032 291378 : if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
6033 28993 : || slp_root->ldst_lanes
6034 320391 : || (vect_store_lanes_supported (vectype, group_size, masked)
6035 : == IFN_LAST))
6036 291398 : continue;
6037 :
6038 0 : auto_vec<slp_tree> loads;
6039 0 : hash_set<slp_tree> visited;
6040 0 : vect_gather_slp_loads (loads, slp_root, visited);
6041 :
6042 : /* Check whether any load in the SLP instance is possibly
6043 : permuted. */
6044 0 : bool loads_permuted = false;
6045 0 : slp_tree load_node;
6046 0 : unsigned j;
6047 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
6048 : {
6049 0 : if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
6050 0 : continue;
6051 : unsigned k;
6052 : stmt_vec_info load_info;
6053 0 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
6054 0 : if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
6055 : {
6056 : loads_permuted = true;
6057 : break;
6058 : }
6059 : }
6060 :
6061 : /* If the loads and stores can use load/store-lanes force re-discovery
6062 : with single lanes. */
6063 0 : if (loads_permuted)
6064 : {
6065 0 : bool can_use_lanes = true;
6066 : bool prefer_load_lanes = false;
6067 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
6068 0 : if (STMT_VINFO_GROUPED_ACCESS
6069 : (SLP_TREE_REPRESENTATIVE (load_node)))
6070 : {
6071 0 : stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
6072 : (SLP_TREE_REPRESENTATIVE (load_node));
6073 0 : rep = STMT_VINFO_STMT (stmt_vinfo);
6074 0 : masked = (is_gimple_call (rep)
6075 0 : && gimple_call_internal_p (rep)
6076 0 : && internal_fn_mask_index
6077 0 : (gimple_call_internal_fn (rep)));
6078 : /* Use SLP for strided accesses (or if we can't
6079 : load-lanes). */
6080 0 : if (STMT_VINFO_STRIDED_P (stmt_vinfo)
6081 0 : || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
6082 0 : || vect_load_lanes_supported
6083 0 : (SLP_TREE_VECTYPE (load_node),
6084 0 : DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
6085 : /* ??? During SLP re-discovery with a single lane
6086 : a masked grouped load will appear permuted and
6087 : discovery will fail. We have to rework this
6088 : on the discovery side - for now avoid ICEing. */
6089 0 : || masked)
6090 : {
6091 : can_use_lanes = false;
6092 : break;
6093 : }
6094 : /* Make sure that the target would prefer store-lanes
6095 : for at least one of the loads.
6096 :
6097 : ??? Perhaps we should instead require this for
6098 : all loads? */
6099 0 : prefer_load_lanes
6100 : = (prefer_load_lanes
6101 0 : || SLP_TREE_LANES (load_node) == group_size
6102 0 : || (vect_slp_prefer_store_lanes_p
6103 0 : (vinfo, stmt_vinfo,
6104 : SLP_TREE_VECTYPE (load_node), masked,
6105 : group_size, SLP_TREE_LANES (load_node))));
6106 : }
6107 :
6108 0 : if (can_use_lanes && prefer_load_lanes)
6109 : {
6110 0 : if (dump_enabled_p ())
6111 0 : dump_printf_loc (MSG_NOTE, vect_location,
6112 : "SLP instance %p can use load/store-lanes,"
6113 : " re-discovering with single-lanes\n",
6114 : (void *) instance);
6115 :
6116 0 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
6117 :
6118 0 : vect_free_slp_instance (instance);
6119 0 : limit = max_tree_size;
6120 0 : bool res = vect_analyze_slp_instance (vinfo, bst_map,
6121 : stmt_info,
6122 : slp_inst_kind_store,
6123 : max_tree_size, &limit,
6124 : true);
6125 0 : gcc_assert (res);
6126 0 : auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
6127 0 : LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
6128 : }
6129 : }
6130 0 : }
6131 :
6132 : /* When we end up with load permutations that we cannot possibly handle,
6133 : like those requiring three vector inputs, lower them using interleaving
6134 : like schemes. */
6135 1088305 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6136 : {
6137 473922 : vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
6138 473922 : if (dump_enabled_p ())
6139 : {
6140 19971 : dump_printf_loc (MSG_NOTE, vect_location,
6141 : "SLP graph after lowering permutations:\n");
6142 19971 : hash_set<slp_tree> visited;
6143 89069 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6144 29181 : vect_print_slp_graph (MSG_NOTE, vect_location,
6145 : SLP_INSTANCE_TREE (instance), visited);
6146 19971 : }
6147 : }
6148 :
6149 1088305 : release_scalar_stmts_to_slp_tree_map (bst_map);
6150 :
6151 1088305 : if (pattern_found && dump_enabled_p ())
6152 : {
6153 18 : dump_printf_loc (MSG_NOTE, vect_location,
6154 : "Pattern matched SLP tree\n");
6155 18 : hash_set<slp_tree> visited;
6156 90 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6157 36 : vect_print_slp_graph (MSG_NOTE, vect_location,
6158 : SLP_INSTANCE_TREE (instance), visited);
6159 18 : }
6160 :
6161 1088305 : return opt_result::success ();
6162 1088305 : }
6163 :
6164 : /* Estimates the cost of inserting layout changes into the SLP graph.
6165 : It can also say that the insertion is impossible. */
6166 :
6167 : struct slpg_layout_cost
6168 : {
6169 10629581 : slpg_layout_cost () = default;
6170 : slpg_layout_cost (sreal, bool);
6171 :
6172 499113 : static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
6173 5510603 : bool is_possible () const { return depth != sreal::max (); }
6174 :
6175 : bool operator== (const slpg_layout_cost &) const;
6176 : bool operator!= (const slpg_layout_cost &) const;
6177 :
6178 : bool is_better_than (const slpg_layout_cost &, bool) const;
6179 :
6180 : void add_parallel_cost (const slpg_layout_cost &);
6181 : void add_serial_cost (const slpg_layout_cost &);
6182 : void split (unsigned int);
6183 :
6184 : /* The longest sequence of layout changes needed during any traversal
6185 : of the partition dag, weighted by execution frequency.
6186 :
6187 : This is the most important metric when optimizing for speed, since
6188 : it helps to ensure that we keep the number of operations on
6189 : critical paths to a minimum. */
6190 : sreal depth = 0;
6191 :
6192 : /* An estimate of the total number of operations needed. It is weighted by
6193 : execution frequency when optimizing for speed but not when optimizing for
6194 : size. In order to avoid double-counting, a node with a fanout of N will
6195 : distribute 1/N of its total cost to each successor.
6196 :
6197 : This is the most important metric when optimizing for size, since
6198 : it helps to keep the total number of operations to a minimum, */
6199 : sreal total = 0;
6200 : };
6201 :
6202 : /* Construct costs for a node with weight WEIGHT. A higher weight
6203 : indicates more frequent execution. IS_FOR_SIZE is true if we are
6204 : optimizing for size rather than speed. */
6205 :
6206 1301522 : slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
6207 1302390 : : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
6208 : {
6209 1301522 : }
6210 :
6211 : bool
6212 0 : slpg_layout_cost::operator== (const slpg_layout_cost &other) const
6213 : {
6214 0 : return depth == other.depth && total == other.total;
6215 : }
6216 :
6217 : bool
6218 0 : slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
6219 : {
6220 0 : return !operator== (other);
6221 : }
6222 :
6223 : /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
6224 : true if we are optimizing for size rather than speed. */
6225 :
6226 : bool
6227 321743 : slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
6228 : bool is_for_size) const
6229 : {
6230 321743 : if (is_for_size)
6231 : {
6232 382 : if (total != other.total)
6233 155 : return total < other.total;
6234 227 : return depth < other.depth;
6235 : }
6236 : else
6237 : {
6238 321361 : if (depth != other.depth)
6239 137023 : return depth < other.depth;
6240 184338 : return total < other.total;
6241 : }
6242 : }
6243 :
6244 : /* Increase the costs to account for something with cost INPUT_COST
6245 : happening in parallel with the current costs. */
6246 :
6247 : void
6248 385614 : slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
6249 : {
6250 385614 : depth = std::max (depth, input_cost.depth);
6251 385614 : total += input_cost.total;
6252 385614 : }
6253 :
6254 : /* Increase the costs to account for something with cost INPUT_COST
6255 : happening in series with the current costs. */
6256 :
6257 : void
6258 1560665 : slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
6259 : {
6260 1560665 : depth += other.depth;
6261 1560665 : total += other.total;
6262 1560665 : }
6263 :
6264 : /* Split the total cost among TIMES successors or predecessors. */
6265 :
6266 : void
6267 1297356 : slpg_layout_cost::split (unsigned int times)
6268 : {
6269 1297356 : if (times > 1)
6270 568326 : total /= times;
6271 1297356 : }
6272 :
6273 : /* Information about one node in the SLP graph, for use during
6274 : vect_optimize_slp_pass. */
6275 :
6276 : struct slpg_vertex
6277 : {
6278 9950475 : slpg_vertex (slp_tree node_) : node (node_) {}
6279 :
6280 : /* The node itself. */
6281 : slp_tree node;
6282 :
6283 : /* Which partition the node belongs to, or -1 if none. Nodes outside of
6284 : partitions are flexible; they can have whichever layout consumers
6285 : want them to have. */
6286 : int partition = -1;
6287 :
6288 : /* The number of nodes that directly use the result of this one
6289 : (i.e. the number of nodes that count this one as a child). */
6290 : unsigned int out_degree = 0;
6291 :
6292 : /* The execution frequency of the node. */
6293 : sreal weight = 0;
6294 :
6295 : /* The total execution frequency of all nodes that directly use the
6296 : result of this one. */
6297 : sreal out_weight = 0;
6298 : };
6299 :
6300 : /* Information about one partition of the SLP graph, for use during
6301 : vect_optimize_slp_pass. */
6302 :
6303 : struct slpg_partition_info
6304 : {
6305 : /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
6306 : of m_partitioned_nodes. */
6307 : unsigned int node_begin = 0;
6308 : unsigned int node_end = 0;
6309 :
6310 : /* Which layout we've chosen to use for this partition, or -1 if
6311 : we haven't picked one yet. */
6312 : int layout = -1;
6313 :
6314 : /* The number of predecessors and successors in the partition dag.
6315 : The predecessors always have lower partition numbers and the
6316 : successors always have higher partition numbers.
6317 :
6318 : Note that the directions of these edges are not necessarily the
6319 : same as in the data flow graph. For example, if an SCC has separate
6320 : partitions for an inner loop and an outer loop, the inner loop's
6321 : partition will have at least two incoming edges from the outer loop's
6322 : partition: one for a live-in value and one for a live-out value.
6323 : In data flow terms, one of these edges would also be from the outer loop
6324 : to the inner loop, but the other would be in the opposite direction. */
6325 : unsigned int in_degree = 0;
6326 : unsigned int out_degree = 0;
6327 : };
6328 :
6329 : /* Information about the costs of using a particular layout for a
6330 : particular partition. It can also say that the combination is
6331 : impossible. */
6332 :
6333 : struct slpg_partition_layout_costs
6334 : {
6335 1571894 : bool is_possible () const { return internal_cost.is_possible (); }
6336 55736 : void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
6337 :
6338 : /* The costs inherited from predecessor partitions. */
6339 : slpg_layout_cost in_cost;
6340 :
6341 : /* The inherent cost of the layout within the node itself. For example,
6342 : this is nonzero for a load if choosing a particular layout would require
6343 : the load to permute the loaded elements. It is nonzero for a
6344 : VEC_PERM_EXPR if the permutation cannot be eliminated or converted
6345 : to full-vector moves. */
6346 : slpg_layout_cost internal_cost;
6347 :
6348 : /* The costs inherited from successor partitions. */
6349 : slpg_layout_cost out_cost;
6350 : };
6351 :
6352 : /* This class tries to optimize the layout of vectors in order to avoid
6353 : unnecessary shuffling. At the moment, the set of possible layouts are
6354 : restricted to bijective permutations.
6355 :
6356 : The goal of the pass depends on whether we're optimizing for size or
6357 : for speed. When optimizing for size, the goal is to reduce the overall
6358 : number of layout changes (including layout changes implied by things
6359 : like load permutations). When optimizing for speed, the goal is to
6360 : reduce the maximum latency attributable to layout changes on any
6361 : non-cyclical path through the data flow graph.
6362 :
6363 : For example, when optimizing a loop nest for speed, we will prefer
6364 : to make layout changes outside of a loop rather than inside of a loop,
6365 : and will prefer to make layout changes in parallel rather than serially,
6366 : even if that increases the overall number of layout changes.
6367 :
6368 : The high-level procedure is:
6369 :
6370 : (1) Build a graph in which edges go from uses (parents) to definitions
6371 : (children).
6372 :
6373 : (2) Divide the graph into a dag of strongly-connected components (SCCs).
6374 :
6375 : (3) When optimizing for speed, partition the nodes in each SCC based
6376 : on their containing cfg loop. When optimizing for size, treat
6377 : each SCC as a single partition.
6378 :
6379 : This gives us a dag of partitions. The goal is now to assign a
6380 : layout to each partition.
6381 :
6382 : (4) Construct a set of vector layouts that are worth considering.
6383 : Record which nodes must keep their current layout.
6384 :
6385 : (5) Perform a forward walk over the partition dag (from loads to stores)
6386 : accumulating the "forward" cost of using each layout. When visiting
6387 : each partition, assign a tentative choice of layout to the partition
6388 : and use that choice when calculating the cost of using a different
6389 : layout in successor partitions.
6390 :
6391 : (6) Perform a backward walk over the partition dag (from stores to loads),
6392 : accumulating the "backward" cost of using each layout. When visiting
6393 : each partition, make a final choice of layout for that partition based
6394 : on the accumulated forward costs (from (5)) and backward costs
6395 : (from (6)).
6396 :
6397 : (7) Apply the chosen layouts to the SLP graph.
6398 :
6399 : For example, consider the SLP statements:
6400 :
6401 : S1: a_1 = load
6402 : loop:
6403 : S2: a_2 = PHI<a_1, a_3>
6404 : S3: b_1 = load
6405 : S4: a_3 = a_2 + b_1
6406 : exit:
6407 : S5: a_4 = PHI<a_3>
6408 : S6: store a_4
6409 :
6410 : S2 and S4 form an SCC and are part of the same loop. Every other
6411 : statement is in a singleton SCC. In this example there is a one-to-one
6412 : mapping between SCCs and partitions and the partition dag looks like this;
6413 :
6414 : S1 S3
6415 : \ /
6416 : S2+S4
6417 : |
6418 : S5
6419 : |
6420 : S6
6421 :
6422 : S2, S3 and S4 will have a higher execution frequency than the other
6423 : statements, so when optimizing for speed, the goal is to avoid any
6424 : layout changes:
6425 :
6426 : - within S3
6427 : - within S2+S4
6428 : - on the S3->S2+S4 edge
6429 :
6430 : For example, if S3 was originally a reversing load, the goal of the
6431 : pass is to make it an unreversed load and change the layout on the
6432 : S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
6433 : on S1->S2+S4 and S5->S6 would also be acceptable.)
6434 :
6435 : The difference between SCCs and partitions becomes important if we
6436 : add an outer loop:
6437 :
6438 : S1: a_1 = ...
6439 : loop1:
6440 : S2: a_2 = PHI<a_1, a_6>
6441 : S3: b_1 = load
6442 : S4: a_3 = a_2 + b_1
6443 : loop2:
6444 : S5: a_4 = PHI<a_3, a_5>
6445 : S6: c_1 = load
6446 : S7: a_5 = a_4 + c_1
6447 : exit2:
6448 : S8: a_6 = PHI<a_5>
6449 : S9: store a_6
6450 : exit1:
6451 :
6452 : Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
6453 : for speed, we usually do not want restrictions in the outer loop to "infect"
6454 : the decision for the inner loop. For example, if an outer-loop node
6455 : in the SCC contains a statement with a fixed layout, that should not
6456 : prevent the inner loop from using a different layout. Conversely,
6457 : the inner loop should not dictate a layout to the outer loop: if the
6458 : outer loop does a lot of computation, then it may not be efficient to
6459 : do all of that computation in the inner loop's preferred layout.
6460 :
6461 : So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
6462 : and S5+S7 (inner). We also try to arrange partitions so that:
6463 :
6464 : - the partition for an outer loop comes before the partition for
6465 : an inner loop
6466 :
6467 : - if a sibling loop A dominates a sibling loop B, A's partition
6468 : comes before B's
6469 :
6470 : This gives the following partition dag for the example above:
6471 :
6472 : S1 S3
6473 : \ /
6474 : S2+S4+S8 S6
6475 : | \\ /
6476 : | S5+S7
6477 : |
6478 : S9
6479 :
6480 : There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
6481 : one for a reversal of the edge S7->S8.
6482 :
6483 : The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
6484 : for S2+S4+S8 therefore has to balance the cost of using the outer loop's
6485 : preferred layout against the cost of changing the layout on entry to the
6486 : inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
6487 :
6488 : Although this works well when optimizing for speed, it has the downside
6489 : when optimizing for size that the choice of layout for S5+S7 is completely
6490 : independent of S9, which lessens the chance of reducing the overall number
6491 : of permutations. We therefore do not partition SCCs when optimizing
6492 : for size.
6493 :
6494 : To give a concrete example of the difference between optimizing
6495 : for size and speed, consider:
6496 :
6497 : a[0] = (b[1] << c[3]) - d[1];
6498 : a[1] = (b[0] << c[2]) - d[0];
6499 : a[2] = (b[3] << c[1]) - d[3];
6500 : a[3] = (b[2] << c[0]) - d[2];
6501 :
6502 : There are three different layouts here: one for a, one for b and d,
6503 : and one for c. When optimizing for speed it is better to permute each
6504 : of b, c and d into the order required by a, since those permutations
6505 : happen in parallel. But when optimizing for size, it is better to:
6506 :
6507 : - permute c into the same order as b
6508 : - do the arithmetic
6509 : - permute the result into the order required by a
6510 :
6511 : This gives 2 permutations rather than 3. */
6512 :
6513 : class vect_optimize_slp_pass
6514 : {
6515 : public:
6516 681015 : vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
6517 : void run ();
6518 :
6519 : private:
6520 : /* Graph building. */
6521 : struct loop *containing_loop (slp_tree);
6522 : bool is_cfg_latch_edge (graph_edge *);
6523 : void build_vertices (hash_set<slp_tree> &, slp_tree);
6524 : void build_vertices ();
6525 : void build_graph ();
6526 :
6527 : /* Partitioning. */
6528 : void create_partitions ();
6529 : template<typename T> void for_each_partition_edge (unsigned int, T);
6530 :
6531 : /* Layout selection. */
6532 : bool is_compatible_layout (slp_tree, unsigned int);
6533 : bool is_compatible_layout (const slpg_partition_info &, unsigned int);
6534 : int change_layout_cost (slp_tree, unsigned int, unsigned int);
6535 : slpg_partition_layout_costs &partition_layout_costs (unsigned int,
6536 : unsigned int);
6537 : void change_vec_perm_layout (slp_tree, lane_permutation_t &,
6538 : int, unsigned int);
6539 : int internal_node_cost (slp_tree, int, unsigned int);
6540 : void start_choosing_layouts ();
6541 : bool legitimize ();
6542 :
6543 : /* Cost propagation. */
6544 : slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
6545 : unsigned int, unsigned int);
6546 : slpg_layout_cost total_in_cost (unsigned int);
6547 : slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
6548 : slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
6549 : void forward_pass ();
6550 : void backward_pass ();
6551 :
6552 : /* Rematerialization. */
6553 : slp_tree get_result_with_layout (slp_tree, unsigned int);
6554 : void materialize ();
6555 :
6556 : /* Clean-up. */
6557 : void remove_redundant_permutations ();
6558 :
6559 : /* Masked load lanes discovery. */
6560 : void decide_masked_load_lanes ();
6561 :
6562 : void dump ();
6563 :
6564 : vec_info *m_vinfo;
6565 :
6566 : /* True if we should optimize the graph for size, false if we should
6567 : optimize it for speed. (It wouldn't be easy to make this decision
6568 : more locally.) */
6569 : bool m_optimize_size;
6570 :
6571 : /* A graph of all SLP nodes, with edges leading from uses to definitions.
6572 : In other words, a node's predecessors are its slp_tree parents and
6573 : a node's successors are its slp_tree children. */
6574 : graph *m_slpg = nullptr;
6575 :
6576 : /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
6577 : auto_vec<slpg_vertex> m_vertices;
6578 :
6579 : /* The list of all leaves of M_SLPG. such as external definitions, constants,
6580 : and loads. */
6581 : auto_vec<int> m_leafs;
6582 :
6583 : /* This array has one entry for every vector layout that we're considering.
6584 : Element 0 is null and indicates "no change". Other entries describe
6585 : permutations that are inherent in the current graph and that we would
6586 : like to reverse if possible.
6587 :
6588 : For example, a permutation { 1, 2, 3, 0 } means that something has
6589 : effectively been permuted in that way, such as a load group
6590 : { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
6591 : We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
6592 : in order to put things "back" in order. */
6593 : auto_vec<vec<unsigned> > m_perms;
6594 :
6595 : /* A partitioning of the nodes for which a layout must be chosen.
6596 : Each partition represents an <SCC, cfg loop> pair; that is,
6597 : nodes in different SCCs belong to different partitions, and nodes
6598 : within an SCC can be further partitioned according to a containing
6599 : cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
6600 :
6601 : - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
6602 : from leaves (such as loads) to roots (such as stores).
6603 :
6604 : - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
6605 : auto_vec<slpg_partition_info> m_partitions;
6606 :
6607 : /* The list of all nodes for which a layout must be chosen. Nodes for
6608 : partition P come before the nodes for partition P+1. Nodes within a
6609 : partition are in reverse postorder. */
6610 : auto_vec<unsigned int> m_partitioned_nodes;
6611 :
6612 : /* Index P * num-layouts + L contains the cost of using layout L
6613 : for partition P. */
6614 : auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
6615 :
6616 : /* Index N * num-layouts + L, if nonnull, is a node that provides the
6617 : original output of node N adjusted to have layout L. */
6618 : auto_vec<slp_tree> m_node_layouts;
6619 : };
6620 :
6621 : /* Fill the vertices and leafs vector with all nodes in the SLP graph.
6622 : Also record whether we should optimize anything for speed rather
6623 : than size. */
6624 :
6625 : void
6626 10761481 : vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
6627 : slp_tree node)
6628 : {
6629 10761481 : unsigned i;
6630 10761481 : slp_tree child;
6631 :
6632 10761481 : if (visited.add (node))
6633 10761481 : return;
6634 :
6635 9950475 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
6636 : {
6637 7832928 : basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
6638 6979966 : if (optimize_bb_for_speed_p (bb))
6639 6860194 : m_optimize_size = false;
6640 : }
6641 :
6642 9950475 : node->vertex = m_vertices.length ();
6643 9950475 : m_vertices.safe_push (slpg_vertex (node));
6644 :
6645 9950475 : bool leaf = true;
6646 9950475 : bool force_leaf = false;
6647 18643354 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6648 8692879 : if (child)
6649 : {
6650 7829363 : leaf = false;
6651 7829363 : build_vertices (visited, child);
6652 : }
6653 : else
6654 : force_leaf = true;
6655 : /* Since SLP discovery works along use-def edges all cycles have an
6656 : entry - but there's the exception of cycles where we do not handle
6657 : the entry explicitly (but with a NULL SLP node), like some reductions
6658 : and inductions. Force those SLP PHIs to act as leafs to make them
6659 : backwards reachable. */
6660 9950475 : if (leaf || force_leaf)
6661 4913728 : m_leafs.safe_push (node->vertex);
6662 : }
6663 :
6664 : /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
6665 :
6666 : void
6667 1362030 : vect_optimize_slp_pass::build_vertices ()
6668 : {
6669 1362030 : hash_set<slp_tree> visited;
6670 1362030 : unsigned i;
6671 1362030 : slp_instance instance;
6672 1362030 : m_vertices.truncate (0);
6673 1362030 : m_leafs.truncate (0);
6674 7018208 : FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
6675 2932118 : build_vertices (visited, SLP_INSTANCE_TREE (instance));
6676 1362030 : }
6677 :
6678 : /* Apply (reverse) bijectite PERM to VEC. */
6679 :
6680 : template <class T>
6681 : static void
6682 207947 : vect_slp_permute (vec<unsigned> perm,
6683 : vec<T> &vec, bool reverse)
6684 : {
6685 207947 : auto_vec<T, 64> saved;
6686 207947 : saved.create (vec.length ());
6687 674387 : for (unsigned i = 0; i < vec.length (); ++i)
6688 466440 : saved.quick_push (vec[i]);
6689 :
6690 207947 : if (reverse)
6691 : {
6692 1338182 : for (unsigned i = 0; i < vec.length (); ++i)
6693 465112 : vec[perm[i]] = saved[i];
6694 672463 : for (unsigned i = 0; i < vec.length (); ++i)
6695 822797 : gcc_assert (vec[perm[i]] == saved[i]);
6696 : }
6697 : else
6698 : {
6699 3848 : for (unsigned i = 0; i < vec.length (); ++i)
6700 1328 : vec[i] = saved[perm[i]];
6701 209275 : for (unsigned i = 0; i < vec.length (); ++i)
6702 1992 : gcc_assert (vec[i] == saved[perm[i]]);
6703 : }
6704 207947 : }
6705 :
6706 : /* Return the cfg loop that contains NODE. */
6707 :
6708 : struct loop *
6709 3892470 : vect_optimize_slp_pass::containing_loop (slp_tree node)
6710 : {
6711 3892470 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
6712 3892470 : if (!rep)
6713 5295 : return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
6714 4326835 : return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
6715 : }
6716 :
6717 : /* Return true if UD (an edge from a use to a definition) is associated
6718 : with a loop latch edge in the cfg. */
6719 :
6720 : bool
6721 7829363 : vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
6722 : {
6723 7829363 : slp_tree use = m_vertices[ud->src].node;
6724 7829363 : slp_tree def = m_vertices[ud->dest].node;
6725 7829363 : if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
6726 7829363 : || SLP_TREE_PERMUTE_P (use))
6727 7518800 : || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
6728 : return false;
6729 :
6730 4561306 : stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
6731 4561306 : return (is_a<gphi *> (use_rep->stmt)
6732 377372 : && bb_loop_header_p (gimple_bb (use_rep->stmt))
6733 4772270 : && containing_loop (def) == containing_loop (use));
6734 : }
6735 :
6736 : /* Build the graph. Mark edges that correspond to cfg loop latch edges with
6737 : a nonnull data field. */
6738 :
6739 : void
6740 1362030 : vect_optimize_slp_pass::build_graph ()
6741 : {
6742 1362030 : m_optimize_size = true;
6743 1362030 : build_vertices ();
6744 :
6745 2724060 : m_slpg = new_graph (m_vertices.length ());
6746 14036565 : for (slpg_vertex &v : m_vertices)
6747 29760840 : for (slp_tree child : SLP_TREE_CHILDREN (v.node))
6748 8692879 : if (child)
6749 : {
6750 7829363 : graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
6751 7829363 : if (is_cfg_latch_edge (ud))
6752 202120 : ud->data = this;
6753 : }
6754 1362030 : }
6755 :
6756 : /* Return true if E corresponds to a loop latch edge in the cfg. */
6757 :
6758 : static bool
6759 4015456 : skip_cfg_latch_edges (graph_edge *e)
6760 : {
6761 4015456 : return e->data;
6762 : }
6763 :
6764 : /* Create the node partitions. */
6765 :
6766 : void
6767 681015 : vect_optimize_slp_pass::create_partitions ()
6768 : {
6769 : /* Calculate a postorder of the graph, ignoring edges that correspond
6770 : to natural latch edges in the cfg. Reading the vector from the end
6771 : to the beginning gives the reverse postorder. */
6772 681015 : auto_vec<int> initial_rpo;
6773 1362030 : graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
6774 : false, NULL, skip_cfg_latch_edges);
6775 2043045 : gcc_assert (initial_rpo.length () == m_vertices.length ());
6776 :
6777 : /* Calculate the strongly connected components of the graph. */
6778 681015 : auto_vec<int> scc_grouping;
6779 681015 : unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
6780 :
6781 : /* Create a new index order in which all nodes from the same SCC are
6782 : consecutive. Use scc_pos to record the index of the first node in
6783 : each SCC. */
6784 681015 : auto_vec<unsigned int> scc_pos (num_sccs);
6785 681015 : int last_component = -1;
6786 681015 : unsigned int node_count = 0;
6787 7017999 : for (unsigned int node_i : scc_grouping)
6788 : {
6789 4974954 : if (last_component != m_slpg->vertices[node_i].component)
6790 : {
6791 4848031 : last_component = m_slpg->vertices[node_i].component;
6792 9696062 : gcc_assert (last_component == int (scc_pos.length ()));
6793 4848031 : scc_pos.quick_push (node_count);
6794 : }
6795 4974954 : node_count += 1;
6796 : }
6797 1362030 : gcc_assert (node_count == initial_rpo.length ()
6798 : && last_component + 1 == int (num_sccs));
6799 :
6800 : /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
6801 : inside each SCC following the RPO we calculated above. The fact that
6802 : we ignored natural latch edges when calculating the RPO should ensure
6803 : that, for natural loop nests:
6804 :
6805 : - the first node that we encounter in a cfg loop is the loop header phi
6806 : - the loop header phis are in dominance order
6807 :
6808 : Arranging for this is an optimization (see below) rather than a
6809 : correctness issue. Unnatural loops with a tangled mess of backedges
6810 : will still work correctly, but might give poorer results.
6811 :
6812 : Also update scc_pos so that it gives 1 + the index of the last node
6813 : in the SCC. */
6814 681015 : m_partitioned_nodes.safe_grow (node_count);
6815 6336984 : for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
6816 : {
6817 4974954 : unsigned int node_i = initial_rpo[old_i];
6818 4974954 : unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
6819 4974954 : m_partitioned_nodes[new_i] = node_i;
6820 : }
6821 :
6822 : /* When optimizing for speed, partition each SCC based on the containing
6823 : cfg loop. The order we constructed above should ensure that, for natural
6824 : cfg loops, we'll create sub-SCC partitions for outer loops before
6825 : the corresponding sub-SCC partitions for inner loops. Similarly,
6826 : when one sibling loop A dominates another sibling loop B, we should
6827 : create a sub-SCC partition for A before a sub-SCC partition for B.
6828 :
6829 : As above, nothing depends for correctness on whether this achieves
6830 : a natural nesting, but we should get better results when it does. */
6831 1362030 : m_partitions.reserve (m_vertices.length ());
6832 681015 : unsigned int next_partition_i = 0;
6833 681015 : hash_map<struct loop *, int> loop_partitions;
6834 681015 : unsigned int rpo_begin = 0;
6835 681015 : unsigned int num_partitioned_nodes = 0;
6836 6891076 : for (unsigned int rpo_end : scc_pos)
6837 : {
6838 4848031 : loop_partitions.empty ();
6839 : unsigned int partition_i = next_partition_i;
6840 9822985 : for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
6841 : {
6842 : /* Handle externals and constants optimistically throughout.
6843 : But treat existing vectors as fixed since we do not handle
6844 : permuting them. */
6845 4974954 : unsigned int node_i = m_partitioned_nodes[rpo_i];
6846 4974954 : auto &vertex = m_vertices[node_i];
6847 4974954 : if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
6848 506256 : && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
6849 4977173 : || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
6850 1479959 : vertex.partition = -1;
6851 : else
6852 : {
6853 3494995 : bool existed;
6854 3494995 : if (m_optimize_size)
6855 24453 : existed = next_partition_i > partition_i;
6856 : else
6857 : {
6858 3470542 : struct loop *loop = containing_loop (vertex.node);
6859 3470542 : auto &entry = loop_partitions.get_or_insert (loop, &existed);
6860 3470542 : if (!existed)
6861 3344644 : entry = next_partition_i;
6862 3470542 : partition_i = entry;
6863 : }
6864 3494995 : if (!existed)
6865 : {
6866 3369019 : m_partitions.quick_push (slpg_partition_info ());
6867 3369019 : next_partition_i += 1;
6868 : }
6869 3494995 : vertex.partition = partition_i;
6870 3494995 : num_partitioned_nodes += 1;
6871 3494995 : m_partitions[partition_i].node_end += 1;
6872 : }
6873 : }
6874 4848031 : rpo_begin = rpo_end;
6875 : }
6876 :
6877 : /* Assign ranges of consecutive node indices to each partition,
6878 : in partition order. Start with node_end being the same as
6879 : node_begin so that the next loop can use it as a counter. */
6880 681015 : unsigned int node_begin = 0;
6881 5412064 : for (auto &partition : m_partitions)
6882 : {
6883 3369019 : partition.node_begin = node_begin;
6884 3369019 : node_begin += partition.node_end;
6885 3369019 : partition.node_end = partition.node_begin;
6886 : }
6887 681015 : gcc_assert (node_begin == num_partitioned_nodes);
6888 :
6889 : /* Finally build the list of nodes in partition order. */
6890 681015 : m_partitioned_nodes.truncate (num_partitioned_nodes);
6891 5655969 : for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
6892 : {
6893 4974954 : int partition_i = m_vertices[node_i].partition;
6894 4974954 : if (partition_i >= 0)
6895 : {
6896 3494995 : unsigned int order_i = m_partitions[partition_i].node_end++;
6897 3494995 : m_partitioned_nodes[order_i] = node_i;
6898 : }
6899 : }
6900 681015 : }
6901 :
6902 : /* Look for edges from earlier partitions into node NODE_I and edges from
6903 : node NODE_I into later partitions. Call:
6904 :
6905 : FN (ud, other_node_i)
6906 :
6907 : for each such use-to-def edge ud, where other_node_i is the node at the
6908 : other end of the edge. */
6909 :
6910 : template<typename T>
6911 : void
6912 3937338 : vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
6913 : {
6914 3937338 : int partition_i = m_vertices[node_i].partition;
6915 3937338 : for (graph_edge *pred = m_slpg->vertices[node_i].pred;
6916 6830708 : pred; pred = pred->pred_next)
6917 : {
6918 2893370 : int src_partition_i = m_vertices[pred->src].partition;
6919 2893370 : if (src_partition_i >= 0 && src_partition_i != partition_i)
6920 2572159 : fn (pred, pred->src);
6921 : }
6922 3937338 : for (graph_edge *succ = m_slpg->vertices[node_i].succ;
6923 8465300 : succ; succ = succ->succ_next)
6924 : {
6925 4527962 : int dest_partition_i = m_vertices[succ->dest].partition;
6926 4527962 : if (dest_partition_i >= 0 && dest_partition_i != partition_i)
6927 2599922 : fn (succ, succ->dest);
6928 : }
6929 3937338 : }
6930 :
6931 : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
6932 : that NODE would operate on. This test is independent of NODE's actual
6933 : operation. */
6934 :
6935 : bool
6936 1769406 : vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
6937 : unsigned int layout_i)
6938 : {
6939 1769406 : if (layout_i == 0)
6940 : return true;
6941 :
6942 1012492 : if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
6943 14802 : return false;
6944 :
6945 : return true;
6946 : }
6947 :
6948 : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
6949 : that NODE would operate on for each NODE in PARTITION.
6950 : This test is independent of NODE's actual operations. */
6951 :
6952 : bool
6953 17791 : vect_optimize_slp_pass::is_compatible_layout (const slpg_partition_info
6954 : &partition,
6955 : unsigned int layout_i)
6956 : {
6957 35854 : for (unsigned int order_i = partition.node_begin;
6958 35854 : order_i < partition.node_end; ++order_i)
6959 : {
6960 18129 : unsigned int node_i = m_partitioned_nodes[order_i];
6961 18129 : auto &vertex = m_vertices[node_i];
6962 :
6963 : /* The layout is incompatible if it is individually incompatible
6964 : with any node in the partition. */
6965 18129 : if (!is_compatible_layout (vertex.node, layout_i))
6966 : return false;
6967 : }
6968 : return true;
6969 : }
6970 :
6971 : /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
6972 : to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
6973 : layouts is incompatible with NODE or if the change is not possible for
6974 : some other reason.
6975 :
6976 : The properties taken from NODE include the number of lanes and the
6977 : vector type. The actual operation doesn't matter. */
6978 :
6979 : int
6980 756986 : vect_optimize_slp_pass::change_layout_cost (slp_tree node,
6981 : unsigned int from_layout_i,
6982 : unsigned int to_layout_i)
6983 : {
6984 756986 : if (!is_compatible_layout (node, from_layout_i)
6985 756986 : || !is_compatible_layout (node, to_layout_i))
6986 563 : return -1;
6987 :
6988 756423 : if (from_layout_i == to_layout_i)
6989 : return 0;
6990 :
6991 320840 : auto_vec<slp_tree, 1> children (1);
6992 320840 : children.quick_push (node);
6993 320840 : auto_lane_permutation_t perm (SLP_TREE_LANES (node));
6994 320840 : if (from_layout_i > 0)
6995 900928 : for (unsigned int i : m_perms[from_layout_i])
6996 393811 : perm.quick_push ({ 0, i });
6997 : else
6998 488218 : for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
6999 336417 : perm.quick_push ({ 0, i });
7000 320840 : if (to_layout_i > 0)
7001 152228 : vect_slp_permute (m_perms[to_layout_i], perm, true);
7002 320840 : auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
7003 : children, false);
7004 320840 : if (count >= 0)
7005 316687 : return MAX (count, 1);
7006 :
7007 : /* ??? In principle we could try changing via layout 0, giving two
7008 : layout changes rather than 1. Doing that would require
7009 : corresponding support in get_result_with_layout. */
7010 : return -1;
7011 320840 : }
7012 :
7013 : /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
7014 :
7015 : inline slpg_partition_layout_costs &
7016 1083115 : vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
7017 : unsigned int layout_i)
7018 : {
7019 2166230 : return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
7020 : }
7021 :
7022 : /* Change PERM in one of two ways:
7023 :
7024 : - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
7025 : chosen for child I of NODE.
7026 :
7027 : - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
7028 :
7029 : In both cases, arrange for the output to have layout OUT_LAYOUT_I */
7030 :
7031 : void
7032 30650 : vect_optimize_slp_pass::
7033 : change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
7034 : int in_layout_i, unsigned int out_layout_i)
7035 : {
7036 178116 : for (auto &entry : perm)
7037 : {
7038 86166 : int this_in_layout_i = in_layout_i;
7039 86166 : if (this_in_layout_i < 0)
7040 : {
7041 59911 : slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
7042 59911 : unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
7043 59911 : if (in_partition_i == -1u)
7044 329 : continue;
7045 59582 : this_in_layout_i = m_partitions[in_partition_i].layout;
7046 : }
7047 85837 : if (this_in_layout_i > 0)
7048 19151 : entry.second = m_perms[this_in_layout_i][entry.second];
7049 : }
7050 30650 : if (out_layout_i > 0)
7051 7153 : vect_slp_permute (m_perms[out_layout_i], perm, true);
7052 30650 : }
7053 :
7054 : /* Check whether the target allows NODE to be rearranged so that the node's
7055 : output has layout OUT_LAYOUT_I. Return the cost of the change if so,
7056 : in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
7057 :
7058 : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
7059 : NODE can adapt to the layout changes that have (perhaps provisionally)
7060 : been chosen for NODE's children, so that no extra permutations are
7061 : needed on either the input or the output of NODE.
7062 :
7063 : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
7064 : that all inputs will be forced into layout IN_LAYOUT_I beforehand.
7065 :
7066 : IN_LAYOUT_I has no meaning for other types of node.
7067 :
7068 : Keeping the node as-is is always valid. If the target doesn't appear
7069 : to support the node as-is, but might realistically support other layouts,
7070 : then layout 0 instead has the cost of a worst-case permutation. On the
7071 : one hand, this ensures that every node has at least one valid layout,
7072 : avoiding what would otherwise be an awkward special case. On the other,
7073 : it still encourages the pass to change an invalid pre-existing layout
7074 : choice into a valid one. */
7075 :
7076 : int
7077 233603 : vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
7078 : unsigned int out_layout_i)
7079 : {
7080 233603 : const int fallback_cost = 1;
7081 :
7082 233603 : if (SLP_TREE_PERMUTE_P (node))
7083 : {
7084 25506 : auto_lane_permutation_t tmp_perm;
7085 25506 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
7086 :
7087 : /* Check that the child nodes support the chosen layout. Checking
7088 : the first child is enough, since any second child would have the
7089 : same shape. */
7090 25506 : auto first_child = SLP_TREE_CHILDREN (node)[0];
7091 25506 : if (in_layout_i > 0
7092 25506 : && !is_compatible_layout (first_child, in_layout_i))
7093 : return -1;
7094 :
7095 24947 : change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
7096 49894 : int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
7097 : node, tmp_perm,
7098 24947 : SLP_TREE_CHILDREN (node),
7099 : false);
7100 24947 : if (count < 0)
7101 : {
7102 1498 : if (in_layout_i == 0 && out_layout_i == 0)
7103 : {
7104 : /* Use the fallback cost if the node could in principle support
7105 : some nonzero layout for both the inputs and the outputs.
7106 : Otherwise assume that the node will be rejected later
7107 : and rebuilt from scalars. */
7108 363 : if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
7109 : return fallback_cost;
7110 293 : return 0;
7111 : }
7112 : return -1;
7113 : }
7114 :
7115 : /* We currently have no way of telling whether the new layout is cheaper
7116 : or more expensive than the old one. But at least in principle,
7117 : it should be worth making zero permutations (whole-vector shuffles)
7118 : cheaper than real permutations, in case the pass is able to remove
7119 : the latter. */
7120 23449 : return count == 0 ? 0 : 1;
7121 25506 : }
7122 :
7123 208097 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
7124 208097 : if (rep
7125 207165 : && STMT_VINFO_DATA_REF (rep)
7126 64004 : && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
7127 254673 : && SLP_TREE_LOAD_PERMUTATION (node).exists ())
7128 : {
7129 39514 : auto_load_permutation_t tmp_perm;
7130 39514 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
7131 39514 : if (out_layout_i > 0)
7132 13543 : vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
7133 :
7134 39514 : poly_uint64 vf = 1;
7135 39514 : if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
7136 12066 : vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7137 39514 : unsigned int n_perms;
7138 39514 : if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
7139 : nullptr, vf, true, false, &n_perms))
7140 : {
7141 1492 : auto rep = SLP_TREE_REPRESENTATIVE (node);
7142 1492 : if (out_layout_i == 0)
7143 : {
7144 : /* Use the fallback cost if the load is an N-to-N permutation.
7145 : Otherwise assume that the node will be rejected later
7146 : and rebuilt from scalars. */
7147 1089 : if (STMT_VINFO_GROUPED_ACCESS (rep)
7148 2178 : && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
7149 1089 : == SLP_TREE_LANES (node)))
7150 593 : return fallback_cost;
7151 : return 0;
7152 : }
7153 : return -1;
7154 : }
7155 :
7156 : /* See the comment above the corresponding VEC_PERM_EXPR handling. */
7157 38022 : return n_perms == 0 ? 0 : 1;
7158 39514 : }
7159 :
7160 : return 0;
7161 : }
7162 :
7163 : /* Decide which element layouts we should consider using. Calculate the
7164 : weights associated with inserting layout changes on partition edges.
7165 : Also mark partitions that cannot change layout, by setting their
7166 : layout to zero. */
7167 :
7168 : void
7169 681015 : vect_optimize_slp_pass::start_choosing_layouts ()
7170 : {
7171 : /* Used to assign unique permutation indices. */
7172 681015 : using perm_hash = unbounded_hashmap_traits<
7173 : vec_free_hash_base<int_hash_base<unsigned>>,
7174 : int_hash<int, -1, -2>
7175 : >;
7176 681015 : hash_map<vec<unsigned>, int, perm_hash> layout_ids;
7177 :
7178 : /* Layout 0 is "no change". */
7179 681015 : m_perms.safe_push (vNULL);
7180 :
7181 : /* Create layouts from existing permutations. */
7182 681015 : auto_load_permutation_t tmp_perm;
7183 5538040 : for (unsigned int node_i : m_partitioned_nodes)
7184 : {
7185 : /* Leafs also double as entries to the reverse graph. Allow the
7186 : layout of those to be changed. */
7187 3494995 : auto &vertex = m_vertices[node_i];
7188 3494995 : auto &partition = m_partitions[vertex.partition];
7189 3494995 : if (!m_slpg->vertices[node_i].succ)
7190 886667 : partition.layout = 0;
7191 :
7192 : /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
7193 3494995 : slp_tree node = vertex.node;
7194 3494995 : stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
7195 3494995 : slp_tree child;
7196 3494995 : unsigned HOST_WIDE_INT imin, imax = 0;
7197 3494995 : bool any_permute = false;
7198 3494995 : tmp_perm.truncate (0);
7199 3494995 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
7200 : {
7201 : /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
7202 : unpermuted, record a layout that reverses this permutation.
7203 :
7204 : We would need more work to cope with loads that are internally
7205 : permuted and also have inputs (such as masks for
7206 : IFN_MASK_LOADs). */
7207 596312 : gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
7208 596312 : if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
7209 : {
7210 423197 : partition.layout = -1;
7211 3477716 : continue;
7212 : }
7213 173115 : dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
7214 173115 : imin = DR_GROUP_SIZE (dr_stmt) + 1;
7215 173115 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
7216 : }
7217 5679334 : else if (SLP_TREE_PERMUTE_P (node)
7218 136514 : && SLP_TREE_CHILDREN (node).length () == 1
7219 118032 : && (child = SLP_TREE_CHILDREN (node)[0])
7220 3016715 : && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
7221 118032 : .is_constant (&imin)))
7222 : {
7223 : /* If the child has the same vector size as this node,
7224 : reversing the permutation can make the permutation a no-op.
7225 : In other cases it can change a true permutation into a
7226 : full-vector extract. */
7227 118032 : tmp_perm.reserve (SLP_TREE_LANES (node));
7228 316665 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7229 198633 : tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
7230 : }
7231 : else
7232 2780651 : continue;
7233 :
7234 768326 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7235 : {
7236 477179 : unsigned idx = tmp_perm[j];
7237 477179 : imin = MIN (imin, idx);
7238 477179 : imax = MAX (imax, idx);
7239 477179 : if (idx - tmp_perm[0] != j)
7240 139073 : any_permute = true;
7241 : }
7242 : /* If the span doesn't match we'd disrupt VF computation, avoid
7243 : that for now. */
7244 291147 : if (imax - imin + 1 != SLP_TREE_LANES (node))
7245 82561 : continue;
7246 : /* If there's no permute no need to split one out. In this case
7247 : we can consider turning a load into a permuted load, if that
7248 : turns out to be cheaper than alternatives. */
7249 208586 : if (!any_permute)
7250 : {
7251 191169 : partition.layout = -1;
7252 191169 : continue;
7253 : }
7254 :
7255 : /* For now only handle true permutes, like
7256 : vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
7257 : when permuting constants and invariants keeping the permute
7258 : bijective. */
7259 17417 : auto_sbitmap load_index (SLP_TREE_LANES (node));
7260 17417 : bitmap_clear (load_index);
7261 66465 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7262 49048 : bitmap_set_bit (load_index, tmp_perm[j] - imin);
7263 : unsigned j;
7264 65781 : for (j = 0; j < SLP_TREE_LANES (node); ++j)
7265 48502 : if (!bitmap_bit_p (load_index, j))
7266 : break;
7267 17417 : if (j != SLP_TREE_LANES (node))
7268 138 : continue;
7269 :
7270 17279 : vec<unsigned> perm = vNULL;
7271 17279 : perm.safe_grow (SLP_TREE_LANES (node), true);
7272 65542 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7273 48263 : perm[j] = tmp_perm[j] - imin;
7274 :
7275 34558 : if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
7276 : {
7277 : /* Continue to use existing layouts, but don't add any more. */
7278 0 : int *entry = layout_ids.get (perm);
7279 0 : partition.layout = entry ? *entry : 0;
7280 0 : perm.release ();
7281 : }
7282 : else
7283 : {
7284 17279 : bool existed;
7285 17279 : int &layout_i = layout_ids.get_or_insert (perm, &existed);
7286 17279 : if (existed)
7287 6233 : perm.release ();
7288 : else
7289 : {
7290 11046 : layout_i = m_perms.length ();
7291 11046 : m_perms.safe_push (perm);
7292 : }
7293 17279 : partition.layout = layout_i;
7294 : }
7295 17417 : }
7296 :
7297 : /* Initially assume that every layout is possible and has zero cost
7298 : in every partition. */
7299 681015 : m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
7300 1362030 : * m_perms.length ());
7301 :
7302 : /* We have to mark outgoing permutations facing non-associating-reduction
7303 : graph entries that are not represented as to be materialized.
7304 : slp_inst_kind_bb_reduc currently only covers associatable reductions. */
7305 3509104 : for (slp_instance instance : m_vinfo->slp_instances)
7306 1466059 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
7307 : {
7308 6350 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
7309 6350 : m_partitions[m_vertices[node_i].partition].layout = 0;
7310 : }
7311 1459709 : else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
7312 : {
7313 2255 : stmt_vec_info stmt_info
7314 2255 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
7315 2255 : vect_reduc_info reduc_info
7316 2255 : = info_for_reduction (as_a <loop_vec_info> (m_vinfo),
7317 : SLP_INSTANCE_TREE (instance));
7318 2255 : if (needs_fold_left_reduction_p (TREE_TYPE
7319 : (gimple_get_lhs (stmt_info->stmt)),
7320 : VECT_REDUC_INFO_CODE (reduc_info)))
7321 : {
7322 97 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
7323 97 : m_partitions[m_vertices[node_i].partition].layout = 0;
7324 : }
7325 : }
7326 :
7327 : /* Check which layouts each node and partition can handle. Calculate the
7328 : weights associated with inserting layout changes on edges. */
7329 5538040 : for (unsigned int node_i : m_partitioned_nodes)
7330 : {
7331 3494995 : auto &vertex = m_vertices[node_i];
7332 3494995 : auto &partition = m_partitions[vertex.partition];
7333 3494995 : slp_tree node = vertex.node;
7334 :
7335 3494995 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
7336 : {
7337 3489700 : vertex.weight = vect_slp_node_weight (node);
7338 :
7339 : /* We do not handle stores with a permutation, so all
7340 : incoming permutations must have been materialized.
7341 :
7342 : We also don't handle masked grouped loads, which lack a
7343 : permutation vector. In this case the memory locations
7344 : form an implicit second input to the loads, on top of the
7345 : explicit mask input, and the memory input's layout cannot
7346 : be changed.
7347 :
7348 : On the other hand, we do support permuting gather loads and
7349 : masked gather loads, where each scalar load is independent
7350 : of the others. This can be useful if the address/index input
7351 : benefits from permutation. */
7352 3489700 : if (STMT_VINFO_DATA_REF (rep)
7353 1761443 : && STMT_VINFO_GROUPED_ACCESS (rep)
7354 4583495 : && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
7355 920680 : partition.layout = 0;
7356 :
7357 : /* We cannot change the layout of an operation that is
7358 : not independent on lanes. Note this is an explicit
7359 : negative list since that's much shorter than the respective
7360 : positive one but it's critical to keep maintaining it. */
7361 3489700 : if (is_gimple_call (STMT_VINFO_STMT (rep)))
7362 31722 : switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
7363 : {
7364 1155 : case CFN_COMPLEX_ADD_ROT90:
7365 1155 : case CFN_COMPLEX_ADD_ROT270:
7366 1155 : case CFN_COMPLEX_MUL:
7367 1155 : case CFN_COMPLEX_MUL_CONJ:
7368 1155 : case CFN_VEC_ADDSUB:
7369 1155 : case CFN_VEC_FMADDSUB:
7370 1155 : case CFN_VEC_FMSUBADD:
7371 1155 : partition.layout = 0;
7372 : default:;
7373 : }
7374 : }
7375 :
7376 7882313 : auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
7377 : {
7378 4387318 : auto &other_vertex = m_vertices[other_node_i];
7379 :
7380 : /* Count the number of edges from earlier partitions and the number
7381 : of edges to later partitions. */
7382 4387318 : if (other_vertex.partition < vertex.partition)
7383 2193659 : partition.in_degree += 1;
7384 : else
7385 2193659 : partition.out_degree += 1;
7386 :
7387 : /* If the current node uses the result of OTHER_NODE_I, accumulate
7388 : the effects of that. */
7389 4387318 : if (ud->src == int (node_i))
7390 : {
7391 2193659 : other_vertex.out_weight += vertex.weight;
7392 2193659 : other_vertex.out_degree += 1;
7393 : }
7394 7882313 : };
7395 3494995 : for_each_partition_edge (node_i, process_edge);
7396 : }
7397 681015 : }
7398 :
7399 : /* Return the incoming costs for node NODE_I, assuming that each input keeps
7400 : its current (provisional) choice of layout. The inputs do not necessarily
7401 : have the same layout as each other. */
7402 :
7403 : slpg_layout_cost
7404 3183 : vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
7405 : {
7406 3183 : auto &vertex = m_vertices[node_i];
7407 3183 : slpg_layout_cost cost;
7408 11635 : auto add_cost = [&](graph_edge *, unsigned int other_node_i)
7409 : {
7410 8452 : auto &other_vertex = m_vertices[other_node_i];
7411 8452 : if (other_vertex.partition < vertex.partition)
7412 : {
7413 5357 : auto &other_partition = m_partitions[other_vertex.partition];
7414 10714 : auto &other_costs = partition_layout_costs (other_vertex.partition,
7415 5357 : other_partition.layout);
7416 5357 : slpg_layout_cost this_cost = other_costs.in_cost;
7417 5357 : this_cost.add_serial_cost (other_costs.internal_cost);
7418 5357 : this_cost.split (other_partition.out_degree);
7419 5357 : cost.add_parallel_cost (this_cost);
7420 : }
7421 11635 : };
7422 3183 : for_each_partition_edge (node_i, add_cost);
7423 3183 : return cost;
7424 : }
7425 :
7426 : /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
7427 : and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
7428 : slpg_layout_cost::impossible () if the change isn't possible. */
7429 :
7430 : slpg_layout_cost
7431 756986 : vect_optimize_slp_pass::
7432 : edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
7433 : unsigned int layout2_i)
7434 : {
7435 756986 : auto &def_vertex = m_vertices[ud->dest];
7436 756986 : auto &use_vertex = m_vertices[ud->src];
7437 756986 : auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
7438 756986 : auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
7439 756986 : auto factor = change_layout_cost (def_vertex.node, def_layout_i,
7440 : use_layout_i);
7441 756986 : if (factor < 0)
7442 4716 : return slpg_layout_cost::impossible ();
7443 :
7444 : /* We have a choice of putting the layout change at the site of the
7445 : definition or at the site of the use. Prefer the former when
7446 : optimizing for size or when the execution frequency of the
7447 : definition is no greater than the combined execution frequencies of
7448 : the uses. When putting the layout change at the site of the definition,
7449 : divvy up the cost among all consumers. */
7450 752270 : if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
7451 : {
7452 734850 : slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
7453 734850 : cost.split (def_vertex.out_degree);
7454 734850 : return cost;
7455 : }
7456 17420 : return { use_vertex.weight * factor, m_optimize_size };
7457 : }
7458 :
7459 : /* UD represents a use-def link between FROM_NODE_I and a node in a later
7460 : partition; FROM_NODE_I could be the definition node or the use node.
7461 : The node at the other end of the link wants to use layout TO_LAYOUT_I.
7462 : Return the cost of any necessary fix-ups on edge UD, or return
7463 : slpg_layout_cost::impossible () if the change isn't possible.
7464 :
7465 : At this point, FROM_NODE_I's partition has chosen the cheapest
7466 : layout based on the information available so far, but this choice
7467 : is only provisional. */
7468 :
7469 : slpg_layout_cost
7470 199137 : vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
7471 : unsigned int to_layout_i)
7472 : {
7473 199137 : auto &from_vertex = m_vertices[from_node_i];
7474 199137 : unsigned int from_partition_i = from_vertex.partition;
7475 199137 : slpg_partition_info &from_partition = m_partitions[from_partition_i];
7476 199137 : gcc_assert (from_partition.layout >= 0);
7477 :
7478 : /* First calculate the cost on the assumption that FROM_PARTITION sticks
7479 : with its current layout preference. */
7480 199137 : slpg_layout_cost cost = slpg_layout_cost::impossible ();
7481 199137 : auto edge_cost = edge_layout_cost (ud, from_node_i,
7482 199137 : from_partition.layout, to_layout_i);
7483 199137 : if (edge_cost.is_possible ())
7484 : {
7485 393344 : auto &from_costs = partition_layout_costs (from_partition_i,
7486 196672 : from_partition.layout);
7487 196672 : cost = from_costs.in_cost;
7488 196672 : cost.add_serial_cost (from_costs.internal_cost);
7489 196672 : cost.split (from_partition.out_degree);
7490 196672 : cost.add_serial_cost (edge_cost);
7491 : }
7492 2465 : else if (from_partition.layout == 0)
7493 : /* We must allow the source partition to have layout 0 as a fallback,
7494 : in case all other options turn out to be impossible. */
7495 2465 : return cost;
7496 :
7497 : /* Take the minimum of that cost and the cost that applies if
7498 : FROM_PARTITION instead switches to TO_LAYOUT_I. */
7499 196672 : auto &direct_layout_costs = partition_layout_costs (from_partition_i,
7500 : to_layout_i);
7501 196672 : if (direct_layout_costs.is_possible ())
7502 : {
7503 176892 : slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
7504 176892 : direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
7505 176892 : direct_cost.split (from_partition.out_degree);
7506 176892 : if (!cost.is_possible ()
7507 176892 : || direct_cost.is_better_than (cost, m_optimize_size))
7508 45073 : cost = direct_cost;
7509 : }
7510 :
7511 196672 : return cost;
7512 : }
7513 :
7514 : /* UD represents a use-def link between TO_NODE_I and a node in an earlier
7515 : partition; TO_NODE_I could be the definition node or the use node.
7516 : The node at the other end of the link wants to use layout FROM_LAYOUT_I;
7517 : return the cost of any necessary fix-ups on edge UD, or
7518 : slpg_layout_cost::impossible () if the choice cannot be made.
7519 :
7520 : At this point, TO_NODE_I's partition has a fixed choice of layout. */
7521 :
7522 : slpg_layout_cost
7523 183585 : vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
7524 : unsigned int from_layout_i)
7525 : {
7526 183585 : auto &to_vertex = m_vertices[to_node_i];
7527 183585 : unsigned int to_partition_i = to_vertex.partition;
7528 183585 : slpg_partition_info &to_partition = m_partitions[to_partition_i];
7529 183585 : gcc_assert (to_partition.layout >= 0);
7530 :
7531 : /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
7532 : adjusted for this input having layout FROM_LAYOUT_I. Assume that
7533 : any other inputs keep their current choice of layout. */
7534 183585 : auto &to_costs = partition_layout_costs (to_partition_i,
7535 : to_partition.layout);
7536 183585 : if (ud->src == int (to_node_i)
7537 183383 : && SLP_TREE_PERMUTE_P (to_vertex.node))
7538 : {
7539 9507 : auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
7540 9507 : auto old_layout = from_partition.layout;
7541 9507 : from_partition.layout = from_layout_i;
7542 19014 : int factor = internal_node_cost (to_vertex.node, -1,
7543 9507 : to_partition.layout);
7544 9507 : from_partition.layout = old_layout;
7545 9507 : if (factor >= 0)
7546 : {
7547 8881 : slpg_layout_cost cost = to_costs.out_cost;
7548 17762 : cost.add_serial_cost ({ to_vertex.weight * factor,
7549 8881 : m_optimize_size });
7550 8881 : cost.split (to_partition.in_degree);
7551 8881 : return cost;
7552 : }
7553 : }
7554 :
7555 : /* Compute the cost if we insert any necessary layout change on edge UD. */
7556 174704 : auto edge_cost = edge_layout_cost (ud, to_node_i,
7557 174704 : to_partition.layout, from_layout_i);
7558 174704 : if (edge_cost.is_possible ())
7559 : {
7560 174704 : slpg_layout_cost cost = to_costs.out_cost;
7561 174704 : cost.add_serial_cost (to_costs.internal_cost);
7562 174704 : cost.split (to_partition.in_degree);
7563 174704 : cost.add_serial_cost (edge_cost);
7564 174704 : return cost;
7565 : }
7566 :
7567 0 : return slpg_layout_cost::impossible ();
7568 : }
7569 :
7570 : /* Make a forward pass through the partitions, accumulating input costs.
7571 : Make a tentative (provisional) choice of layout for each partition,
7572 : ensuring that this choice still allows later partitions to keep
7573 : their original layout. */
7574 :
7575 : void
7576 5690 : vect_optimize_slp_pass::forward_pass ()
7577 : {
7578 125452 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
7579 : ++partition_i)
7580 : {
7581 119762 : auto &partition = m_partitions[partition_i];
7582 :
7583 : /* If the partition consists of a single VEC_PERM_EXPR, precompute
7584 : the incoming cost that would apply if every predecessor partition
7585 : keeps its current layout. This is used within the loop below. */
7586 119762 : slpg_layout_cost in_cost;
7587 119762 : slp_tree single_node = nullptr;
7588 119762 : if (partition.node_end == partition.node_begin + 1)
7589 : {
7590 113591 : unsigned int node_i = m_partitioned_nodes[partition.node_begin];
7591 113591 : single_node = m_vertices[node_i].node;
7592 113591 : if (SLP_TREE_PERMUTE_P (single_node))
7593 3183 : in_cost = total_in_cost (node_i);
7594 : }
7595 :
7596 : /* Go through the possible layouts. Decide which ones are valid
7597 : for this partition and record which of the valid layouts has
7598 : the lowest cost. */
7599 119762 : unsigned int min_layout_i = 0;
7600 119762 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
7601 365139 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7602 : {
7603 245377 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7604 245377 : if (!layout_costs.is_possible ())
7605 55736 : continue;
7606 :
7607 : /* If the recorded layout is already 0 then the layout cannot
7608 : change. */
7609 245377 : if (partition.layout == 0 && layout_i != 0)
7610 : {
7611 39012 : layout_costs.mark_impossible ();
7612 39012 : continue;
7613 : }
7614 :
7615 206365 : bool is_possible = true;
7616 423897 : for (unsigned int order_i = partition.node_begin;
7617 423897 : order_i < partition.node_end; ++order_i)
7618 : {
7619 232178 : unsigned int node_i = m_partitioned_nodes[order_i];
7620 232178 : auto &vertex = m_vertices[node_i];
7621 :
7622 : /* Reject the layout if it is individually incompatible
7623 : with any node in the partition. */
7624 232178 : if (!is_compatible_layout (vertex.node, layout_i))
7625 : {
7626 13614 : is_possible = false;
7627 14646 : break;
7628 : }
7629 :
7630 604299 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
7631 : {
7632 385735 : auto &other_vertex = m_vertices[other_node_i];
7633 385735 : if (other_vertex.partition < vertex.partition)
7634 : {
7635 : /* Accumulate the incoming costs from earlier
7636 : partitions, plus the cost of any layout changes
7637 : on UD itself. */
7638 199137 : auto cost = forward_cost (ud, other_node_i, layout_i);
7639 199137 : if (!cost.is_possible ())
7640 2465 : is_possible = false;
7641 : else
7642 196672 : layout_costs.in_cost.add_parallel_cost (cost);
7643 : }
7644 : else
7645 : /* Reject the layout if it would make layout 0 impossible
7646 : for later partitions. This amounts to testing that the
7647 : target supports reversing the layout change on edges
7648 : to later partitions.
7649 :
7650 : In principle, it might be possible to push a layout
7651 : change all the way down a graph, so that it never
7652 : needs to be reversed and so that the target doesn't
7653 : need to support the reverse operation. But it would
7654 : be awkward to bail out if we hit a partition that
7655 : does not support the new layout, especially since
7656 : we are not dealing with a lattice. */
7657 186598 : is_possible &= edge_layout_cost (ud, other_node_i, 0,
7658 186598 : layout_i).is_possible ();
7659 604299 : };
7660 218564 : for_each_partition_edge (node_i, add_cost);
7661 :
7662 : /* Accumulate the cost of using LAYOUT_I within NODE,
7663 : both for the inputs and the outputs. */
7664 218564 : int factor = internal_node_cost (vertex.node, layout_i,
7665 : layout_i);
7666 218564 : if (factor < 0)
7667 : {
7668 1032 : is_possible = false;
7669 1032 : break;
7670 : }
7671 217532 : else if (factor)
7672 36165 : layout_costs.internal_cost.add_serial_cost
7673 36165 : ({ vertex.weight * factor, m_optimize_size });
7674 : }
7675 206365 : if (!is_possible)
7676 : {
7677 16724 : layout_costs.mark_impossible ();
7678 16724 : continue;
7679 : }
7680 :
7681 : /* Combine the incoming and partition-internal costs. */
7682 189641 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7683 189641 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7684 :
7685 : /* If this partition consists of a single VEC_PERM_EXPR, see
7686 : if the VEC_PERM_EXPR can be changed to support output layout
7687 : LAYOUT_I while keeping all the provisional choices of input
7688 : layout. */
7689 189641 : if (single_node && SLP_TREE_PERMUTE_P (single_node))
7690 : {
7691 5532 : int factor = internal_node_cost (single_node, -1, layout_i);
7692 5532 : if (factor >= 0)
7693 : {
7694 5093 : auto weight = m_vertices[single_node->vertex].weight;
7695 5093 : slpg_layout_cost internal_cost
7696 5093 : = { weight * factor, m_optimize_size };
7697 :
7698 5093 : slpg_layout_cost alt_cost = in_cost;
7699 5093 : alt_cost.add_serial_cost (internal_cost);
7700 5093 : if (alt_cost.is_better_than (combined_cost, m_optimize_size))
7701 : {
7702 1604 : combined_cost = alt_cost;
7703 1604 : layout_costs.in_cost = in_cost;
7704 1604 : layout_costs.internal_cost = internal_cost;
7705 : }
7706 : }
7707 : }
7708 :
7709 : /* Record the layout with the lowest cost. Prefer layout 0 in
7710 : the event of a tie between it and another layout. */
7711 189641 : if (!min_layout_cost.is_possible ()
7712 69879 : || combined_cost.is_better_than (min_layout_cost,
7713 69879 : m_optimize_size))
7714 : {
7715 134500 : min_layout_i = layout_i;
7716 134500 : min_layout_cost = combined_cost;
7717 : }
7718 : }
7719 :
7720 : /* This loop's handling of earlier partitions should ensure that
7721 : choosing the original layout for the current partition is no
7722 : less valid than it was in the original graph, even with the
7723 : provisional layout choices for those earlier partitions. */
7724 119762 : gcc_assert (min_layout_cost.is_possible ());
7725 119762 : partition.layout = min_layout_i;
7726 : }
7727 5690 : }
7728 :
7729 : /* Make a backward pass through the partitions, accumulating output costs.
7730 : Make a final choice of layout for each partition. */
7731 :
7732 : void
7733 5690 : vect_optimize_slp_pass::backward_pass ()
7734 : {
7735 131142 : for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
7736 : {
7737 119762 : auto &partition = m_partitions[partition_i];
7738 :
7739 119762 : unsigned int min_layout_i = 0;
7740 119762 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
7741 365139 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7742 : {
7743 245377 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7744 245377 : if (!layout_costs.is_possible ())
7745 55736 : continue;
7746 :
7747 : /* Accumulate the costs from successor partitions. */
7748 189641 : bool is_possible = true;
7749 405064 : for (unsigned int order_i = partition.node_begin;
7750 405064 : order_i < partition.node_end; ++order_i)
7751 : {
7752 215423 : unsigned int node_i = m_partitioned_nodes[order_i];
7753 215423 : auto &vertex = m_vertices[node_i];
7754 595555 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
7755 : {
7756 380132 : auto &other_vertex = m_vertices[other_node_i];
7757 380132 : auto &other_partition = m_partitions[other_vertex.partition];
7758 380132 : if (other_vertex.partition > vertex.partition)
7759 : {
7760 : /* Accumulate the incoming costs from later
7761 : partitions, plus the cost of any layout changes
7762 : on UD itself. */
7763 183585 : auto cost = backward_cost (ud, other_node_i, layout_i);
7764 183585 : if (!cost.is_possible ())
7765 0 : is_possible = false;
7766 : else
7767 183585 : layout_costs.out_cost.add_parallel_cost (cost);
7768 : }
7769 : else
7770 : /* Make sure that earlier partitions can (if necessary
7771 : or beneficial) keep the layout that they chose in
7772 : the forward pass. This ensures that there is at
7773 : least one valid choice of layout. */
7774 196547 : is_possible &= edge_layout_cost (ud, other_node_i,
7775 196547 : other_partition.layout,
7776 196547 : layout_i).is_possible ();
7777 595555 : };
7778 215423 : for_each_partition_edge (node_i, add_cost);
7779 : }
7780 189641 : if (!is_possible)
7781 : {
7782 0 : layout_costs.mark_impossible ();
7783 0 : continue;
7784 : }
7785 :
7786 : /* Locally combine the costs from the forward and backward passes.
7787 : (This combined cost is not passed on, since that would lead
7788 : to double counting.) */
7789 189641 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7790 189641 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7791 189641 : combined_cost.add_serial_cost (layout_costs.out_cost);
7792 :
7793 : /* Record the layout with the lowest cost. Prefer layout 0 in
7794 : the event of a tie between it and another layout. */
7795 189641 : if (!min_layout_cost.is_possible ()
7796 69879 : || combined_cost.is_better_than (min_layout_cost,
7797 69879 : m_optimize_size))
7798 : {
7799 127848 : min_layout_i = layout_i;
7800 127848 : min_layout_cost = combined_cost;
7801 : }
7802 : }
7803 :
7804 119762 : gcc_assert (min_layout_cost.is_possible ());
7805 119762 : partition.layout = min_layout_i;
7806 : }
7807 5690 : }
7808 :
7809 : /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
7810 : NODE already has the layout that was selected for its partition. */
7811 :
7812 : slp_tree
7813 166411 : vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
7814 : unsigned int to_layout_i)
7815 : {
7816 166411 : unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
7817 166411 : slp_tree result = m_node_layouts[result_i];
7818 166411 : if (result)
7819 : return result;
7820 :
7821 165919 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7822 165919 : || (SLP_TREE_DEF_TYPE (node) == vect_external_def
7823 : /* We can't permute vector defs in place. */
7824 20220 : && SLP_TREE_VEC_DEFS (node).is_empty ()))
7825 : {
7826 : /* If the vector is uniform or unchanged, there's nothing to do. */
7827 38143 : if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
7828 : result = node;
7829 : else
7830 : {
7831 2009 : auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
7832 2009 : result = vect_create_new_slp_node (scalar_ops);
7833 2009 : vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
7834 : }
7835 : }
7836 : else
7837 : {
7838 127776 : unsigned int partition_i = m_vertices[node->vertex].partition;
7839 127776 : unsigned int from_layout_i = m_partitions[partition_i].layout;
7840 127776 : if (from_layout_i == to_layout_i)
7841 127207 : return node;
7842 :
7843 : /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
7844 : permutation instead of a serial one. Leave the new permutation
7845 : in TMP_PERM on success. */
7846 569 : auto_lane_permutation_t tmp_perm;
7847 569 : unsigned int num_inputs = 1;
7848 569 : if (SLP_TREE_PERMUTE_P (node))
7849 : {
7850 7 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
7851 7 : if (from_layout_i != 0)
7852 7 : vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
7853 7 : if (to_layout_i != 0)
7854 4 : vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
7855 7 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
7856 : tmp_perm,
7857 7 : SLP_TREE_CHILDREN (node),
7858 : false) >= 0)
7859 7 : num_inputs = SLP_TREE_CHILDREN (node).length ();
7860 : else
7861 0 : tmp_perm.truncate (0);
7862 : }
7863 :
7864 569 : if (dump_enabled_p ())
7865 : {
7866 68 : if (tmp_perm.length () > 0)
7867 6 : dump_printf_loc (MSG_NOTE, vect_location,
7868 : "duplicating permutation node %p with"
7869 : " layout %d\n",
7870 : (void *) node, to_layout_i);
7871 : else
7872 62 : dump_printf_loc (MSG_NOTE, vect_location,
7873 : "inserting permutation node in place of %p\n",
7874 : (void *) node);
7875 : }
7876 :
7877 569 : unsigned int num_lanes = SLP_TREE_LANES (node);
7878 569 : result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
7879 569 : if (SLP_TREE_SCALAR_STMTS (node).length ())
7880 : {
7881 568 : auto &stmts = SLP_TREE_SCALAR_STMTS (result);
7882 568 : stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
7883 568 : if (from_layout_i != 0)
7884 298 : vect_slp_permute (m_perms[from_layout_i], stmts, false);
7885 568 : if (to_layout_i != 0)
7886 274 : vect_slp_permute (m_perms[to_layout_i], stmts, true);
7887 : }
7888 569 : SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
7889 569 : SLP_TREE_LANES (result) = num_lanes;
7890 569 : SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
7891 569 : result->vertex = -1;
7892 :
7893 569 : auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
7894 569 : if (tmp_perm.length ())
7895 : {
7896 7 : lane_perm.safe_splice (tmp_perm);
7897 7 : SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
7898 : }
7899 : else
7900 : {
7901 562 : lane_perm.create (num_lanes);
7902 1750 : for (unsigned j = 0; j < num_lanes; ++j)
7903 1188 : lane_perm.quick_push ({ 0, j });
7904 562 : if (from_layout_i != 0)
7905 291 : vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
7906 562 : if (to_layout_i != 0)
7907 271 : vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
7908 562 : SLP_TREE_CHILDREN (result).safe_push (node);
7909 : }
7910 2280 : for (slp_tree child : SLP_TREE_CHILDREN (result))
7911 573 : child->refcnt++;
7912 569 : }
7913 38712 : m_node_layouts[result_i] = result;
7914 38712 : return result;
7915 : }
7916 :
7917 : /* Apply the chosen vector layouts to the SLP graph. */
7918 :
7919 : void
7920 10629 : vect_optimize_slp_pass::materialize ()
7921 : {
7922 : /* We no longer need the costs, so avoid having two O(N * P) arrays
7923 : live at the same time. */
7924 10629 : m_partition_layout_costs.release ();
7925 31887 : m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
7926 :
7927 21258 : auto_sbitmap fully_folded (m_vertices.length ());
7928 10629 : bitmap_clear (fully_folded);
7929 174264 : for (unsigned int node_i : m_partitioned_nodes)
7930 : {
7931 142377 : auto &vertex = m_vertices[node_i];
7932 142377 : slp_tree node = vertex.node;
7933 142377 : int layout_i = m_partitions[vertex.partition].layout;
7934 142377 : gcc_assert (layout_i >= 0);
7935 :
7936 : /* Rearrange the scalar statements to match the chosen layout. */
7937 142377 : if (layout_i > 0)
7938 15986 : vect_slp_permute (m_perms[layout_i],
7939 15986 : SLP_TREE_SCALAR_STMTS (node), true);
7940 :
7941 : /* Update load and lane permutations. */
7942 142377 : if (SLP_TREE_PERMUTE_P (node))
7943 : {
7944 : /* First try to absorb the input vector layouts. If that fails,
7945 : force the inputs to have layout LAYOUT_I too. We checked that
7946 : that was possible before deciding to use nonzero output layouts.
7947 : (Note that at this stage we don't really have any guarantee that
7948 : the target supports the original VEC_PERM_EXPR.) */
7949 5340 : auto &perm = SLP_TREE_LANE_PERMUTATION (node);
7950 5340 : auto_lane_permutation_t tmp_perm;
7951 5340 : tmp_perm.safe_splice (perm);
7952 5340 : change_vec_perm_layout (node, tmp_perm, -1, layout_i);
7953 5340 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
7954 : tmp_perm,
7955 5340 : SLP_TREE_CHILDREN (node),
7956 : false) >= 0)
7957 : {
7958 4977 : if (dump_enabled_p ()
7959 5897 : && !std::equal (tmp_perm.begin (), tmp_perm.end (),
7960 : perm.begin ()))
7961 58 : dump_printf_loc (MSG_NOTE, vect_location,
7962 : "absorbing input layouts into %p\n",
7963 : (void *) node);
7964 28034 : std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
7965 4977 : bitmap_set_bit (fully_folded, node_i);
7966 : }
7967 : else
7968 : {
7969 : /* Not MSG_MISSED because it would make no sense to users. */
7970 363 : if (dump_enabled_p ())
7971 46 : dump_printf_loc (MSG_NOTE, vect_location,
7972 : "failed to absorb input layouts into %p\n",
7973 : (void *) node);
7974 363 : change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
7975 : }
7976 5340 : }
7977 : else
7978 : {
7979 137037 : gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
7980 137037 : auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
7981 137037 : if (layout_i > 0)
7982 : /* ??? When we handle non-bijective permutes the idea
7983 : is that we can force the load-permutation to be
7984 : { min, min + 1, min + 2, ... max }. But then the
7985 : scalar defs might no longer match the lane content
7986 : which means wrong-code with live lane vectorization.
7987 : So we possibly have to have NULL entries for those. */
7988 15883 : vect_slp_permute (m_perms[layout_i], load_perm, true);
7989 : }
7990 : }
7991 :
7992 : /* Do this before any nodes disappear, since it involves a walk
7993 : over the leaves. */
7994 10629 : remove_redundant_permutations ();
7995 :
7996 : /* Replace each child with a correctly laid-out version. */
7997 174264 : for (unsigned int node_i : m_partitioned_nodes)
7998 : {
7999 : /* Skip nodes that have already been handled above. */
8000 142377 : if (bitmap_bit_p (fully_folded, node_i))
8001 4977 : continue;
8002 :
8003 137400 : auto &vertex = m_vertices[node_i];
8004 137400 : int in_layout_i = m_partitions[vertex.partition].layout;
8005 137400 : gcc_assert (in_layout_i >= 0);
8006 :
8007 : unsigned j;
8008 : slp_tree child;
8009 412812 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
8010 : {
8011 172369 : if (!child)
8012 5958 : continue;
8013 :
8014 166411 : slp_tree new_child = get_result_with_layout (child, in_layout_i);
8015 166411 : if (new_child != child)
8016 : {
8017 2813 : vect_free_slp_tree (child);
8018 2813 : SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
8019 2813 : new_child->refcnt += 1;
8020 : }
8021 : }
8022 : }
8023 10629 : }
8024 :
8025 : /* Elide load permutations that are not necessary. Such permutations might
8026 : be pre-existing, rather than created by the layout optimizations. */
8027 :
8028 : void
8029 681015 : vect_optimize_slp_pass::remove_redundant_permutations ()
8030 : {
8031 4499909 : for (unsigned int node_i : m_leafs)
8032 : {
8033 2456864 : slp_tree node = m_vertices[node_i].node;
8034 2456864 : if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
8035 1860552 : continue;
8036 :
8037 : /* In basic block vectorization we allow any subchain of an interleaving
8038 : chain.
8039 : FORNOW: not in loop SLP because of realignment complications. */
8040 596312 : if (is_a <bb_vec_info> (m_vinfo))
8041 : {
8042 159613 : bool subchain_p = true;
8043 : stmt_vec_info next_load_info = NULL;
8044 : stmt_vec_info load_info;
8045 : unsigned j;
8046 159613 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
8047 : {
8048 129714 : if (j != 0
8049 129714 : && (next_load_info != load_info
8050 61955 : || ! load_info
8051 61955 : || DR_GROUP_GAP (load_info) != 1))
8052 : {
8053 : subchain_p = false;
8054 : break;
8055 : }
8056 107172 : next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
8057 : }
8058 52441 : if (subchain_p)
8059 : {
8060 29899 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8061 29899 : continue;
8062 : }
8063 : }
8064 : else
8065 : {
8066 543871 : loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
8067 543871 : bool this_load_permuted = !vect_load_perm_consecutive_p (node, 0);
8068 : /* When this isn't a grouped access we know it's single element
8069 : and contiguous. */
8070 543871 : if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
8071 : {
8072 423197 : if (!this_load_permuted
8073 423197 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
8074 422402 : || SLP_TREE_LANES (node) == 1))
8075 422404 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8076 423197 : continue;
8077 : }
8078 120674 : stmt_vec_info first_stmt_info
8079 120674 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
8080 121179 : if (!this_load_permuted
8081 : /* The load requires permutation when unrolling exposes
8082 : a gap either because the group is larger than the SLP
8083 : group-size or because there is a gap between the groups. */
8084 120674 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
8085 98477 : || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
8086 140 : && DR_GROUP_GAP (first_stmt_info) == 0)))
8087 : {
8088 505 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8089 505 : continue;
8090 : }
8091 : }
8092 : }
8093 681015 : }
8094 :
8095 : /* Print the partition graph and layout information to the dump file. */
8096 :
8097 : void
8098 674 : vect_optimize_slp_pass::dump ()
8099 : {
8100 674 : dump_printf_loc (MSG_NOTE, vect_location,
8101 : "SLP optimize permutations:\n");
8102 1361 : for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
8103 : {
8104 687 : dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
8105 687 : const char *sep = "";
8106 5866 : for (unsigned int idx : m_perms[layout_i])
8107 : {
8108 3805 : dump_printf (MSG_NOTE, "%s%d", sep, idx);
8109 3805 : sep = ", ";
8110 : }
8111 687 : dump_printf (MSG_NOTE, " }\n");
8112 : }
8113 674 : dump_printf_loc (MSG_NOTE, vect_location,
8114 : "SLP optimize partitions:\n");
8115 5612 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
8116 : ++partition_i)
8117 : {
8118 4938 : auto &partition = m_partitions[partition_i];
8119 4938 : dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
8120 4938 : dump_printf_loc (MSG_NOTE, vect_location,
8121 : " partition %d (layout %d):\n",
8122 : partition_i, partition.layout);
8123 4938 : dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
8124 10111 : for (unsigned int order_i = partition.node_begin;
8125 10111 : order_i < partition.node_end; ++order_i)
8126 : {
8127 5173 : auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
8128 10346 : dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
8129 5173 : (void *) vertex.node);
8130 5173 : dump_printf_loc (MSG_NOTE, vect_location,
8131 : " weight: %f\n",
8132 : vertex.weight.to_double ());
8133 5173 : if (vertex.out_degree)
8134 4050 : dump_printf_loc (MSG_NOTE, vect_location,
8135 : " out weight: %f (degree %d)\n",
8136 : vertex.out_weight.to_double (),
8137 : vertex.out_degree);
8138 5173 : if (SLP_TREE_PERMUTE_P (vertex.node))
8139 506 : dump_printf_loc (MSG_NOTE, vect_location,
8140 : " op: VEC_PERM_EXPR\n");
8141 4667 : else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
8142 4649 : dump_printf_loc (MSG_NOTE, vect_location,
8143 : " op template: %G", rep->stmt);
8144 : }
8145 4938 : dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
8146 10111 : for (unsigned int order_i = partition.node_begin;
8147 10111 : order_i < partition.node_end; ++order_i)
8148 : {
8149 5173 : unsigned int node_i = m_partitioned_nodes[order_i];
8150 5173 : auto &vertex = m_vertices[node_i];
8151 15617 : auto print_edge = [&](graph_edge *, unsigned int other_node_i)
8152 : {
8153 10444 : auto &other_vertex = m_vertices[other_node_i];
8154 10444 : if (other_vertex.partition < vertex.partition)
8155 5222 : dump_printf_loc (MSG_NOTE, vect_location,
8156 : " - %p [%d] --> %p\n",
8157 5222 : (void *) other_vertex.node,
8158 : other_vertex.partition,
8159 5222 : (void *) vertex.node);
8160 : else
8161 5222 : dump_printf_loc (MSG_NOTE, vect_location,
8162 : " - %p --> [%d] %p\n",
8163 5222 : (void *) vertex.node,
8164 : other_vertex.partition,
8165 5222 : (void *) other_vertex.node);
8166 15617 : };
8167 5173 : for_each_partition_edge (node_i, print_edge);
8168 : }
8169 :
8170 15013 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
8171 : {
8172 10075 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
8173 10075 : if (layout_costs.is_possible ())
8174 : {
8175 8301 : dump_printf_loc (MSG_NOTE, vect_location,
8176 : " layout %d:%s\n", layout_i,
8177 8301 : partition.layout == int (layout_i)
8178 : ? " (*)" : "");
8179 8301 : slpg_layout_cost combined_cost = layout_costs.in_cost;
8180 8301 : combined_cost.add_serial_cost (layout_costs.internal_cost);
8181 8301 : combined_cost.add_serial_cost (layout_costs.out_cost);
8182 : #define TEMPLATE "{depth: %f, total: %f}"
8183 8301 : dump_printf_loc (MSG_NOTE, vect_location,
8184 : " " TEMPLATE "\n",
8185 : layout_costs.in_cost.depth.to_double (),
8186 : layout_costs.in_cost.total.to_double ());
8187 8301 : dump_printf_loc (MSG_NOTE, vect_location,
8188 : " + " TEMPLATE "\n",
8189 : layout_costs.internal_cost.depth.to_double (),
8190 : layout_costs.internal_cost.total.to_double ());
8191 8301 : dump_printf_loc (MSG_NOTE, vect_location,
8192 : " + " TEMPLATE "\n",
8193 : layout_costs.out_cost.depth.to_double (),
8194 : layout_costs.out_cost.total.to_double ());
8195 8301 : dump_printf_loc (MSG_NOTE, vect_location,
8196 : " = " TEMPLATE "\n",
8197 : combined_cost.depth.to_double (),
8198 : combined_cost.total.to_double ());
8199 : #undef TEMPLATE
8200 : }
8201 : else
8202 1774 : dump_printf_loc (MSG_NOTE, vect_location,
8203 : " layout %d: rejected\n", layout_i);
8204 : }
8205 : }
8206 674 : }
8207 :
8208 : /* Masked load lanes discovery. */
8209 :
8210 : void
8211 681015 : vect_optimize_slp_pass::decide_masked_load_lanes ()
8212 : {
8213 7018566 : for (auto v : m_vertices)
8214 : {
8215 4975521 : slp_tree node = v.node;
8216 4975521 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
8217 3493334 : || SLP_TREE_PERMUTE_P (node))
8218 1619268 : continue;
8219 3356253 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8220 1642413 : if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
8221 : /* The mask has to be uniform. */
8222 975356 : || STMT_VINFO_SLP_VECT_ONLY (stmt_info)
8223 975225 : || ! is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
8224 3356338 : || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
8225 : IFN_MASK_LOAD))
8226 3356220 : continue;
8227 33 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8228 66 : if (STMT_VINFO_STRIDED_P (stmt_info)
8229 33 : || compare_step_with_zero (m_vinfo, stmt_info) <= 0
8230 63 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (node),
8231 30 : DR_GROUP_SIZE (stmt_info),
8232 : true) == IFN_LAST)
8233 33 : continue;
8234 :
8235 : /* Uniform masks need to be suitably represented. */
8236 0 : slp_tree mask = SLP_TREE_CHILDREN (node)[0];
8237 0 : if (!SLP_TREE_PERMUTE_P (mask)
8238 0 : || SLP_TREE_CHILDREN (mask).length () != 1)
8239 0 : continue;
8240 0 : bool match = true;
8241 0 : for (auto perm : SLP_TREE_LANE_PERMUTATION (mask))
8242 0 : if (perm.first != 0 || perm.second != 0)
8243 : {
8244 : match = false;
8245 : break;
8246 : }
8247 0 : if (!match)
8248 0 : continue;
8249 :
8250 : /* Now see if the consumer side matches. */
8251 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
8252 0 : pred; pred = pred->pred_next)
8253 : {
8254 0 : slp_tree pred_node = m_vertices[pred->src].node;
8255 : /* All consumers should be a permute with a single outgoing lane. */
8256 0 : if (!SLP_TREE_PERMUTE_P (pred_node)
8257 0 : || SLP_TREE_LANES (pred_node) != 1)
8258 : {
8259 : match = false;
8260 : break;
8261 : }
8262 0 : gcc_assert (SLP_TREE_CHILDREN (pred_node).length () == 1);
8263 : }
8264 0 : if (!match)
8265 0 : continue;
8266 : /* Now we can mark the nodes as to use load lanes. */
8267 0 : node->ldst_lanes = true;
8268 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
8269 0 : pred; pred = pred->pred_next)
8270 0 : m_vertices[pred->src].node->ldst_lanes = true;
8271 : /* The catch is we have to massage the mask. We have arranged
8272 : analyzed uniform masks to be represented by a splat VEC_PERM
8273 : which we can now simply elide as we cannot easily re-do SLP
8274 : discovery here. */
8275 0 : slp_tree new_mask = SLP_TREE_CHILDREN (mask)[0];
8276 0 : SLP_TREE_REF_COUNT (new_mask)++;
8277 0 : SLP_TREE_CHILDREN (node)[0] = new_mask;
8278 0 : vect_free_slp_tree (mask);
8279 : }
8280 681015 : }
8281 :
8282 : /* Perform legitimizing attempts. This is intended to improve the
8283 : situation when layout 0 is not valid which is a situation the cost
8284 : based propagation does not handle well.
8285 : Return true if further layout optimization is possible, false if
8286 : the layout configuration should be considered final. */
8287 :
8288 : bool
8289 10629 : vect_optimize_slp_pass::legitimize ()
8290 : {
8291 : /* Perform a very simple legitimizing attempt by attempting to choose
8292 : a single layout for all partitions that will make all permutations
8293 : a noop. That should also be the optimal layout choice in case
8294 : layout zero is legitimate.
8295 : ??? Disconnected components of the SLP graph could have distinct
8296 : single layouts. */
8297 10629 : int single_layout_i = -1;
8298 10629 : unsigned deferred_up_to = -1U;
8299 31472 : for (unsigned partition_i = 0; partition_i < m_partitions.length ();
8300 : ++partition_i)
8301 : {
8302 26527 : auto &partition = m_partitions[partition_i];
8303 26527 : if (single_layout_i == -1)
8304 : {
8305 13863 : single_layout_i = partition.layout;
8306 13863 : deferred_up_to = partition_i;
8307 : }
8308 12664 : else if (partition.layout == single_layout_i || partition.layout == -1)
8309 : ;
8310 : else
8311 : single_layout_i = 0;
8312 23255 : if (single_layout_i == 0)
8313 : return true;
8314 :
8315 20903 : if (single_layout_i != -1
8316 20903 : && !is_compatible_layout (partition, single_layout_i))
8317 : return true;
8318 : }
8319 :
8320 4945 : if (single_layout_i <= 0)
8321 : return true;
8322 :
8323 5061 : for (unsigned partition_i = 0; partition_i < deferred_up_to; ++partition_i)
8324 122 : if (!is_compatible_layout (m_partitions[partition_i],
8325 : single_layout_i))
8326 : return true;
8327 :
8328 12517 : for (unsigned partition_i = 0; partition_i < m_partitions.length ();
8329 : ++partition_i)
8330 : {
8331 7578 : auto &partition = m_partitions[partition_i];
8332 7578 : partition.layout = single_layout_i;
8333 : }
8334 :
8335 : return false;
8336 : }
8337 :
8338 : /* Main entry point for the SLP graph optimization pass. */
8339 :
8340 : void
8341 681015 : vect_optimize_slp_pass::run ()
8342 : {
8343 681015 : build_graph ();
8344 681015 : create_partitions ();
8345 681015 : start_choosing_layouts ();
8346 681015 : if (m_perms.length () > 1)
8347 : {
8348 10629 : if (legitimize ())
8349 : {
8350 5690 : forward_pass ();
8351 5690 : backward_pass ();
8352 : }
8353 10629 : if (dump_enabled_p ())
8354 674 : dump ();
8355 10629 : materialize ();
8356 42933 : while (!m_perms.is_empty ())
8357 21675 : m_perms.pop ().release ();
8358 : }
8359 : else
8360 670386 : remove_redundant_permutations ();
8361 681015 : free_graph (m_slpg);
8362 681015 : build_graph ();
8363 681015 : decide_masked_load_lanes ();
8364 681015 : free_graph (m_slpg);
8365 681015 : }
8366 :
8367 : /* Apply CSE to NODE and its children using BST_MAP. */
8368 :
8369 : static void
8370 5377344 : vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
8371 : {
8372 5377344 : bool put_p = false;
8373 5377344 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
8374 : /* Besides some VEC_PERM_EXPR, two-operator nodes also
8375 : lack scalar stmts and thus CSE doesn't work via bst_map. Ideally
8376 : we'd have sth that works for all internal and external nodes. */
8377 5377344 : && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
8378 : {
8379 3869142 : slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
8380 3869142 : if (leader)
8381 : {
8382 : /* We've visited this node already. */
8383 404229 : if (!*leader || *leader == node)
8384 : return;
8385 :
8386 2800 : if (dump_enabled_p ())
8387 907 : dump_printf_loc (MSG_NOTE, vect_location,
8388 : "re-using SLP tree %p for %p\n",
8389 : (void *)*leader, (void *)node);
8390 2800 : vect_free_slp_tree (node);
8391 2800 : (*leader)->refcnt += 1;
8392 2800 : node = *leader;
8393 2800 : return;
8394 : }
8395 :
8396 : /* Avoid creating a cycle by populating the map only after recursion. */
8397 3464913 : bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
8398 3464913 : node->refcnt += 1;
8399 3464913 : put_p = true;
8400 : /* And recurse. */
8401 : }
8402 :
8403 14868276 : for (slp_tree &child : SLP_TREE_CHILDREN (node))
8404 4343037 : if (child)
8405 3911285 : vect_cse_slp_nodes (bst_map, child);
8406 :
8407 : /* Now record the node for CSE in other siblings. */
8408 4973115 : if (put_p)
8409 3464913 : *bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
8410 : }
8411 :
8412 : /* Optimize the SLP graph of VINFO. */
8413 :
8414 : void
8415 1027203 : vect_optimize_slp (vec_info *vinfo)
8416 : {
8417 1027203 : if (vinfo->slp_instances.is_empty ())
8418 : return;
8419 681015 : vect_optimize_slp_pass (vinfo).run ();
8420 :
8421 : /* Apply CSE again to nodes after permute optimization. */
8422 681015 : scalar_stmts_to_slp_tree_map_t *bst_map
8423 681015 : = new scalar_stmts_to_slp_tree_map_t ();
8424 :
8425 3509104 : for (auto inst : vinfo->slp_instances)
8426 1466059 : vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
8427 :
8428 681015 : release_scalar_stmts_to_slp_tree_map (bst_map);
8429 : }
8430 :
8431 : /* Gather loads reachable from the individual SLP graph entries. */
8432 :
8433 : void
8434 1027203 : vect_gather_slp_loads (vec_info *vinfo)
8435 : {
8436 1027203 : unsigned i;
8437 1027203 : slp_instance instance;
8438 2493262 : FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
8439 : {
8440 1466059 : hash_set<slp_tree> visited;
8441 1466059 : vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
8442 : SLP_INSTANCE_TREE (instance), visited);
8443 1466059 : }
8444 1027203 : }
8445 :
8446 : /* For NODE update VF based on the number of lanes and the vector types
8447 : used. */
8448 :
8449 : static void
8450 4226623 : vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
8451 : hash_set<slp_tree> &visited)
8452 : {
8453 4226623 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
8454 1519506 : return;
8455 3070087 : if (visited.add (node))
8456 : return;
8457 :
8458 10265523 : for (slp_tree child : SLP_TREE_CHILDREN (node))
8459 3466014 : vect_update_slp_vf_for_node (child, vf, visited);
8460 :
8461 : /* We do not visit SLP nodes for constants or externals - those neither
8462 : have a vector type set yet (vectorizable_* does this) nor do they
8463 : have max_nunits set. Instead we rely on internal nodes max_nunit
8464 : to cover constant/external operands.
8465 : Note that when we stop using fixed size vectors externs and constants
8466 : shouldn't influence the (minimum) vectorization factor, instead
8467 : vectorizable_* should honor the vectorization factor when trying to
8468 : assign vector types to constants and externals and cause iteration
8469 : to a higher vectorization factor when required. */
8470 2707117 : poly_uint64 node_vf
8471 2707117 : = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
8472 2707117 : vf = force_common_multiple (vf, node_vf);
8473 :
8474 : /* For permute nodes that are fed from externs or constants we have to
8475 : consider their number of lanes as well. Likewise for store-lanes. */
8476 2707117 : if (SLP_TREE_PERMUTE_P (node) || node->ldst_lanes)
8477 706215 : for (slp_tree child : SLP_TREE_CHILDREN (node))
8478 189930 : if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
8479 : {
8480 3445 : poly_uint64 child_vf
8481 3445 : = calculate_unrolling_factor (node->max_nunits,
8482 : SLP_TREE_LANES (child));
8483 3445 : vf = force_common_multiple (vf, child_vf);
8484 : }
8485 : }
8486 :
8487 : /* For each possible SLP instance decide whether to SLP it and calculate overall
8488 : unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
8489 : least one instance. */
8490 :
8491 : bool
8492 473922 : vect_make_slp_decision (loop_vec_info loop_vinfo)
8493 : {
8494 473922 : unsigned int i;
8495 473922 : poly_uint64 unrolling_factor = 1;
8496 473922 : const vec<slp_instance> &slp_instances
8497 : = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8498 473922 : slp_instance instance;
8499 473922 : int decided_to_slp = 0;
8500 :
8501 473922 : DUMP_VECT_SCOPE ("vect_make_slp_decision");
8502 :
8503 473922 : hash_set<slp_tree> visited;
8504 1234531 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
8505 : {
8506 760609 : slp_tree root = SLP_INSTANCE_TREE (instance);
8507 :
8508 : /* All unroll factors have the form:
8509 :
8510 : GET_MODE_SIZE (vinfo->vector_mode) * X
8511 :
8512 : for some rational X, so they must have a common multiple. */
8513 760609 : vect_update_slp_vf_for_node (root, unrolling_factor, visited);
8514 :
8515 : /* If all instances ended up with vector(1) T roots make sure to
8516 : not vectorize. RVV for example relies on loop vectorization
8517 : when some instances are essentially kept scalar. See PR121048. */
8518 760609 : if (SLP_TREE_VECTYPE (root)
8519 760609 : && known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U))
8520 622309 : decided_to_slp++;
8521 : }
8522 :
8523 473922 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = unrolling_factor;
8524 :
8525 473922 : if (decided_to_slp && dump_enabled_p ())
8526 : {
8527 19072 : dump_printf_loc (MSG_NOTE, vect_location,
8528 : "Decided to SLP %d instances. Unrolling factor ",
8529 : decided_to_slp);
8530 19072 : dump_dec (MSG_NOTE, unrolling_factor);
8531 19072 : dump_printf (MSG_NOTE, "\n");
8532 : }
8533 :
8534 473922 : return (decided_to_slp > 0);
8535 473922 : }
8536 :
8537 : /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
8538 :
8539 2205447 : _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
8540 : : vec_info (vec_info::bb, shared),
8541 2205447 : roots (vNULL)
8542 : {
8543 : /* The region we are operating on. bbs[0] is the entry, excluding
8544 : its PHI nodes. In the future we might want to track an explicit
8545 : entry edge to cover bbs[0] PHI nodes and have a region entry
8546 : insert location. */
8547 2205447 : bbs = _bbs.address ();
8548 2205447 : nbbs = _bbs.length ();
8549 :
8550 17680234 : for (unsigned i = 0; i < nbbs; ++i)
8551 : {
8552 15474787 : if (i != 0)
8553 20118615 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
8554 6849275 : gsi_next (&si))
8555 : {
8556 6849275 : gphi *phi = si.phi ();
8557 6849275 : gimple_set_uid (phi, 0);
8558 6849275 : add_stmt (phi);
8559 : }
8560 30949574 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
8561 137971545 : !gsi_end_p (gsi); gsi_next (&gsi))
8562 : {
8563 122496758 : gimple *stmt = gsi_stmt (gsi);
8564 122496758 : gimple_set_uid (stmt, 0);
8565 122496758 : if (is_gimple_debug (stmt))
8566 77082069 : continue;
8567 45414689 : add_stmt (stmt);
8568 : }
8569 : }
8570 2205447 : }
8571 :
8572 :
8573 : /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
8574 : stmts in the basic block. */
8575 :
8576 2205447 : _bb_vec_info::~_bb_vec_info ()
8577 : {
8578 : /* Reset region marker. */
8579 17680234 : for (unsigned i = 0; i < nbbs; ++i)
8580 : {
8581 15474787 : if (i != 0)
8582 20134427 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
8583 6865087 : gsi_next (&si))
8584 : {
8585 6865087 : gphi *phi = si.phi ();
8586 6865087 : gimple_set_uid (phi, -1);
8587 : }
8588 30949574 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
8589 137914411 : !gsi_end_p (gsi); gsi_next (&gsi))
8590 : {
8591 122439624 : gimple *stmt = gsi_stmt (gsi);
8592 122439624 : gimple_set_uid (stmt, -1);
8593 : }
8594 : }
8595 :
8596 3448649 : for (unsigned i = 0; i < roots.length (); ++i)
8597 : {
8598 1243202 : roots[i].stmts.release ();
8599 1243202 : roots[i].roots.release ();
8600 1243202 : roots[i].remain.release ();
8601 : }
8602 2205447 : roots.release ();
8603 2205447 : }
8604 :
8605 : /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
8606 : given then that child nodes have already been processed, and that
8607 : their def types currently match their SLP node's def type. */
8608 :
8609 : static bool
8610 2819368 : vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
8611 : slp_instance node_instance,
8612 : stmt_vector_for_cost *cost_vec)
8613 : {
8614 : /* Handle purely internal nodes. */
8615 2819368 : if (SLP_TREE_PERMUTE_P (node))
8616 : {
8617 122699 : if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
8618 : return false;
8619 :
8620 : stmt_vec_info slp_stmt_info;
8621 : unsigned int i;
8622 323678 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
8623 : {
8624 202306 : if (slp_stmt_info
8625 196765 : && STMT_VINFO_LIVE_P (slp_stmt_info)
8626 202306 : && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
8627 : node_instance, i,
8628 : false, cost_vec))
8629 : return false;
8630 : }
8631 121372 : SLP_TREE_TYPE (node) = permute_info_type;
8632 121372 : return true;
8633 : }
8634 :
8635 2696669 : return vect_analyze_stmt (vinfo, node, node_instance, cost_vec);
8636 : }
8637 :
8638 : static int
8639 1860568 : sort_ints (const void *a_, const void *b_)
8640 : {
8641 1860568 : int a = *(const int *)a_;
8642 1860568 : int b = *(const int *)b_;
8643 1860568 : return a - b;
8644 : }
8645 :
8646 : /* Verify if we can externalize a set of internal defs. */
8647 :
8648 : static bool
8649 383274 : vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
8650 : {
8651 : /* Constant generation uses get_later_stmt which can only handle
8652 : defs from the same BB or a set of defs that can be ordered
8653 : with a dominance query. */
8654 383274 : basic_block bb = NULL;
8655 383274 : bool all_same = true;
8656 383274 : auto_vec<int> bbs;
8657 766548 : bbs.reserve_exact (stmts.length ());
8658 2073018 : for (stmt_vec_info stmt : stmts)
8659 : {
8660 923196 : if (!stmt)
8661 : return false;
8662 923196 : else if (!bb)
8663 383274 : bb = gimple_bb (stmt->stmt);
8664 539922 : else if (gimple_bb (stmt->stmt) != bb)
8665 174681 : all_same = false;
8666 923196 : bbs.quick_push (gimple_bb (stmt->stmt)->index);
8667 : }
8668 383274 : if (all_same)
8669 : return true;
8670 :
8671 : /* Produce a vector of unique BB indexes for the defs. */
8672 130880 : bbs.qsort (sort_ints);
8673 : unsigned i, j;
8674 318876 : for (i = 1, j = 1; i < bbs.length (); ++i)
8675 187996 : if (bbs[i] != bbs[j-1])
8676 139650 : bbs[j++] = bbs[i];
8677 130880 : gcc_assert (j >= 2);
8678 130880 : bbs.truncate (j);
8679 :
8680 261760 : if (bbs.length () == 2)
8681 127360 : return (dominated_by_p (CDI_DOMINATORS,
8682 127360 : BASIC_BLOCK_FOR_FN (cfun, bbs[0]),
8683 127360 : BASIC_BLOCK_FOR_FN (cfun, bbs[1]))
8684 248076 : || dominated_by_p (CDI_DOMINATORS,
8685 120716 : BASIC_BLOCK_FOR_FN (cfun, bbs[1]),
8686 120716 : BASIC_BLOCK_FOR_FN (cfun, bbs[0])));
8687 :
8688 : /* ??? For more than two BBs we can sort the vector and verify the
8689 : result is a total order. But we can't use vec::qsort with a
8690 : compare function using a dominance query since there's no way to
8691 : signal failure and any fallback for an unordered pair would
8692 : fail qsort_chk later.
8693 : For now simply hope that ordering after BB index provides the
8694 : best candidate total order. If required we can implement our
8695 : own mergesort or export an entry without checking. */
8696 399031 : for (unsigned i = 1; i < bbs.length (); ++i)
8697 12266 : if (!dominated_by_p (CDI_DOMINATORS,
8698 12266 : BASIC_BLOCK_FOR_FN (cfun, bbs[i]),
8699 12266 : BASIC_BLOCK_FOR_FN (cfun, bbs[i-1])))
8700 : return false;
8701 :
8702 : return true;
8703 383274 : }
8704 :
8705 : /* Try to build NODE from scalars, returning true on success.
8706 : NODE_INSTANCE is the SLP instance that contains NODE. */
8707 :
8708 : static bool
8709 562222 : vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
8710 : slp_instance node_instance)
8711 : {
8712 562222 : stmt_vec_info stmt_info;
8713 562222 : unsigned int i;
8714 :
8715 562222 : if (!is_a <bb_vec_info> (vinfo)
8716 71191 : || node == SLP_INSTANCE_TREE (node_instance)
8717 22299 : || !SLP_TREE_SCALAR_STMTS (node).exists ()
8718 22258 : || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
8719 : /* Force the mask use to be built from scalars instead. */
8720 20022 : || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
8721 582045 : || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
8722 542399 : return false;
8723 :
8724 19823 : if (dump_enabled_p ())
8725 76 : dump_printf_loc (MSG_NOTE, vect_location,
8726 : "Building vector operands of %p from scalars instead\n",
8727 : (void *) node);
8728 :
8729 : /* Don't remove and free the child nodes here, since they could be
8730 : referenced by other structures. The analysis and scheduling phases
8731 : (need to) ignore child nodes of anything that isn't vect_internal_def. */
8732 19823 : unsigned int group_size = SLP_TREE_LANES (node);
8733 19823 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
8734 : /* Invariants get their vector type from the uses. */
8735 19823 : SLP_TREE_VECTYPE (node) = NULL_TREE;
8736 19823 : SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
8737 19823 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8738 68899 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8739 : {
8740 49076 : tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
8741 49076 : SLP_TREE_SCALAR_OPS (node)[i] = lhs;
8742 : }
8743 : return true;
8744 : }
8745 :
8746 : /* Return true if all elements of the slice are the same. */
8747 : bool
8748 483923 : vect_scalar_ops_slice::all_same_p () const
8749 : {
8750 532064 : for (unsigned int i = 1; i < length; ++i)
8751 449430 : if (!operand_equal_p (op (0), op (i)))
8752 : return false;
8753 : return true;
8754 : }
8755 :
8756 : hashval_t
8757 406059 : vect_scalar_ops_slice_hash::hash (const value_type &s)
8758 : {
8759 406059 : hashval_t hash = 0;
8760 1560783 : for (unsigned i = 0; i < s.length; ++i)
8761 1154724 : hash = iterative_hash_expr (s.op (i), hash);
8762 406059 : return hash;
8763 : }
8764 :
8765 : bool
8766 220111 : vect_scalar_ops_slice_hash::equal (const value_type &s1,
8767 : const compare_type &s2)
8768 : {
8769 220111 : if (s1.length != s2.length)
8770 : return false;
8771 384682 : for (unsigned i = 0; i < s1.length; ++i)
8772 334547 : if (!operand_equal_p (s1.op (i), s2.op (i)))
8773 : return false;
8774 : return true;
8775 : }
8776 :
8777 : /* Compute the prologue cost for invariant or constant operands represented
8778 : by NODE. */
8779 :
8780 : static void
8781 1113057 : vect_prologue_cost_for_slp (vec_info *vinfo, slp_tree node,
8782 : stmt_vector_for_cost *cost_vec)
8783 : {
8784 : /* There's a special case of an existing vector, that costs nothing. */
8785 1113057 : if (SLP_TREE_SCALAR_OPS (node).length () == 0
8786 1113057 : && !SLP_TREE_VEC_DEFS (node).is_empty ())
8787 1569 : return;
8788 : /* Without looking at the actual initializer a vector of
8789 : constants can be implemented as load from the constant pool.
8790 : When all elements are the same we can use a splat. */
8791 1111488 : tree vectype = SLP_TREE_VECTYPE (node);
8792 1111488 : unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
8793 1111488 : unsigned HOST_WIDE_INT const_nunits;
8794 1111488 : unsigned nelt_limit;
8795 1111488 : unsigned nvectors = vect_get_num_copies (vinfo, node);
8796 1111488 : auto ops = &SLP_TREE_SCALAR_OPS (node);
8797 1111488 : auto_vec<unsigned int> starts (nvectors);
8798 1111488 : if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
8799 1111488 : && ! multiple_p (const_nunits, group_size))
8800 : {
8801 64556 : nelt_limit = const_nunits;
8802 64556 : hash_set<vect_scalar_ops_slice_hash> vector_ops;
8803 268057 : for (unsigned int i = 0; i < nvectors; ++i)
8804 203501 : if (!vector_ops.add ({ ops, i * nelt_limit, nelt_limit }))
8805 153366 : starts.quick_push (i * nelt_limit);
8806 64556 : }
8807 : else
8808 : {
8809 : /* If either the vector has variable length or the vectors
8810 : are composed of repeated whole groups we only need to
8811 : cost construction once. All vectors will be the same. */
8812 1046932 : nelt_limit = group_size;
8813 1046932 : starts.quick_push (0);
8814 : }
8815 : /* ??? We're just tracking whether vectors in a single node are the same.
8816 : Ideally we'd do something more global. */
8817 1111488 : bool passed = false;
8818 4534762 : for (unsigned int start : starts)
8819 : {
8820 1200298 : vect_cost_for_stmt kind;
8821 1200298 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
8822 : kind = vector_load;
8823 483923 : else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
8824 : kind = scalar_to_vec;
8825 : else
8826 401289 : kind = vec_construct;
8827 : /* The target cost hook has no idea which part of the SLP node
8828 : we are costing so avoid passing it down more than once. Pass
8829 : it to the first vec_construct or scalar_to_vec part since for those
8830 : the x86 backend tries to account for GPR to XMM register moves. */
8831 1200298 : record_stmt_cost (cost_vec, 1, kind, nullptr,
8832 1200298 : (kind != vector_load && !passed) ? node : nullptr,
8833 : vectype, 0, vect_prologue);
8834 1200298 : if (kind != vector_load)
8835 483923 : passed = true;
8836 : }
8837 1111488 : }
8838 :
8839 : /* Analyze statements contained in SLP tree NODE after recursively analyzing
8840 : the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
8841 :
8842 : Return true if the operations are supported. */
8843 :
8844 : static bool
8845 5210616 : vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
8846 : slp_instance node_instance,
8847 : hash_set<slp_tree> &visited_set,
8848 : vec<slp_tree> &visited_vec,
8849 : stmt_vector_for_cost *cost_vec)
8850 : {
8851 5210616 : int i, j;
8852 5210616 : slp_tree child;
8853 :
8854 : /* Assume we can code-generate all invariants. */
8855 5210616 : if (!node
8856 4836089 : || SLP_TREE_DEF_TYPE (node) == vect_constant_def
8857 4064937 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8858 : return true;
8859 :
8860 3508730 : if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
8861 : {
8862 9 : if (dump_enabled_p ())
8863 0 : dump_printf_loc (MSG_NOTE, vect_location,
8864 : "Failed cyclic SLP reference in %p\n", (void *) node);
8865 9 : return false;
8866 : }
8867 3508721 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
8868 :
8869 : /* If we already analyzed the exact same set of scalar stmts we're done.
8870 : We share the generated vector stmts for those. */
8871 3508721 : if (visited_set.add (node))
8872 : return true;
8873 3131319 : visited_vec.safe_push (node);
8874 :
8875 3131319 : bool res = true;
8876 3131319 : unsigned visited_rec_start = visited_vec.length ();
8877 3131319 : unsigned cost_vec_rec_start = cost_vec->length ();
8878 3131319 : bool seen_non_constant_child = false;
8879 6724145 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8880 : {
8881 3904593 : res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
8882 : visited_set, visited_vec,
8883 : cost_vec);
8884 3904593 : if (!res)
8885 : break;
8886 3592826 : if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
8887 3592826 : seen_non_constant_child = true;
8888 : }
8889 : /* We're having difficulties scheduling nodes with just constant
8890 : operands and no scalar stmts since we then cannot compute a stmt
8891 : insertion place. */
8892 3131319 : if (res
8893 3131319 : && !seen_non_constant_child
8894 3131319 : && SLP_TREE_SCALAR_STMTS (node).is_empty ())
8895 : {
8896 184 : if (dump_enabled_p ())
8897 6 : dump_printf_loc (MSG_NOTE, vect_location,
8898 : "Cannot vectorize all-constant op node %p\n",
8899 : (void *) node);
8900 : res = false;
8901 : }
8902 :
8903 3131135 : if (res)
8904 2819368 : res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
8905 : cost_vec);
8906 : /* If analysis failed we have to pop all recursive visited nodes
8907 : plus ourselves. */
8908 3131319 : if (!res)
8909 : {
8910 2814716 : while (visited_vec.length () >= visited_rec_start)
8911 845136 : visited_set.remove (visited_vec.pop ());
8912 562222 : cost_vec->truncate (cost_vec_rec_start);
8913 : }
8914 :
8915 : /* When the node can be vectorized cost invariant nodes it references.
8916 : This is not done in DFS order to allow the referring node
8917 : vectorizable_* calls to nail down the invariant nodes vector type
8918 : and possibly unshare it if it needs a different vector type than
8919 : other referrers. */
8920 3131319 : if (res)
8921 5846458 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
8922 3277361 : if (child
8923 2969531 : && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
8924 2969531 : || SLP_TREE_DEF_TYPE (child) == vect_external_def)
8925 : /* Perform usual caching, note code-generation still
8926 : code-gens these nodes multiple times but we expect
8927 : to CSE them later. */
8928 4480794 : && !visited_set.add (child))
8929 : {
8930 1158618 : visited_vec.safe_push (child);
8931 : /* ??? After auditing more code paths make a "default"
8932 : and push the vector type from NODE to all children
8933 : if it is not already set. */
8934 : /* Compute the number of vectors to be generated. */
8935 1158618 : tree vector_type = SLP_TREE_VECTYPE (child);
8936 1158618 : if (!vector_type)
8937 : {
8938 : /* Masked loads can have an undefined (default SSA definition)
8939 : else operand. We do not need to cost it. */
8940 45561 : vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
8941 46996 : if (SLP_TREE_TYPE (node) == load_vec_info_type
8942 46996 : && ((ops.length ()
8943 1435 : && TREE_CODE (ops[0]) == SSA_NAME
8944 0 : && SSA_NAME_IS_DEFAULT_DEF (ops[0])
8945 0 : && VAR_P (SSA_NAME_VAR (ops[0])))
8946 1435 : || SLP_TREE_DEF_TYPE (child) == vect_constant_def))
8947 1435 : continue;
8948 :
8949 : /* For shifts with a scalar argument we don't need
8950 : to cost or code-generate anything.
8951 : ??? Represent this more explicitly. */
8952 44126 : gcc_assert (SLP_TREE_TYPE (node) == shift_vec_info_type
8953 : && j == 1);
8954 44126 : continue;
8955 44126 : }
8956 :
8957 : /* And cost them. */
8958 1113057 : vect_prologue_cost_for_slp (vinfo, child, cost_vec);
8959 : }
8960 :
8961 : /* If this node or any of its children can't be vectorized, try pruning
8962 : the tree here rather than felling the whole thing. */
8963 562222 : if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
8964 : {
8965 : /* We'll need to revisit this for invariant costing and number
8966 : of vectorized stmt setting. */
8967 : res = true;
8968 : }
8969 :
8970 : return res;
8971 : }
8972 :
8973 : /* Mark lanes of NODE that are live outside of the basic-block vectorized
8974 : region and that can be vectorized using vectorizable_live_operation
8975 : with STMT_VINFO_LIVE_P. Not handled live operations will cause the
8976 : scalar code computing it to be retained. */
8977 :
8978 : static void
8979 920835 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
8980 : slp_instance instance,
8981 : stmt_vector_for_cost *cost_vec,
8982 : hash_set<stmt_vec_info> &svisited,
8983 : hash_set<slp_tree> &visited)
8984 : {
8985 920835 : if (visited.add (node))
8986 43724 : return;
8987 :
8988 877111 : unsigned i;
8989 877111 : stmt_vec_info stmt_info;
8990 877111 : stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
8991 3175599 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8992 : {
8993 2298488 : if (!stmt_info || svisited.contains (stmt_info))
8994 56903 : continue;
8995 2272305 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
8996 2272305 : if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
8997 12060 : && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
8998 : /* Only the pattern root stmt computes the original scalar value. */
8999 8975 : continue;
9000 2263330 : if (!PURE_SLP_STMT (orig_stmt_info))
9001 : /* Iff the stmt is not part of the vector coverage because it or
9002 : uses of it are used by SLP graph leafs as extern input there is
9003 : no point in trying to live code-generate from a vector stmt as
9004 : the scalar stmt will survive anyway. */
9005 21745 : continue;
9006 2241585 : bool mark_visited = true;
9007 2241585 : gimple *orig_stmt = orig_stmt_info->stmt;
9008 2241585 : ssa_op_iter op_iter;
9009 2241585 : def_operand_p def_p;
9010 4981613 : FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
9011 : {
9012 : /* We have to verify whether we can insert the lane extract
9013 : before all uses. The following is a conservative approximation.
9014 : We cannot put this into vectorizable_live_operation because
9015 : iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
9016 : doesn't work.
9017 : Note that while the fact that we emit code for loads at the
9018 : first load should make this a non-problem leafs we construct
9019 : from scalars are vectorized after the last scalar def.
9020 : ??? If we'd actually compute the insert location during
9021 : analysis we could use sth less conservative than the last
9022 : scalar stmt in the node for the dominance check. */
9023 : /* ??? What remains is "live" uses in vector CTORs in the same
9024 : SLP graph which is where those uses can end up code-generated
9025 : right after their definition instead of close to their original
9026 : use. But that would restrict us to code-generate lane-extracts
9027 : from the latest stmt in a node. So we compensate for this
9028 : during code-generation, simply not replacing uses for those
9029 : hopefully rare cases. */
9030 498443 : imm_use_iterator use_iter;
9031 498443 : gimple *use_stmt;
9032 498443 : stmt_vec_info use_stmt_info;
9033 :
9034 498443 : bool live_p = false;
9035 498443 : bool can_insert = true;
9036 1921145 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
9037 940151 : if (!is_gimple_debug (use_stmt)
9038 940151 : && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
9039 702228 : || !PURE_SLP_STMT (use_stmt_info)))
9040 : {
9041 147351 : live_p = true;
9042 147351 : if (!vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
9043 : {
9044 15892 : if (dump_enabled_p ())
9045 46 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9046 : "Cannot determine insertion place for "
9047 : "lane extract\n");
9048 : can_insert = false;
9049 : break;
9050 : }
9051 498443 : }
9052 498443 : if (live_p && can_insert)
9053 : {
9054 : /* Only record a live stmt when we can replace all uses. We
9055 : record from which SLP tree we vectorize the uses, so we'll
9056 : cost once and can deal with the case that not all SLP nodes
9057 : may be suitable for code-generation of all live uses.
9058 : ??? But we never split up the work between multiple SLP
9059 : nodes. */
9060 65831 : STMT_VINFO_LIVE_P (stmt_info) = true;
9061 65831 : if (!vectorizable_live_operation (bb_vinfo, stmt_info, node,
9062 : instance, i, false, cost_vec))
9063 : {
9064 0 : STMT_VINFO_LIVE_P (stmt_info) = false;
9065 0 : mark_visited = false;
9066 : }
9067 : }
9068 : }
9069 2241585 : if (mark_visited)
9070 2241585 : svisited.add (stmt_info);
9071 : }
9072 :
9073 : slp_tree child;
9074 2535300 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9075 888355 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9076 237733 : vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
9077 : svisited, visited);
9078 : }
9079 :
9080 : /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
9081 : are live outside of the basic-block vectorized region and that can be
9082 : vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
9083 :
9084 : static void
9085 236000 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
9086 : {
9087 236000 : if (bb_vinfo->slp_instances.is_empty ())
9088 0 : return;
9089 :
9090 236000 : hash_set<slp_tree> visited;
9091 236000 : hash_set<stmt_vec_info> svisited;
9092 1391102 : for (slp_instance instance : bb_vinfo->slp_instances)
9093 : {
9094 683102 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9095 29096 : STMT_VINFO_LIVE_P (SLP_INSTANCE_ROOT_STMTS (instance)[0]) = true;
9096 683102 : vect_location = instance->location ();
9097 683102 : vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
9098 : instance, &instance->cost_vec,
9099 : svisited, visited);
9100 : }
9101 236000 : }
9102 :
9103 : /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
9104 :
9105 : static bool
9106 74913 : vectorizable_bb_reduc_epilogue (slp_instance instance,
9107 : stmt_vector_for_cost *cost_vec)
9108 : {
9109 74913 : gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
9110 74913 : enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
9111 74913 : if (reduc_code == MINUS_EXPR)
9112 0 : reduc_code = PLUS_EXPR;
9113 74913 : internal_fn reduc_fn;
9114 74913 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
9115 74913 : if (!vectype
9116 74901 : || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9117 74901 : || reduc_fn == IFN_LAST
9118 74901 : || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
9119 110479 : || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
9120 35566 : TREE_TYPE (vectype)))
9121 : {
9122 49928 : if (dump_enabled_p ())
9123 277 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9124 : "not vectorized: basic block reduction epilogue "
9125 : "operation unsupported.\n");
9126 49928 : return false;
9127 : }
9128 :
9129 : /* There's no way to cost a horizontal vector reduction via REDUC_FN so
9130 : cost log2 vector operations plus shuffles and one extraction. */
9131 24985 : unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
9132 24985 : record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
9133 : vectype, 0, vect_body);
9134 24985 : record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
9135 : vectype, 0, vect_body);
9136 24985 : record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
9137 : vectype, 0, vect_body);
9138 :
9139 : /* Since we replace all stmts of a possibly longer scalar reduction
9140 : chain account for the extra scalar stmts for that. */
9141 24985 : if (!instance->remain_defs.is_empty ())
9142 20262 : record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
9143 10131 : instance->root_stmts[0], 0, vect_body);
9144 : return true;
9145 : }
9146 :
9147 : /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
9148 : and recurse to children. */
9149 :
9150 : static void
9151 189776 : vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
9152 : hash_set<slp_tree> &visited)
9153 : {
9154 189776 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
9155 189776 : || visited.add (node))
9156 83463 : return;
9157 :
9158 : stmt_vec_info stmt;
9159 : unsigned i;
9160 359932 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
9161 253619 : if (stmt)
9162 258946 : roots.remove (vect_orig_stmt (stmt));
9163 :
9164 : slp_tree child;
9165 234910 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9166 128597 : if (child)
9167 127191 : vect_slp_prune_covered_roots (child, roots, visited);
9168 : }
9169 :
9170 : /* Hand over COST_VEC to the target COSTS grouped by SLP node. */
9171 :
9172 : static void
9173 944801 : add_slp_costs (vector_costs *costs, stmt_vector_for_cost& cost_vec)
9174 : {
9175 3579128 : for (unsigned start = 0; start < cost_vec.length ();)
9176 : {
9177 2634327 : unsigned end = start + 1;
9178 3214686 : while (end < cost_vec.length ()
9179 5493175 : && cost_vec[start].node == cost_vec[end].node)
9180 580359 : end++;
9181 2634327 : costs->add_slp_cost (cost_vec[start].node,
9182 2634327 : array_slice<stmt_info_for_cost>
9183 2634327 : (cost_vec.begin () + start, end - start));
9184 2634327 : start = end;
9185 : }
9186 944801 : }
9187 :
9188 : /* Analyze statements in SLP instances of VINFO. Return true if the
9189 : operations are supported. */
9190 :
9191 : bool
9192 662023 : vect_slp_analyze_operations (vec_info *vinfo)
9193 : {
9194 662023 : slp_instance instance;
9195 662023 : int i;
9196 :
9197 662023 : DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
9198 :
9199 662023 : hash_set<slp_tree> visited;
9200 1727792 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
9201 : {
9202 1306023 : auto_vec<slp_tree> visited_vec;
9203 1306023 : stmt_vector_for_cost cost_vec;
9204 1306023 : cost_vec.create (2);
9205 1306023 : if (is_a <bb_vec_info> (vinfo))
9206 783452 : vect_location = instance->location ();
9207 1306023 : if (!vect_slp_analyze_node_operations (vinfo,
9208 : SLP_INSTANCE_TREE (instance),
9209 : instance, visited, visited_vec,
9210 : &cost_vec)
9211 : /* CTOR instances require vectorized defs for the SLP tree root. */
9212 1075382 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
9213 5641 : && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
9214 : != vect_internal_def
9215 : /* Make sure we vectorized with the expected type. */
9216 5641 : || !useless_type_conversion_p
9217 5641 : (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
9218 : (instance->root_stmts[0]->stmt))),
9219 5641 : TREE_TYPE (SLP_TREE_VECTYPE
9220 : (SLP_INSTANCE_TREE (instance))))))
9221 : /* Check we can vectorize the reduction. */
9222 1075367 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
9223 74913 : && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
9224 : /* Check we can vectorize the gcond. */
9225 2331462 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
9226 61107 : && !vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
9227 61107 : SLP_INSTANCE_ROOT_STMTS (instance)[0],
9228 : NULL,
9229 : SLP_INSTANCE_TREE (instance),
9230 : &cost_vec)))
9231 : {
9232 339089 : cost_vec.release ();
9233 339089 : slp_tree node = SLP_INSTANCE_TREE (instance);
9234 339089 : stmt_vec_info stmt_info;
9235 339089 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9236 256254 : stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
9237 82835 : else if (!SLP_TREE_SCALAR_STMTS (node).is_empty ()
9238 82835 : && SLP_TREE_SCALAR_STMTS (node)[0])
9239 : stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
9240 : else
9241 0 : stmt_info = SLP_TREE_REPRESENTATIVE (node);
9242 339089 : if (is_a <loop_vec_info> (vinfo))
9243 : {
9244 240254 : if (dump_enabled_p ())
9245 6485 : dump_printf_loc (MSG_NOTE, vect_location,
9246 : "unsupported SLP instance starting from: %G",
9247 : stmt_info->stmt);
9248 240254 : return false;
9249 : }
9250 98835 : if (dump_enabled_p ())
9251 331 : dump_printf_loc (MSG_NOTE, vect_location,
9252 : "removing SLP instance operations starting from: %G",
9253 : stmt_info->stmt);
9254 540692 : while (!visited_vec.is_empty ())
9255 : {
9256 441857 : slp_tree node = visited_vec.pop ();
9257 441857 : SLP_TREE_TYPE (node) = undef_vec_info_type;
9258 441857 : if (node->data)
9259 : {
9260 12131 : delete node->data;
9261 12131 : node->data = nullptr;
9262 : }
9263 441857 : visited.remove (node);
9264 : }
9265 98835 : vect_free_slp_instance (instance);
9266 98835 : vinfo->slp_instances.ordered_remove (i);
9267 : }
9268 : else
9269 : {
9270 966934 : i++;
9271 966934 : if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
9272 : {
9273 282317 : add_slp_costs (loop_vinfo->vector_costs, cost_vec);
9274 282317 : cost_vec.release ();
9275 : }
9276 : else
9277 : /* For BB vectorization remember the SLP graph entry
9278 : cost for later. */
9279 684617 : instance->cost_vec = cost_vec;
9280 : }
9281 1306023 : }
9282 :
9283 : /* Now look for SLP instances with a root that are covered by other
9284 : instances and remove them. */
9285 421769 : hash_set<stmt_vec_info> roots;
9286 1741656 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
9287 931319 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9288 33201 : roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
9289 421769 : if (!roots.is_empty ())
9290 : {
9291 13186 : visited.empty ();
9292 75771 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
9293 62585 : vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
9294 : visited);
9295 75771 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
9296 62585 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
9297 33201 : && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
9298 : {
9299 1515 : stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
9300 1515 : if (dump_enabled_p ())
9301 20 : dump_printf_loc (MSG_NOTE, vect_location,
9302 : "removing SLP instance operations starting "
9303 : "from: %G", root->stmt);
9304 1515 : vect_free_slp_instance (instance);
9305 1515 : vinfo->slp_instances.ordered_remove (i);
9306 : }
9307 : else
9308 61070 : ++i;
9309 : }
9310 :
9311 843538 : return !vinfo->slp_instances.is_empty ();
9312 1083792 : }
9313 :
9314 : /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
9315 : closing the eventual chain. */
9316 :
9317 : static slp_instance
9318 753763 : get_ultimate_leader (slp_instance instance,
9319 : hash_map<slp_instance, slp_instance> &instance_leader)
9320 : {
9321 753763 : auto_vec<slp_instance *, 8> chain;
9322 753763 : slp_instance *tem;
9323 838223 : while (*(tem = instance_leader.get (instance)) != instance)
9324 : {
9325 84460 : chain.safe_push (tem);
9326 84460 : instance = *tem;
9327 : }
9328 838223 : while (!chain.is_empty ())
9329 84460 : *chain.pop () = instance;
9330 753763 : return instance;
9331 753763 : }
9332 :
9333 : namespace {
9334 : /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
9335 : KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
9336 : for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
9337 :
9338 : INSTANCE_LEADER is as for get_ultimate_leader. */
9339 :
9340 : template<typename T>
9341 : bool
9342 3326223 : vect_map_to_instance (slp_instance instance, T key,
9343 : hash_map<T, slp_instance> &key_to_instance,
9344 : hash_map<slp_instance, slp_instance> &instance_leader)
9345 : {
9346 : bool existed_p;
9347 3326223 : slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
9348 3326223 : if (!existed_p)
9349 : ;
9350 182591 : else if (key_instance != instance)
9351 : {
9352 : /* If we're running into a previously marked key make us the
9353 : leader of the current ultimate leader. This keeps the
9354 : leader chain acyclic and works even when the current instance
9355 : connects two previously independent graph parts. */
9356 70661 : slp_instance key_leader
9357 70661 : = get_ultimate_leader (key_instance, instance_leader);
9358 70661 : if (key_leader != instance)
9359 20843 : instance_leader.put (key_leader, instance);
9360 : }
9361 3326223 : key_instance = instance;
9362 3326223 : return existed_p;
9363 : }
9364 : }
9365 :
9366 : /* Worker of vect_bb_partition_graph, recurse on NODE. */
9367 :
9368 : static void
9369 920835 : vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
9370 : slp_instance instance, slp_tree node,
9371 : hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
9372 : hash_map<slp_tree, slp_instance> &node_to_instance,
9373 : hash_map<slp_instance, slp_instance> &instance_leader)
9374 : {
9375 920835 : stmt_vec_info stmt_info;
9376 920835 : unsigned i;
9377 :
9378 3326223 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9379 2405388 : if (stmt_info)
9380 2405388 : vect_map_to_instance (instance, stmt_info, stmt_to_instance,
9381 : instance_leader);
9382 :
9383 920835 : if (vect_map_to_instance (instance, node, node_to_instance,
9384 : instance_leader))
9385 920835 : return;
9386 :
9387 : slp_tree child;
9388 1765466 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9389 888355 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9390 237733 : vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
9391 : node_to_instance, instance_leader);
9392 : }
9393 :
9394 : /* Partition the SLP graph into pieces that can be costed independently. */
9395 :
9396 : static void
9397 236000 : vect_bb_partition_graph (bb_vec_info bb_vinfo)
9398 : {
9399 236000 : DUMP_VECT_SCOPE ("vect_bb_partition_graph");
9400 :
9401 : /* First walk the SLP graph assigning each involved scalar stmt a
9402 : corresponding SLP graph entry and upon visiting a previously
9403 : marked stmt, make the stmts leader the current SLP graph entry. */
9404 236000 : hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
9405 236000 : hash_map<slp_tree, slp_instance> node_to_instance;
9406 236000 : hash_map<slp_instance, slp_instance> instance_leader;
9407 236000 : slp_instance instance;
9408 919102 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
9409 : {
9410 683102 : instance_leader.put (instance, instance);
9411 683102 : vect_bb_partition_graph_r (bb_vinfo,
9412 : instance, SLP_INSTANCE_TREE (instance),
9413 : stmt_to_instance, node_to_instance,
9414 : instance_leader);
9415 : }
9416 :
9417 : /* Then collect entries to each independent subgraph. */
9418 1155102 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
9419 : {
9420 683102 : slp_instance leader = get_ultimate_leader (instance, instance_leader);
9421 683102 : leader->subgraph_entries.safe_push (instance);
9422 683102 : if (dump_enabled_p ()
9423 683102 : && leader != instance)
9424 69 : dump_printf_loc (MSG_NOTE, vect_location,
9425 : "instance %p is leader of %p\n",
9426 : (void *) leader, (void *) instance);
9427 : }
9428 236000 : }
9429 :
9430 : /* Compute the scalar cost of the SLP node NODE and its children
9431 : and return it. Do not account defs that are marked in LIFE and
9432 : update LIFE according to uses of NODE. */
9433 :
9434 : static void
9435 679602 : vect_bb_slp_scalar_cost (bb_vec_info vinfo,
9436 : vec<stmt_vec_info> &worklist,
9437 : stmt_vector_for_cost *cost_vec,
9438 : hash_set<stmt_vec_info> &visited)
9439 : {
9440 3155976 : while (!worklist.is_empty ())
9441 : {
9442 2476374 : stmt_vec_info stmt = worklist.pop ();
9443 2766805 : if (!PURE_SLP_STMT (stmt))
9444 306076 : continue;
9445 :
9446 : /* When the stmt is live but not actually vectorized we have
9447 : to keep the feeding scalar defs. */
9448 2188839 : if (!STMT_VINFO_LIVE_P (vect_stmt_to_vectorize (stmt)))
9449 : {
9450 2121835 : bool live_p = false;
9451 2121835 : ssa_op_iter op_iter;
9452 2121835 : def_operand_p def_p;
9453 4645141 : FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt->stmt, op_iter, SSA_OP_DEF)
9454 : {
9455 401471 : imm_use_iterator use_iter;
9456 401471 : gimple *use_stmt;
9457 1451120 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
9458 648178 : if (!is_gimple_debug (use_stmt))
9459 : {
9460 479421 : stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
9461 479421 : if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
9462 : {
9463 24420 : if (dump_enabled_p ())
9464 : {
9465 36 : dump_printf_loc (MSG_NOTE, vect_location,
9466 : "stmt considered live: %G",
9467 : stmt->stmt);
9468 36 : dump_printf_loc (MSG_NOTE, vect_location,
9469 : "because of use in: %G",
9470 : use_stmt);
9471 : }
9472 : live_p = true;
9473 : }
9474 401471 : }
9475 : }
9476 2121835 : if (live_p)
9477 15645 : continue;
9478 : }
9479 :
9480 : /* The following assert verifies that vect_bb_partition_graph
9481 : partitions the SLP graph in a way that each scalar stmt of
9482 : the coverage of the SLP graph belongs to exactly one subgraph.
9483 : ??? This is currently not guaranteed since the function
9484 : works purely on SLP_TREE_SCALAR_STMTS, resulting in the assert
9485 : tripping or scalar stmts costed multiple times, making vectorization
9486 : more profitable than it really is. */
9487 : /* gcc_checking_assert (!gimple_visited_p (stmt->stmt)); */
9488 :
9489 2170298 : if (vect_nop_conversion_p (stmt))
9490 : ;
9491 : /* For single-argument PHIs assume coalescing which means zero
9492 : cost for the scalar and the vector PHIs. This avoids
9493 : artificially favoring the vector path (but may pessimize it
9494 : in some cases). */
9495 2148939 : else if (is_a <gphi *> (stmt->stmt)
9496 2148939 : && gimple_phi_num_args (as_a <gphi *> (stmt->stmt)) == 1)
9497 : ;
9498 : else
9499 : {
9500 2140082 : vect_cost_for_stmt kind;
9501 2140082 : if (STMT_VINFO_DATA_REF (stmt))
9502 : {
9503 1963897 : data_reference_p dr = STMT_VINFO_DATA_REF (stmt);
9504 1963897 : tree base = get_base_address (DR_REF (dr));
9505 : /* When the scalar access is to a non-global not
9506 : address-taken decl that is not BLKmode assume we can
9507 : access it with a single non-load/store instruction. */
9508 1963897 : if (DECL_P (base)
9509 1514036 : && !is_global_var (base)
9510 1438187 : && !TREE_ADDRESSABLE (base)
9511 2512939 : && DECL_MODE (base) != BLKmode)
9512 : kind = scalar_stmt;
9513 1820792 : else if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt)))
9514 : kind = scalar_load;
9515 : else
9516 1593384 : kind = scalar_store;
9517 : }
9518 : else
9519 : kind = scalar_stmt;
9520 : /* Cost each scalar stmt only once. */
9521 2140082 : gimple_set_visited (stmt->stmt, true);
9522 2140082 : record_stmt_cost (cost_vec, 1, kind, stmt, NULL_TREE, 0, vect_body);
9523 : }
9524 :
9525 : /* Now walk relevant parts of the SSA use-def graph. */
9526 2170298 : slp_oprnds child_ops (stmt);
9527 4553796 : for (unsigned i = 0; i < child_ops.num_slp_children; ++i)
9528 : {
9529 2383498 : tree op = child_ops.get_op_for_slp_child (stmt, i);
9530 2383498 : stmt_vec_info def = vinfo->lookup_def (op);
9531 2383498 : if (def && !visited.add (def))
9532 697562 : worklist.safe_push (def);
9533 : }
9534 : }
9535 679602 : }
9536 :
9537 :
9538 : /* Comparator for the loop-index sorted cost vectors. */
9539 :
9540 : static int
9541 17038004 : li_cost_vec_cmp (const void *a_, const void *b_, void *)
9542 : {
9543 17038004 : auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
9544 17038004 : auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
9545 17038004 : if (a->first < b->first)
9546 : return -1;
9547 16196014 : else if (a->first == b->first)
9548 15491323 : return 0;
9549 : return 1;
9550 : }
9551 :
9552 : /* Check if vectorization of the basic block is profitable for the
9553 : subgraph denoted by SLP_INSTANCES. */
9554 :
9555 : static bool
9556 658896 : vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
9557 : vec<slp_instance> slp_instances,
9558 : loop_p orig_loop)
9559 : {
9560 658896 : slp_instance instance;
9561 658896 : int i;
9562 658896 : unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
9563 658896 : unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
9564 :
9565 658896 : if (dump_enabled_p ())
9566 : {
9567 98 : dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
9568 98 : hash_set<slp_tree> visited;
9569 395 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9570 101 : vect_print_slp_graph (MSG_NOTE, vect_location,
9571 : SLP_INSTANCE_TREE (instance), visited);
9572 98 : }
9573 :
9574 : /* Then DFS walk scalar stmts, performing costing and handling
9575 : still live scalar stmts via the previously computed vector coverage. */
9576 658896 : stmt_vector_for_cost scalar_costs = vNULL;
9577 658896 : stmt_vector_for_cost vector_costs = vNULL;
9578 658896 : hash_set<slp_tree> visited;
9579 658896 : hash_set<stmt_vec_info> svisited;
9580 1338498 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9581 : {
9582 679602 : auto_vec<stmt_vec_info> worklist;
9583 679602 : if (SLP_INSTANCE_ROOT_STMTS (instance).exists ())
9584 57632 : record_stmt_cost (&scalar_costs,
9585 28816 : SLP_INSTANCE_ROOT_STMTS (instance).length (),
9586 : scalar_stmt,
9587 28816 : SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
9588 3825954 : for (auto stmt : SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)))
9589 : {
9590 1787148 : stmt = vect_orig_stmt (stmt);
9591 1787148 : if (!svisited.add (stmt))
9592 1778812 : worklist.safe_push (stmt);
9593 : }
9594 679602 : vect_bb_slp_scalar_cost (bb_vinfo, worklist, &scalar_costs, svisited);
9595 679602 : vector_costs.safe_splice (instance->cost_vec);
9596 679602 : instance->cost_vec.release ();
9597 679602 : }
9598 :
9599 658896 : if (dump_enabled_p ())
9600 98 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
9601 :
9602 : /* When costing non-loop vectorization we need to consider each covered
9603 : loop independently and make sure vectorization is profitable. For
9604 : now we assume a loop may be not entered or executed an arbitrary
9605 : number of iterations (??? static information can provide more
9606 : precise info here) which means we can simply cost each containing
9607 : loops stmts separately. */
9608 :
9609 : /* First produce cost vectors sorted by loop index. */
9610 658896 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
9611 658896 : li_scalar_costs (scalar_costs.length ());
9612 658896 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
9613 658896 : li_vector_costs (vector_costs.length ());
9614 658896 : stmt_info_for_cost *cost;
9615 2827794 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9616 : {
9617 2168898 : unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
9618 2168898 : li_scalar_costs.quick_push (std::make_pair (l, cost));
9619 : }
9620 : /* Use a random used loop as fallback in case the first vector_costs
9621 : entry does not have a stmt_info associated with it. */
9622 658896 : unsigned l = li_scalar_costs[0].first;
9623 2408453 : FOR_EACH_VEC_ELT (vector_costs, i, cost)
9624 : {
9625 : /* We inherit from the previous COST, invariants, externals and
9626 : extracts immediately follow the cost for the related stmt. */
9627 1749557 : if (cost->stmt_info)
9628 1028078 : l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
9629 1749557 : li_vector_costs.quick_push (std::make_pair (l, cost));
9630 : }
9631 658896 : li_scalar_costs.stablesort (li_cost_vec_cmp, NULL);
9632 658896 : li_vector_costs.stablesort (li_cost_vec_cmp, NULL);
9633 :
9634 : /* Now cost the portions individually. */
9635 : unsigned vi = 0;
9636 : unsigned si = 0;
9637 1143503 : bool profitable = true;
9638 1143503 : while (si < li_scalar_costs.length ()
9639 1807037 : && vi < li_vector_costs.length ())
9640 : {
9641 663522 : unsigned sl = li_scalar_costs[si].first;
9642 663522 : unsigned vl = li_vector_costs[vi].first;
9643 663522 : if (sl != vl)
9644 : {
9645 1038 : if (dump_enabled_p ())
9646 0 : dump_printf_loc (MSG_NOTE, vect_location,
9647 : "Scalar %d and vector %d loop part do not "
9648 : "match up, skipping scalar part\n", sl, vl);
9649 : /* Skip the scalar part, assuming zero cost on the vector side. */
9650 1708 : do
9651 : {
9652 1708 : si++;
9653 : }
9654 1708 : while (si < li_scalar_costs.length ()
9655 3533 : && li_scalar_costs[si].first == sl);
9656 1038 : continue;
9657 : }
9658 :
9659 662484 : class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
9660 2149632 : do
9661 : {
9662 2149632 : add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
9663 2149632 : si++;
9664 : }
9665 2149632 : while (si < li_scalar_costs.length ()
9666 4306840 : && li_scalar_costs[si].first == sl);
9667 662484 : scalar_target_cost_data->finish_cost (nullptr);
9668 662484 : scalar_cost = scalar_target_cost_data->body_cost ();
9669 :
9670 : /* Complete the target-specific vector cost calculation. */
9671 662484 : class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
9672 662484 : auto_vec<stmt_info_for_cost> tem;
9673 1721602 : do
9674 : {
9675 1721602 : tem.safe_push (*li_vector_costs[vi].second);
9676 1721602 : vi++;
9677 : }
9678 1721602 : while (vi < li_vector_costs.length ()
9679 3451940 : && li_vector_costs[vi].first == vl);
9680 662484 : add_slp_costs (vect_target_cost_data, tem);
9681 662484 : vect_target_cost_data->finish_cost (scalar_target_cost_data);
9682 662484 : vec_prologue_cost = vect_target_cost_data->prologue_cost ();
9683 662484 : vec_inside_cost = vect_target_cost_data->body_cost ();
9684 662484 : vec_epilogue_cost = vect_target_cost_data->epilogue_cost ();
9685 662484 : delete scalar_target_cost_data;
9686 662484 : delete vect_target_cost_data;
9687 :
9688 662484 : vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
9689 :
9690 662484 : if (dump_enabled_p ())
9691 : {
9692 98 : dump_printf_loc (MSG_NOTE, vect_location,
9693 : "Cost model analysis for part in loop %d:\n", sl);
9694 98 : dump_printf (MSG_NOTE, " Vector cost: %d\n",
9695 : vec_inside_cost + vec_outside_cost);
9696 98 : dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
9697 : }
9698 :
9699 : /* Vectorization is profitable if its cost is more than the cost of scalar
9700 : version. Note that we err on the vector side for equal cost because
9701 : the cost estimate is otherwise quite pessimistic (constant uses are
9702 : free on the scalar side but cost a load on the vector side for
9703 : example). */
9704 662484 : if (vec_outside_cost + vec_inside_cost > scalar_cost)
9705 : {
9706 178915 : profitable = false;
9707 178915 : break;
9708 : }
9709 483569 : }
9710 658896 : if (profitable && vi < li_vector_costs.length ())
9711 : {
9712 1082 : if (dump_enabled_p ())
9713 12 : dump_printf_loc (MSG_NOTE, vect_location,
9714 : "Excess vector cost for part in loop %d:\n",
9715 6 : li_vector_costs[vi].first);
9716 : profitable = false;
9717 : }
9718 :
9719 : /* Unset visited flag. This is delayed when the subgraph is profitable
9720 : and we process the loop for remaining unvectorized if-converted code. */
9721 658896 : if (!orig_loop || !profitable)
9722 2826375 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9723 2167582 : gimple_set_visited (cost->stmt_info->stmt, false);
9724 :
9725 658896 : scalar_costs.release ();
9726 658896 : vector_costs.release ();
9727 :
9728 658896 : return profitable;
9729 658896 : }
9730 :
9731 : /* qsort comparator for lane defs. */
9732 :
9733 : static int
9734 40 : vld_cmp (const void *a_, const void *b_)
9735 : {
9736 40 : auto *a = (const std::pair<unsigned, tree> *)a_;
9737 40 : auto *b = (const std::pair<unsigned, tree> *)b_;
9738 40 : return a->first - b->first;
9739 : }
9740 :
9741 : /* Return true if USE_STMT is a vector lane insert into VEC and set
9742 : *THIS_LANE to the lane number that is set. */
9743 :
9744 : static bool
9745 248 : vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
9746 : {
9747 248 : gassign *use_ass = dyn_cast <gassign *> (use_stmt);
9748 91 : if (!use_ass
9749 91 : || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
9750 22 : || (vec
9751 22 : ? gimple_assign_rhs1 (use_ass) != vec
9752 24 : : ((vec = gimple_assign_rhs1 (use_ass)), false))
9753 46 : || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
9754 46 : TREE_TYPE (gimple_assign_rhs2 (use_ass)))
9755 46 : || !constant_multiple_p
9756 46 : (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
9757 92 : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
9758 : this_lane))
9759 202 : return false;
9760 : return true;
9761 : }
9762 :
9763 : /* Find any vectorizable constructors and add them to the grouped_store
9764 : array. */
9765 :
9766 : static void
9767 2205447 : vect_slp_check_for_roots (bb_vec_info bb_vinfo)
9768 : {
9769 17680234 : for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
9770 30949574 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
9771 137971545 : !gsi_end_p (gsi); gsi_next (&gsi))
9772 : {
9773 122496758 : gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
9774 : /* This can be used to start SLP discovery for early breaks for BB early breaks
9775 : when we get that far. */
9776 122496758 : if (!assign)
9777 184803758 : continue;
9778 :
9779 31028261 : tree rhs = gimple_assign_rhs1 (assign);
9780 31028261 : enum tree_code code = gimple_assign_rhs_code (assign);
9781 31028261 : use_operand_p use_p;
9782 31028261 : gimple *use_stmt;
9783 31028261 : if (code == CONSTRUCTOR)
9784 : {
9785 1597947 : if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9786 63480 : || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
9787 92543 : CONSTRUCTOR_NELTS (rhs))
9788 42948 : || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
9789 1640891 : || uniform_vector_p (rhs))
9790 1585095 : continue;
9791 :
9792 : unsigned j;
9793 : tree val;
9794 63645 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9795 50793 : if (TREE_CODE (val) != SSA_NAME
9796 50793 : || !bb_vinfo->lookup_def (val))
9797 : break;
9798 31736 : if (j != CONSTRUCTOR_NELTS (rhs))
9799 3016 : continue;
9800 :
9801 12852 : vec<stmt_vec_info> roots = vNULL;
9802 12852 : roots.safe_push (bb_vinfo->lookup_stmt (assign));
9803 12852 : vec<stmt_vec_info> stmts;
9804 12852 : stmts.create (CONSTRUCTOR_NELTS (rhs));
9805 71840 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9806 46136 : stmts.quick_push
9807 46136 : (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
9808 12852 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9809 12852 : stmts, roots));
9810 : }
9811 29430314 : else if (code == BIT_INSERT_EXPR
9812 933 : && VECTOR_TYPE_P (TREE_TYPE (rhs))
9813 611 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
9814 611 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
9815 608 : && integer_zerop (gimple_assign_rhs3 (assign))
9816 341 : && useless_type_conversion_p
9817 341 : (TREE_TYPE (TREE_TYPE (rhs)),
9818 341 : TREE_TYPE (gimple_assign_rhs2 (assign)))
9819 29430936 : && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
9820 : {
9821 : /* We start to match on insert to lane zero but since the
9822 : inserts need not be ordered we'd have to search both
9823 : the def and the use chains. */
9824 215 : tree vectype = TREE_TYPE (rhs);
9825 215 : unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
9826 215 : auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
9827 215 : auto_sbitmap lanes (nlanes);
9828 215 : bitmap_clear (lanes);
9829 215 : bitmap_set_bit (lanes, 0);
9830 215 : tree def = gimple_assign_lhs (assign);
9831 215 : lane_defs.quick_push
9832 215 : (std::make_pair (0, gimple_assign_rhs2 (assign)));
9833 215 : unsigned lanes_found = 1;
9834 : /* Start with the use chains, the last stmt will be the root. */
9835 215 : stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
9836 215 : vec<stmt_vec_info> roots = vNULL;
9837 215 : roots.safe_push (last);
9838 217 : do
9839 : {
9840 217 : use_operand_p use_p;
9841 217 : gimple *use_stmt;
9842 217 : if (!single_imm_use (def, &use_p, &use_stmt))
9843 : break;
9844 211 : unsigned this_lane;
9845 211 : if (!bb_vinfo->lookup_stmt (use_stmt)
9846 211 : || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
9847 233 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
9848 : break;
9849 22 : if (bitmap_bit_p (lanes, this_lane))
9850 : break;
9851 2 : lanes_found++;
9852 2 : bitmap_set_bit (lanes, this_lane);
9853 2 : gassign *use_ass = as_a <gassign *> (use_stmt);
9854 2 : lane_defs.quick_push (std::make_pair
9855 2 : (this_lane, gimple_assign_rhs2 (use_ass)));
9856 2 : last = bb_vinfo->lookup_stmt (use_ass);
9857 2 : roots.safe_push (last);
9858 2 : def = gimple_assign_lhs (use_ass);
9859 : }
9860 2 : while (lanes_found < nlanes);
9861 215 : if (roots.length () > 1)
9862 2 : std::swap(roots[0], roots[roots.length () - 1]);
9863 215 : if (lanes_found < nlanes)
9864 : {
9865 : /* Now search the def chain. */
9866 215 : def = gimple_assign_rhs1 (assign);
9867 217 : do
9868 : {
9869 217 : if (TREE_CODE (def) != SSA_NAME
9870 217 : || !has_single_use (def))
9871 : break;
9872 56 : gimple *def_stmt = SSA_NAME_DEF_STMT (def);
9873 56 : unsigned this_lane;
9874 56 : if (!bb_vinfo->lookup_stmt (def_stmt)
9875 37 : || !vect_slp_is_lane_insert (def_stmt,
9876 : NULL_TREE, &this_lane)
9877 80 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
9878 : break;
9879 24 : if (bitmap_bit_p (lanes, this_lane))
9880 : break;
9881 4 : lanes_found++;
9882 4 : bitmap_set_bit (lanes, this_lane);
9883 8 : lane_defs.quick_push (std::make_pair
9884 4 : (this_lane,
9885 4 : gimple_assign_rhs2 (def_stmt)));
9886 4 : roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
9887 4 : def = gimple_assign_rhs1 (def_stmt);
9888 : }
9889 4 : while (lanes_found < nlanes);
9890 : }
9891 215 : if (lanes_found == nlanes)
9892 : {
9893 : /* Sort lane_defs after the lane index and register the root. */
9894 2 : lane_defs.qsort (vld_cmp);
9895 2 : vec<stmt_vec_info> stmts;
9896 2 : stmts.create (nlanes);
9897 10 : for (unsigned i = 0; i < nlanes; ++i)
9898 8 : stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
9899 2 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9900 2 : stmts, roots));
9901 : }
9902 : else
9903 213 : roots.release ();
9904 215 : }
9905 29430099 : else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9906 28450600 : && (associative_tree_code (code) || code == MINUS_EXPR)
9907 : /* ??? This pessimizes a two-element reduction. PR54400.
9908 : ??? In-order reduction could be handled if we only
9909 : traverse one operand chain in vect_slp_linearize_chain. */
9910 33377145 : && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
9911 : /* Ops with constants at the tail can be stripped here. */
9912 5829311 : && TREE_CODE (rhs) == SSA_NAME
9913 5762785 : && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
9914 : /* Should be the chain end. */
9915 31720412 : && (!single_imm_use (gimple_assign_lhs (assign),
9916 : &use_p, &use_stmt)
9917 1763985 : || !is_gimple_assign (use_stmt)
9918 1209555 : || (gimple_assign_rhs_code (use_stmt) != code
9919 899164 : && ((code != PLUS_EXPR && code != MINUS_EXPR)
9920 498179 : || (gimple_assign_rhs_code (use_stmt)
9921 498179 : != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
9922 : {
9923 : /* We start the match at the end of a possible association
9924 : chain. */
9925 1882265 : auto_vec<chain_op_t> chain;
9926 1882265 : auto_vec<std::pair<tree_code, gimple *> > worklist;
9927 1882265 : auto_vec<gimple *> chain_stmts;
9928 1882265 : gimple *code_stmt = NULL, *alt_code_stmt = NULL;
9929 1882265 : if (code == MINUS_EXPR)
9930 304825 : code = PLUS_EXPR;
9931 1882265 : internal_fn reduc_fn;
9932 2163934 : if (!reduction_fn_for_scalar_code (code, &reduc_fn)
9933 1882265 : || reduc_fn == IFN_LAST)
9934 281669 : continue;
9935 1600596 : vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
9936 : /* ??? */
9937 : code_stmt, alt_code_stmt, &chain_stmts,
9938 : false);
9939 3201192 : if (chain.length () > 1)
9940 : {
9941 : /* Sort the chain according to def_type and operation. */
9942 1600596 : chain.sort (dt_sort_cmp, bb_vinfo);
9943 : /* ??? Now we'd want to strip externals and constants
9944 : but record those to be handled in the epilogue. */
9945 : /* ??? For now do not allow mixing ops or externs/constants. */
9946 1600596 : bool invalid = false;
9947 1600596 : unsigned remain_cnt = 0;
9948 1600596 : unsigned last_idx = 0;
9949 4831863 : for (unsigned i = 0; i < chain.length (); ++i)
9950 : {
9951 3536092 : if (chain[i].code != code)
9952 : {
9953 : invalid = true;
9954 : break;
9955 : }
9956 3231267 : if (chain[i].dt != vect_internal_def
9957 : /* Avoid stmts where the def is not the LHS, like
9958 : ASMs. */
9959 6249703 : || (gimple_get_lhs (bb_vinfo->lookup_def
9960 3018436 : (chain[i].op)->stmt)
9961 3018436 : != chain[i].op))
9962 215775 : remain_cnt++;
9963 : else
9964 : last_idx = i;
9965 : }
9966 : /* Make sure to have an even number of lanes as we later do
9967 : all-or-nothing discovery, not trying to split further. */
9968 1600596 : if ((chain.length () - remain_cnt) & 1)
9969 169686 : remain_cnt++;
9970 1600596 : if (!invalid && chain.length () - remain_cnt > 1)
9971 : {
9972 1230348 : vec<stmt_vec_info> stmts;
9973 1230348 : vec<tree> remain = vNULL;
9974 1230348 : stmts.create (chain.length ());
9975 1230348 : if (remain_cnt > 0)
9976 115003 : remain.create (remain_cnt);
9977 3951375 : for (unsigned i = 0; i < chain.length (); ++i)
9978 : {
9979 2721027 : stmt_vec_info stmt_info;
9980 2721027 : if (chain[i].dt == vect_internal_def
9981 2681057 : && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
9982 2681057 : gimple_get_lhs (stmt_info->stmt) == chain[i].op)
9983 5402000 : && (i != last_idx
9984 1230348 : || (stmts.length () & 1)))
9985 2594710 : stmts.quick_push (stmt_info);
9986 : else
9987 126317 : remain.quick_push (chain[i].op);
9988 : }
9989 1230348 : vec<stmt_vec_info> roots;
9990 1230348 : roots.create (chain_stmts.length ());
9991 2721027 : for (unsigned i = 0; i < chain_stmts.length (); ++i)
9992 1490679 : roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
9993 1230348 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
9994 1230348 : stmts, roots, remain));
9995 : }
9996 : }
9997 1882265 : }
9998 : }
9999 2205447 : }
10000 :
10001 : /* Walk the grouped store chains and replace entries with their
10002 : pattern variant if any. */
10003 :
10004 : static void
10005 614383 : vect_fixup_store_groups_with_patterns (vec_info *vinfo)
10006 : {
10007 614383 : stmt_vec_info first_element;
10008 614383 : unsigned i;
10009 :
10010 1508085 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
10011 : {
10012 : /* We also have CTORs in this array. */
10013 893702 : if (!STMT_VINFO_GROUPED_ACCESS (first_element))
10014 0 : continue;
10015 893702 : if (STMT_VINFO_IN_PATTERN_P (first_element))
10016 : {
10017 252 : stmt_vec_info orig = first_element;
10018 252 : first_element = STMT_VINFO_RELATED_STMT (first_element);
10019 252 : DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
10020 252 : DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
10021 252 : DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
10022 252 : DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
10023 252 : vinfo->grouped_stores[i] = first_element;
10024 : }
10025 893702 : stmt_vec_info prev = first_element;
10026 2511675 : while (DR_GROUP_NEXT_ELEMENT (prev))
10027 : {
10028 1617973 : stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
10029 1617973 : if (STMT_VINFO_IN_PATTERN_P (elt))
10030 : {
10031 849 : stmt_vec_info orig = elt;
10032 849 : elt = STMT_VINFO_RELATED_STMT (elt);
10033 849 : DR_GROUP_NEXT_ELEMENT (prev) = elt;
10034 849 : DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
10035 849 : DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
10036 : }
10037 1617973 : DR_GROUP_FIRST_ELEMENT (elt) = first_element;
10038 1617973 : prev = elt;
10039 : }
10040 : }
10041 614383 : }
10042 :
10043 : /* Check if the region described by BB_VINFO can be vectorized, returning
10044 : true if so. When returning false, set FATAL to true if the same failure
10045 : would prevent vectorization at other vector sizes, false if it is still
10046 : worth trying other sizes. N_STMTS is the number of statements in the
10047 : region. */
10048 :
10049 : static bool
10050 2205447 : vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
10051 : vec<int> *dataref_groups)
10052 : {
10053 2205447 : DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
10054 :
10055 2205447 : slp_instance instance;
10056 2205447 : int i;
10057 :
10058 : /* The first group of checks is independent of the vector size. */
10059 2205447 : fatal = true;
10060 :
10061 : /* Analyze the data references. */
10062 :
10063 2205447 : if (!vect_analyze_data_refs (bb_vinfo, NULL))
10064 : {
10065 0 : if (dump_enabled_p ())
10066 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10067 : "not vectorized: unhandled data-ref in basic "
10068 : "block.\n");
10069 0 : return false;
10070 : }
10071 :
10072 2205447 : if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
10073 : {
10074 0 : if (dump_enabled_p ())
10075 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10076 : "not vectorized: unhandled data access in "
10077 : "basic block.\n");
10078 0 : return false;
10079 : }
10080 :
10081 2205447 : vect_slp_check_for_roots (bb_vinfo);
10082 :
10083 : /* If there are no grouped stores and no constructors in the region
10084 : there is no need to continue with pattern recog as vect_analyze_slp
10085 : will fail anyway. */
10086 2205447 : if (bb_vinfo->grouped_stores.is_empty ()
10087 1862201 : && bb_vinfo->roots.is_empty ())
10088 : {
10089 1591064 : if (dump_enabled_p ())
10090 1022 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10091 : "not vectorized: no grouped stores in "
10092 : "basic block.\n");
10093 1591064 : return false;
10094 : }
10095 :
10096 : /* While the rest of the analysis below depends on it in some way. */
10097 614383 : fatal = false;
10098 :
10099 614383 : vect_pattern_recog (bb_vinfo);
10100 :
10101 : /* Update store groups from pattern processing. */
10102 614383 : vect_fixup_store_groups_with_patterns (bb_vinfo);
10103 :
10104 : /* Check the SLP opportunities in the basic block, analyze and build SLP
10105 : trees. */
10106 614383 : if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
10107 : {
10108 0 : if (dump_enabled_p ())
10109 : {
10110 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10111 : "Failed to SLP the basic block.\n");
10112 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10113 : "not vectorized: failed to find SLP opportunities "
10114 : "in basic block.\n");
10115 : }
10116 0 : return false;
10117 : }
10118 :
10119 : /* Optimize permutations. */
10120 614383 : vect_optimize_slp (bb_vinfo);
10121 :
10122 : /* Gather the loads reachable from the SLP graph entries. */
10123 614383 : vect_gather_slp_loads (bb_vinfo);
10124 :
10125 614383 : vect_record_base_alignments (bb_vinfo);
10126 :
10127 : /* Analyze and verify the alignment of data references and the
10128 : dependence in the SLP instances. */
10129 1406625 : for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
10130 : {
10131 792242 : vect_location = instance->location ();
10132 792242 : if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
10133 792242 : || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
10134 : {
10135 8790 : slp_tree node = SLP_INSTANCE_TREE (instance);
10136 8790 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
10137 8790 : if (dump_enabled_p ())
10138 4 : dump_printf_loc (MSG_NOTE, vect_location,
10139 : "removing SLP instance operations starting from: %G",
10140 : stmt_info->stmt);
10141 8790 : vect_free_slp_instance (instance);
10142 8790 : BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
10143 8790 : continue;
10144 8790 : }
10145 :
10146 : /* Mark all the statements that we want to vectorize as relevant. */
10147 783452 : vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
10148 :
10149 783452 : i++;
10150 : }
10151 2235637 : if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
10152 : return false;
10153 :
10154 266190 : if (!vect_slp_analyze_operations (bb_vinfo))
10155 : {
10156 30190 : if (dump_enabled_p ())
10157 87 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10158 : "not vectorized: bad operation in basic block.\n");
10159 30190 : return false;
10160 : }
10161 :
10162 : /* Mark all the statements that we vectorize. */
10163 236000 : vect_bb_slp_mark_stmts_vectorized (bb_vinfo);
10164 :
10165 : /* Compute vectorizable live stmts. */
10166 236000 : vect_bb_slp_mark_live_stmts (bb_vinfo);
10167 :
10168 236000 : vect_bb_partition_graph (bb_vinfo);
10169 :
10170 236000 : return true;
10171 : }
10172 :
10173 : /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
10174 : basic blocks in BBS, returning true on success.
10175 : The region has N_STMTS statements and has the datarefs given by DATAREFS. */
10176 :
10177 : static bool
10178 1883769 : vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
10179 : vec<int> *dataref_groups, unsigned int n_stmts,
10180 : loop_p orig_loop)
10181 : {
10182 1883769 : bb_vec_info bb_vinfo;
10183 1883769 : auto_vector_modes vector_modes;
10184 :
10185 : /* Autodetect first vector size we try. */
10186 1883769 : machine_mode next_vector_mode = VOIDmode;
10187 1883769 : targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
10188 1883769 : unsigned int mode_i = 0;
10189 :
10190 1883769 : vec_info_shared shared;
10191 :
10192 1883769 : machine_mode autodetected_vector_mode = VOIDmode;
10193 2527125 : while (1)
10194 : {
10195 2205447 : bool vectorized = false;
10196 2205447 : bool fatal = false;
10197 2205447 : bb_vinfo = new _bb_vec_info (bbs, &shared);
10198 :
10199 2205447 : bool first_time_p = shared.datarefs.is_empty ();
10200 2205447 : BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
10201 2205447 : if (first_time_p)
10202 1906626 : bb_vinfo->shared->save_datarefs ();
10203 : else
10204 298821 : bb_vinfo->shared->check_datarefs ();
10205 2205447 : bb_vinfo->vector_mode = next_vector_mode;
10206 :
10207 2205447 : if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
10208 : {
10209 236000 : if (dump_enabled_p ())
10210 : {
10211 1506 : dump_printf_loc (MSG_NOTE, vect_location,
10212 : "***** Analysis succeeded with vector mode"
10213 753 : " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
10214 753 : dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
10215 : }
10216 :
10217 236000 : bb_vinfo->shared->check_datarefs ();
10218 :
10219 236000 : bool force_clear = false;
10220 236000 : auto_vec<slp_instance> profitable_subgraphs;
10221 1391102 : for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
10222 : {
10223 683102 : if (instance->subgraph_entries.is_empty ())
10224 221683 : continue;
10225 :
10226 662259 : dump_user_location_t saved_vect_location = vect_location;
10227 662259 : vect_location = instance->location ();
10228 662259 : if (!unlimited_cost_model (NULL)
10229 658901 : && !param_vect_allow_possibly_not_worthwhile_vectorizations
10230 1321155 : && !vect_bb_vectorization_profitable_p
10231 658896 : (bb_vinfo, instance->subgraph_entries, orig_loop))
10232 : {
10233 179997 : if (dump_enabled_p ())
10234 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10235 : "not vectorized: vectorization is not "
10236 : "profitable.\n");
10237 179997 : vect_location = saved_vect_location;
10238 179997 : continue;
10239 : }
10240 :
10241 482262 : vect_location = saved_vect_location;
10242 482262 : if (!dbg_cnt (vect_slp))
10243 : {
10244 0 : force_clear = true;
10245 0 : continue;
10246 : }
10247 :
10248 482262 : profitable_subgraphs.safe_push (instance);
10249 : }
10250 :
10251 : /* When we're vectorizing an if-converted loop body make sure
10252 : we vectorized all if-converted code. */
10253 395384 : if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
10254 : {
10255 106 : gcc_assert (bb_vinfo->nbbs == 1);
10256 212 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
10257 4388 : !gsi_end_p (gsi); gsi_next (&gsi))
10258 : {
10259 : /* The costing above left us with DCEable vectorized scalar
10260 : stmts having the visited flag set on profitable
10261 : subgraphs. Do the delayed clearing of the flag here. */
10262 4282 : if (gimple_visited_p (gsi_stmt (gsi)))
10263 : {
10264 1260 : gimple_set_visited (gsi_stmt (gsi), false);
10265 1260 : continue;
10266 : }
10267 3022 : if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
10268 813 : continue;
10269 :
10270 6334 : if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
10271 2666 : if (gimple_assign_rhs_code (ass) == COND_EXPR)
10272 : {
10273 69 : if (!profitable_subgraphs.is_empty ()
10274 31 : && dump_enabled_p ())
10275 0 : dump_printf_loc (MSG_NOTE, vect_location,
10276 : "not profitable because of "
10277 : "unprofitable if-converted scalar "
10278 : "code\n");
10279 38 : profitable_subgraphs.truncate (0);
10280 : }
10281 : }
10282 : }
10283 :
10284 : /* Finally schedule the profitable subgraphs. */
10285 1036984 : for (slp_instance instance : profitable_subgraphs)
10286 : {
10287 482216 : if (!vectorized && dump_enabled_p ())
10288 728 : dump_printf_loc (MSG_NOTE, vect_location,
10289 : "Basic block will be vectorized "
10290 : "using SLP\n");
10291 482216 : vectorized = true;
10292 :
10293 : /* Dump before scheduling as store vectorization will remove
10294 : the original stores and mess with the instance tree
10295 : so querying its location will eventually ICE. */
10296 482216 : if (flag_checking)
10297 1940004 : for (slp_instance sub : instance->subgraph_entries)
10298 493356 : gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
10299 482216 : unsigned HOST_WIDE_INT bytes;
10300 482216 : if (dump_enabled_p ())
10301 3465 : for (slp_instance sub : instance->subgraph_entries)
10302 : {
10303 918 : tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
10304 1836 : if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
10305 918 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
10306 918 : sub->location (),
10307 : "basic block part vectorized using %wu "
10308 : "byte vectors\n", bytes);
10309 : else
10310 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
10311 : sub->location (),
10312 : "basic block part vectorized using "
10313 : "variable length vectors\n");
10314 : }
10315 :
10316 482216 : dump_user_location_t saved_vect_location = vect_location;
10317 482216 : vect_location = instance->location ();
10318 :
10319 482216 : vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
10320 :
10321 482216 : vect_location = saved_vect_location;
10322 : }
10323 :
10324 :
10325 : /* Generate the invariant statements. */
10326 236000 : if (!gimple_seq_empty_p (bb_vinfo->inv_pattern_def_seq))
10327 : {
10328 23 : if (dump_enabled_p ())
10329 0 : dump_printf_loc (MSG_NOTE, vect_location,
10330 : "------>generating invariant statements\n");
10331 :
10332 23 : bb_vinfo->insert_seq_on_entry (NULL,
10333 : bb_vinfo->inv_pattern_def_seq);
10334 : }
10335 236000 : }
10336 : else
10337 : {
10338 1969447 : if (dump_enabled_p ())
10339 1314 : dump_printf_loc (MSG_NOTE, vect_location,
10340 : "***** Analysis failed with vector mode %s\n",
10341 1314 : GET_MODE_NAME (bb_vinfo->vector_mode));
10342 : }
10343 :
10344 2205447 : if (mode_i == 0)
10345 1883769 : autodetected_vector_mode = bb_vinfo->vector_mode;
10346 :
10347 2205447 : if (!fatal)
10348 3154940 : while (mode_i < vector_modes.length ()
10349 1765589 : && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
10350 : {
10351 335110 : if (dump_enabled_p ())
10352 1658 : dump_printf_loc (MSG_NOTE, vect_location,
10353 : "***** The result for vector mode %s would"
10354 : " be the same\n",
10355 829 : GET_MODE_NAME (vector_modes[mode_i]));
10356 335110 : mode_i += 1;
10357 : }
10358 :
10359 2205447 : delete bb_vinfo;
10360 :
10361 2205447 : if (mode_i < vector_modes.length ()
10362 2027632 : && VECTOR_MODE_P (autodetected_vector_mode)
10363 2003166 : && (related_vector_mode (vector_modes[mode_i],
10364 : GET_MODE_INNER (autodetected_vector_mode))
10365 1001583 : == autodetected_vector_mode)
10366 4233079 : && (related_vector_mode (autodetected_vector_mode,
10367 520154 : GET_MODE_INNER (vector_modes[mode_i]))
10368 1040308 : == vector_modes[mode_i]))
10369 : {
10370 520154 : if (dump_enabled_p ())
10371 205 : dump_printf_loc (MSG_NOTE, vect_location,
10372 : "***** Skipping vector mode %s, which would"
10373 : " repeat the analysis for %s\n",
10374 205 : GET_MODE_NAME (vector_modes[mode_i]),
10375 205 : GET_MODE_NAME (autodetected_vector_mode));
10376 520154 : mode_i += 1;
10377 : }
10378 :
10379 2205447 : if (vectorized
10380 2046094 : || mode_i == vector_modes.length ()
10381 1868324 : || autodetected_vector_mode == VOIDmode
10382 : /* If vect_slp_analyze_bb_1 signaled that analysis for all
10383 : vector sizes will fail do not bother iterating. */
10384 3047722 : || fatal)
10385 3767538 : return vectorized;
10386 :
10387 : /* Try the next biggest vector size. */
10388 321678 : next_vector_mode = vector_modes[mode_i++];
10389 321678 : if (dump_enabled_p ())
10390 218 : dump_printf_loc (MSG_NOTE, vect_location,
10391 : "***** Re-trying analysis with vector mode %s\n",
10392 218 : GET_MODE_NAME (next_vector_mode));
10393 321678 : }
10394 1883769 : }
10395 :
10396 :
10397 : /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
10398 : true if anything in the basic-block was vectorized. */
10399 :
10400 : static bool
10401 1883769 : vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
10402 : {
10403 1883769 : vec<data_reference_p> datarefs = vNULL;
10404 1883769 : auto_vec<int> dataref_groups;
10405 1883769 : int insns = 0;
10406 1883769 : int current_group = 0;
10407 :
10408 12471296 : for (unsigned i = 0; i < bbs.length (); i++)
10409 : {
10410 10587527 : basic_block bb = bbs[i];
10411 89725375 : for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
10412 79137848 : gsi_next (&gsi))
10413 : {
10414 79137848 : gimple *stmt = gsi_stmt (gsi);
10415 79137848 : if (is_gimple_debug (stmt))
10416 49414821 : continue;
10417 :
10418 29723027 : insns++;
10419 :
10420 29723027 : if (gimple_location (stmt) != UNKNOWN_LOCATION)
10421 26676015 : vect_location = stmt;
10422 :
10423 29723027 : if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
10424 : &dataref_groups, current_group))
10425 5113055 : ++current_group;
10426 : }
10427 : /* New BBs always start a new DR group. */
10428 10587527 : ++current_group;
10429 : }
10430 :
10431 1883769 : return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
10432 1883769 : }
10433 :
10434 : /* Special entry for the BB vectorizer. Analyze and transform a single
10435 : if-converted BB with ORIG_LOOPs body being the not if-converted
10436 : representation. Returns true if anything in the basic-block was
10437 : vectorized. */
10438 :
10439 : bool
10440 19332 : vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
10441 : {
10442 19332 : auto_vec<basic_block> bbs;
10443 19332 : bbs.safe_push (bb);
10444 19332 : return vect_slp_bbs (bbs, orig_loop);
10445 19332 : }
10446 :
10447 : /* Main entry for the BB vectorizer. Analyze and transform BB, returns
10448 : true if anything in the basic-block was vectorized. */
10449 :
10450 : bool
10451 910766 : vect_slp_function (function *fun)
10452 : {
10453 910766 : bool r = false;
10454 910766 : int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
10455 910766 : auto_bitmap exit_bbs;
10456 910766 : bitmap_set_bit (exit_bbs, EXIT_BLOCK);
10457 910766 : edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
10458 910766 : unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
10459 910766 : true, rpo, NULL);
10460 :
10461 : /* For the moment split the function into pieces to avoid making
10462 : the iteration on the vector mode moot. Split at points we know
10463 : to not handle well which is CFG merges (SLP discovery doesn't
10464 : handle non-loop-header PHIs) and loop exits. Since pattern
10465 : recog requires reverse iteration to visit uses before defs
10466 : simply chop RPO into pieces. */
10467 910766 : auto_vec<basic_block> bbs;
10468 11509907 : for (unsigned i = 0; i < n; i++)
10469 : {
10470 10599141 : basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
10471 10599141 : bool split = false;
10472 :
10473 : /* Split when a BB is not dominated by the first block. */
10474 19980938 : if (!bbs.is_empty ()
10475 9381797 : && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
10476 : {
10477 666941 : if (dump_enabled_p ())
10478 146 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10479 : "splitting region at dominance boundary bb%d\n",
10480 : bb->index);
10481 : split = true;
10482 : }
10483 : /* Split when the loop determined by the first block
10484 : is exited. This is because we eventually insert
10485 : invariants at region begin. */
10486 18647056 : else if (!bbs.is_empty ()
10487 8714856 : && bbs[0]->loop_father != bb->loop_father
10488 2270550 : && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
10489 : {
10490 3827 : if (dump_enabled_p ())
10491 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10492 : "splitting region at loop %d exit at bb%d\n",
10493 3 : bbs[0]->loop_father->num, bb->index);
10494 : split = true;
10495 : }
10496 9928373 : else if (!bbs.is_empty ()
10497 8711029 : && bb->loop_father->header == bb
10498 470204 : && bb->loop_father->dont_vectorize)
10499 : {
10500 7271 : if (dump_enabled_p ())
10501 72 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10502 : "splitting region at dont-vectorize loop %d "
10503 : "entry at bb%d\n",
10504 : bb->loop_father->num, bb->index);
10505 : split = true;
10506 : }
10507 :
10508 11277180 : if (split && !bbs.is_empty ())
10509 : {
10510 678039 : r |= vect_slp_bbs (bbs, NULL);
10511 678039 : bbs.truncate (0);
10512 : }
10513 :
10514 10599141 : if (bbs.is_empty ())
10515 : {
10516 : /* We need to be able to insert at the head of the region which
10517 : we cannot for region starting with a returns-twice call. */
10518 1895383 : if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
10519 404420 : if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
10520 : {
10521 303 : if (dump_enabled_p ())
10522 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10523 : "skipping bb%d as start of region as it "
10524 : "starts with returns-twice call\n",
10525 : bb->index);
10526 30946 : continue;
10527 : }
10528 : /* If the loop this BB belongs to is marked as not to be vectorized
10529 : honor that also for BB vectorization. */
10530 1895080 : if (bb->loop_father->dont_vectorize)
10531 30643 : continue;
10532 : }
10533 :
10534 10568195 : bbs.safe_push (bb);
10535 :
10536 : /* When we have a stmt ending this block and defining a
10537 : value we have to insert on edges when inserting after it for
10538 : a vector containing its definition. Avoid this for now. */
10539 21136390 : if (gimple *last = *gsi_last_bb (bb))
10540 8573611 : if (gimple_get_lhs (last)
10541 8573611 : && is_ctrl_altering_stmt (last))
10542 : {
10543 275639 : if (dump_enabled_p ())
10544 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10545 : "splitting region at control altering "
10546 : "definition %G", last);
10547 275639 : r |= vect_slp_bbs (bbs, NULL);
10548 275639 : bbs.truncate (0);
10549 : }
10550 : }
10551 :
10552 910766 : if (!bbs.is_empty ())
10553 910759 : r |= vect_slp_bbs (bbs, NULL);
10554 :
10555 910766 : free (rpo);
10556 :
10557 910766 : return r;
10558 910766 : }
10559 :
10560 : /* Build a variable-length vector in which the elements in ELTS are repeated
10561 : to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
10562 : RESULTS and add any new instructions to SEQ.
10563 :
10564 : The approach we use is:
10565 :
10566 : (1) Find a vector mode VM with integer elements of mode IM.
10567 :
10568 : (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
10569 : ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
10570 : from small vectors to IM.
10571 :
10572 : (3) Duplicate each ELTS'[I] into a vector of mode VM.
10573 :
10574 : (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
10575 : correct byte contents.
10576 :
10577 : (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
10578 :
10579 : We try to find the largest IM for which this sequence works, in order
10580 : to cut down on the number of interleaves. */
10581 :
10582 : void
10583 0 : duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
10584 : const vec<tree> &elts, unsigned int nresults,
10585 : vec<tree> &results)
10586 : {
10587 0 : unsigned int nelts = elts.length ();
10588 0 : tree element_type = TREE_TYPE (vector_type);
10589 :
10590 : /* (1) Find a vector mode VM with integer elements of mode IM. */
10591 0 : unsigned int nvectors = 1;
10592 0 : tree new_vector_type;
10593 0 : tree permutes[2];
10594 0 : if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
10595 : &nvectors, &new_vector_type,
10596 : permutes))
10597 0 : gcc_unreachable ();
10598 :
10599 : /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
10600 0 : unsigned int partial_nelts = nelts / nvectors;
10601 0 : tree partial_vector_type = build_vector_type (element_type, partial_nelts);
10602 :
10603 0 : tree_vector_builder partial_elts;
10604 0 : auto_vec<tree, 32> pieces (nvectors * 2);
10605 0 : pieces.quick_grow_cleared (nvectors * 2);
10606 0 : for (unsigned int i = 0; i < nvectors; ++i)
10607 : {
10608 : /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
10609 : ELTS' has mode IM. */
10610 0 : partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
10611 0 : for (unsigned int j = 0; j < partial_nelts; ++j)
10612 0 : partial_elts.quick_push (elts[i * partial_nelts + j]);
10613 0 : tree t = gimple_build_vector (seq, &partial_elts);
10614 0 : t = gimple_build (seq, VIEW_CONVERT_EXPR,
10615 0 : TREE_TYPE (new_vector_type), t);
10616 :
10617 : /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
10618 0 : pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
10619 : }
10620 :
10621 : /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
10622 : correct byte contents.
10623 :
10624 : Conceptually, we need to repeat the following operation log2(nvectors)
10625 : times, where hi_start = nvectors / 2:
10626 :
10627 : out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
10628 : out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
10629 :
10630 : However, if each input repeats every N elements and the VF is
10631 : a multiple of N * 2, the HI result is the same as the LO result.
10632 : This will be true for the first N1 iterations of the outer loop,
10633 : followed by N2 iterations for which both the LO and HI results
10634 : are needed. I.e.:
10635 :
10636 : N1 + N2 = log2(nvectors)
10637 :
10638 : Each "N1 iteration" doubles the number of redundant vectors and the
10639 : effect of the process as a whole is to have a sequence of nvectors/2**N1
10640 : vectors that repeats 2**N1 times. Rather than generate these redundant
10641 : vectors, we halve the number of vectors for each N1 iteration. */
10642 : unsigned int in_start = 0;
10643 : unsigned int out_start = nvectors;
10644 : unsigned int new_nvectors = nvectors;
10645 0 : for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
10646 : {
10647 0 : unsigned int hi_start = new_nvectors / 2;
10648 0 : unsigned int out_i = 0;
10649 0 : for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
10650 : {
10651 0 : if ((in_i & 1) != 0
10652 0 : && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
10653 : 2 * in_repeat))
10654 0 : continue;
10655 :
10656 0 : tree output = make_ssa_name (new_vector_type);
10657 0 : tree input1 = pieces[in_start + (in_i / 2)];
10658 0 : tree input2 = pieces[in_start + (in_i / 2) + hi_start];
10659 0 : gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
10660 : input1, input2,
10661 : permutes[in_i & 1]);
10662 0 : gimple_seq_add_stmt (seq, stmt);
10663 0 : pieces[out_start + out_i] = output;
10664 0 : out_i += 1;
10665 : }
10666 0 : std::swap (in_start, out_start);
10667 0 : new_nvectors = out_i;
10668 : }
10669 :
10670 : /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
10671 0 : results.reserve (nresults);
10672 0 : for (unsigned int i = 0; i < nresults; ++i)
10673 0 : if (i < new_nvectors)
10674 0 : results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
10675 0 : pieces[in_start + i]));
10676 : else
10677 0 : results.quick_push (results[i - new_nvectors]);
10678 0 : }
10679 :
10680 :
10681 : /* For constant and loop invariant defs in OP_NODE this function creates
10682 : vector defs that will be used in the vectorized stmts and stores them
10683 : to SLP_TREE_VEC_DEFS of OP_NODE. */
10684 :
10685 : static void
10686 491495 : vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
10687 : {
10688 491495 : unsigned HOST_WIDE_INT nunits;
10689 491495 : tree vec_cst;
10690 491495 : unsigned j, number_of_places_left_in_vector;
10691 491495 : tree vector_type;
10692 491495 : tree vop;
10693 491495 : int group_size = op_node->ops.length ();
10694 491495 : unsigned int vec_num, i;
10695 491495 : unsigned number_of_copies = 1;
10696 491495 : bool constant_p;
10697 491495 : gimple_seq ctor_seq = NULL;
10698 491495 : auto_vec<tree, 16> permute_results;
10699 :
10700 : /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
10701 491495 : vector_type = SLP_TREE_VECTYPE (op_node);
10702 :
10703 491495 : unsigned int number_of_vectors = vect_get_num_copies (vinfo, op_node);
10704 491495 : SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
10705 491495 : auto_vec<tree> voprnds (number_of_vectors);
10706 :
10707 : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
10708 : created vectors. It is greater than 1 if unrolling is performed.
10709 :
10710 : For example, we have two scalar operands, s1 and s2 (e.g., group of
10711 : strided accesses of size two), while NUNITS is four (i.e., four scalars
10712 : of this type can be packed in a vector). The output vector will contain
10713 : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
10714 : will be 2).
10715 :
10716 : If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
10717 : containing the operands.
10718 :
10719 : For example, NUNITS is four as before, and the group size is 8
10720 : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
10721 : {s5, s6, s7, s8}. */
10722 :
10723 : /* When using duplicate_and_interleave, we just need one element for
10724 : each scalar statement. */
10725 491495 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
10726 : nunits = group_size;
10727 :
10728 491495 : number_of_copies = nunits * number_of_vectors / group_size;
10729 :
10730 491495 : number_of_places_left_in_vector = nunits;
10731 491495 : constant_p = true;
10732 491495 : tree uniform_elt = NULL_TREE;
10733 491495 : tree_vector_builder elts (vector_type, nunits, 1);
10734 491495 : elts.quick_grow (nunits);
10735 491495 : stmt_vec_info insert_after = NULL;
10736 1464084 : for (j = 0; j < number_of_copies; j++)
10737 : {
10738 972589 : tree op;
10739 3731964 : for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
10740 : {
10741 : /* Create 'vect_ = {op0,op1,...,opn}'. */
10742 1786786 : tree orig_op = op;
10743 1786786 : if (number_of_places_left_in_vector == nunits)
10744 : uniform_elt = op;
10745 1165550 : else if (uniform_elt && operand_equal_p (uniform_elt, op))
10746 740162 : op = elts[number_of_places_left_in_vector];
10747 : else
10748 : uniform_elt = NULL_TREE;
10749 1786786 : number_of_places_left_in_vector--;
10750 1786786 : if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
10751 : {
10752 276070 : if (CONSTANT_CLASS_P (op))
10753 : {
10754 100787 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10755 : {
10756 : /* Can't use VIEW_CONVERT_EXPR for booleans because
10757 : of possibly different sizes of scalar value and
10758 : vector element. */
10759 51 : if (integer_zerop (op))
10760 51 : op = build_int_cst (TREE_TYPE (vector_type), 0);
10761 0 : else if (integer_onep (op))
10762 0 : op = build_all_ones_cst (TREE_TYPE (vector_type));
10763 : else
10764 0 : gcc_unreachable ();
10765 : }
10766 : else
10767 100736 : op = fold_unary (VIEW_CONVERT_EXPR,
10768 : TREE_TYPE (vector_type), op);
10769 100787 : gcc_assert (op && CONSTANT_CLASS_P (op));
10770 : }
10771 : else
10772 : {
10773 175283 : tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
10774 175283 : gimple *init_stmt;
10775 175283 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10776 : {
10777 403 : tree true_val
10778 403 : = build_all_ones_cst (TREE_TYPE (vector_type));
10779 403 : tree false_val
10780 403 : = build_zero_cst (TREE_TYPE (vector_type));
10781 403 : gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
10782 403 : init_stmt = gimple_build_assign (new_temp, COND_EXPR,
10783 : op, true_val,
10784 : false_val);
10785 : }
10786 : else
10787 : {
10788 174880 : op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
10789 : op);
10790 174880 : init_stmt
10791 174880 : = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
10792 : op);
10793 : }
10794 175283 : gimple_seq_add_stmt (&ctor_seq, init_stmt);
10795 175283 : op = new_temp;
10796 : }
10797 : }
10798 1786786 : elts[number_of_places_left_in_vector] = op;
10799 1786786 : if (!CONSTANT_CLASS_P (op))
10800 317003 : constant_p = false;
10801 : /* For BB vectorization we have to compute an insert location
10802 : when a def is inside the analyzed region since we cannot
10803 : simply insert at the BB start in this case. */
10804 1786786 : stmt_vec_info opdef;
10805 1786786 : if (TREE_CODE (orig_op) == SSA_NAME
10806 181966 : && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
10807 162156 : && is_a <bb_vec_info> (vinfo)
10808 1891345 : && (opdef = vinfo->lookup_def (orig_op)))
10809 : {
10810 85536 : if (!insert_after)
10811 : insert_after = opdef;
10812 : else
10813 47162 : insert_after = get_later_stmt (insert_after, opdef);
10814 : }
10815 :
10816 1786786 : if (number_of_places_left_in_vector == 0)
10817 : {
10818 621236 : auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
10819 621236 : if (uniform_elt)
10820 646912 : vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
10821 323456 : elts[0]);
10822 595560 : else if (constant_p
10823 595560 : ? multiple_p (type_nunits, nunits)
10824 109421 : : known_eq (type_nunits, nunits))
10825 297780 : vec_cst = gimple_build_vector (&ctor_seq, &elts);
10826 : else
10827 : {
10828 0 : if (permute_results.is_empty ())
10829 0 : duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
10830 : elts, number_of_vectors,
10831 : permute_results);
10832 0 : vec_cst = permute_results[number_of_vectors - j - 1];
10833 : }
10834 621236 : if (!gimple_seq_empty_p (ctor_seq))
10835 : {
10836 136692 : if (insert_after)
10837 : {
10838 38374 : gimple_stmt_iterator gsi;
10839 38374 : if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
10840 : {
10841 630 : gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
10842 630 : gsi_insert_seq_before (&gsi, ctor_seq,
10843 : GSI_CONTINUE_LINKING);
10844 : }
10845 37744 : else if (!stmt_ends_bb_p (insert_after->stmt))
10846 : {
10847 37744 : gsi = gsi_for_stmt (insert_after->stmt);
10848 37744 : gsi_insert_seq_after (&gsi, ctor_seq,
10849 : GSI_CONTINUE_LINKING);
10850 : }
10851 : else
10852 : {
10853 : /* When we want to insert after a def where the
10854 : defining stmt throws then insert on the fallthru
10855 : edge. */
10856 0 : edge e = find_fallthru_edge
10857 0 : (gimple_bb (insert_after->stmt)->succs);
10858 0 : basic_block new_bb
10859 0 : = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
10860 0 : gcc_assert (!new_bb);
10861 : }
10862 : }
10863 : else
10864 98318 : vinfo->insert_seq_on_entry (NULL, ctor_seq);
10865 136692 : ctor_seq = NULL;
10866 : }
10867 621236 : voprnds.quick_push (vec_cst);
10868 621236 : insert_after = NULL;
10869 621236 : number_of_places_left_in_vector = nunits;
10870 621236 : constant_p = true;
10871 621236 : elts.new_vector (vector_type, nunits, 1);
10872 621236 : elts.quick_grow (nunits);
10873 : }
10874 : }
10875 : }
10876 :
10877 : /* Since the vectors are created in the reverse order, we should invert
10878 : them. */
10879 491495 : vec_num = voprnds.length ();
10880 1112731 : for (j = vec_num; j != 0; j--)
10881 : {
10882 621236 : vop = voprnds[j - 1];
10883 621236 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
10884 : }
10885 :
10886 : /* In case that VF is greater than the unrolling factor needed for the SLP
10887 : group of stmts, NUMBER_OF_VECTORS to be created is greater than
10888 : NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
10889 : to replicate the vectors. */
10890 491495 : while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
10891 491495 : for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
10892 : i++)
10893 0 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
10894 491495 : }
10895 :
10896 : /* Get the scalar definition of the Nth lane from SLP_NODE or NULL_TREE
10897 : if there is no definition for it in the scalar IL or it is not known. */
10898 :
10899 : tree
10900 2665 : vect_get_slp_scalar_def (slp_tree slp_node, unsigned n)
10901 : {
10902 2665 : if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
10903 : {
10904 2653 : if (!SLP_TREE_SCALAR_STMTS (slp_node).exists ())
10905 : return NULL_TREE;
10906 2653 : stmt_vec_info def = SLP_TREE_SCALAR_STMTS (slp_node)[n];
10907 2653 : if (!def)
10908 : return NULL_TREE;
10909 2653 : return gimple_get_lhs (STMT_VINFO_STMT (def));
10910 : }
10911 : else
10912 12 : return SLP_TREE_SCALAR_OPS (slp_node)[n];
10913 : }
10914 :
10915 : /* Get the Ith vectorized definition from SLP_NODE. */
10916 :
10917 : tree
10918 145669 : vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
10919 : {
10920 145669 : return SLP_TREE_VEC_DEFS (slp_node)[i];
10921 : }
10922 :
10923 : /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
10924 :
10925 : void
10926 931113 : vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
10927 : {
10928 1862226 : vec_defs->create (SLP_TREE_VEC_DEFS (slp_node).length ());
10929 931113 : vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
10930 931113 : }
10931 :
10932 : /* Get N vectorized definitions for SLP_NODE. */
10933 :
10934 : void
10935 2965 : vect_get_slp_defs (vec_info *,
10936 : slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
10937 : {
10938 2965 : if (n == -1U)
10939 2965 : n = SLP_TREE_CHILDREN (slp_node).length ();
10940 :
10941 10681 : for (unsigned i = 0; i < n; ++i)
10942 : {
10943 7716 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
10944 7716 : vec<tree> vec_defs = vNULL;
10945 7716 : vect_get_slp_defs (child, &vec_defs);
10946 7716 : vec_oprnds->quick_push (vec_defs);
10947 : }
10948 2965 : }
10949 :
10950 : /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
10951 : - PERM gives the permutation that the caller wants to use for NODE,
10952 : which might be different from SLP_LOAD_PERMUTATION.
10953 : - DUMP_P controls whether the function dumps information. */
10954 :
10955 : static bool
10956 129808 : vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
10957 : load_permutation_t &perm,
10958 : const vec<tree> &dr_chain,
10959 : gimple_stmt_iterator *gsi, poly_uint64 vf,
10960 : bool analyze_only, bool dump_p,
10961 : unsigned *n_perms, unsigned int *n_loads,
10962 : bool dce_chain)
10963 : {
10964 129808 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
10965 129808 : int vec_index = 0;
10966 129808 : tree vectype = SLP_TREE_VECTYPE (node);
10967 129808 : unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
10968 129808 : unsigned int mask_element;
10969 129808 : unsigned dr_group_size;
10970 129808 : machine_mode mode;
10971 :
10972 129808 : if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
10973 : {
10974 : /* We have both splats of the same non-grouped load and groups
10975 : of distinct invariant loads entering here. */
10976 1603 : unsigned max_idx = 0;
10977 8819 : for (auto idx : perm)
10978 4010 : max_idx = idx > max_idx ? idx : max_idx;
10979 1603 : dr_group_size = max_idx + 1;
10980 : }
10981 : else
10982 : {
10983 128205 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
10984 128205 : dr_group_size = DR_GROUP_SIZE (stmt_info);
10985 : }
10986 :
10987 129808 : mode = TYPE_MODE (vectype);
10988 129808 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10989 129808 : unsigned int nstmts = vect_get_num_copies (vinfo, node);
10990 :
10991 : /* Initialize the vect stmts of NODE to properly insert the generated
10992 : stmts later. */
10993 129808 : if (! analyze_only)
10994 57144 : for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
10995 22052 : SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
10996 :
10997 : /* Generate permutation masks for every NODE. Number of masks for each NODE
10998 : is equal to GROUP_SIZE.
10999 : E.g., we have a group of three nodes with three loads from the same
11000 : location in each node, and the vector size is 4. I.e., we have a
11001 : a0b0c0a1b1c1... sequence and we need to create the following vectors:
11002 : for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
11003 : for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
11004 : ...
11005 :
11006 : The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
11007 : The last mask is illegal since we assume two operands for permute
11008 : operation, and the mask element values can't be outside that range.
11009 : Hence, the last mask must be converted into {2,5,5,5}.
11010 : For the first two permutations we need the first and the second input
11011 : vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
11012 : we need the second and the third vectors: {b1,c1,a2,b2} and
11013 : {c2,a3,b3,c3}. */
11014 :
11015 129808 : int vect_stmts_counter = 0;
11016 129808 : unsigned int index = 0;
11017 129808 : int first_vec_index = -1;
11018 129808 : int second_vec_index = -1;
11019 129808 : bool noop_p = true;
11020 129808 : *n_perms = 0;
11021 :
11022 129808 : vec_perm_builder mask;
11023 129808 : unsigned int nelts_to_build;
11024 129808 : unsigned int nvectors_per_build;
11025 129808 : unsigned int in_nlanes;
11026 129808 : bool repeating_p = (group_size == dr_group_size
11027 164782 : && multiple_p (nunits, group_size));
11028 129808 : if (repeating_p)
11029 : {
11030 : /* A single vector contains a whole number of copies of the node, so:
11031 : (a) all permutes can use the same mask; and
11032 : (b) the permutes only need a single vector input. */
11033 32754 : mask.new_vector (nunits, group_size, 3);
11034 32754 : nelts_to_build = mask.encoded_nelts ();
11035 : /* It's possible to obtain zero nstmts during analyze_only, so make
11036 : it at least one to ensure the later computation for n_perms
11037 : proceed. */
11038 32754 : nvectors_per_build = nstmts > 0 ? nstmts : 1;
11039 32754 : in_nlanes = dr_group_size * 3;
11040 : }
11041 : else
11042 : {
11043 : /* We need to construct a separate mask for each vector statement. */
11044 97054 : unsigned HOST_WIDE_INT const_nunits, const_vf;
11045 97054 : if (!nunits.is_constant (&const_nunits)
11046 97054 : || !vf.is_constant (&const_vf))
11047 : return false;
11048 97054 : mask.new_vector (const_nunits, const_nunits, 1);
11049 97054 : nelts_to_build = const_vf * group_size;
11050 97054 : nvectors_per_build = 1;
11051 97054 : in_nlanes = const_vf * dr_group_size;
11052 : }
11053 129808 : auto_sbitmap used_in_lanes (in_nlanes);
11054 129808 : bitmap_clear (used_in_lanes);
11055 129808 : auto_bitmap used_defs;
11056 :
11057 129808 : unsigned int count = mask.encoded_nelts ();
11058 129808 : mask.quick_grow (count);
11059 129808 : vec_perm_indices indices;
11060 :
11061 689287 : for (unsigned int j = 0; j < nelts_to_build; j++)
11062 : {
11063 569097 : unsigned int iter_num = j / group_size;
11064 569097 : unsigned int stmt_num = j % group_size;
11065 569097 : unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
11066 569097 : bitmap_set_bit (used_in_lanes, i);
11067 569097 : if (repeating_p)
11068 : {
11069 : first_vec_index = 0;
11070 : mask_element = i;
11071 : }
11072 : else
11073 : {
11074 : /* Enforced before the loop when !repeating_p. */
11075 359163 : unsigned int const_nunits = nunits.to_constant ();
11076 359163 : vec_index = i / const_nunits;
11077 359163 : mask_element = i % const_nunits;
11078 359163 : if (vec_index == first_vec_index
11079 359163 : || first_vec_index == -1)
11080 : {
11081 : first_vec_index = vec_index;
11082 : }
11083 143777 : else if (vec_index == second_vec_index
11084 143777 : || second_vec_index == -1)
11085 : {
11086 137684 : second_vec_index = vec_index;
11087 137684 : mask_element += const_nunits;
11088 : }
11089 : else
11090 : {
11091 6093 : if (dump_p)
11092 280 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11093 : "permutation requires at "
11094 : "least three vectors %G",
11095 : stmt_info->stmt);
11096 6093 : gcc_assert (analyze_only);
11097 : return false;
11098 : }
11099 :
11100 353070 : gcc_assert (mask_element < 2 * const_nunits);
11101 : }
11102 :
11103 563004 : if (mask_element != index)
11104 362644 : noop_p = false;
11105 563004 : mask[index++] = mask_element;
11106 :
11107 563004 : if (index == count)
11108 : {
11109 153073 : if (!noop_p)
11110 : {
11111 210617 : indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
11112 124624 : if (!can_vec_perm_const_p (mode, mode, indices))
11113 : {
11114 3525 : if (dump_p)
11115 : {
11116 79 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11117 : "unsupported vect permute { ");
11118 669 : for (i = 0; i < count; ++i)
11119 : {
11120 590 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
11121 590 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
11122 : }
11123 79 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
11124 : }
11125 3525 : gcc_assert (analyze_only);
11126 : return false;
11127 : }
11128 :
11129 121099 : tree mask_vec = NULL_TREE;
11130 121099 : if (!analyze_only)
11131 20371 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
11132 :
11133 121099 : if (second_vec_index == -1)
11134 36685 : second_vec_index = first_vec_index;
11135 :
11136 245072 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
11137 : {
11138 123973 : ++*n_perms;
11139 123973 : if (analyze_only)
11140 103320 : continue;
11141 : /* Generate the permute statement if necessary. */
11142 20653 : tree first_vec = dr_chain[first_vec_index + ri];
11143 20653 : tree second_vec = dr_chain[second_vec_index + ri];
11144 20653 : gassign *stmt = as_a<gassign *> (stmt_info->stmt);
11145 20653 : tree perm_dest
11146 20653 : = vect_create_destination_var (gimple_assign_lhs (stmt),
11147 : vectype);
11148 20653 : perm_dest = make_ssa_name (perm_dest);
11149 20653 : gimple *perm_stmt
11150 20653 : = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
11151 : second_vec, mask_vec);
11152 20653 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
11153 : gsi);
11154 20653 : if (dce_chain)
11155 : {
11156 19964 : bitmap_set_bit (used_defs, first_vec_index + ri);
11157 19964 : bitmap_set_bit (used_defs, second_vec_index + ri);
11158 : }
11159 :
11160 : /* Store the vector statement in NODE. */
11161 20653 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
11162 : }
11163 : }
11164 28449 : else if (!analyze_only)
11165 : {
11166 2798 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
11167 : {
11168 1399 : tree first_vec = dr_chain[first_vec_index + ri];
11169 : /* If mask was NULL_TREE generate the requested
11170 : identity transform. */
11171 1399 : if (dce_chain)
11172 1392 : bitmap_set_bit (used_defs, first_vec_index + ri);
11173 :
11174 : /* Store the vector statement in NODE. */
11175 1399 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
11176 : }
11177 : }
11178 :
11179 : index = 0;
11180 : first_vec_index = -1;
11181 : second_vec_index = -1;
11182 : noop_p = true;
11183 : }
11184 : }
11185 :
11186 120190 : if (n_loads)
11187 : {
11188 81612 : if (repeating_p)
11189 10602 : *n_loads = nstmts;
11190 : else
11191 : {
11192 : /* Enforced above when !repeating_p. */
11193 71010 : unsigned int const_nunits = nunits.to_constant ();
11194 71010 : *n_loads = 0;
11195 71010 : bool load_seen = false;
11196 991368 : for (unsigned i = 0; i < in_nlanes; ++i)
11197 : {
11198 920358 : if (i % const_nunits == 0)
11199 : {
11200 389597 : if (load_seen)
11201 110626 : *n_loads += 1;
11202 : load_seen = false;
11203 : }
11204 920358 : if (bitmap_bit_p (used_in_lanes, i))
11205 253382 : load_seen = true;
11206 : }
11207 71010 : if (load_seen)
11208 48451 : *n_loads += 1;
11209 : }
11210 : }
11211 :
11212 120190 : if (dce_chain)
11213 218752 : for (unsigned i = 0; i < dr_chain.length (); ++i)
11214 71954 : if (!bitmap_bit_p (used_defs, i))
11215 : {
11216 39323 : tree def = dr_chain[i];
11217 39670 : do
11218 : {
11219 39670 : gimple *stmt = SSA_NAME_DEF_STMT (def);
11220 39670 : if (is_gimple_assign (stmt)
11221 39670 : && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
11222 39670 : || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
11223 4916 : def = single_ssa_tree_operand (stmt, SSA_OP_USE);
11224 : else
11225 : def = NULL;
11226 39670 : gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
11227 39670 : gsi_remove (&rgsi, true);
11228 39670 : release_defs (stmt);
11229 : }
11230 39670 : while (def);
11231 : }
11232 :
11233 : return true;
11234 129808 : }
11235 :
11236 : /* Generate vector permute statements from a list of loads in DR_CHAIN.
11237 : If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
11238 : permute statements for the SLP node NODE. Store the number of vector
11239 : permute instructions in *N_PERMS and the number of vector load
11240 : instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
11241 : that were not needed. */
11242 :
11243 : bool
11244 90294 : vect_transform_slp_perm_load (vec_info *vinfo,
11245 : slp_tree node, const vec<tree> &dr_chain,
11246 : gimple_stmt_iterator *gsi, poly_uint64 vf,
11247 : bool analyze_only, unsigned *n_perms,
11248 : unsigned int *n_loads, bool dce_chain)
11249 : {
11250 90294 : return vect_transform_slp_perm_load_1 (vinfo, node,
11251 90294 : SLP_TREE_LOAD_PERMUTATION (node),
11252 : dr_chain, gsi, vf, analyze_only,
11253 : dump_enabled_p (), n_perms, n_loads,
11254 90294 : dce_chain);
11255 : }
11256 :
11257 : /* Produce the next vector result for SLP permutation NODE by adding a vector
11258 : statement at GSI. If MASK_VEC is nonnull, add:
11259 :
11260 : <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
11261 :
11262 : otherwise add:
11263 :
11264 : <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF,
11265 : { N, N+1, N+2, ... }>
11266 :
11267 : where N == IDENTITY_OFFSET which is either zero or equal to the
11268 : number of elements of the result. */
11269 :
11270 : static void
11271 31263 : vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11272 : slp_tree node, tree first_def, tree second_def,
11273 : tree mask_vec, poly_uint64 identity_offset)
11274 : {
11275 31263 : tree vectype = SLP_TREE_VECTYPE (node);
11276 :
11277 : /* ??? We SLP match existing vector element extracts but
11278 : allow punning which we need to re-instantiate at uses
11279 : but have no good way of explicitly representing. */
11280 31263 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
11281 31263 : && !types_compatible_p (TREE_TYPE (first_def), vectype))
11282 : {
11283 14 : gassign *conv_stmt
11284 14 : = gimple_build_assign (make_ssa_name (vectype),
11285 : build1 (VIEW_CONVERT_EXPR, vectype, first_def));
11286 14 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
11287 14 : first_def = gimple_assign_lhs (conv_stmt);
11288 : }
11289 31263 : gassign *perm_stmt;
11290 31263 : tree perm_dest = make_ssa_name (vectype);
11291 31263 : if (mask_vec)
11292 : {
11293 27957 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
11294 27957 : TYPE_SIZE (vectype))
11295 27957 : && !types_compatible_p (TREE_TYPE (second_def), vectype))
11296 : {
11297 8 : gassign *conv_stmt
11298 8 : = gimple_build_assign (make_ssa_name (vectype),
11299 : build1 (VIEW_CONVERT_EXPR,
11300 : vectype, second_def));
11301 8 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
11302 8 : second_def = gimple_assign_lhs (conv_stmt);
11303 : }
11304 27957 : perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
11305 : first_def, second_def,
11306 : mask_vec);
11307 : }
11308 : else
11309 : {
11310 3306 : auto def_nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
11311 3306 : unsigned HOST_WIDE_INT vecno;
11312 3306 : poly_uint64 eltno;
11313 3306 : if (!can_div_trunc_p (poly_uint64 (identity_offset), def_nunits,
11314 : &vecno, &eltno))
11315 : gcc_unreachable ();
11316 3306 : tree def = vecno & 1 ? second_def : first_def;
11317 3306 : if (!types_compatible_p (TREE_TYPE (def), vectype))
11318 : {
11319 : /* For identity permutes we still need to handle the case
11320 : of offsetted extracts or concats. */
11321 261 : unsigned HOST_WIDE_INT c;
11322 261 : if (known_le (TYPE_VECTOR_SUBPARTS (vectype), def_nunits))
11323 : {
11324 257 : unsigned HOST_WIDE_INT elsz
11325 257 : = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (def))));
11326 514 : tree lowpart = build3 (BIT_FIELD_REF, vectype, def,
11327 257 : TYPE_SIZE (vectype),
11328 257 : bitsize_int (eltno * elsz));
11329 257 : perm_stmt = gimple_build_assign (perm_dest, lowpart);
11330 : }
11331 4 : else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
11332 4 : def_nunits, &c) && c == 2)
11333 : {
11334 4 : gcc_assert (known_eq (identity_offset, 0U));
11335 4 : tree ctor = build_constructor_va (vectype, 2,
11336 : NULL_TREE, first_def,
11337 : NULL_TREE, second_def);
11338 4 : perm_stmt = gimple_build_assign (perm_dest, ctor);
11339 : }
11340 : else
11341 0 : gcc_unreachable ();
11342 : }
11343 : else
11344 : {
11345 : /* We need a copy here in case the def was external. */
11346 3045 : gcc_assert (known_eq (eltno, 0U));
11347 3045 : perm_stmt = gimple_build_assign (perm_dest, def);
11348 : }
11349 : }
11350 31263 : vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
11351 : /* Store the vector statement in NODE. */
11352 31263 : node->push_vec_def (perm_stmt);
11353 31263 : }
11354 :
11355 : /* Subroutine of vectorizable_slp_permutation. Check whether the target
11356 : can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
11357 : If GSI is nonnull, emit the permutation there.
11358 :
11359 : When GSI is null, the only purpose of NODE is to give properties
11360 : of the result, such as the vector type and number of SLP lanes.
11361 : The node does not need to be a VEC_PERM_EXPR.
11362 :
11363 : If the target supports the operation, return the number of individual
11364 : VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
11365 : dump file if DUMP_P is true. */
11366 :
11367 : static int
11368 490300 : vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
11369 : slp_tree node, lane_permutation_t &perm,
11370 : vec<slp_tree> &children, bool dump_p)
11371 : {
11372 490300 : tree vectype = SLP_TREE_VECTYPE (node);
11373 :
11374 : /* ??? We currently only support all same vector input types
11375 : while the SLP IL should really do a concat + select and thus accept
11376 : arbitrary mismatches. */
11377 490300 : slp_tree child;
11378 490300 : unsigned i;
11379 490300 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
11380 490300 : bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
11381 : /* True if we're permuting a single input of 2N vectors down
11382 : to N vectors. This case doesn't generalize beyond 2 since
11383 : VEC_PERM_EXPR only takes 2 inputs. */
11384 490300 : bool pack_p = false;
11385 : /* If we're permuting inputs of N vectors each into X*N outputs,
11386 : this is the value of X, otherwise it is 1. */
11387 490300 : unsigned int unpack_factor = 1;
11388 490300 : tree op_vectype = NULL_TREE;
11389 491865 : FOR_EACH_VEC_ELT (children, i, child)
11390 491790 : if (SLP_TREE_VECTYPE (child))
11391 : {
11392 : op_vectype = SLP_TREE_VECTYPE (child);
11393 : break;
11394 : }
11395 490300 : if (!op_vectype)
11396 75 : op_vectype = vectype;
11397 1064748 : FOR_EACH_VEC_ELT (children, i, child)
11398 : {
11399 574448 : if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
11400 10467 : && !vect_maybe_update_slp_op_vectype (child, op_vectype))
11401 574448 : || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
11402 1148896 : || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
11403 : {
11404 0 : if (dump_p)
11405 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11406 : "Unsupported vector types in lane permutation\n");
11407 0 : return -1;
11408 : }
11409 574448 : auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
11410 574448 : unsigned int this_unpack_factor;
11411 : /* Detect permutations of external, pre-existing vectors. The external
11412 : node's SLP_TREE_LANES stores the total number of units in the vector,
11413 : or zero if the vector has variable length.
11414 :
11415 : We are expected to keep the original VEC_PERM_EXPR for such cases.
11416 : There is no repetition to model. */
11417 574448 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def
11418 574448 : && SLP_TREE_SCALAR_OPS (child).is_empty ())
11419 : repeating_p = false;
11420 : /* Check whether the input has twice as many lanes per vector. */
11421 566540 : else if (children.length () == 1
11422 566540 : && known_eq (SLP_TREE_LANES (child) * nunits,
11423 : SLP_TREE_LANES (node) * op_nunits * 2))
11424 : pack_p = true;
11425 : /* Check whether the output has N times as many lanes per vector. */
11426 574448 : else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
11427 522767 : SLP_TREE_LANES (child) * nunits,
11428 : &this_unpack_factor)
11429 487833 : && (i == 0 || unpack_factor == this_unpack_factor))
11430 : unpack_factor = this_unpack_factor;
11431 : else
11432 : repeating_p = false;
11433 : }
11434 :
11435 980600 : gcc_assert (perm.length () == SLP_TREE_LANES (node));
11436 :
11437 : /* Load-lanes permute. This permute only acts as a forwarder to
11438 : select the correct vector def of the load-lanes load which
11439 : has the permuted vectors in its vector defs like
11440 : { v0, w0, r0, v1, w1, r1 ... } for a ld3. All costs are
11441 : accounted for in the costing for the actual load so we
11442 : return zero here. */
11443 490300 : if (node->ldst_lanes)
11444 : {
11445 0 : gcc_assert (children.length () == 1);
11446 0 : if (!gsi)
11447 : /* This is a trivial op always supported. */
11448 : return 0;
11449 0 : slp_tree child = children[0];
11450 0 : unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
11451 0 : / SLP_TREE_LANES (node));
11452 0 : unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
11453 0 : unsigned nvectors = vect_get_num_copies (vinfo, node);
11454 0 : for (unsigned i = 0; i < nvectors; ++i)
11455 : {
11456 0 : tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx];
11457 0 : node->push_vec_def (def);
11458 : }
11459 : return 0;
11460 : }
11461 :
11462 : /* Set REPEATING_P to true if the permutations are cyclical wrt UNPACK_FACTOR
11463 : and if we can generate the vectors in a vector-length agnostic way.
11464 : This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
11465 : compile time.
11466 :
11467 : The significance of UNPACK_STEP is that, when PACK_P is false,
11468 : output vector I operates on a window of UNPACK_STEP elements from each
11469 : input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR). For example,
11470 : when UNPACK_FACTOR is 2, the first output vector operates on lanes
11471 : [0, NUNITS / 2 - 1] of each input vector and the second output vector
11472 : operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
11473 :
11474 : When REPEATING_P is true, NOUTPUTS holds the total number of outputs
11475 : that we actually need to generate. */
11476 490300 : uint64_t noutputs = 0;
11477 490300 : poly_uint64 unpack_step = 0;
11478 490300 : loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
11479 182124 : if (!linfo
11480 529416 : || !multiple_p (nunits, unpack_factor, &unpack_step)
11481 181190 : || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
11482 181190 : * SLP_TREE_LANES (node), nunits, &noutputs))
11483 : repeating_p = false;
11484 :
11485 : /* We can handle the conditions described for REPEATING_P above for
11486 : both variable- and constant-length vectors. The fallback requires
11487 : us to generate every element of every permute vector explicitly,
11488 : which is only possible for constant-length permute vectors.
11489 :
11490 : Set:
11491 :
11492 : - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
11493 : mask vectors that we want to build.
11494 :
11495 : - NCOPIES to the number of copies of PERM that we need in order
11496 : to build the necessary permute mask vectors. */
11497 181190 : uint64_t npatterns;
11498 181190 : unsigned nelts_per_pattern;
11499 181190 : uint64_t ncopies;
11500 181190 : if (repeating_p)
11501 : {
11502 : /* We need permute mask vectors that have the form:
11503 :
11504 : { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
11505 :
11506 : In other words, the original n-element permute in PERM is
11507 : "unrolled" to fill a full vector. The stepped vector encoding
11508 : that we use for permutes requires 3n elements. */
11509 142074 : npatterns = SLP_TREE_LANES (node);
11510 142074 : nelts_per_pattern = ncopies = 3;
11511 : }
11512 : else
11513 : {
11514 : /* Calculate every element of every permute mask vector explicitly,
11515 : instead of relying on the pattern described above. */
11516 348226 : if (!nunits.is_constant (&npatterns)
11517 348226 : || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
11518 : {
11519 : if (dump_p)
11520 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11521 : "unsupported permutation %p on variable-length"
11522 : " vectors\n", (void *) node);
11523 : return -1;
11524 : }
11525 348226 : nelts_per_pattern = ncopies = 1;
11526 348226 : if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
11527 : {
11528 : if (dump_p)
11529 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11530 : "unsupported permutation %p for variable VF\n",
11531 : (void *) node);
11532 : return -1;
11533 : }
11534 : pack_p = false;
11535 : unpack_factor = 1;
11536 : }
11537 490300 : unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
11538 490300 : gcc_assert (repeating_p || multiple_p (olanes, nunits));
11539 :
11540 : /* Compute the { { SLP operand, vector index}, lane } permutation sequence
11541 : from the { SLP operand, scalar lane } permutation as recorded in the
11542 : SLP node as intermediate step. This part should already work
11543 : with SLP children with arbitrary number of lanes. */
11544 490300 : auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
11545 490300 : auto_vec<poly_uint64> active_lane;
11546 490300 : vperm.create (olanes);
11547 490300 : active_lane.safe_grow_cleared (children.length (), true);
11548 988827 : for (unsigned int ui = 0; ui < unpack_factor; ++ui)
11549 : {
11550 2178810 : for (unsigned j = 0; j < children.length (); ++j)
11551 590878 : active_lane[j] = ui * unpack_step;
11552 1397593 : for (unsigned i = 0; i < ncopies; ++i)
11553 : {
11554 5597910 : for (unsigned pi = 0; pi < perm.length (); ++pi)
11555 : {
11556 1899889 : std::pair<unsigned, unsigned> p = perm[pi];
11557 1899889 : tree vtype = SLP_TREE_VECTYPE (children[p.first]);
11558 1899889 : if (repeating_p)
11559 827625 : vperm.quick_push ({{p.first, 0},
11560 827625 : p.second + active_lane[p.first]});
11561 : else
11562 : {
11563 : /* We checked above that the vectors are constant-length. */
11564 1072264 : unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
11565 1072264 : .to_constant ();
11566 1072264 : unsigned lane = active_lane[p.first].to_constant ();
11567 1072264 : unsigned vi = (lane + p.second) / vnunits;
11568 1072264 : unsigned vl = (lane + p.second) % vnunits;
11569 1072264 : vperm.quick_push ({{p.first, vi}, vl});
11570 : }
11571 : }
11572 : /* Advance to the next group. */
11573 1954347 : for (unsigned j = 0; j < children.length (); ++j)
11574 1055281 : active_lane[j] += SLP_TREE_LANES (children[j]);
11575 : }
11576 : }
11577 :
11578 490300 : if (dump_p)
11579 : {
11580 8909 : dump_printf_loc (MSG_NOTE, vect_location,
11581 : "vectorizing permutation %p", (void *)node);
11582 32209 : for (unsigned i = 0; i < perm.length (); ++i)
11583 23300 : dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
11584 8909 : if (repeating_p)
11585 7502 : dump_printf (MSG_NOTE, " (repeat %d)", SLP_TREE_LANES (node));
11586 8909 : dump_printf (MSG_NOTE, "\n");
11587 8909 : dump_printf_loc (MSG_NOTE, vect_location, "as");
11588 89301 : for (unsigned i = 0; i < vperm.length (); ++i)
11589 : {
11590 80392 : if (i != 0
11591 80392 : && (repeating_p
11592 54232 : ? multiple_p (i, npatterns)
11593 59784 : : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
11594 24113 : dump_printf (MSG_NOTE, ",");
11595 80392 : dump_printf (MSG_NOTE, " vops%u[%u][",
11596 80392 : vperm[i].first.first, vperm[i].first.second);
11597 80392 : dump_dec (MSG_NOTE, vperm[i].second);
11598 80392 : dump_printf (MSG_NOTE, "]");
11599 : }
11600 8909 : dump_printf (MSG_NOTE, "\n");
11601 : }
11602 :
11603 : /* We can only handle two-vector permutes, everything else should
11604 : be lowered on the SLP level. The following is closely inspired
11605 : by vect_transform_slp_perm_load and is supposed to eventually
11606 : replace it.
11607 : ??? As intermediate step do code-gen in the SLP tree representation
11608 : somehow? */
11609 490300 : std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
11610 490300 : std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
11611 490300 : unsigned int index = 0;
11612 490300 : poly_uint64 mask_element;
11613 490300 : vec_perm_builder mask;
11614 490300 : mask.new_vector (nunits, npatterns, nelts_per_pattern);
11615 490300 : unsigned int count = mask.encoded_nelts ();
11616 490300 : mask.quick_grow (count);
11617 490300 : vec_perm_indices indices;
11618 490300 : unsigned nperms = 0;
11619 : /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
11620 : vectors to check during analysis, but we need to generate NOUTPUTS
11621 : vectors during transformation. */
11622 490300 : unsigned total_nelts = olanes;
11623 490300 : unsigned process_nelts = olanes;
11624 490300 : if (repeating_p)
11625 : {
11626 142074 : total_nelts = (total_nelts / unpack_factor) * noutputs;
11627 142074 : if (gsi)
11628 9805 : process_nelts = total_nelts;
11629 : }
11630 490300 : unsigned last_ei = (total_nelts - 1) % process_nelts;
11631 2399472 : for (unsigned i = 0; i < process_nelts; ++i)
11632 : {
11633 : /* VI is the input vector index when generating code for REPEATING_P. */
11634 1916513 : unsigned vi = i / olanes * (pack_p ? 2 : 1);
11635 1916513 : unsigned ei = i % olanes;
11636 1916513 : mask_element = vperm[ei].second;
11637 1916513 : if (pack_p)
11638 : {
11639 : /* In this case, we have N outputs and the single child provides 2N
11640 : inputs. Output X permutes inputs 2X and 2X+1.
11641 :
11642 : The mask indices are taken directly from the SLP permutation node.
11643 : Index X selects from the first vector if (X / NUNITS) % 2 == 0;
11644 : X selects from the second vector otherwise. These conditions
11645 : are only known at compile time for constant-length vectors. */
11646 : first_vec = std::make_pair (0, 0);
11647 : second_vec = std::make_pair (0, 1);
11648 : }
11649 1747877 : else if (first_vec.first == -1U
11650 1747877 : || first_vec == vperm[ei].first)
11651 1515665 : first_vec = vperm[ei].first;
11652 232212 : else if (second_vec.first == -1U
11653 232212 : || second_vec == vperm[ei].first)
11654 : {
11655 231815 : second_vec = vperm[ei].first;
11656 231815 : mask_element += nunits;
11657 : }
11658 : else
11659 : {
11660 397 : if (dump_p)
11661 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11662 : "permutation requires at "
11663 : "least three vectors\n");
11664 397 : gcc_assert (!gsi);
11665 : return -1;
11666 : }
11667 :
11668 1916116 : mask[index++] = mask_element;
11669 :
11670 1916116 : if (index == count)
11671 : {
11672 806993 : indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
11673 : TYPE_VECTOR_SUBPARTS (op_vectype));
11674 632810 : bool identity_p = (indices.series_p (0, 1, mask[0], 1)
11675 968846 : && constant_multiple_p (mask[0], nunits));
11676 632810 : machine_mode vmode = TYPE_MODE (vectype);
11677 632810 : machine_mode op_vmode = TYPE_MODE (op_vectype);
11678 632810 : unsigned HOST_WIDE_INT c;
11679 632810 : if ((!identity_p
11680 589488 : && !can_vec_perm_const_p (vmode, op_vmode, indices))
11681 632810 : || (identity_p
11682 43322 : && !known_le (nunits,
11683 : TYPE_VECTOR_SUBPARTS (op_vectype))
11684 6952 : && (!constant_multiple_p (nunits,
11685 8 : TYPE_VECTOR_SUBPARTS (op_vectype),
11686 8 : &c) || c != 2)))
11687 : {
11688 6944 : if (dump_p)
11689 : {
11690 152 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
11691 : vect_location,
11692 : "unsupported vect permute { ");
11693 1586 : for (i = 0; i < count; ++i)
11694 : {
11695 1434 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
11696 1434 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
11697 : }
11698 152 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
11699 : }
11700 6944 : gcc_assert (!gsi);
11701 7341 : return -1;
11702 : }
11703 :
11704 625866 : if (!identity_p)
11705 582544 : nperms += CEIL (total_nelts, process_nelts) - (ei > last_ei);
11706 625866 : if (gsi)
11707 : {
11708 31263 : if (second_vec.first == -1U)
11709 7079 : second_vec = first_vec;
11710 :
11711 31263 : slp_tree
11712 31263 : first_node = children[first_vec.first],
11713 31263 : second_node = children[second_vec.first];
11714 :
11715 31263 : tree mask_vec = NULL_TREE;
11716 31263 : if (!identity_p)
11717 27957 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
11718 :
11719 31263 : tree first_def
11720 31263 : = vect_get_slp_vect_def (first_node, first_vec.second + vi);
11721 31263 : tree second_def
11722 31263 : = vect_get_slp_vect_def (second_node, second_vec.second + vi);
11723 31263 : vect_add_slp_permutation (vinfo, gsi, node, first_def,
11724 31263 : second_def, mask_vec, mask[0]);
11725 : }
11726 :
11727 : index = 0;
11728 : first_vec = std::make_pair (-1U, -1U);
11729 : second_vec = std::make_pair (-1U, -1U);
11730 : }
11731 : }
11732 :
11733 482959 : return nperms;
11734 490300 : }
11735 :
11736 : /* Vectorize the SLP permutations in NODE as specified
11737 : in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
11738 : child number and lane number.
11739 : Interleaving of two two-lane two-child SLP subtrees (not supported):
11740 : [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
11741 : A blend of two four-lane two-child SLP subtrees:
11742 : [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
11743 : Highpart of a four-lane one-child SLP subtree (not supported):
11744 : [ { 0, 2 }, { 0, 3 } ]
11745 : Where currently only a subset is supported by code generating below. */
11746 :
11747 : bool
11748 139166 : vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11749 : slp_tree node, stmt_vector_for_cost *cost_vec)
11750 : {
11751 139166 : tree vectype = SLP_TREE_VECTYPE (node);
11752 139166 : lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
11753 139166 : int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
11754 139166 : SLP_TREE_CHILDREN (node),
11755 : dump_enabled_p ());
11756 139166 : if (nperms < 0)
11757 : return false;
11758 :
11759 137839 : if (!gsi && nperms != 0)
11760 115857 : record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
11761 :
11762 : return true;
11763 : }
11764 :
11765 : /* Vectorize SLP NODE. */
11766 :
11767 : static void
11768 1472867 : vect_schedule_slp_node (vec_info *vinfo,
11769 : slp_tree node, slp_instance instance)
11770 : {
11771 1472867 : gimple_stmt_iterator si;
11772 1472867 : int i;
11773 1472867 : slp_tree child;
11774 :
11775 : /* Vectorize externals and constants. */
11776 1472867 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
11777 1472867 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
11778 : {
11779 : /* ??? vectorizable_shift can end up using a scalar operand which is
11780 : currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
11781 : node in this case. */
11782 499294 : if (!SLP_TREE_VECTYPE (node))
11783 499294 : return;
11784 :
11785 : /* There are two reasons vector defs might already exist. The first
11786 : is that we are vectorizing an existing vector def. The second is
11787 : when performing BB vectorization shared constant/external nodes
11788 : are not split apart during partitioning so during the code-gen
11789 : DFS walk we can end up visiting them twice. */
11790 492318 : if (! SLP_TREE_VEC_DEFS (node).exists ())
11791 491495 : vect_create_constant_vectors (vinfo, node);
11792 492318 : return;
11793 : }
11794 :
11795 973573 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
11796 :
11797 973573 : gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
11798 973573 : if (SLP_TREE_VECTYPE (node))
11799 973567 : SLP_TREE_VEC_DEFS (node).create (vect_get_num_copies (vinfo, node));
11800 :
11801 973573 : if (!SLP_TREE_PERMUTE_P (node) && STMT_VINFO_DATA_REF (stmt_info))
11802 : {
11803 : /* Vectorized loads go before the first scalar load to make it
11804 : ready early, vectorized stores go before the last scalar
11805 : stmt which is where all uses are ready. */
11806 713130 : stmt_vec_info last_stmt_info = NULL;
11807 713130 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
11808 166233 : last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
11809 : else /* DR_IS_WRITE */
11810 546897 : last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
11811 713130 : si = gsi_for_stmt (last_stmt_info->stmt);
11812 713130 : }
11813 260443 : else if (!SLP_TREE_PERMUTE_P (node)
11814 243976 : && (SLP_TREE_TYPE (node) == cycle_phi_info_type
11815 : || SLP_TREE_TYPE (node) == induc_vec_info_type
11816 : || SLP_TREE_TYPE (node) == phi_info_type))
11817 : {
11818 : /* For PHI node vectorization we do not use the insertion iterator. */
11819 53945 : si = gsi_none ();
11820 : }
11821 : else
11822 : {
11823 : /* Emit other stmts after the children vectorized defs which is
11824 : earliest possible. */
11825 : gimple *last_stmt = NULL;
11826 : bool seen_vector_def = false;
11827 574218 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11828 367720 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
11829 : {
11830 : /* For fold-left reductions we are retaining the scalar
11831 : reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
11832 : set so the representation isn't perfect. Resort to the
11833 : last scalar def here. */
11834 294724 : if (SLP_TREE_VEC_DEFS (child).is_empty ())
11835 : {
11836 878 : gcc_assert (SLP_TREE_TYPE (child) == cycle_phi_info_type);
11837 878 : gphi *phi = as_a <gphi *>
11838 878 : (vect_find_last_scalar_stmt_in_slp (child)->stmt);
11839 878 : if (!last_stmt)
11840 : last_stmt = phi;
11841 662 : else if (vect_stmt_dominates_stmt_p (last_stmt, phi))
11842 : last_stmt = phi;
11843 651 : else if (vect_stmt_dominates_stmt_p (phi, last_stmt))
11844 : ;
11845 : else
11846 0 : gcc_unreachable ();
11847 : }
11848 : /* We are emitting all vectorized stmts in the same place and
11849 : the last one is the last.
11850 : ??? Unless we have a load permutation applied and that
11851 : figures to re-use an earlier generated load. */
11852 : unsigned j;
11853 : tree vdef;
11854 697047 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11855 : {
11856 402323 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11857 402323 : if (!last_stmt)
11858 : last_stmt = vstmt;
11859 206563 : else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11860 : last_stmt = vstmt;
11861 45219 : else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
11862 : ;
11863 : else
11864 0 : gcc_unreachable ();
11865 : }
11866 : }
11867 72996 : else if (!SLP_TREE_VECTYPE (child))
11868 : {
11869 : /* For externals we use unvectorized at all scalar defs. */
11870 : unsigned j;
11871 : tree def;
11872 14831 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
11873 8491 : if (TREE_CODE (def) == SSA_NAME
11874 8491 : && !SSA_NAME_IS_DEFAULT_DEF (def))
11875 : {
11876 295 : gimple *stmt = SSA_NAME_DEF_STMT (def);
11877 295 : if (gimple_uid (stmt) == -1u)
11878 : /* If the stmt is not inside the region do not
11879 : use it as possible insertion point. */
11880 : ;
11881 285 : else if (!last_stmt)
11882 : last_stmt = stmt;
11883 261 : else if (vect_stmt_dominates_stmt_p (last_stmt, stmt))
11884 : last_stmt = stmt;
11885 159 : else if (vect_stmt_dominates_stmt_p (stmt, last_stmt))
11886 : ;
11887 : else
11888 0 : gcc_unreachable ();
11889 : }
11890 : }
11891 : else
11892 : {
11893 : /* For externals we have to look at all defs since their
11894 : insertion place is decided per vector. But beware
11895 : of pre-existing vectors where we need to make sure
11896 : we do not insert before the region boundary. */
11897 66656 : if (SLP_TREE_SCALAR_OPS (child).is_empty ()
11898 650 : && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
11899 : seen_vector_def = true;
11900 : else
11901 : {
11902 : unsigned j;
11903 : tree vdef;
11904 528822 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11905 94563 : if (TREE_CODE (vdef) == SSA_NAME
11906 94563 : && !SSA_NAME_IS_DEFAULT_DEF (vdef))
11907 : {
11908 19780 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11909 19780 : if (!last_stmt)
11910 : last_stmt = vstmt;
11911 11005 : else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11912 : last_stmt = vstmt;
11913 8738 : else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
11914 : ;
11915 : else
11916 0 : gcc_unreachable ();
11917 : }
11918 : }
11919 : }
11920 : /* This can happen when all children are pre-existing vectors or
11921 : constants. */
11922 206498 : if (!last_stmt)
11923 1723 : last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
11924 1723 : if (!last_stmt)
11925 : {
11926 0 : gcc_assert (seen_vector_def);
11927 0 : si = gsi_after_labels (vinfo->bbs[0]);
11928 : }
11929 206498 : else if (is_ctrl_altering_stmt (last_stmt))
11930 : {
11931 : /* We split regions to vectorize at control altering stmts
11932 : with a definition so this must be an external which
11933 : we can insert at the start of the region. */
11934 0 : si = gsi_after_labels (vinfo->bbs[0]);
11935 : }
11936 206498 : else if (is_a <bb_vec_info> (vinfo)
11937 18190 : && !SLP_TREE_PERMUTE_P (node)
11938 16704 : && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
11939 207883 : && gimple_could_trap_p (stmt_info->stmt))
11940 : {
11941 : /* We've constrained possibly trapping operations to all come
11942 : from the same basic-block, if vectorized defs would allow earlier
11943 : scheduling still force vectorized stmts to the original block.
11944 : This is only necessary for BB vectorization since for loop vect
11945 : all operations are in a single BB and scalar stmt based
11946 : placement doesn't play well with epilogue vectorization. */
11947 53 : gcc_assert (dominated_by_p (CDI_DOMINATORS,
11948 : gimple_bb (stmt_info->stmt),
11949 : gimple_bb (last_stmt)));
11950 53 : si = gsi_after_labels (gimple_bb (stmt_info->stmt));
11951 : }
11952 206445 : else if (is_a <gphi *> (last_stmt))
11953 14439 : si = gsi_after_labels (gimple_bb (last_stmt));
11954 : else
11955 : {
11956 192006 : si = gsi_for_stmt (last_stmt);
11957 192006 : gsi_next (&si);
11958 :
11959 192006 : if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
11960 : {
11961 : /* Avoid scheduling stmts to random places in the CFG, any
11962 : stmt dominance check we performed is possibly wrong as UIDs
11963 : are not initialized for all of the function for loop
11964 : vectorization. Instead append to the loop preheader. */
11965 174085 : if ((LOOP_VINFO_LOOP (loop_vinfo)->header
11966 174085 : != gimple_bb (last_stmt))
11967 177302 : && dominated_by_p (CDI_DOMINATORS,
11968 : LOOP_VINFO_LOOP (loop_vinfo)->header,
11969 3217 : gimple_bb (last_stmt)))
11970 1406 : si = gsi_end_bb (loop_preheader_edge
11971 703 : (LOOP_VINFO_LOOP (loop_vinfo))->src);
11972 : /* Avoid scheduling internal defs outside of the loop when
11973 : we might have only implicitly tracked loop mask/len defs. */
11974 74 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
11975 174085 : || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
11976 : {
11977 74 : gimple_stmt_iterator si2
11978 74 : = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
11979 74 : if ((gsi_end_p (si2)
11980 0 : && (LOOP_VINFO_LOOP (loop_vinfo)->header
11981 0 : != gimple_bb (last_stmt))
11982 0 : && dominated_by_p (CDI_DOMINATORS,
11983 : LOOP_VINFO_LOOP (loop_vinfo)->header,
11984 0 : gimple_bb (last_stmt)))
11985 74 : || (!gsi_end_p (si2)
11986 74 : && last_stmt != *si2
11987 72 : && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
11988 3 : si = si2;
11989 : }
11990 : }
11991 : }
11992 : }
11993 :
11994 973573 : if (dump_enabled_p ())
11995 : {
11996 71489 : if (stmt_info)
11997 71436 : dump_printf_loc (MSG_NOTE, vect_location,
11998 : "------>vectorizing SLP node starting from: %G",
11999 : stmt_info->stmt);
12000 : else
12001 : {
12002 53 : dump_printf_loc (MSG_NOTE, vect_location,
12003 : "------>vectorizing SLP node:\n");
12004 53 : vect_print_slp_tree (MSG_NOTE, vect_location, node);
12005 : }
12006 : }
12007 973573 : vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
12008 : }
12009 :
12010 : /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
12011 : For loop vectorization this is done in vectorizable_call, but for SLP
12012 : it needs to be deferred until end of vect_schedule_slp, because multiple
12013 : SLP instances may refer to the same scalar stmt. */
12014 :
12015 : static void
12016 598713 : vect_remove_slp_scalar_calls (vec_info *vinfo,
12017 : slp_tree node, hash_set<slp_tree> &visited)
12018 : {
12019 598713 : gimple *new_stmt;
12020 598713 : gimple_stmt_iterator gsi;
12021 598713 : int i;
12022 598713 : slp_tree child;
12023 598713 : tree lhs;
12024 598713 : stmt_vec_info stmt_info;
12025 :
12026 598713 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
12027 187549 : return;
12028 :
12029 454485 : if (visited.add (node))
12030 : return;
12031 :
12032 920222 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
12033 509058 : vect_remove_slp_scalar_calls (vinfo, child, visited);
12034 :
12035 1301816 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
12036 : {
12037 483620 : if (!stmt_info)
12038 3974 : continue;
12039 479646 : stmt_info = vect_orig_stmt (stmt_info);
12040 479646 : gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
12041 5239 : if (!stmt || gimple_bb (stmt) == NULL)
12042 474453 : continue;
12043 5193 : lhs = gimple_call_lhs (stmt);
12044 5193 : if (lhs)
12045 4585 : new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
12046 : else
12047 608 : new_stmt = gimple_build_nop ();
12048 5193 : unlink_stmt_vdef (stmt_info->stmt);
12049 5193 : gsi = gsi_for_stmt (stmt);
12050 5193 : vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
12051 5193 : if (lhs)
12052 4585 : SSA_NAME_DEF_STMT (lhs) = new_stmt;
12053 : }
12054 : }
12055 :
12056 : static void
12057 89655 : vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
12058 : {
12059 89655 : hash_set<slp_tree> visited;
12060 89655 : vect_remove_slp_scalar_calls (vinfo, node, visited);
12061 89655 : }
12062 :
12063 : /* Vectorize the instance root. */
12064 :
12065 : void
12066 10935 : vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance instance)
12067 : {
12068 10935 : gassign *rstmt = NULL;
12069 :
12070 10935 : if (instance->kind == slp_inst_kind_ctor)
12071 : {
12072 5236 : if (SLP_TREE_VEC_DEFS (node).length () == 1)
12073 : {
12074 5199 : tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
12075 5199 : tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
12076 5199 : if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
12077 5199 : TREE_TYPE (vect_lhs)))
12078 0 : vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
12079 : vect_lhs);
12080 5199 : rstmt = gimple_build_assign (root_lhs, vect_lhs);
12081 : }
12082 : else
12083 : {
12084 37 : gcc_assert (SLP_TREE_VEC_DEFS (node).length () > 1);
12085 37 : tree child_def;
12086 37 : int j;
12087 37 : vec<constructor_elt, va_gc> *v;
12088 37 : vec_alloc (v, SLP_TREE_VEC_DEFS (node).length ());
12089 :
12090 : /* A CTOR can handle V16HI composition from VNx8HI so we
12091 : do not need to convert vector elements if the types
12092 : do not match. */
12093 111 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
12094 74 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
12095 37 : tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
12096 37 : tree rtype
12097 37 : = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
12098 37 : tree r_constructor = build_constructor (rtype, v);
12099 37 : rstmt = gimple_build_assign (lhs, r_constructor);
12100 : }
12101 : }
12102 5699 : else if (instance->kind == slp_inst_kind_bb_reduc)
12103 : {
12104 : /* Largely inspired by reduction chain epilogue handling in
12105 : vect_create_epilog_for_reduction. */
12106 4131 : vec<tree> vec_defs = vNULL;
12107 4131 : vect_get_slp_defs (node, &vec_defs);
12108 4131 : enum tree_code reduc_code
12109 4131 : = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
12110 : /* ??? We actually have to reflect signs somewhere. */
12111 4131 : if (reduc_code == MINUS_EXPR)
12112 0 : reduc_code = PLUS_EXPR;
12113 4131 : gimple_seq epilogue = NULL;
12114 : /* We may end up with more than one vector result, reduce them
12115 : to one vector. */
12116 4131 : tree vec_def = vec_defs[0];
12117 4131 : tree vectype = TREE_TYPE (vec_def);
12118 4131 : tree compute_vectype = vectype;
12119 4131 : bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
12120 3932 : && TYPE_OVERFLOW_UNDEFINED (vectype)
12121 6896 : && operation_can_overflow (reduc_code));
12122 2622 : if (pun_for_overflow_p)
12123 : {
12124 2622 : compute_vectype = unsigned_type_for (vectype);
12125 2622 : vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
12126 : compute_vectype, vec_def);
12127 : }
12128 6519 : for (unsigned i = 1; i < vec_defs.length (); ++i)
12129 : {
12130 2388 : tree def = vec_defs[i];
12131 2388 : if (pun_for_overflow_p)
12132 2285 : def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
12133 : compute_vectype, def);
12134 2388 : vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
12135 : vec_def, def);
12136 : }
12137 4131 : vec_defs.release ();
12138 : /* ??? Support other schemes than direct internal fn. */
12139 4131 : internal_fn reduc_fn;
12140 4131 : if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
12141 4131 : || reduc_fn == IFN_LAST)
12142 0 : gcc_unreachable ();
12143 4131 : tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
12144 4131 : TREE_TYPE (compute_vectype), vec_def);
12145 4131 : if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
12146 : {
12147 2565 : tree rem_def = NULL_TREE;
12148 11907 : for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
12149 : {
12150 9342 : def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
12151 9342 : if (!rem_def)
12152 : rem_def = def;
12153 : else
12154 6777 : rem_def = gimple_build (&epilogue, reduc_code,
12155 6777 : TREE_TYPE (scalar_def),
12156 : rem_def, def);
12157 : }
12158 2565 : scalar_def = gimple_build (&epilogue, reduc_code,
12159 2565 : TREE_TYPE (scalar_def),
12160 : scalar_def, rem_def);
12161 : }
12162 4131 : scalar_def = gimple_convert (&epilogue,
12163 4131 : TREE_TYPE (vectype), scalar_def);
12164 4131 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
12165 4131 : gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
12166 4131 : gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
12167 4131 : update_stmt (gsi_stmt (rgsi));
12168 4131 : return;
12169 : }
12170 1568 : else if (instance->kind == slp_inst_kind_gcond)
12171 : {
12172 : /* Only support a single root for now as we can't codegen CFG yet and so we
12173 : can't support lane > 1 at this time. */
12174 1568 : gcc_assert (instance->root_stmts.length () == 1);
12175 1568 : auto root_stmt_info = instance->root_stmts[0];
12176 1568 : auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
12177 1568 : gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
12178 1568 : gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
12179 1568 : bool res = vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
12180 : root_stmt_info, &rgsi, node, NULL);
12181 1568 : gcc_assert (res);
12182 1568 : return;
12183 : }
12184 : else
12185 0 : gcc_unreachable ();
12186 :
12187 5236 : gcc_assert (rstmt);
12188 :
12189 5236 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
12190 5236 : gsi_replace (&rgsi, rstmt, true);
12191 : }
12192 :
12193 : struct slp_scc_info
12194 : {
12195 : bool on_stack;
12196 : int dfs;
12197 : int lowlink;
12198 : };
12199 :
12200 : /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
12201 :
12202 : static void
12203 1472867 : vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
12204 : hash_map<slp_tree, slp_scc_info> &scc_info,
12205 : int &maxdfs, vec<slp_tree> &stack)
12206 : {
12207 1472867 : bool existed_p;
12208 1472867 : slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
12209 1472867 : gcc_assert (!existed_p);
12210 1472867 : info->dfs = maxdfs;
12211 1472867 : info->lowlink = maxdfs;
12212 1472867 : maxdfs++;
12213 :
12214 : /* Leaf. */
12215 1472867 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
12216 : {
12217 499294 : info->on_stack = false;
12218 499294 : vect_schedule_slp_node (vinfo, node, instance);
12219 1030021 : return;
12220 : }
12221 :
12222 973573 : info->on_stack = true;
12223 973573 : stack.safe_push (node);
12224 :
12225 973573 : unsigned i;
12226 973573 : slp_tree child;
12227 : /* DFS recurse. */
12228 2008299 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
12229 : {
12230 1034726 : if (!child)
12231 55074 : continue;
12232 979652 : slp_scc_info *child_info = scc_info.get (child);
12233 979652 : if (!child_info)
12234 : {
12235 889963 : vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
12236 : /* Recursion might have re-allocated the node. */
12237 889963 : info = scc_info.get (node);
12238 889963 : child_info = scc_info.get (child);
12239 889963 : info->lowlink = MIN (info->lowlink, child_info->lowlink);
12240 : }
12241 89689 : else if (child_info->on_stack)
12242 25289 : info->lowlink = MIN (info->lowlink, child_info->dfs);
12243 : }
12244 973573 : if (info->lowlink != info->dfs)
12245 : return;
12246 :
12247 942140 : auto_vec<slp_tree, 4> phis_to_fixup;
12248 :
12249 : /* Singleton. */
12250 942140 : if (stack.last () == node)
12251 : {
12252 918526 : stack.pop ();
12253 918526 : info->on_stack = false;
12254 918526 : vect_schedule_slp_node (vinfo, node, instance);
12255 918526 : if (!SLP_TREE_PERMUTE_P (node)
12256 918526 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
12257 30458 : phis_to_fixup.quick_push (node);
12258 : }
12259 : else
12260 : {
12261 : /* SCC. */
12262 23614 : int last_idx = stack.length () - 1;
12263 55047 : while (stack[last_idx] != node)
12264 31433 : last_idx--;
12265 : /* We can break the cycle at PHIs who have at least one child
12266 : code generated. Then we could re-start the DFS walk until
12267 : all nodes in the SCC are covered (we might have new entries
12268 : for only back-reachable nodes). But it's simpler to just
12269 : iterate and schedule those that are ready. */
12270 23614 : unsigned todo = stack.length () - last_idx;
12271 23953 : do
12272 : {
12273 104737 : for (int idx = stack.length () - 1; idx >= last_idx; --idx)
12274 : {
12275 56831 : slp_tree entry = stack[idx];
12276 56831 : if (!entry)
12277 956 : continue;
12278 55875 : bool phi = (!SLP_TREE_PERMUTE_P (entry)
12279 55875 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
12280 55875 : bool ready = !phi;
12281 141383 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
12282 110366 : if (!child)
12283 : {
12284 22736 : gcc_assert (phi);
12285 : ready = true;
12286 : break;
12287 : }
12288 87630 : else if (scc_info.get (child)->on_stack)
12289 : {
12290 23823 : if (!phi)
12291 : {
12292 : ready = false;
12293 : break;
12294 : }
12295 : }
12296 : else
12297 : {
12298 63807 : if (phi)
12299 : {
12300 : ready = true;
12301 : break;
12302 : }
12303 : }
12304 33139 : if (ready)
12305 : {
12306 55047 : vect_schedule_slp_node (vinfo, entry, instance);
12307 55047 : scc_info.get (entry)->on_stack = false;
12308 55047 : stack[idx] = NULL;
12309 55047 : todo--;
12310 55047 : if (phi)
12311 24060 : phis_to_fixup.safe_push (entry);
12312 : }
12313 : }
12314 : }
12315 23953 : while (todo != 0);
12316 :
12317 : /* Pop the SCC. */
12318 23614 : stack.truncate (last_idx);
12319 : }
12320 :
12321 : /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
12322 : slp_tree phi_node;
12323 1938798 : FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
12324 : {
12325 54518 : gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
12326 54518 : edge_iterator ei;
12327 54518 : edge e;
12328 172209 : FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
12329 : {
12330 117691 : unsigned dest_idx = e->dest_idx;
12331 117691 : child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
12332 117691 : if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
12333 66092 : continue;
12334 51599 : unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
12335 : /* Simply fill all args. */
12336 51599 : if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
12337 : != vect_first_order_recurrence)
12338 110856 : for (unsigned i = 0; i < n; ++i)
12339 : {
12340 59300 : tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
12341 59300 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
12342 59300 : add_phi_arg (phi, vect_get_slp_vect_def (child, i),
12343 : e, gimple_phi_arg_location (phi, dest_idx));
12344 : }
12345 : else
12346 : {
12347 : /* Unless it is a first order recurrence which needs
12348 : args filled in for both the PHI node and the permutes. */
12349 43 : gimple *perm
12350 43 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
12351 43 : gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
12352 43 : add_phi_arg (as_a <gphi *> (rphi),
12353 : vect_get_slp_vect_def (child, n - 1),
12354 : e, gimple_phi_arg_location (phi, dest_idx));
12355 123 : for (unsigned i = 0; i < n; ++i)
12356 : {
12357 80 : gimple *perm
12358 80 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
12359 80 : if (i > 0)
12360 37 : gimple_assign_set_rhs1 (perm,
12361 : vect_get_slp_vect_def (child, i - 1));
12362 80 : gimple_assign_set_rhs2 (perm,
12363 : vect_get_slp_vect_def (child, i));
12364 80 : update_stmt (perm);
12365 : }
12366 : }
12367 : }
12368 : }
12369 942140 : }
12370 :
12371 : /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
12372 :
12373 : void
12374 543782 : vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
12375 : {
12376 543782 : slp_instance instance;
12377 543782 : unsigned int i;
12378 :
12379 543782 : hash_map<slp_tree, slp_scc_info> scc_info;
12380 543782 : int maxdfs = 0;
12381 1126793 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
12382 : {
12383 583011 : slp_tree node = SLP_INSTANCE_TREE (instance);
12384 583011 : if (dump_enabled_p ())
12385 : {
12386 16034 : dump_printf_loc (MSG_NOTE, vect_location,
12387 : "Vectorizing SLP tree:\n");
12388 : /* ??? Dump all? */
12389 16034 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
12390 465 : dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
12391 465 : SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
12392 16034 : vect_print_slp_graph (MSG_NOTE, vect_location,
12393 : SLP_INSTANCE_TREE (instance));
12394 : }
12395 : /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
12396 : have a PHI be the node breaking the cycle. */
12397 583011 : auto_vec<slp_tree> stack;
12398 583011 : if (!scc_info.get (node))
12399 582904 : vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
12400 :
12401 583011 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
12402 10935 : vectorize_slp_instance_root_stmt (vinfo, node, instance);
12403 :
12404 583011 : if (dump_enabled_p ())
12405 16034 : dump_printf_loc (MSG_NOTE, vect_location,
12406 : "vectorizing stmts using SLP.\n");
12407 583011 : }
12408 :
12409 1670575 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
12410 : {
12411 583011 : slp_tree root = SLP_INSTANCE_TREE (instance);
12412 583011 : stmt_vec_info store_info;
12413 583011 : unsigned int j;
12414 :
12415 : /* Remove scalar call stmts. Do not do this for basic-block
12416 : vectorization as not all uses may be vectorized.
12417 : ??? Why should this be necessary? DCE should be able to
12418 : remove the stmts itself.
12419 : ??? For BB vectorization we can as well remove scalar
12420 : stmts starting from the SLP tree root if they have no
12421 : uses. */
12422 583011 : if (is_a <loop_vec_info> (vinfo))
12423 89655 : vect_remove_slp_scalar_calls (vinfo, root);
12424 :
12425 : /* Remove vectorized stores original scalar stmts. */
12426 2603408 : for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
12427 : {
12428 1473500 : if (!store_info
12429 1473486 : || !STMT_VINFO_DATA_REF (store_info)
12430 1445992 : || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
12431 : break;
12432 :
12433 1437386 : store_info = vect_orig_stmt (store_info);
12434 : /* Free the attached stmt_vec_info and remove the stmt. */
12435 1437386 : vinfo->remove_stmt (store_info);
12436 :
12437 : /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
12438 : to not crash in vect_free_slp_tree later. */
12439 1437386 : if (SLP_TREE_REPRESENTATIVE (root) == store_info)
12440 546566 : SLP_TREE_REPRESENTATIVE (root) = NULL;
12441 : }
12442 : }
12443 543782 : }
|