Line data Source code
1 : /* SLP - Basic Block Vectorization
2 : Copyright (C) 2007-2026 Free Software Foundation, Inc.
3 : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : and Ira Rosen <irar@il.ibm.com>
5 :
6 : This file is part of GCC.
7 :
8 : GCC is free software; you can redistribute it and/or modify it under
9 : the terms of the GNU General Public License as published by the Free
10 : Software Foundation; either version 3, or (at your option) any later
11 : version.
12 :
13 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with GCC; see the file COPYING3. If not see
20 : <http://www.gnu.org/licenses/>. */
21 :
22 : #include "config.h"
23 : #define INCLUDE_ALGORITHM
24 : #include "system.h"
25 : #include "coretypes.h"
26 : #include "backend.h"
27 : #include "target.h"
28 : #include "rtl.h"
29 : #include "tree.h"
30 : #include "gimple.h"
31 : #include "tree-pass.h"
32 : #include "ssa.h"
33 : #include "optabs-tree.h"
34 : #include "insn-config.h"
35 : #include "recog.h" /* FIXME: for insn_data */
36 : #include "fold-const.h"
37 : #include "stor-layout.h"
38 : #include "gimple-iterator.h"
39 : #include "cfgloop.h"
40 : #include "tree-vectorizer.h"
41 : #include "langhooks.h"
42 : #include "gimple-walk.h"
43 : #include "dbgcnt.h"
44 : #include "tree-vector-builder.h"
45 : #include "vec-perm-indices.h"
46 : #include "gimple-fold.h"
47 : #include "internal-fn.h"
48 : #include "dump-context.h"
49 : #include "cfganal.h"
50 : #include "tree-eh.h"
51 : #include "tree-cfg.h"
52 : #include "alloc-pool.h"
53 : #include "sreal.h"
54 : #include "predict.h"
55 :
56 : #define REDUC_GROUP_FIRST_ELEMENT(S) \
57 : (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element)
58 :
59 : static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
60 : load_permutation_t &,
61 : const vec<tree> &,
62 : gimple_stmt_iterator *,
63 : poly_uint64, bool, bool,
64 : unsigned *,
65 : unsigned * = nullptr,
66 : bool = false);
67 : static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
68 : slp_tree, lane_permutation_t &,
69 : vec<slp_tree> &, bool);
70 : static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
71 : static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
72 :
73 : static object_allocator<_slp_tree> *slp_tree_pool;
74 : static slp_tree slp_first_node;
75 :
76 : void
77 1113321 : vect_slp_init (void)
78 : {
79 1113321 : slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
80 1113321 : }
81 :
82 : void
83 1113321 : vect_slp_fini (void)
84 : {
85 1615569 : while (slp_first_node)
86 502248 : delete slp_first_node;
87 2226642 : delete slp_tree_pool;
88 1113321 : slp_tree_pool = NULL;
89 1113321 : }
90 :
91 : void *
92 7115296 : _slp_tree::operator new (size_t n)
93 : {
94 7115296 : gcc_assert (n == sizeof (_slp_tree));
95 7115296 : return slp_tree_pool->allocate_raw ();
96 : }
97 :
98 : void
99 7115296 : _slp_tree::operator delete (void *node, size_t n)
100 : {
101 7115296 : gcc_assert (n == sizeof (_slp_tree));
102 7115296 : slp_tree_pool->remove_raw (node);
103 7115296 : }
104 :
105 :
106 : /* Initialize a SLP node. */
107 :
108 7115296 : _slp_tree::_slp_tree ()
109 : {
110 7115296 : this->prev_node = NULL;
111 7115296 : if (slp_first_node)
112 6156127 : slp_first_node->prev_node = this;
113 7115296 : this->next_node = slp_first_node;
114 7115296 : slp_first_node = this;
115 7115296 : SLP_TREE_SCALAR_STMTS (this) = vNULL;
116 7115296 : SLP_TREE_SCALAR_OPS (this) = vNULL;
117 7115296 : SLP_TREE_VEC_DEFS (this) = vNULL;
118 7115296 : SLP_TREE_CHILDREN (this) = vNULL;
119 7115296 : SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
120 7115296 : SLP_TREE_LANE_PERMUTATION (this) = vNULL;
121 7115296 : SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122 7115296 : SLP_TREE_CODE (this) = ERROR_MARK;
123 7115296 : SLP_TREE_GS_SCALE (this) = 0;
124 7115296 : SLP_TREE_GS_BASE (this) = NULL_TREE;
125 7115296 : this->ldst_lanes = false;
126 7115296 : this->avoid_stlf_fail = false;
127 7115296 : SLP_TREE_VECTYPE (this) = NULL_TREE;
128 7115296 : SLP_TREE_REPRESENTATIVE (this) = NULL;
129 7115296 : this->cycle_info.id = -1;
130 7115296 : this->cycle_info.reduc_idx = -1;
131 7115296 : SLP_TREE_REF_COUNT (this) = 1;
132 7115296 : this->failed = NULL;
133 7115296 : this->max_nunits = 1;
134 7115296 : this->lanes = 0;
135 7115296 : SLP_TREE_TYPE (this) = undef_vec_info_type;
136 7115296 : this->data = NULL;
137 7115296 : }
138 :
139 : /* Tear down a SLP node. */
140 :
141 7115296 : _slp_tree::~_slp_tree ()
142 : {
143 7115296 : if (this->prev_node)
144 4389504 : this->prev_node->next_node = this->next_node;
145 : else
146 2725792 : slp_first_node = this->next_node;
147 7115296 : if (this->next_node)
148 5235470 : this->next_node->prev_node = this->prev_node;
149 7115296 : SLP_TREE_CHILDREN (this).release ();
150 7115296 : SLP_TREE_SCALAR_STMTS (this).release ();
151 7115296 : SLP_TREE_SCALAR_OPS (this).release ();
152 7115296 : SLP_TREE_VEC_DEFS (this).release ();
153 7115296 : SLP_TREE_LOAD_PERMUTATION (this).release ();
154 7115296 : SLP_TREE_LANE_PERMUTATION (this).release ();
155 7115296 : if (this->failed)
156 1926318 : free (failed);
157 7115296 : if (this->data)
158 1127404 : delete this->data;
159 7115296 : }
160 :
161 : /* Push the single SSA definition in DEF to the vector of vector defs. */
162 :
163 : void
164 525542 : _slp_tree::push_vec_def (gimple *def)
165 : {
166 525542 : if (gphi *phi = dyn_cast <gphi *> (def))
167 58642 : vec_defs.quick_push (gimple_phi_result (phi));
168 : else
169 : {
170 466900 : def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
171 466900 : vec_defs.quick_push (get_def_from_ptr (defop));
172 : }
173 525542 : }
174 :
175 : /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
176 :
177 : void
178 13381788 : vect_free_slp_tree (slp_tree node)
179 : {
180 13381788 : int i;
181 13381788 : slp_tree child;
182 :
183 13381788 : if (--SLP_TREE_REF_COUNT (node) != 0)
184 13381788 : return;
185 :
186 10175018 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
187 3561970 : if (child)
188 3237869 : vect_free_slp_tree (child);
189 :
190 : /* If the node defines any SLP only patterns then those patterns are no
191 : longer valid and should be removed. */
192 6613048 : stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
193 6613048 : if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
194 : {
195 973 : stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
196 973 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
197 973 : STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
198 : }
199 :
200 6613048 : delete node;
201 : }
202 :
203 : /* Return a location suitable for dumpings related to the SLP instance. */
204 :
205 : dump_user_location_t
206 3373462 : _slp_instance::location () const
207 : {
208 3373462 : if (!root_stmts.is_empty ())
209 314978 : return root_stmts[0]->stmt;
210 : else
211 3058484 : return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
212 : }
213 :
214 :
215 : /* Free the memory allocated for the SLP instance. */
216 :
217 : void
218 1451065 : vect_free_slp_instance (slp_instance instance)
219 : {
220 1451065 : vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
221 1451065 : SLP_INSTANCE_LOADS (instance).release ();
222 1451065 : SLP_INSTANCE_ROOT_STMTS (instance).release ();
223 1451065 : SLP_INSTANCE_REMAIN_DEFS (instance).release ();
224 1451065 : instance->subgraph_entries.release ();
225 1451065 : instance->cost_vec.release ();
226 1451065 : free (instance);
227 1451065 : }
228 :
229 :
230 : /* Create an SLP node for SCALAR_STMTS. */
231 :
232 : slp_tree
233 86729 : vect_create_new_slp_node (unsigned nops, tree_code code)
234 : {
235 86729 : slp_tree node = new _slp_tree;
236 86729 : SLP_TREE_SCALAR_STMTS (node) = vNULL;
237 86729 : SLP_TREE_CHILDREN (node).create (nops);
238 86729 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
239 86729 : SLP_TREE_CODE (node) = code;
240 86729 : return node;
241 : }
242 : /* Create an SLP node for SCALAR_STMTS. */
243 :
244 : static slp_tree
245 3339883 : vect_create_new_slp_node (slp_tree node,
246 : vec<stmt_vec_info> scalar_stmts, unsigned nops)
247 : {
248 3339883 : SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
249 3339883 : SLP_TREE_CHILDREN (node).create (nops);
250 3339883 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
251 3339883 : SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
252 3339883 : SLP_TREE_LANES (node) = scalar_stmts.length ();
253 3339883 : return node;
254 : }
255 :
256 : /* Create an SLP node for SCALAR_STMTS. */
257 :
258 : static slp_tree
259 6276 : vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
260 : {
261 6276 : return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
262 : }
263 :
264 : /* Create an SLP node for OPS. */
265 :
266 : static slp_tree
267 1752640 : vect_create_new_slp_node (slp_tree node, vec<tree> ops)
268 : {
269 1752640 : SLP_TREE_SCALAR_OPS (node) = ops;
270 1752640 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
271 0 : SLP_TREE_LANES (node) = ops.length ();
272 1752640 : return node;
273 : }
274 :
275 : /* Create an SLP node for OPS. */
276 :
277 : static slp_tree
278 1752640 : vect_create_new_slp_node (vec<tree> ops)
279 : {
280 1752640 : return vect_create_new_slp_node (new _slp_tree, ops);
281 : }
282 :
283 :
284 : /* This structure is used in creation of an SLP tree. Each instance
285 : corresponds to the same operand in a group of scalar stmts in an SLP
286 : node. */
287 : typedef struct _slp_oprnd_info
288 : {
289 : /* Def-stmts for the operands. */
290 : vec<stmt_vec_info> def_stmts;
291 : /* Operands. */
292 : vec<tree> ops;
293 : /* Information about the first statement, its vector def-type, type, the
294 : operand itself in case it's constant, and an indication if it's a pattern
295 : stmt and gather/scatter info. */
296 : tree first_op_type;
297 : enum vect_def_type first_dt;
298 : bool any_pattern;
299 : bool first_gs_p;
300 : gather_scatter_info first_gs_info;
301 : } *slp_oprnd_info;
302 :
303 :
304 : /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
305 : operand. */
306 : static vec<slp_oprnd_info>
307 2981204 : vect_create_oprnd_info (int nops, int group_size)
308 : {
309 2981204 : int i;
310 2981204 : slp_oprnd_info oprnd_info;
311 2981204 : vec<slp_oprnd_info> oprnds_info;
312 :
313 2981204 : oprnds_info.create (nops);
314 10653248 : for (i = 0; i < nops; i++)
315 : {
316 4690840 : oprnd_info = XNEW (struct _slp_oprnd_info);
317 4690840 : oprnd_info->def_stmts.create (group_size);
318 4690840 : oprnd_info->ops.create (group_size);
319 4690840 : oprnd_info->first_dt = vect_uninitialized_def;
320 4690840 : oprnd_info->first_op_type = NULL_TREE;
321 4690840 : oprnd_info->any_pattern = false;
322 4690840 : oprnd_info->first_gs_p = false;
323 4690840 : oprnds_info.quick_push (oprnd_info);
324 : }
325 :
326 2981204 : return oprnds_info;
327 : }
328 :
329 :
330 : /* Free operands info. */
331 :
332 : static void
333 2981204 : vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
334 : {
335 2981204 : int i;
336 2981204 : slp_oprnd_info oprnd_info;
337 :
338 7672044 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
339 : {
340 4690840 : oprnd_info->def_stmts.release ();
341 4690840 : oprnd_info->ops.release ();
342 4690840 : XDELETE (oprnd_info);
343 : }
344 :
345 2981204 : oprnds_info.release ();
346 2981204 : }
347 :
348 : /* Return the execution frequency of NODE (so that a higher value indicates
349 : a "more important" node when optimizing for speed). */
350 :
351 : static sreal
352 3139585 : vect_slp_node_weight (slp_tree node)
353 : {
354 3139585 : stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
355 3139585 : basic_block bb = gimple_bb (stmt_info->stmt);
356 3139585 : return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
357 : }
358 :
359 : /* Return true if STMTS contains a pattern statement. */
360 :
361 : static bool
362 22303 : vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
363 : {
364 22303 : stmt_vec_info stmt_info;
365 22303 : unsigned int i;
366 72067 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
367 52069 : if (stmt_info && is_pattern_stmt_p (stmt_info))
368 : return true;
369 : return false;
370 : }
371 :
372 : /* Return true when all lanes in the external or constant NODE have
373 : the same value. */
374 :
375 : static bool
376 591195 : vect_slp_tree_uniform_p (slp_tree node)
377 : {
378 591195 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
379 : || SLP_TREE_DEF_TYPE (node) == vect_external_def);
380 :
381 : /* Pre-exsting vectors. */
382 1041084 : if (SLP_TREE_SCALAR_OPS (node).is_empty ())
383 : return false;
384 :
385 : unsigned i;
386 : tree op, first = NULL_TREE;
387 1353495 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
388 1212189 : if (!first)
389 : first = op;
390 620994 : else if (!operand_equal_p (first, op, 0))
391 : return false;
392 :
393 : return true;
394 : }
395 :
396 : /* Find the place of the data-ref in STMT_INFO in the interleaving chain
397 : that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
398 : of the chain. */
399 :
400 : int
401 660328 : vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
402 : stmt_vec_info first_stmt_info)
403 : {
404 660328 : stmt_vec_info next_stmt_info = first_stmt_info;
405 660328 : int result = 0;
406 :
407 660328 : if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
408 : return -1;
409 :
410 1660588 : do
411 : {
412 1660588 : if (next_stmt_info == stmt_info)
413 : return result;
414 1000260 : next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
415 1000260 : if (next_stmt_info)
416 1000260 : result += DR_GROUP_GAP (next_stmt_info);
417 : }
418 1000260 : while (next_stmt_info);
419 :
420 : return -1;
421 : }
422 :
423 : /* Check whether it is possible to load COUNT elements of type ELT_TYPE
424 : using the method implemented by duplicate_and_interleave. Return true
425 : if so, returning the number of intermediate vectors in *NVECTORS_OUT
426 : (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
427 : (if nonnull). */
428 :
429 : bool
430 0 : can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
431 : tree elt_type, unsigned int *nvectors_out,
432 : tree *vector_type_out,
433 : tree *permutes)
434 : {
435 0 : tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
436 0 : if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
437 0 : return false;
438 :
439 0 : machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
440 0 : poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
441 0 : unsigned int nvectors = 1;
442 0 : for (;;)
443 : {
444 0 : scalar_int_mode int_mode;
445 0 : poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
446 0 : if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
447 : {
448 : /* Get the natural vector type for this SLP group size. */
449 0 : tree int_type = build_nonstandard_integer_type
450 0 : (GET_MODE_BITSIZE (int_mode), 1);
451 0 : tree vector_type
452 0 : = get_vectype_for_scalar_type (vinfo, int_type, count);
453 0 : poly_int64 half_nelts;
454 0 : if (vector_type
455 0 : && VECTOR_MODE_P (TYPE_MODE (vector_type))
456 0 : && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
457 : GET_MODE_SIZE (base_vector_mode))
458 0 : && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
459 : 2, &half_nelts))
460 : {
461 : /* Try fusing consecutive sequences of COUNT / NVECTORS elements
462 : together into elements of type INT_TYPE and using the result
463 : to build NVECTORS vectors. */
464 0 : poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
465 0 : vec_perm_builder sel1 (nelts, 2, 3);
466 0 : vec_perm_builder sel2 (nelts, 2, 3);
467 :
468 0 : for (unsigned int i = 0; i < 3; ++i)
469 : {
470 0 : sel1.quick_push (i);
471 0 : sel1.quick_push (i + nelts);
472 0 : sel2.quick_push (half_nelts + i);
473 0 : sel2.quick_push (half_nelts + i + nelts);
474 : }
475 0 : vec_perm_indices indices1 (sel1, 2, nelts);
476 0 : vec_perm_indices indices2 (sel2, 2, nelts);
477 0 : machine_mode vmode = TYPE_MODE (vector_type);
478 0 : if (can_vec_perm_const_p (vmode, vmode, indices1)
479 0 : && can_vec_perm_const_p (vmode, vmode, indices2))
480 : {
481 0 : if (nvectors_out)
482 0 : *nvectors_out = nvectors;
483 0 : if (vector_type_out)
484 0 : *vector_type_out = vector_type;
485 0 : if (permutes)
486 : {
487 0 : permutes[0] = vect_gen_perm_mask_checked (vector_type,
488 : indices1);
489 0 : permutes[1] = vect_gen_perm_mask_checked (vector_type,
490 : indices2);
491 : }
492 0 : return true;
493 : }
494 0 : }
495 : }
496 0 : if (!multiple_p (elt_bytes, 2, &elt_bytes))
497 : return false;
498 0 : nvectors *= 2;
499 : /* We need to be able to fuse COUNT / NVECTORS elements together. */
500 0 : if (!multiple_p (count, nvectors))
501 : return false;
502 : }
503 : }
504 :
505 : /* Return true if DTA and DTB match. */
506 :
507 : static bool
508 16788186 : vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
509 : {
510 16788186 : return (dta == dtb
511 341072 : || ((dta == vect_external_def || dta == vect_constant_def)
512 213062 : && (dtb == vect_external_def || dtb == vect_constant_def)));
513 : }
514 :
515 : #define GATHER_SCATTER_OFFSET (-3)
516 :
517 : static const int no_arg_map[] = { 0 };
518 : static const int arg0_map[] = { 1, 0 };
519 : static const int arg2_map[] = { 1, 2 };
520 : static const int arg2_arg3_map[] = { 2, 2, 3 };
521 : static const int arg2_arg4_map[] = { 2, 2, 4 };
522 : static const int arg2_arg5_arg6_map[] = { 3, 2, 5, 6 };
523 : static const int arg2_arg4_arg5_map[] = { 3, 2, 4, 5 };
524 : static const int arg3_arg2_map[] = { 2, 3, 2 };
525 : static const int op1_op0_map[] = { 2, 1, 0 };
526 : static const int off_map[] = { 1, GATHER_SCATTER_OFFSET };
527 : static const int off_op0_map[] = { 2, GATHER_SCATTER_OFFSET, 0 };
528 : static const int off_arg2_arg3_map[] = { 3, GATHER_SCATTER_OFFSET, 2, 3 };
529 : static const int off_arg3_arg2_map[] = { 3, GATHER_SCATTER_OFFSET, 3, 2 };
530 : static const int mask_call_maps[6][7] = {
531 : { 1, 1, },
532 : { 2, 1, 2, },
533 : { 3, 1, 2, 3, },
534 : { 4, 1, 2, 3, 4, },
535 : { 5, 1, 2, 3, 4, 5, },
536 : { 6, 1, 2, 3, 4, 5, 6 },
537 : };
538 :
539 : /* For most SLP statements, there is a one-to-one mapping between
540 : gimple arguments and child nodes. If that is not true for STMT,
541 : return an array that contains:
542 :
543 : - the number of child nodes, followed by
544 : - for each child node, the index of the argument associated with that node.
545 : The special index -1 is the first operand of an embedded comparison and
546 : the special index -2 is the second operand of an embedded comparison.
547 : The special indes -3 is the offset of a gather as analyzed by
548 : vect_check_gather_scatter.
549 :
550 : SWAP is as for vect_get_and_check_slp_defs. */
551 :
552 : static const int *
553 18814899 : vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
554 : unsigned char swap = 0)
555 : {
556 18814899 : if (auto assign = dyn_cast<const gassign *> (stmt))
557 : {
558 17672689 : if (gimple_assign_rhs_code (assign) == COND_EXPR
559 17672689 : && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
560 0 : gcc_unreachable ();
561 17672689 : if ((TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
562 16386855 : || commutative_tree_code (gimple_assign_rhs_code (assign)))
563 26201648 : && swap)
564 : return op1_op0_map;
565 17632391 : if (gather_scatter_p)
566 42215 : return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
567 42215 : ? off_op0_map : off_map);
568 : }
569 18732386 : gcc_assert (!swap);
570 18732386 : if (auto call = dyn_cast<const gcall *> (stmt))
571 : {
572 139370 : if (gimple_call_internal_p (call))
573 73317 : switch (gimple_call_internal_fn (call))
574 : {
575 12168 : case IFN_MASK_LOAD:
576 20116 : return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
577 :
578 0 : case IFN_GATHER_LOAD:
579 0 : return arg2_map;
580 :
581 0 : case IFN_MASK_GATHER_LOAD:
582 0 : case IFN_MASK_LEN_GATHER_LOAD:
583 0 : return arg2_arg5_arg6_map;
584 :
585 0 : case IFN_SCATTER_STORE:
586 0 : return arg2_arg4_map;
587 :
588 0 : case IFN_MASK_SCATTER_STORE:
589 0 : case IFN_MASK_LEN_SCATTER_STORE:
590 0 : return arg2_arg4_arg5_map;
591 :
592 6227 : case IFN_MASK_STORE:
593 11178 : return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
594 :
595 988 : case IFN_MASK_CALL:
596 988 : {
597 988 : unsigned nargs = gimple_call_num_args (call);
598 988 : if (nargs >= 2 && nargs <= 7)
599 988 : return mask_call_maps[nargs-2];
600 : else
601 : return nullptr;
602 : }
603 :
604 140 : case IFN_CLZ:
605 140 : case IFN_CTZ:
606 140 : return arg0_map;
607 :
608 6306 : case IFN_GOMP_SIMD_LANE:
609 6306 : return no_arg_map;
610 :
611 : default:
612 : break;
613 : }
614 : }
615 : return nullptr;
616 : }
617 :
618 : /* Return the SLP node child index for operand OP of STMT. */
619 :
620 : int
621 1325462 : vect_slp_child_index_for_operand (const gimple *stmt, int op,
622 : bool gather_scatter_p)
623 : {
624 1325462 : const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
625 1325462 : if (!opmap)
626 : return op;
627 18015 : for (int i = 1; i < 1 + opmap[0]; ++i)
628 18015 : if (opmap[i] == op)
629 9882 : return i - 1;
630 0 : gcc_unreachable ();
631 : }
632 :
633 : /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
634 : they are of a valid type and that they match the defs of the first stmt of
635 : the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
636 : by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
637 : indicates swap is required for cond_expr stmts. Specifically, SWAP
638 : is 1 if STMT is cond and operands of comparison need to be swapped;
639 : SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
640 :
641 : If there was a fatal error return -1; if the error could be corrected by
642 : swapping operands of father node of this one, return 1; if everything is
643 : ok return 0. */
644 : static int
645 12216205 : vect_get_and_check_slp_defs (vec_info *vinfo, tree vectype, unsigned char swap,
646 : bool *skip_args,
647 : vec<stmt_vec_info> stmts, unsigned stmt_num,
648 : vec<slp_oprnd_info> *oprnds_info)
649 : {
650 12216205 : stmt_vec_info stmt_info = stmts[stmt_num];
651 12216205 : tree oprnd;
652 12216205 : unsigned int i, number_of_oprnds;
653 12216205 : enum vect_def_type dt = vect_uninitialized_def;
654 12216205 : slp_oprnd_info oprnd_info;
655 12216205 : gather_scatter_info gs_info;
656 12216205 : unsigned int gs_op = -1u;
657 12216205 : unsigned int commutative_op = -1U;
658 12216205 : bool first = stmt_num == 0;
659 :
660 12216205 : if (!stmt_info)
661 : {
662 0 : for (auto oi : *oprnds_info)
663 : {
664 0 : oi->def_stmts.quick_push (NULL);
665 0 : oi->ops.quick_push (NULL_TREE);
666 : }
667 : return 0;
668 : }
669 :
670 12216205 : if (!is_a<gcall *> (stmt_info->stmt)
671 : && !is_a<gassign *> (stmt_info->stmt)
672 : && !is_a<gphi *> (stmt_info->stmt))
673 : return -1;
674 :
675 12216205 : number_of_oprnds = gimple_num_args (stmt_info->stmt);
676 12216205 : const int *map
677 24432410 : = vect_get_operand_map (stmt_info->stmt,
678 12216205 : STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
679 12216205 : if (map)
680 69662 : number_of_oprnds = *map++;
681 12216205 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
682 : {
683 40096 : if (gimple_call_internal_p (stmt))
684 : {
685 24252 : internal_fn ifn = gimple_call_internal_fn (stmt);
686 24252 : commutative_op = first_commutative_argument (ifn);
687 24252 : if (internal_gather_scatter_fn_p (ifn))
688 : {
689 0 : vect_describe_gather_scatter_call
690 0 : (stmt_info,
691 0 : first ? &(*oprnds_info)[0]->first_gs_info : &gs_info);
692 0 : if (first)
693 0 : (*oprnds_info)[0]->first_gs_p = true;
694 : gs_op = 0;
695 : }
696 : }
697 : }
698 12176109 : else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
699 : {
700 14238804 : if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
701 8177067 : commutative_op = 0;
702 : }
703 :
704 12216205 : bool swapped = (swap != 0);
705 12216205 : bool backedge = false;
706 12216205 : enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
707 33832805 : for (i = 0; i < number_of_oprnds; i++)
708 : {
709 21617709 : oprnd_info = (*oprnds_info)[i];
710 21617709 : int opno = map ? map[i] : int (i);
711 21617709 : if (opno == GATHER_SCATTER_OFFSET)
712 : {
713 22050 : gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
714 22050 : if (!is_a <loop_vec_info> (vinfo)
715 22050 : || !vect_check_gather_scatter (stmt_info, vectype,
716 : as_a <loop_vec_info> (vinfo),
717 : first ? &oprnd_info->first_gs_info
718 : : &gs_info))
719 1109 : return -1;
720 :
721 22050 : if (first)
722 : {
723 21813 : oprnd_info->first_gs_p = true;
724 21813 : oprnd = oprnd_info->first_gs_info.offset;
725 : }
726 : else
727 : {
728 237 : gs_op = i;
729 237 : oprnd = gs_info.offset;
730 : }
731 : }
732 21595659 : else if (opno < 0)
733 0 : oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
734 : else
735 : {
736 21595659 : oprnd = gimple_arg (stmt_info->stmt, opno);
737 21595659 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
738 : {
739 1090364 : edge e = gimple_phi_arg_edge (stmt, opno);
740 2180728 : backedge = (is_a <bb_vec_info> (vinfo)
741 1629296 : ? e->flags & EDGE_DFS_BACK
742 538932 : : dominated_by_p (CDI_DOMINATORS, e->src,
743 538932 : gimple_bb (stmt_info->stmt)));
744 : }
745 : }
746 21617709 : if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
747 2650 : oprnd = TREE_OPERAND (oprnd, 0);
748 :
749 21617709 : stmt_vec_info def_stmt_info;
750 21617709 : if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
751 : {
752 957 : if (dump_enabled_p ())
753 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
754 : "Build SLP failed: can't analyze def for %T\n",
755 : oprnd);
756 :
757 957 : return -1;
758 : }
759 :
760 21616752 : if (skip_args[i])
761 : {
762 445318 : oprnd_info->def_stmts.quick_push (NULL);
763 445318 : oprnd_info->ops.quick_push (NULL_TREE);
764 445318 : oprnd_info->first_dt = vect_uninitialized_def;
765 445318 : continue;
766 : }
767 :
768 21171434 : oprnd_info->def_stmts.quick_push (def_stmt_info);
769 21171434 : oprnd_info->ops.quick_push (oprnd);
770 :
771 21171434 : if (def_stmt_info
772 21171434 : && is_pattern_stmt_p (def_stmt_info))
773 : {
774 345226 : if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
775 : != def_stmt_info)
776 248007 : oprnd_info->any_pattern = true;
777 : else
778 : /* If we promote this to external use the original stmt def. */
779 97219 : oprnd_info->ops.last ()
780 194438 : = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
781 : }
782 :
783 : /* If there's a extern def on a backedge make sure we can
784 : code-generate at the region start.
785 : ??? This is another case that could be fixed by adjusting
786 : how we split the function but at the moment we'd have conflicting
787 : goals there. */
788 21171434 : if (backedge
789 126901 : && dts[i] == vect_external_def
790 173 : && is_a <bb_vec_info> (vinfo)
791 173 : && TREE_CODE (oprnd) == SSA_NAME
792 152 : && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
793 21171586 : && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
794 152 : gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
795 : {
796 152 : if (dump_enabled_p ())
797 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
798 : "Build SLP failed: extern def %T only defined "
799 : "on backedge\n", oprnd);
800 152 : return -1;
801 : }
802 :
803 21171282 : if (first)
804 : {
805 4269103 : tree type = TREE_TYPE (oprnd);
806 4269103 : dt = dts[i];
807 :
808 : /* For the swapping logic below force vect_reduction_def
809 : for the reduction op in a SLP reduction group. */
810 4269103 : if (!STMT_VINFO_DATA_REF (stmt_info)
811 3181119 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
812 3288 : && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
813 4270723 : && def_stmt_info)
814 1620 : dts[i] = dt = vect_reduction_def;
815 :
816 : /* Check the types of the definition. */
817 4269103 : switch (dt)
818 : {
819 4269103 : case vect_external_def:
820 4269103 : case vect_constant_def:
821 4269103 : case vect_internal_def:
822 4269103 : case vect_reduction_def:
823 4269103 : case vect_double_reduction_def:
824 4269103 : case vect_induction_def:
825 4269103 : case vect_nested_cycle:
826 4269103 : case vect_first_order_recurrence:
827 4269103 : break;
828 :
829 0 : default:
830 : /* FORNOW: Not supported. */
831 0 : if (dump_enabled_p ())
832 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
833 : "Build SLP failed: illegal type of def %T\n",
834 : oprnd);
835 0 : return -1;
836 : }
837 :
838 4269103 : oprnd_info->first_dt = dt;
839 4269103 : oprnd_info->first_op_type = type;
840 : }
841 : }
842 12215096 : if (first)
843 : return 0;
844 :
845 : /* Now match the operand definition types to that of the first stmt. */
846 25871678 : for (i = 0; i < number_of_oprnds;)
847 : {
848 16898349 : if (skip_args[i])
849 : {
850 27772 : ++i;
851 27772 : continue;
852 : }
853 :
854 16870577 : oprnd_info = (*oprnds_info)[i];
855 16870577 : dt = dts[i];
856 16870577 : stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
857 16870577 : oprnd = oprnd_info->ops[stmt_num];
858 16870577 : tree type = TREE_TYPE (oprnd);
859 :
860 16870577 : if (!types_compatible_p (oprnd_info->first_op_type, type))
861 : {
862 88604 : if (dump_enabled_p ())
863 107 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
864 : "Build SLP failed: different operand types\n");
865 88604 : return 1;
866 : }
867 :
868 16781973 : if ((gs_op == i) != oprnd_info->first_gs_p)
869 : {
870 0 : if (dump_enabled_p ())
871 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
872 : "Build SLP failed: mixed gather and non-gather\n");
873 0 : return 1;
874 : }
875 16781973 : else if (gs_op == i)
876 : {
877 207 : if (!operand_equal_p (oprnd_info->first_gs_info.base,
878 207 : gs_info.base))
879 : {
880 16 : if (dump_enabled_p ())
881 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
882 : "Build SLP failed: different gather base\n");
883 16 : return 1;
884 : }
885 191 : if (oprnd_info->first_gs_info.scale != gs_info.scale)
886 : {
887 8 : if (dump_enabled_p ())
888 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
889 : "Build SLP failed: different gather scale\n");
890 8 : return 1;
891 : }
892 : }
893 :
894 : /* Not first stmt of the group, check that the def-stmt/s match
895 : the def-stmt/s of the first stmt. Allow different definition
896 : types for reduction chains: the first stmt must be a
897 : vect_reduction_def (a phi node), and the rest
898 : end in the reduction chain. */
899 16781949 : if ((!vect_def_types_match (oprnd_info->first_dt, dt)
900 284723 : && !(oprnd_info->first_dt == vect_reduction_def
901 2777 : && !STMT_VINFO_DATA_REF (stmt_info)
902 2777 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
903 2767 : && def_stmt_info
904 2767 : && !STMT_VINFO_DATA_REF (def_stmt_info)
905 2767 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
906 : == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
907 16499993 : || (!STMT_VINFO_DATA_REF (stmt_info)
908 15223902 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
909 5814 : && ((!def_stmt_info
910 5652 : || STMT_VINFO_DATA_REF (def_stmt_info)
911 10379 : || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
912 : != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
913 5814 : != (oprnd_info->first_dt != vect_reduction_def))))
914 : {
915 : /* Try swapping operands if we got a mismatch. For BB
916 : vectorization only in case it will clearly improve things. */
917 283887 : if (i == commutative_op && !swapped
918 281956 : && (!is_a <bb_vec_info> (vinfo)
919 4983 : || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
920 4983 : dts[i+1])
921 1108 : && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
922 : || vect_def_types_match
923 146 : ((*oprnds_info)[i+1]->first_dt, dts[i])))))
924 : {
925 1931 : if (dump_enabled_p ())
926 144 : dump_printf_loc (MSG_NOTE, vect_location,
927 : "trying swapped operands\n");
928 1931 : std::swap (dts[i], dts[i+1]);
929 1931 : std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
930 1931 : (*oprnds_info)[i+1]->def_stmts[stmt_num]);
931 1931 : std::swap ((*oprnds_info)[i]->ops[stmt_num],
932 1931 : (*oprnds_info)[i+1]->ops[stmt_num]);
933 : /* After swapping some operands we lost track whether an
934 : operand has any pattern defs so be conservative here. */
935 1931 : if ((*oprnds_info)[i]->any_pattern
936 1931 : || (*oprnds_info)[i+1]->any_pattern)
937 4 : (*oprnds_info)[i]->any_pattern
938 2 : = (*oprnds_info)[i+1]->any_pattern = true;
939 1931 : swapped = true;
940 1931 : continue;
941 : }
942 :
943 280025 : if (is_a <bb_vec_info> (vinfo)
944 269538 : && !oprnd_info->any_pattern
945 549325 : && number_of_oprnds > 1)
946 : {
947 : /* Now for commutative ops we should see whether we can
948 : make the other operand matching. */
949 104259 : if (dump_enabled_p ())
950 149 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
951 : "treating operand as external\n");
952 104259 : oprnd_info->first_dt = dt = vect_external_def;
953 : }
954 : else
955 : {
956 175766 : if (dump_enabled_p ())
957 406 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
958 : "Build SLP failed: different types\n");
959 175766 : return 1;
960 : }
961 : }
962 :
963 : /* Make sure to demote the overall operand to external. */
964 16604252 : if (dt == vect_external_def)
965 329846 : oprnd_info->first_dt = vect_external_def;
966 : /* For a SLP reduction chain we want to duplicate the reduction to
967 : each of the chain members. That gets us a sane SLP graph (still
968 : the stmts are not 100% correct wrt the initial values). */
969 16274406 : else if ((dt == vect_internal_def
970 16274406 : || dt == vect_reduction_def)
971 15370735 : && oprnd_info->first_dt == vect_reduction_def
972 64716 : && !STMT_VINFO_DATA_REF (stmt_info)
973 64716 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
974 2767 : && !STMT_VINFO_DATA_REF (def_stmt_info)
975 16277173 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
976 : == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
977 : {
978 2767 : oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
979 2767 : oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
980 : }
981 :
982 16604252 : ++i;
983 : }
984 :
985 : /* Swap operands. */
986 8973329 : if (swapped)
987 : {
988 39972 : if (dump_enabled_p ())
989 432 : dump_printf_loc (MSG_NOTE, vect_location,
990 : "swapped operands to match def types in %G",
991 : stmt_info->stmt);
992 : }
993 :
994 : return 0;
995 : }
996 :
997 : /* Return true if call statements CALL1 and CALL2 are similar enough
998 : to be combined into the same SLP group. */
999 :
1000 : bool
1001 20900 : compatible_calls_p (gcall *call1, gcall *call2, bool allow_two_operators)
1002 : {
1003 20900 : unsigned int nargs = gimple_call_num_args (call1);
1004 20900 : if (nargs != gimple_call_num_args (call2))
1005 : return false;
1006 :
1007 18964 : auto cfn1 = gimple_call_combined_fn (call1);
1008 18964 : auto cfn2 = gimple_call_combined_fn (call2);
1009 18964 : if (cfn1 != cfn2
1010 2 : && (!allow_two_operators
1011 2 : || !((cfn1 == CFN_FMA || cfn1 == CFN_FMS)
1012 2 : && (cfn2 == CFN_FMA || cfn2 == CFN_FMS))))
1013 : return false;
1014 :
1015 18964 : if (gimple_call_internal_p (call1))
1016 : {
1017 7084 : if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
1018 7084 : TREE_TYPE (gimple_call_lhs (call2))))
1019 : return false;
1020 14393 : for (unsigned int i = 0; i < nargs; ++i)
1021 7309 : if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
1022 7309 : TREE_TYPE (gimple_call_arg (call2, i))))
1023 : return false;
1024 : }
1025 : else
1026 : {
1027 11880 : if (!operand_equal_p (gimple_call_fn (call1),
1028 11880 : gimple_call_fn (call2), 0))
1029 : return false;
1030 :
1031 25884 : if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
1032 : return false;
1033 : }
1034 :
1035 : /* Check that any unvectorized arguments are equal. */
1036 15712 : if (const int *map = vect_get_operand_map (call1))
1037 : {
1038 15 : unsigned int nkept = *map++;
1039 15 : unsigned int mapi = 0;
1040 57 : for (unsigned int i = 0; i < nargs; ++i)
1041 42 : if (mapi < nkept && map[mapi] == int (i))
1042 27 : mapi += 1;
1043 15 : else if (!operand_equal_p (gimple_call_arg (call1, i),
1044 15 : gimple_call_arg (call2, i)))
1045 : return false;
1046 : }
1047 :
1048 : return true;
1049 : }
1050 :
1051 : /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1052 : caller's attempt to find the vector type in STMT_INFO with the narrowest
1053 : element type. Return true if VECTYPE is nonnull and if it is valid
1054 : for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1055 : number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1056 : vect_build_slp_tree. */
1057 :
1058 : static bool
1059 4966443 : vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1060 : unsigned int group_size,
1061 : tree vectype, poly_uint64 *max_nunits)
1062 : {
1063 4966443 : if (!vectype)
1064 : {
1065 4489 : if (dump_enabled_p ())
1066 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1067 : "Build SLP failed: unsupported data-type in %G\n",
1068 : stmt_info->stmt);
1069 : /* Fatal mismatch. */
1070 4489 : return false;
1071 : }
1072 :
1073 : /* If populating the vector type requires unrolling then fail
1074 : before adjusting *max_nunits for basic-block vectorization. */
1075 4961954 : if (is_a <bb_vec_info> (vinfo)
1076 4961954 : && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1077 : {
1078 140914 : if (dump_enabled_p ())
1079 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1080 : "Build SLP failed: unrolling required "
1081 : "in basic block SLP\n");
1082 : /* Fatal mismatch. */
1083 140914 : return false;
1084 : }
1085 :
1086 : /* In case of multiple types we need to detect the smallest type. */
1087 4821040 : vect_update_max_nunits (max_nunits, vectype);
1088 4821040 : return true;
1089 : }
1090 :
1091 : /* Verify if the scalar stmts STMTS are isomorphic, require data
1092 : permutation or are of unsupported types of operation. Return
1093 : true if they are, otherwise return false and indicate in *MATCHES
1094 : which stmts are not isomorphic to the first one. If MATCHES[0]
1095 : is false then this indicates the comparison could not be
1096 : carried out or the stmts will never be vectorized by SLP.
1097 :
1098 : Note COND_EXPR is possibly isomorphic to another one after swapping its
1099 : operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1100 : the first stmt by swapping the two operands of comparison; set SWAP[i]
1101 : to 2 if stmt I is isormorphic to the first stmt by inverting the code
1102 : of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1103 : to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1104 :
1105 : static bool
1106 5253029 : vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1107 : vec<stmt_vec_info> stmts, unsigned int group_size,
1108 : poly_uint64 *max_nunits, bool *matches,
1109 : bool *two_operators, tree *node_vectype)
1110 : {
1111 5253029 : unsigned int i;
1112 5253029 : stmt_vec_info first_stmt_info = stmts[0];
1113 5253029 : code_helper first_stmt_code = ERROR_MARK;
1114 5253029 : code_helper alt_stmt_code = ERROR_MARK;
1115 5253029 : code_helper first_cond_code = ERROR_MARK;
1116 5253029 : bool need_same_oprnds = false;
1117 5253029 : tree first_lhs = NULL_TREE;
1118 5253029 : tree first_op1 = NULL_TREE;
1119 5253029 : stmt_vec_info first_load = NULL, prev_first_load = NULL;
1120 5253029 : bool first_stmt_ldst_p = false, first_stmt_ldst_masklen_p = false;
1121 5253029 : bool first_stmt_phi_p = false;
1122 5253029 : int first_reduc_idx = -1;
1123 5253029 : bool maybe_soft_fail = false;
1124 5253029 : tree soft_fail_nunits_vectype = NULL_TREE;
1125 :
1126 5253029 : tree vectype, nunits_vectype;
1127 5253029 : if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
1128 : &nunits_vectype, group_size))
1129 : {
1130 : /* Fatal mismatch. */
1131 194867 : matches[0] = false;
1132 194867 : return false;
1133 : }
1134 5058162 : if (is_a <bb_vec_info> (vinfo)
1135 5058162 : && known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U))
1136 : {
1137 344295 : if (dump_enabled_p ())
1138 290 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1139 : "Build SLP failed: not using single lane "
1140 : "vector type %T\n", vectype);
1141 344295 : matches[0] = false;
1142 344295 : return false;
1143 : }
1144 : /* Record nunits required but continue analysis, producing matches[]
1145 : as if nunits was not an issue. This allows splitting of groups
1146 : to happen. */
1147 4713867 : if (nunits_vectype
1148 4713867 : && !vect_record_max_nunits (vinfo, first_stmt_info, group_size,
1149 : nunits_vectype, max_nunits))
1150 : {
1151 140914 : gcc_assert (is_a <bb_vec_info> (vinfo));
1152 140914 : maybe_soft_fail = true;
1153 140914 : soft_fail_nunits_vectype = nunits_vectype;
1154 : }
1155 :
1156 4713867 : gcc_assert (vectype || !gimple_get_lhs (first_stmt_info->stmt));
1157 4713867 : *node_vectype = vectype;
1158 :
1159 : /* For every stmt in NODE find its def stmt/s. */
1160 4713867 : stmt_vec_info stmt_info;
1161 20977825 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1162 : {
1163 16423394 : bool ldst_p = false;
1164 16423394 : bool ldst_masklen_p = false;
1165 16423394 : bool phi_p = false;
1166 16423394 : code_helper rhs_code = ERROR_MARK;
1167 :
1168 16423394 : swap[i] = 0;
1169 16423394 : matches[i] = false;
1170 16423394 : if (!stmt_info)
1171 : {
1172 39741 : matches[i] = true;
1173 16303699 : continue;
1174 : }
1175 :
1176 16383653 : gimple *stmt = stmt_info->stmt;
1177 16383653 : if (dump_enabled_p ())
1178 213753 : dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1179 :
1180 : /* Fail to vectorize statements marked as unvectorizable, throw
1181 : or are volatile. */
1182 16383653 : if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1183 16195315 : || stmt_can_throw_internal (cfun, stmt)
1184 31854467 : || gimple_has_volatile_ops (stmt))
1185 : {
1186 193822 : if (dump_enabled_p ())
1187 199 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1188 : "Build SLP failed: unvectorizable statement %G",
1189 : stmt);
1190 : /* ??? For BB vectorization we want to commutate operands in a way
1191 : to shuffle all unvectorizable defs into one operand and have
1192 : the other still vectorized. The following doesn't reliably
1193 : work for this though but it's the easiest we can do here. */
1194 193822 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1195 63392 : continue;
1196 : /* Fatal mismatch. */
1197 130430 : matches[0] = false;
1198 130430 : return false;
1199 : }
1200 :
1201 16189831 : gcall *call_stmt = dyn_cast <gcall *> (stmt);
1202 16189831 : tree lhs = gimple_get_lhs (stmt);
1203 16189831 : if (lhs == NULL_TREE && !call_stmt)
1204 : {
1205 36 : if (dump_enabled_p ())
1206 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1207 : "Build SLP failed: not GIMPLE_ASSIGN nor "
1208 : "GIMPLE_CALL %G", stmt);
1209 36 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1210 36 : continue;
1211 : /* Fatal mismatch. */
1212 0 : matches[0] = false;
1213 0 : return false;
1214 : }
1215 :
1216 16189795 : if (call_stmt)
1217 : {
1218 92922 : combined_fn cfn = gimple_call_combined_fn (call_stmt);
1219 92922 : if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1220 49417 : rhs_code = cfn;
1221 : else
1222 : rhs_code = CALL_EXPR;
1223 :
1224 92922 : if (cfn == CFN_GATHER_LOAD
1225 92922 : || cfn == CFN_SCATTER_STORE)
1226 : ldst_p = true;
1227 : else if (cfn == CFN_MASK_LOAD
1228 : || cfn == CFN_MASK_GATHER_LOAD
1229 : || cfn == CFN_MASK_LEN_GATHER_LOAD
1230 : || cfn == CFN_MASK_SCATTER_STORE
1231 : || cfn == CFN_MASK_LEN_SCATTER_STORE)
1232 : {
1233 : ldst_p = true;
1234 : ldst_masklen_p = true;
1235 : }
1236 : else if (cfn == CFN_MASK_STORE)
1237 : {
1238 : ldst_p = true;
1239 : ldst_masklen_p = true;
1240 : rhs_code = CFN_MASK_STORE;
1241 : }
1242 : else if (cfn == CFN_GOMP_SIMD_LANE)
1243 : ;
1244 83779 : else if ((cfn != CFN_LAST
1245 : && cfn != CFN_MASK_CALL
1246 40274 : && internal_fn_p (cfn)
1247 31185 : && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1248 83705 : || gimple_call_tail_p (call_stmt)
1249 83705 : || gimple_call_noreturn_p (call_stmt)
1250 167484 : || gimple_call_chain (call_stmt))
1251 : {
1252 423 : if (dump_enabled_p ())
1253 13 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254 : "Build SLP failed: unsupported call type %G",
1255 : (gimple *) call_stmt);
1256 423 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1257 62 : continue;
1258 : /* Fatal mismatch. */
1259 361 : matches[0] = false;
1260 361 : return false;
1261 : }
1262 : }
1263 16096873 : else if (gimple_code (stmt) == GIMPLE_PHI)
1264 : {
1265 : rhs_code = ERROR_MARK;
1266 : phi_p = true;
1267 : }
1268 : else
1269 : {
1270 15372372 : rhs_code = gimple_assign_rhs_code (stmt);
1271 15372372 : ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1272 : }
1273 :
1274 : /* Check the operation. */
1275 16189372 : if (i == 0)
1276 : {
1277 4583076 : first_lhs = lhs;
1278 4583076 : first_stmt_code = rhs_code;
1279 4583076 : first_stmt_ldst_p = ldst_p;
1280 4583076 : first_stmt_ldst_masklen_p = ldst_masklen_p;
1281 4583076 : first_stmt_phi_p = phi_p;
1282 4583076 : first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
1283 :
1284 : /* Shift arguments should be equal in all the packed stmts for a
1285 : vector shift with scalar shift operand. */
1286 4583076 : if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1287 4460480 : || rhs_code == LROTATE_EXPR
1288 9043514 : || rhs_code == RROTATE_EXPR)
1289 : {
1290 : /* First see if we have a vector/vector shift. */
1291 122839 : if (!directly_supported_p (rhs_code, vectype, optab_vector))
1292 : {
1293 : /* No vector/vector shift, try for a vector/scalar shift. */
1294 114779 : if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1295 : {
1296 9423 : if (dump_enabled_p ())
1297 375 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1298 : "Build SLP failed: "
1299 : "op not supported by target.\n");
1300 9423 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1301 : continue;
1302 : /* Fatal mismatch. */
1303 9423 : matches[0] = false;
1304 9423 : return false;
1305 : }
1306 105356 : need_same_oprnds = true;
1307 105356 : first_op1 = gimple_assign_rhs2 (stmt);
1308 : }
1309 : }
1310 4460237 : else if (rhs_code == WIDEN_LSHIFT_EXPR)
1311 : {
1312 0 : need_same_oprnds = true;
1313 0 : first_op1 = gimple_assign_rhs2 (stmt);
1314 : }
1315 4460237 : else if (!ldst_p
1316 4460237 : && rhs_code == BIT_FIELD_REF)
1317 : {
1318 5748 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1319 5748 : if (!is_a <bb_vec_info> (vinfo)
1320 5622 : || TREE_CODE (vec) != SSA_NAME
1321 : /* When the element types are not compatible we pun the
1322 : source to the target vectype which requires equal size. */
1323 11358 : || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1324 4895 : || !types_compatible_p (TREE_TYPE (vectype),
1325 4895 : TREE_TYPE (TREE_TYPE (vec))))
1326 1031 : && !operand_equal_p (TYPE_SIZE (vectype),
1327 1031 : TYPE_SIZE (TREE_TYPE (vec)))))
1328 : {
1329 781 : if (dump_enabled_p ())
1330 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1331 : "Build SLP failed: "
1332 : "BIT_FIELD_REF not supported\n");
1333 : /* Fatal mismatch. */
1334 781 : matches[0] = false;
1335 781 : return false;
1336 : }
1337 : }
1338 4454489 : else if (rhs_code == CFN_DIV_POW2)
1339 : {
1340 0 : need_same_oprnds = true;
1341 0 : first_op1 = gimple_call_arg (call_stmt, 1);
1342 : }
1343 4454489 : else if (rhs_code == CFN_GOMP_SIMD_LANE)
1344 : {
1345 3153 : need_same_oprnds = true;
1346 3153 : first_op1 = gimple_call_arg (call_stmt, 1);
1347 : }
1348 : }
1349 : else
1350 : {
1351 11606628 : if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1352 : /* For SLP reduction groups the index isn't necessarily
1353 : uniform but only that of the first stmt matters. */
1354 1640 : && !(first_reduc_idx != -1
1355 1640 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1356 1640 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1357 11606296 : && !(first_reduc_idx != -1
1358 898 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1359 898 : && rhs_code.is_tree_code ()
1360 898 : && commutative_tree_code (tree_code (rhs_code))
1361 704 : && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info)))
1362 : {
1363 332 : if (dump_enabled_p ())
1364 : {
1365 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1366 : "Build SLP failed: different reduc_idx "
1367 : "%d instead of %d in %G",
1368 : STMT_VINFO_REDUC_IDX (stmt_info),
1369 : first_reduc_idx, stmt);
1370 : }
1371 : /* Mismatch. */
1372 332 : continue;
1373 : }
1374 11605964 : if (!ldst_p
1375 9174603 : && first_stmt_code != rhs_code
1376 12991131 : && alt_stmt_code == ERROR_MARK)
1377 : alt_stmt_code = rhs_code;
1378 12971420 : if ((!ldst_p
1379 9174603 : && first_stmt_code != rhs_code
1380 1385167 : && (first_stmt_code != IMAGPART_EXPR
1381 127 : || rhs_code != REALPART_EXPR)
1382 1385147 : && (first_stmt_code != REALPART_EXPR
1383 458 : || rhs_code != IMAGPART_EXPR)
1384 : /* Handle mismatches in plus/minus by computing both
1385 : and merging the results. */
1386 1385136 : && !((((first_stmt_code == PLUS_EXPR
1387 1288268 : || first_stmt_code == MINUS_EXPR)
1388 116915 : && (alt_stmt_code == PLUS_EXPR
1389 108102 : || alt_stmt_code == MINUS_EXPR))
1390 1362557 : || ((first_stmt_code == CFN_FMA
1391 1362555 : || first_stmt_code == CFN_FMS)
1392 2 : && (alt_stmt_code == CFN_FMA
1393 2 : || alt_stmt_code == CFN_FMS)))
1394 22581 : && rhs_code == alt_stmt_code)
1395 1402434 : && !(first_stmt_code.is_tree_code ()
1396 1286575 : && rhs_code.is_tree_code ()
1397 1193820 : && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1398 : == tcc_comparison)
1399 128023 : && (swap_tree_comparison (tree_code (first_stmt_code))
1400 128023 : == tree_code (rhs_code))
1401 : && (first_reduc_idx == -1
1402 0 : || REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
1403 : || (ldst_p
1404 4862722 : && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1405 2431361 : != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1406 : || (ldst_p
1407 2389070 : && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1408 2389070 : != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1409 10240650 : || first_stmt_ldst_p != ldst_p
1410 10240516 : || (ldst_p && first_stmt_ldst_masklen_p != ldst_masklen_p)
1411 21846472 : || first_stmt_phi_p != phi_p)
1412 : {
1413 1365456 : if (dump_enabled_p ())
1414 : {
1415 2845 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1416 : "Build SLP failed: different operation "
1417 : "in stmt %G", stmt);
1418 2845 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1419 : "original stmt %G", first_stmt_info->stmt);
1420 : }
1421 : /* Mismatch. */
1422 1365456 : continue;
1423 : }
1424 :
1425 10242871 : if (!ldst_p
1426 7851568 : && first_stmt_code == BIT_FIELD_REF
1427 10246317 : && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1428 5809 : != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1429 : {
1430 2363 : if (dump_enabled_p ())
1431 40 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1432 : "Build SLP failed: different BIT_FIELD_REF "
1433 : "arguments in %G", stmt);
1434 : /* Mismatch. */
1435 2363 : continue;
1436 : }
1437 :
1438 10238145 : if (call_stmt
1439 21732 : && first_stmt_code != CFN_MASK_LOAD
1440 10259391 : && first_stmt_code != CFN_MASK_STORE)
1441 : {
1442 20900 : if (!is_a <gcall *> (stmts[0]->stmt)
1443 20900 : || !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1444 : call_stmt, true))
1445 : {
1446 5188 : if (dump_enabled_p ())
1447 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1448 : "Build SLP failed: different calls in %G",
1449 : stmt);
1450 : /* Mismatch. */
1451 5188 : continue;
1452 : }
1453 : }
1454 :
1455 10062180 : if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1456 10931037 : && (gimple_bb (first_stmt_info->stmt)
1457 868857 : != gimple_bb (stmt_info->stmt)))
1458 : {
1459 27048 : if (dump_enabled_p ())
1460 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1461 : "Build SLP failed: different BB for PHI "
1462 : "or possibly trapping operation in %G", stmt);
1463 : /* Mismatch. */
1464 27048 : continue;
1465 : }
1466 :
1467 10205909 : if (need_same_oprnds)
1468 : {
1469 54759 : tree other_op1 = gimple_arg (stmt, 1);
1470 54759 : if (!operand_equal_p (first_op1, other_op1, 0))
1471 : {
1472 7457 : if (dump_enabled_p ())
1473 123 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474 : "Build SLP failed: different shift "
1475 : "arguments in %G", stmt);
1476 : /* Mismatch. */
1477 7457 : continue;
1478 : }
1479 : }
1480 :
1481 10199189 : if (first_lhs
1482 10198452 : && lhs
1483 10198452 : && !types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (first_lhs)))
1484 : {
1485 737 : if (dump_enabled_p ())
1486 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1487 : "Build SLP failed: different vector type "
1488 : "in %G", stmt);
1489 : /* Mismatch. */
1490 737 : continue;
1491 : }
1492 : }
1493 :
1494 : /* Grouped store or load. */
1495 14770587 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1496 : {
1497 3742014 : gcc_assert (ldst_p);
1498 3742014 : if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1499 : {
1500 : /* Store. */
1501 2968228 : gcc_assert (rhs_code == CFN_MASK_STORE
1502 : || REFERENCE_CLASS_P (lhs)
1503 : || DECL_P (lhs));
1504 : }
1505 : else
1506 : {
1507 : /* Load. */
1508 773786 : first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1509 773786 : if (prev_first_load)
1510 : {
1511 : /* Check that there are no loads from different interleaving
1512 : chains in the same node. */
1513 344053 : if (prev_first_load != first_load)
1514 : {
1515 41488 : if (dump_enabled_p ())
1516 1988 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1517 : vect_location,
1518 : "Build SLP failed: different "
1519 : "interleaving chains in one node %G",
1520 : stmt);
1521 : /* Mismatch. */
1522 41488 : continue;
1523 : }
1524 : }
1525 : else
1526 : prev_first_load = first_load;
1527 : }
1528 : }
1529 : /* Non-grouped store or load. */
1530 11028573 : else if (ldst_p)
1531 : {
1532 706860 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1533 495038 : && rhs_code != CFN_GATHER_LOAD
1534 : && rhs_code != CFN_MASK_GATHER_LOAD
1535 : && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1536 : && rhs_code != CFN_SCATTER_STORE
1537 : && rhs_code != CFN_MASK_SCATTER_STORE
1538 : && rhs_code != CFN_MASK_LEN_SCATTER_STORE
1539 495038 : && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1540 : /* Not grouped loads are handled as externals for BB
1541 : vectorization. For loop vectorization we can handle
1542 : splats the same we handle single element interleaving.
1543 : Likewise we can handle a collection of invariant refs. */
1544 1183564 : && (is_a <bb_vec_info> (vinfo)
1545 476704 : || (stmt_info != first_stmt_info
1546 44304 : && !(integer_zerop (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
1547 157 : && integer_zerop (DR_STEP (STMT_VINFO_DATA_REF
1548 : (first_stmt_info)))))))
1549 : {
1550 : /* Not grouped load. */
1551 43990 : if (dump_enabled_p ())
1552 121 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1553 : "Build SLP failed: not grouped load %G", stmt);
1554 :
1555 43990 : if (i != 0)
1556 43990 : continue;
1557 : /* Fatal mismatch. */
1558 0 : matches[0] = false;
1559 0 : return false;
1560 : }
1561 : }
1562 : /* Not memory operation. */
1563 : else
1564 : {
1565 10321713 : if (!phi_p
1566 9719280 : && rhs_code.is_tree_code ()
1567 9677508 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1568 1420635 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1569 902142 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1570 853756 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1571 64529 : && rhs_code != VIEW_CONVERT_EXPR
1572 : && rhs_code != CALL_EXPR
1573 : && rhs_code != BIT_FIELD_REF
1574 10321713 : && rhs_code != SSA_NAME)
1575 : {
1576 18441 : if (dump_enabled_p ())
1577 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1578 : "Build SLP failed: operation unsupported %G",
1579 : stmt);
1580 18441 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1581 0 : continue;
1582 : /* Fatal mismatch. */
1583 18441 : matches[0] = false;
1584 18441 : return false;
1585 : }
1586 :
1587 10303272 : if (rhs_code == COND_EXPR)
1588 : {
1589 45970 : tree cond_expr = gimple_assign_rhs1 (stmt);
1590 45970 : enum tree_code cond_code = TREE_CODE (cond_expr);
1591 45970 : enum tree_code swap_code = ERROR_MARK;
1592 45970 : enum tree_code invert_code = ERROR_MARK;
1593 :
1594 45970 : if (i == 0)
1595 37213 : first_cond_code = TREE_CODE (cond_expr);
1596 8757 : else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1597 : {
1598 0 : bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1599 0 : swap_code = swap_tree_comparison (cond_code);
1600 0 : invert_code = invert_tree_comparison (cond_code, honor_nans);
1601 : }
1602 :
1603 45970 : if (first_cond_code == cond_code)
1604 : ;
1605 : /* Isomorphic can be achieved by swapping. */
1606 0 : else if (first_cond_code == swap_code)
1607 0 : swap[i] = 1;
1608 : /* Isomorphic can be achieved by inverting. */
1609 0 : else if (first_cond_code == invert_code)
1610 0 : swap[i] = 2;
1611 : else
1612 : {
1613 0 : if (dump_enabled_p ())
1614 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1615 : "Build SLP failed: different"
1616 : " operation %G", stmt);
1617 : /* Mismatch. */
1618 0 : continue;
1619 : }
1620 : }
1621 :
1622 10303272 : if (i != 0
1623 7809956 : && first_stmt_code != rhs_code
1624 62132 : && first_stmt_code.is_tree_code ()
1625 62130 : && rhs_code.is_tree_code ()
1626 62130 : && TREE_CODE_CLASS ((tree_code)first_stmt_code) == tcc_comparison
1627 10342987 : && (swap_tree_comparison ((tree_code)first_stmt_code)
1628 39715 : == (tree_code)rhs_code))
1629 39715 : swap[i] = 1;
1630 :
1631 10303272 : if (i != 0
1632 7809956 : && first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1633 1084 : && first_reduc_idx != -1
1634 1084 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1635 1084 : && rhs_code.is_tree_code ()
1636 1084 : && commutative_tree_code (tree_code (rhs_code))
1637 10304356 : && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info))
1638 1084 : swap[i] = 1;
1639 : }
1640 :
1641 14666668 : matches[i] = true;
1642 : }
1643 :
1644 19235432 : for (i = 0; i < group_size; ++i)
1645 15336862 : if (!matches[i])
1646 : return false;
1647 :
1648 : /* If we allowed a two-operation SLP node verify the target can cope
1649 : with the permute we are going to use. */
1650 3898570 : if (alt_stmt_code != ERROR_MARK
1651 3898570 : && (!alt_stmt_code.is_tree_code ()
1652 51464 : || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1653 51464 : && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1654 : {
1655 12322 : *two_operators = true;
1656 : }
1657 :
1658 3898570 : if (maybe_soft_fail)
1659 : {
1660 140499 : unsigned HOST_WIDE_INT const_nunits;
1661 140499 : if (!TYPE_VECTOR_SUBPARTS
1662 140499 : (soft_fail_nunits_vectype).is_constant (&const_nunits)
1663 140499 : || const_nunits > group_size)
1664 0 : matches[0] = false;
1665 : else
1666 : {
1667 : /* With constant vector elements simulate a mismatch at the
1668 : point we need to split. */
1669 140499 : unsigned tail = group_size & (const_nunits - 1);
1670 140499 : memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1671 : }
1672 140499 : return false;
1673 : }
1674 :
1675 : return true;
1676 : }
1677 :
1678 : /* Traits for the hash_set to record failed SLP builds for a stmt set.
1679 : Note we never remove apart from at destruction time so we do not
1680 : need a special value for deleted that differs from empty. */
1681 : struct bst_traits
1682 : {
1683 : typedef vec <stmt_vec_info> value_type;
1684 : typedef vec <stmt_vec_info> compare_type;
1685 : static inline hashval_t hash (value_type);
1686 : static inline bool equal (value_type existing, value_type candidate);
1687 433889645 : static inline bool is_empty (value_type x) { return !x.exists (); }
1688 96562673 : static inline bool is_deleted (value_type x) { return !x.exists (); }
1689 : static const bool empty_zero_p = true;
1690 0 : static inline void mark_empty (value_type &x) { x.release (); }
1691 : static inline void mark_deleted (value_type &x) { x.release (); }
1692 8376442 : static inline void remove (value_type &x) { x.release (); }
1693 : };
1694 : inline hashval_t
1695 84046806 : bst_traits::hash (value_type x)
1696 : {
1697 84046806 : inchash::hash h;
1698 398965932 : for (unsigned i = 0; i < x.length (); ++i)
1699 314919126 : h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
1700 84046806 : return h.end ();
1701 : }
1702 : inline bool
1703 73414547 : bst_traits::equal (value_type existing, value_type candidate)
1704 : {
1705 220243641 : if (existing.length () != candidate.length ())
1706 : return false;
1707 75225775 : for (unsigned i = 0; i < existing.length (); ++i)
1708 71393620 : if (existing[i] != candidate[i])
1709 : return false;
1710 : return true;
1711 : }
1712 :
1713 : typedef hash_map <vec <stmt_vec_info>, slp_tree,
1714 : simple_hashmap_traits <bst_traits, slp_tree> >
1715 : scalar_stmts_to_slp_tree_map_t;
1716 :
1717 : /* Release BST_MAP. */
1718 :
1719 : static void
1720 1661254 : release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
1721 : {
1722 : /* The map keeps a reference on SLP nodes built, release that. */
1723 10037696 : for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1724 18414138 : it != bst_map->end (); ++it)
1725 8376442 : if ((*it).second)
1726 8376442 : vect_free_slp_tree ((*it).second);
1727 1661254 : delete bst_map;
1728 1661254 : }
1729 :
1730 : /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1731 : but then vec::insert does memmove and that's not compatible with
1732 : std::pair. */
1733 : struct chain_op_t
1734 : {
1735 3646349 : chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1736 3646349 : : code (code_), dt (dt_), op (op_) {}
1737 : tree_code code;
1738 : vect_def_type dt;
1739 : tree op;
1740 : };
1741 :
1742 : /* Comparator for sorting associatable chains. */
1743 :
1744 : static int
1745 8448485 : dt_sort_cmp (const void *op1_, const void *op2_, void *)
1746 : {
1747 8448485 : auto *op1 = (const chain_op_t *) op1_;
1748 8448485 : auto *op2 = (const chain_op_t *) op2_;
1749 8448485 : if (op1->dt != op2->dt)
1750 1032834 : return (int)op1->dt - (int)op2->dt;
1751 7415651 : return (int)op1->code - (int)op2->code;
1752 : }
1753 :
1754 : /* Linearize the associatable expression chain at START with the
1755 : associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1756 : filling CHAIN with the result and using WORKLIST as intermediate storage.
1757 : CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1758 : or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1759 : stmts, starting with START. */
1760 :
1761 : static void
1762 1633968 : vect_slp_linearize_chain (vec_info *vinfo,
1763 : vec<std::pair<tree_code, gimple *> > &worklist,
1764 : vec<chain_op_t> &chain,
1765 : enum tree_code code, gimple *start,
1766 : gimple *&code_stmt, gimple *&alt_code_stmt,
1767 : vec<gimple *> *chain_stmts)
1768 : {
1769 : /* For each lane linearize the addition/subtraction (or other
1770 : uniform associatable operation) expression tree. */
1771 1633968 : worklist.safe_push (std::make_pair (code, start));
1772 3646349 : while (!worklist.is_empty ())
1773 : {
1774 2012381 : auto entry = worklist.pop ();
1775 2012381 : gassign *stmt = as_a <gassign *> (entry.second);
1776 2012381 : enum tree_code in_code = entry.first;
1777 4024762 : enum tree_code this_code = gimple_assign_rhs_code (stmt);
1778 : /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1779 2012381 : if (!code_stmt
1780 2012381 : && gimple_assign_rhs_code (stmt) == code)
1781 1382544 : code_stmt = stmt;
1782 629837 : else if (!alt_code_stmt
1783 629837 : && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1784 328262 : alt_code_stmt = stmt;
1785 2012381 : if (chain_stmts)
1786 1963081 : chain_stmts->safe_push (stmt);
1787 6037143 : for (unsigned opnum = 1; opnum <= 2; ++opnum)
1788 : {
1789 4024762 : tree op = gimple_op (stmt, opnum);
1790 4024762 : vect_def_type dt;
1791 4024762 : stmt_vec_info def_stmt_info;
1792 4024762 : bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1793 4024762 : gcc_assert (res);
1794 4024762 : if (dt == vect_internal_def
1795 4024762 : && is_pattern_stmt_p (def_stmt_info))
1796 6498 : op = gimple_get_lhs (def_stmt_info->stmt);
1797 4024762 : gimple *use_stmt;
1798 4024762 : use_operand_p use_p;
1799 4024762 : if (dt == vect_internal_def
1800 3734873 : && single_imm_use (op, &use_p, &use_stmt)
1801 2296275 : && is_gimple_assign (def_stmt_info->stmt)
1802 6139862 : && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1803 1764313 : || (code == PLUS_EXPR
1804 884532 : && (gimple_assign_rhs_code (def_stmt_info->stmt)
1805 : == MINUS_EXPR))))
1806 : {
1807 378413 : tree_code op_def_code = this_code;
1808 378413 : if (op_def_code == MINUS_EXPR && opnum == 1)
1809 55519 : op_def_code = PLUS_EXPR;
1810 378413 : if (in_code == MINUS_EXPR)
1811 193 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1812 378413 : worklist.safe_push (std::make_pair (op_def_code,
1813 378413 : def_stmt_info->stmt));
1814 : }
1815 : else
1816 : {
1817 3646349 : tree_code op_def_code = this_code;
1818 3646349 : if (op_def_code == MINUS_EXPR && opnum == 1)
1819 277231 : op_def_code = PLUS_EXPR;
1820 3646349 : if (in_code == MINUS_EXPR)
1821 6241 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1822 3646349 : chain.safe_push (chain_op_t (op_def_code, dt, op));
1823 : }
1824 : }
1825 : }
1826 1633968 : }
1827 :
1828 : static slp_tree
1829 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1830 : vec<stmt_vec_info> stmts, unsigned int group_size,
1831 : poly_uint64 *max_nunits,
1832 : bool *matches, unsigned *limit, unsigned *tree_size,
1833 : scalar_stmts_to_slp_tree_map_t *bst_map);
1834 :
1835 : static slp_tree
1836 5649975 : vect_build_slp_tree (vec_info *vinfo,
1837 : vec<stmt_vec_info> stmts, unsigned int group_size,
1838 : poly_uint64 *max_nunits,
1839 : bool *matches, unsigned *limit, unsigned *tree_size,
1840 : scalar_stmts_to_slp_tree_map_t *bst_map)
1841 : {
1842 5649975 : if (slp_tree *leader = bst_map->get (stmts))
1843 : {
1844 390883 : if (dump_enabled_p ())
1845 16822 : dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1846 16822 : !(*leader)->failed ? "" : "failed ",
1847 : (void *) *leader);
1848 390883 : if (!(*leader)->failed)
1849 : {
1850 343909 : SLP_TREE_REF_COUNT (*leader)++;
1851 343909 : vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1852 343909 : stmts.release ();
1853 343909 : return *leader;
1854 : }
1855 46974 : memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1856 46974 : return NULL;
1857 : }
1858 :
1859 : /* Single-lane SLP doesn't have the chance of run-away, do not account
1860 : it to the limit. */
1861 5259092 : if (stmts.length () > 1)
1862 : {
1863 3074262 : if (*limit == 0)
1864 : {
1865 1501 : if (dump_enabled_p ())
1866 12 : dump_printf_loc (MSG_NOTE, vect_location,
1867 : "SLP discovery limit exceeded\n");
1868 1501 : memset (matches, 0, sizeof (bool) * group_size);
1869 1501 : return NULL;
1870 : }
1871 3072761 : --*limit;
1872 : }
1873 :
1874 : /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1875 : so we can pick up backedge destinations during discovery. */
1876 5257591 : slp_tree res = new _slp_tree;
1877 5257591 : SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1878 5257591 : SLP_TREE_SCALAR_STMTS (res) = stmts;
1879 5257591 : bst_map->put (stmts.copy (), res);
1880 :
1881 5257591 : if (dump_enabled_p ())
1882 142299 : dump_printf_loc (MSG_NOTE, vect_location,
1883 : "starting SLP discovery for node %p\n", (void *) res);
1884 :
1885 5257591 : poly_uint64 this_max_nunits = 1;
1886 5257591 : slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1887 : &this_max_nunits,
1888 : matches, limit, tree_size, bst_map);
1889 5257591 : if (!res_)
1890 : {
1891 1926318 : if (dump_enabled_p ())
1892 8016 : dump_printf_loc (MSG_NOTE, vect_location,
1893 : "SLP discovery for node %p failed\n", (void *) res);
1894 : /* Mark the node invalid so we can detect those when still in use
1895 : as backedge destinations. */
1896 1926318 : SLP_TREE_SCALAR_STMTS (res) = vNULL;
1897 1926318 : SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1898 1926318 : res->failed = XNEWVEC (bool, group_size);
1899 1926318 : if (flag_checking)
1900 : {
1901 : unsigned i;
1902 3421979 : for (i = 0; i < group_size; ++i)
1903 3421979 : if (!matches[i])
1904 : break;
1905 1926318 : gcc_assert (i < group_size);
1906 : }
1907 1926318 : memcpy (res->failed, matches, sizeof (bool) * group_size);
1908 : }
1909 : else
1910 : {
1911 3331273 : if (dump_enabled_p ())
1912 134283 : dump_printf_loc (MSG_NOTE, vect_location,
1913 : "SLP discovery for node %p succeeded\n",
1914 : (void *) res);
1915 3331273 : gcc_assert (res_ == res);
1916 3331273 : res->max_nunits = this_max_nunits;
1917 3331273 : vect_update_max_nunits (max_nunits, this_max_nunits);
1918 : /* Keep a reference for the bst_map use. */
1919 3331273 : SLP_TREE_REF_COUNT (res)++;
1920 : }
1921 : return res_;
1922 : }
1923 :
1924 : /* Helper for building an associated SLP node chain. */
1925 :
1926 : static void
1927 122 : vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1928 : slp_tree op0, slp_tree op1,
1929 : stmt_vec_info oper1, stmt_vec_info oper2,
1930 : vec<std::pair<unsigned, unsigned> > lperm)
1931 : {
1932 122 : unsigned group_size = SLP_TREE_LANES (op1);
1933 :
1934 122 : slp_tree child1 = new _slp_tree;
1935 122 : SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1936 122 : SLP_TREE_VECTYPE (child1) = vectype;
1937 122 : SLP_TREE_LANES (child1) = group_size;
1938 122 : SLP_TREE_CHILDREN (child1).create (2);
1939 122 : SLP_TREE_CHILDREN (child1).quick_push (op0);
1940 122 : SLP_TREE_CHILDREN (child1).quick_push (op1);
1941 122 : SLP_TREE_REPRESENTATIVE (child1) = oper1;
1942 :
1943 122 : slp_tree child2 = new _slp_tree;
1944 122 : SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1945 122 : SLP_TREE_VECTYPE (child2) = vectype;
1946 122 : SLP_TREE_LANES (child2) = group_size;
1947 122 : SLP_TREE_CHILDREN (child2).create (2);
1948 122 : SLP_TREE_CHILDREN (child2).quick_push (op0);
1949 122 : SLP_TREE_REF_COUNT (op0)++;
1950 122 : SLP_TREE_CHILDREN (child2).quick_push (op1);
1951 122 : SLP_TREE_REF_COUNT (op1)++;
1952 122 : SLP_TREE_REPRESENTATIVE (child2) = oper2;
1953 :
1954 122 : SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1955 122 : SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1956 122 : SLP_TREE_VECTYPE (perm) = vectype;
1957 122 : SLP_TREE_LANES (perm) = group_size;
1958 : /* ??? We should set this NULL but that's not expected. */
1959 122 : SLP_TREE_REPRESENTATIVE (perm) = oper1;
1960 122 : SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1961 122 : SLP_TREE_CHILDREN (perm).quick_push (child1);
1962 122 : SLP_TREE_CHILDREN (perm).quick_push (child2);
1963 122 : }
1964 :
1965 : /* Recursively build an SLP tree starting from NODE.
1966 : Fail (and return a value not equal to zero) if def-stmts are not
1967 : isomorphic, require data permutation or are of unsupported types of
1968 : operation. Otherwise, return 0.
1969 : The value returned is the depth in the SLP tree where a mismatch
1970 : was found. */
1971 :
1972 : static slp_tree
1973 5257591 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1974 : vec<stmt_vec_info> stmts, unsigned int group_size,
1975 : poly_uint64 *max_nunits,
1976 : bool *matches, unsigned *limit, unsigned *tree_size,
1977 : scalar_stmts_to_slp_tree_map_t *bst_map)
1978 : {
1979 5257591 : unsigned nops, i, this_tree_size = 0;
1980 5257591 : poly_uint64 this_max_nunits = *max_nunits;
1981 :
1982 5257591 : matches[0] = false;
1983 :
1984 5257591 : stmt_vec_info stmt_info = stmts[0];
1985 5257591 : if (!is_a<gcall *> (stmt_info->stmt)
1986 : && !is_a<gassign *> (stmt_info->stmt)
1987 : && !is_a<gphi *> (stmt_info->stmt))
1988 : return NULL;
1989 :
1990 5257520 : nops = gimple_num_args (stmt_info->stmt);
1991 5257520 : if (const int *map = vect_get_operand_map (stmt_info->stmt,
1992 5257520 : STMT_VINFO_GATHER_SCATTER_P
1993 : (stmt_info)))
1994 28783 : nops = map[0];
1995 :
1996 : /* If the SLP node is a PHI (induction or reduction), terminate
1997 : the recursion. */
1998 5257520 : bool *skip_args = XALLOCAVEC (bool, nops);
1999 5257520 : memset (skip_args, 0, sizeof (bool) * nops);
2000 5257520 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2001 2329600 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
2002 : {
2003 252596 : tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
2004 252596 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2005 : group_size);
2006 252596 : if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
2007 : max_nunits))
2008 : return NULL;
2009 :
2010 248107 : vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
2011 248107 : if (def_type == vect_induction_def)
2012 : {
2013 : /* Induction PHIs are not cycles but walk the initial
2014 : value. Only for inner loops through, for outer loops
2015 : we need to pick up the value from the actual PHIs
2016 : to more easily support peeling and epilogue vectorization. */
2017 172772 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2018 172772 : if (!nested_in_vect_loop_p (loop, stmt_info))
2019 172029 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
2020 : else
2021 : loop = loop->inner;
2022 172772 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
2023 : }
2024 75335 : else if (def_type == vect_reduction_def
2025 : || def_type == vect_double_reduction_def
2026 : || def_type == vect_nested_cycle
2027 75335 : || def_type == vect_first_order_recurrence)
2028 : {
2029 : /* Else def types have to match. */
2030 : stmt_vec_info other_info;
2031 : bool all_same = true;
2032 166728 : FOR_EACH_VEC_ELT (stmts, i, other_info)
2033 : {
2034 92527 : if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
2035 1709972 : return NULL;
2036 92525 : if (other_info != stmt_info)
2037 15673 : all_same = false;
2038 : }
2039 74201 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2040 : /* Reduction initial values are not explicitly represented. */
2041 74201 : if (def_type != vect_first_order_recurrence
2042 74201 : && gimple_bb (stmt_info->stmt) == loop->header)
2043 71331 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
2044 : /* Reduction chain backedge defs are filled manually.
2045 : ??? Need a better way to identify a SLP reduction chain PHI.
2046 : Or a better overall way to SLP match those. */
2047 74201 : if (stmts.length () > 1
2048 74201 : && all_same && def_type == vect_reduction_def)
2049 1414 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
2050 : }
2051 1132 : else if (def_type != vect_internal_def)
2052 : return NULL;
2053 : }
2054 :
2055 :
2056 5253029 : bool two_operators = false;
2057 5253029 : unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
2058 5253029 : tree vectype = NULL_TREE;
2059 5253029 : if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
2060 : &this_max_nunits, matches, &two_operators,
2061 : &vectype))
2062 : return NULL;
2063 :
2064 : /* If the SLP node is a load, terminate the recursion unless masked. */
2065 3758071 : if (STMT_VINFO_DATA_REF (stmt_info)
2066 1854957 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2067 : {
2068 797224 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2069 : gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
2070 : else
2071 : {
2072 779127 : *max_nunits = this_max_nunits;
2073 779127 : (*tree_size)++;
2074 779127 : node = vect_create_new_slp_node (node, stmts, 0);
2075 779127 : SLP_TREE_VECTYPE (node) = vectype;
2076 : /* And compute the load permutation. Whether it is actually
2077 : a permutation depends on the unrolling factor which is
2078 : decided later. */
2079 779127 : vec<unsigned> load_permutation;
2080 779127 : int j;
2081 779127 : stmt_vec_info load_info;
2082 779127 : load_permutation.create (group_size);
2083 779127 : stmt_vec_info first_stmt_info
2084 779127 : = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2085 779127 : ? DR_GROUP_FIRST_ELEMENT (stmt_info) : stmt_info;
2086 779127 : bool any_permute = false;
2087 1894265 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2088 : {
2089 1115138 : int load_place;
2090 1115138 : if (! load_info)
2091 : {
2092 39421 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2093 : load_place = j;
2094 : else
2095 : load_place = 0;
2096 : }
2097 1075717 : else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2098 660328 : load_place = vect_get_place_in_interleaving_chain
2099 660328 : (load_info, first_stmt_info);
2100 : else
2101 : /* Recognize the splat case as { 0, 0, ... } but make
2102 : sure to use the appropriate refs for collections
2103 : of invariant refs. */
2104 415389 : load_place = (load_info == stmt_info) ? 0 : j;
2105 699906 : gcc_assert (load_place != -1);
2106 1115138 : any_permute |= load_place != j;
2107 1115138 : load_permutation.quick_push (load_place);
2108 : }
2109 :
2110 779127 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
2111 : {
2112 2350 : gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
2113 2350 : bool has_gaps = false;
2114 2350 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2115 209 : for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
2116 1346 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2117 1137 : if (DR_GROUP_GAP (si) != 1)
2118 160 : has_gaps = true;
2119 : /* We cannot handle permuted masked loads directly, see
2120 : PR114375. We cannot handle strided masked loads or masked
2121 : loads with gaps unless the mask is uniform. */
2122 2350 : if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
2123 209 : && (DR_GROUP_GAP (first_stmt_info) != 0
2124 149 : || (has_gaps
2125 55 : && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))))
2126 4605 : || STMT_VINFO_STRIDED_P (stmt_info))
2127 : {
2128 108 : load_permutation.release ();
2129 108 : matches[0] = false;
2130 776929 : return NULL;
2131 : }
2132 :
2133 : /* For permuted masked loads do an unpermuted masked load of
2134 : the whole group followed by a SLP permute node. */
2135 2242 : if (any_permute
2136 2242 : || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2137 84 : && DR_GROUP_SIZE (first_stmt_info) != group_size))
2138 : {
2139 : /* Discover the whole unpermuted load. */
2140 44 : vec<stmt_vec_info> stmts2;
2141 44 : unsigned dr_group_size = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2142 78 : ? DR_GROUP_SIZE (first_stmt_info) : 1;
2143 44 : stmts2.create (dr_group_size);
2144 44 : stmts2.quick_grow_cleared (dr_group_size);
2145 44 : unsigned i = 0;
2146 44 : for (stmt_vec_info si = first_stmt_info;
2147 594 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2148 : {
2149 550 : if (si != first_stmt_info)
2150 2106 : for (unsigned k = 1; k < DR_GROUP_GAP (si); ++k)
2151 1600 : stmts2[i++] = NULL;
2152 550 : stmts2[i++] = si;
2153 : }
2154 44 : bool *matches2 = XALLOCAVEC (bool, dr_group_size);
2155 44 : slp_tree unperm_load
2156 44 : = vect_build_slp_tree (vinfo, stmts2, dr_group_size,
2157 : &this_max_nunits, matches2, limit,
2158 44 : &this_tree_size, bst_map);
2159 : /* When we are able to do the full masked load emit that
2160 : followed by 'node' being the desired final permutation. */
2161 44 : if (unperm_load)
2162 : {
2163 16 : gcc_assert
2164 : (!SLP_TREE_LOAD_PERMUTATION (unperm_load).exists ());
2165 16 : lane_permutation_t lperm;
2166 16 : lperm.create (group_size);
2167 56 : for (unsigned j = 0; j < load_permutation.length (); ++j)
2168 40 : lperm.quick_push
2169 40 : (std::make_pair (0, load_permutation[j]));
2170 16 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2171 16 : SLP_TREE_CHILDREN (node).safe_push (unperm_load);
2172 16 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2173 16 : load_permutation.release ();
2174 16 : return node;
2175 : }
2176 28 : stmts2.release ();
2177 28 : load_permutation.release ();
2178 28 : matches[0] = false;
2179 28 : return NULL;
2180 : }
2181 2198 : load_permutation.release ();
2182 : }
2183 : else
2184 : {
2185 776777 : if (!any_permute
2186 676520 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2187 1054505 : && group_size == DR_GROUP_SIZE (first_stmt_info))
2188 120450 : load_permutation.release ();
2189 776777 : SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
2190 776777 : return node;
2191 : }
2192 : }
2193 : }
2194 2960847 : else if (gimple_assign_single_p (stmt_info->stmt)
2195 2121890 : && !gimple_vuse (stmt_info->stmt)
2196 2968556 : && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
2197 : {
2198 : /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
2199 : the same SSA name vector of a compatible type to vectype. */
2200 2391 : vec<std::pair<unsigned, unsigned> > lperm = vNULL;
2201 2391 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
2202 2391 : stmt_vec_info estmt_info;
2203 7531 : FOR_EACH_VEC_ELT (stmts, i, estmt_info)
2204 : {
2205 5287 : gassign *estmt = as_a <gassign *> (estmt_info->stmt);
2206 5287 : tree bfref = gimple_assign_rhs1 (estmt);
2207 5287 : HOST_WIDE_INT lane;
2208 5287 : if (!known_eq (bit_field_size (bfref),
2209 : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2210 10427 : || !constant_multiple_p (bit_field_offset (bfref),
2211 5140 : bit_field_size (bfref), &lane))
2212 : {
2213 147 : lperm.release ();
2214 147 : matches[0] = false;
2215 147 : return NULL;
2216 : }
2217 5140 : lperm.safe_push (std::make_pair (0, (unsigned)lane));
2218 : }
2219 2244 : slp_tree vnode = vect_create_new_slp_node (vNULL);
2220 2244 : if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2221 : /* ??? We record vectype here but we hide eventually necessary
2222 : punning and instead rely on code generation to materialize
2223 : VIEW_CONVERT_EXPRs as necessary. We instead should make
2224 : this explicit somehow. */
2225 710 : SLP_TREE_VECTYPE (vnode) = vectype;
2226 : else
2227 : {
2228 : /* For different size but compatible elements we can still
2229 : use VEC_PERM_EXPR without punning. */
2230 1534 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2231 : && types_compatible_p (TREE_TYPE (vectype),
2232 : TREE_TYPE (TREE_TYPE (vec))));
2233 1534 : SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2234 : }
2235 2244 : auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2236 2244 : unsigned HOST_WIDE_INT const_nunits;
2237 2244 : if (nunits.is_constant (&const_nunits))
2238 2244 : SLP_TREE_LANES (vnode) = const_nunits;
2239 2244 : SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2240 : /* We are always building a permutation node even if it is an identity
2241 : permute to shield the rest of the vectorizer from the odd node
2242 : representing an actual vector without any scalar ops.
2243 : ??? We could hide it completely with making the permute node
2244 : external? */
2245 2244 : node = vect_create_new_slp_node (node, stmts, 1);
2246 2244 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2247 2244 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2248 2244 : SLP_TREE_VECTYPE (node) = vectype;
2249 2244 : SLP_TREE_CHILDREN (node).quick_push (vnode);
2250 2244 : return node;
2251 : }
2252 : /* When discovery reaches an associatable operation see whether we can
2253 : improve that to match up lanes in a way superior to the operand
2254 : swapping code which at most looks at two defs.
2255 : ??? For BB vectorization we cannot do the brute-force search
2256 : for matching as we can succeed by means of builds from scalars
2257 : and have no good way to "cost" one build against another. */
2258 2958456 : else if (is_a <loop_vec_info> (vinfo)
2259 : /* Do not bother for single-lane SLP. */
2260 1627427 : && group_size > 1
2261 : /* ??? We don't handle !vect_internal_def defs below. */
2262 80180 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2263 : /* ??? Do not associate a reduction, this will wreck REDUC_IDX
2264 : mapping as long as that exists on the stmt_info level. */
2265 63620 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1
2266 58477 : && is_gimple_assign (stmt_info->stmt)
2267 58209 : && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2268 40641 : || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2269 2977613 : && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2270 11656 : || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2271 9713 : && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2272 : {
2273 : /* See if we have a chain of (mixed) adds or subtracts or other
2274 : associatable ops. */
2275 13653 : enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2276 13653 : if (code == MINUS_EXPR)
2277 686 : code = PLUS_EXPR;
2278 13653 : stmt_vec_info other_op_stmt_info = NULL;
2279 13653 : stmt_vec_info op_stmt_info = NULL;
2280 13653 : unsigned chain_len = 0;
2281 13653 : auto_vec<chain_op_t> chain;
2282 13653 : auto_vec<std::pair<tree_code, gimple *> > worklist;
2283 13653 : auto_vec<vec<chain_op_t> > chains (group_size);
2284 13653 : auto_vec<slp_tree, 4> children;
2285 13653 : bool hard_fail = true;
2286 14538 : for (unsigned lane = 0; lane < group_size; ++lane)
2287 : {
2288 14269 : if (!stmts[lane])
2289 : {
2290 : /* ??? Below we require lane zero is present. */
2291 0 : if (lane == 0)
2292 : {
2293 : hard_fail = false;
2294 13384 : break;
2295 : }
2296 0 : chains.quick_push (vNULL);
2297 0 : continue;
2298 : }
2299 : /* For each lane linearize the addition/subtraction (or other
2300 : uniform associatable operation) expression tree. */
2301 14269 : gimple *op_stmt = NULL, *other_op_stmt = NULL;
2302 14269 : vect_slp_linearize_chain (vinfo, worklist, chain, code,
2303 14269 : stmts[lane]->stmt, op_stmt, other_op_stmt,
2304 : NULL);
2305 14269 : if (!op_stmt_info && op_stmt)
2306 13123 : op_stmt_info = vinfo->lookup_stmt (op_stmt);
2307 14269 : if (!other_op_stmt_info && other_op_stmt)
2308 722 : other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2309 14269 : if (chain.length () == 2)
2310 : {
2311 : /* In a chain of just two elements resort to the regular
2312 : operand swapping scheme. Likewise if we run into a
2313 : length mismatch process regularly as well as we did not
2314 : process the other lanes we cannot report a good hint what
2315 : lanes to try swapping in the parent. */
2316 : hard_fail = false;
2317 : break;
2318 : }
2319 888 : else if (chain_len == 0)
2320 309 : chain_len = chain.length ();
2321 1158 : else if (chain.length () != chain_len)
2322 : {
2323 : /* ??? Here we could slip in magic to compensate with
2324 : neutral operands. */
2325 3 : matches[lane] = false;
2326 3 : if (lane != group_size - 1)
2327 3 : matches[0] = false;
2328 : break;
2329 : }
2330 885 : chains.quick_push (chain.copy ());
2331 885 : chain.truncate (0);
2332 : }
2333 27306 : if (chains.length () == group_size)
2334 : {
2335 : /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2336 269 : if (!op_stmt_info)
2337 : {
2338 2 : hard_fail = false;
2339 2 : goto out;
2340 : }
2341 : /* Now we have a set of chains with the same length. */
2342 : /* 1. pre-sort according to def_type and operation. */
2343 1042 : for (unsigned lane = 0; lane < group_size; ++lane)
2344 1550 : chains[lane].stablesort (dt_sort_cmp, vinfo);
2345 267 : if (dump_enabled_p ())
2346 : {
2347 145 : dump_printf_loc (MSG_NOTE, vect_location,
2348 : "pre-sorted chains of %s\n",
2349 : get_tree_code_name (code));
2350 649 : for (unsigned lane = 0; lane < group_size; ++lane)
2351 : {
2352 504 : if (!stmts[lane])
2353 0 : dump_printf (MSG_NOTE, "--");
2354 : else
2355 2326 : for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2356 3644 : dump_printf (MSG_NOTE, "%s %T ",
2357 1822 : get_tree_code_name (chains[lane][opnum].code),
2358 1822 : chains[lane][opnum].op);
2359 504 : dump_printf (MSG_NOTE, "\n");
2360 : }
2361 : }
2362 : /* 2. try to build children nodes, associating as necessary. */
2363 : /* 2a. prepare and perform early checks to avoid eating into
2364 : discovery limit unnecessarily. */
2365 267 : vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
2366 1135 : for (unsigned n = 0; n < chain_len; ++n)
2367 : {
2368 868 : vect_def_type dt = chains[0][n].dt;
2369 868 : unsigned lane;
2370 3535 : for (lane = 0; lane < group_size; ++lane)
2371 5334 : if (stmts[lane] && chains[lane][n].dt != dt)
2372 : {
2373 0 : if (dt == vect_constant_def
2374 0 : && chains[lane][n].dt == vect_external_def)
2375 : dt = vect_external_def;
2376 0 : else if (dt == vect_external_def
2377 0 : && chains[lane][n].dt == vect_constant_def)
2378 : ;
2379 : else
2380 : break;
2381 : }
2382 868 : if (lane != group_size)
2383 : {
2384 0 : if (dump_enabled_p ())
2385 0 : dump_printf_loc (MSG_NOTE, vect_location,
2386 : "giving up on chain due to mismatched "
2387 : "def types\n");
2388 0 : matches[lane] = false;
2389 0 : if (lane != group_size - 1)
2390 0 : matches[0] = false;
2391 0 : goto out;
2392 : }
2393 868 : dts[n] = dt;
2394 868 : if (dt == vect_constant_def
2395 868 : || dt == vect_external_def)
2396 : {
2397 : /* Check whether we can build the invariant. If we can't
2398 : we never will be able to. */
2399 77 : tree type = TREE_TYPE (chains[0][n].op);
2400 868 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2401 : && (TREE_CODE (type) == BOOLEAN_TYPE
2402 : || !can_duplicate_and_interleave_p (vinfo, group_size,
2403 : type)))
2404 : {
2405 : matches[0] = false;
2406 : goto out;
2407 : }
2408 : }
2409 791 : else if (dt != vect_internal_def)
2410 : {
2411 : /* Not sure, we might need sth special.
2412 : gcc.dg/vect/pr96854.c,
2413 : gfortran.dg/vect/fast-math-pr37021.f90
2414 : and gfortran.dg/vect/pr61171.f trigger. */
2415 : /* Soft-fail for now. */
2416 0 : hard_fail = false;
2417 0 : goto out;
2418 : }
2419 : }
2420 : /* 2b. do the actual build. */
2421 1081 : for (unsigned n = 0; n < chain_len; ++n)
2422 : {
2423 833 : vect_def_type dt = dts[n];
2424 833 : unsigned lane;
2425 833 : if (dt == vect_constant_def
2426 833 : || dt == vect_external_def)
2427 : {
2428 77 : vec<tree> ops;
2429 77 : ops.create (group_size);
2430 397 : for (lane = 0; lane < group_size; ++lane)
2431 243 : if (stmts[lane])
2432 243 : ops.quick_push (chains[lane][n].op);
2433 : else
2434 0 : ops.quick_push (NULL_TREE);
2435 77 : slp_tree child = vect_create_new_slp_node (ops);
2436 77 : SLP_TREE_DEF_TYPE (child) = dt;
2437 77 : children.safe_push (child);
2438 : }
2439 : else
2440 : {
2441 756 : vec<stmt_vec_info> op_stmts;
2442 756 : op_stmts.create (group_size);
2443 756 : slp_tree child = NULL;
2444 : /* Brute-force our way. We have to consider a lane
2445 : failing after fixing an earlier fail up in the
2446 : SLP discovery recursion. So track the current
2447 : permute per lane. */
2448 756 : unsigned *perms = XALLOCAVEC (unsigned, group_size);
2449 756 : memset (perms, 0, sizeof (unsigned) * group_size);
2450 835 : do
2451 : {
2452 835 : op_stmts.truncate (0);
2453 4248 : for (lane = 0; lane < group_size; ++lane)
2454 2578 : if (stmts[lane])
2455 2578 : op_stmts.quick_push
2456 2578 : (vinfo->lookup_def (chains[lane][n].op));
2457 : else
2458 0 : op_stmts.quick_push (NULL);
2459 835 : child = vect_build_slp_tree (vinfo, op_stmts,
2460 : group_size, &this_max_nunits,
2461 : matches, limit,
2462 : &this_tree_size, bst_map);
2463 : /* ??? We're likely getting too many fatal mismatches
2464 : here so maybe we want to ignore them (but then we
2465 : have no idea which lanes fatally mismatched). */
2466 835 : if (child || !matches[0])
2467 : break;
2468 : /* Swap another lane we have not yet matched up into
2469 : lanes that did not match. If we run out of
2470 : permute possibilities for a lane terminate the
2471 : search. */
2472 257 : bool term = false;
2473 257 : for (lane = 1; lane < group_size; ++lane)
2474 178 : if (!matches[lane])
2475 : {
2476 150 : if (n + perms[lane] + 1 == chain_len)
2477 : {
2478 : term = true;
2479 : break;
2480 : }
2481 131 : if (dump_enabled_p ())
2482 113 : dump_printf_loc (MSG_NOTE, vect_location,
2483 : "swapping operand %d and %d "
2484 : "of lane %d\n",
2485 : n, n + perms[lane] + 1, lane);
2486 262 : std::swap (chains[lane][n],
2487 131 : chains[lane][n + perms[lane] + 1]);
2488 131 : perms[lane]++;
2489 : }
2490 98 : if (term)
2491 : break;
2492 : }
2493 : while (1);
2494 756 : if (!child)
2495 : {
2496 19 : if (dump_enabled_p ())
2497 18 : dump_printf_loc (MSG_NOTE, vect_location,
2498 : "failed to match up op %d\n", n);
2499 19 : op_stmts.release ();
2500 19 : if (lane != group_size - 1)
2501 9 : matches[0] = false;
2502 : else
2503 10 : matches[lane] = false;
2504 19 : goto out;
2505 : }
2506 737 : if (dump_enabled_p ())
2507 : {
2508 397 : dump_printf_loc (MSG_NOTE, vect_location,
2509 : "matched up op %d to\n", n);
2510 397 : vect_print_slp_tree (MSG_NOTE, vect_location, child);
2511 : }
2512 737 : children.safe_push (child);
2513 : }
2514 : }
2515 : /* 3. build SLP nodes to combine the chain. */
2516 950 : for (unsigned lane = 0; lane < group_size; ++lane)
2517 1416 : if (stmts[lane] && chains[lane][0].code != code)
2518 : {
2519 : /* See if there's any alternate all-PLUS entry. */
2520 : unsigned n;
2521 6 : for (n = 1; n < chain_len; ++n)
2522 : {
2523 30 : for (lane = 0; lane < group_size; ++lane)
2524 48 : if (stmts[lane] && chains[lane][n].code != code)
2525 : break;
2526 6 : if (lane == group_size)
2527 : break;
2528 : }
2529 6 : if (n != chain_len)
2530 : {
2531 : /* Swap that in at first position. */
2532 6 : std::swap (children[0], children[n]);
2533 30 : for (lane = 0; lane < group_size; ++lane)
2534 24 : if (stmts[lane])
2535 24 : std::swap (chains[lane][0], chains[lane][n]);
2536 : }
2537 : else
2538 : {
2539 : /* ??? When this triggers and we end up with two
2540 : vect_constant/external_def up-front things break (ICE)
2541 : spectacularly finding an insertion place for the
2542 : all-constant op. We should have a fully
2543 : vect_internal_def operand though(?) so we can swap
2544 : that into first place and then prepend the all-zero
2545 : constant. */
2546 0 : if (dump_enabled_p ())
2547 0 : dump_printf_loc (MSG_NOTE, vect_location,
2548 : "inserting constant zero to compensate "
2549 : "for (partially) negated first "
2550 : "operand\n");
2551 0 : chain_len++;
2552 0 : for (lane = 0; lane < group_size; ++lane)
2553 0 : if (stmts[lane])
2554 0 : chains[lane].safe_insert
2555 0 : (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2556 0 : vec<tree> zero_ops;
2557 0 : zero_ops.create (group_size);
2558 0 : zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2559 0 : for (lane = 1; lane < group_size; ++lane)
2560 0 : if (stmts[lane])
2561 0 : zero_ops.quick_push (zero_ops[0]);
2562 : else
2563 0 : zero_ops.quick_push (NULL_TREE);
2564 0 : slp_tree zero = vect_create_new_slp_node (zero_ops);
2565 0 : SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2566 0 : children.safe_insert (0, zero);
2567 : }
2568 : break;
2569 : }
2570 809 : for (unsigned i = 1; i < children.length (); ++i)
2571 : {
2572 561 : slp_tree op0 = children[i - 1];
2573 561 : slp_tree op1 = children[i];
2574 561 : bool this_two_op = false;
2575 2169 : for (unsigned lane = 0; lane < group_size; ++lane)
2576 3460 : if (stmts[lane] && chains[lane][i].code != chains[0][i].code)
2577 : {
2578 : this_two_op = true;
2579 : break;
2580 : }
2581 561 : slp_tree child;
2582 561 : if (i == children.length () - 1)
2583 248 : child = vect_create_new_slp_node (node, stmts, 2);
2584 : else
2585 313 : child = vect_create_new_slp_node (2, ERROR_MARK);
2586 561 : if (this_two_op)
2587 : {
2588 122 : vec<std::pair<unsigned, unsigned> > lperm;
2589 122 : lperm.create (group_size);
2590 462 : for (unsigned lane = 0; lane < group_size; ++lane)
2591 680 : lperm.quick_push (std::make_pair
2592 340 : (chains[lane][i].code != chains[0][i].code, lane));
2593 244 : vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2594 122 : (chains[0][i].code == code
2595 : ? op_stmt_info
2596 : : other_op_stmt_info),
2597 122 : (chains[0][i].code == code
2598 : ? other_op_stmt_info
2599 : : op_stmt_info),
2600 : lperm);
2601 : }
2602 : else
2603 : {
2604 439 : SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2605 439 : SLP_TREE_VECTYPE (child) = vectype;
2606 439 : SLP_TREE_LANES (child) = group_size;
2607 439 : SLP_TREE_CHILDREN (child).quick_push (op0);
2608 439 : SLP_TREE_CHILDREN (child).quick_push (op1);
2609 439 : SLP_TREE_REPRESENTATIVE (child)
2610 878 : = (chains[0][i].code == code
2611 439 : ? op_stmt_info : other_op_stmt_info);
2612 : }
2613 561 : children[i] = child;
2614 : }
2615 248 : *tree_size += this_tree_size + 1;
2616 248 : *max_nunits = this_max_nunits;
2617 1244 : while (!chains.is_empty ())
2618 726 : chains.pop ().release ();
2619 : return node;
2620 : }
2621 13384 : out:
2622 13405 : if (dump_enabled_p ())
2623 2775 : dump_printf_loc (MSG_NOTE, vect_location,
2624 : "failed to line up SLP graph by re-associating "
2625 : "operations in lanes%s\n",
2626 : !hard_fail ? " trying regular discovery" : "");
2627 13410 : while (!children.is_empty ())
2628 5 : vect_free_slp_tree (children.pop ());
2629 13564 : while (!chains.is_empty ())
2630 159 : chains.pop ().release ();
2631 : /* Hard-fail, otherwise we might run into quadratic processing of the
2632 : chains starting one stmt into the chain again. */
2633 13405 : if (hard_fail)
2634 : return NULL;
2635 : /* Fall thru to normal processing. */
2636 13653 : }
2637 :
2638 : /* Get at the operands, verifying they are compatible. */
2639 2978481 : vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2640 2978481 : slp_oprnd_info oprnd_info;
2641 15193577 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2642 : {
2643 24432410 : int res = vect_get_and_check_slp_defs (vinfo, vectype,
2644 12216205 : swap[i], skip_args,
2645 : stmts, i, &oprnds_info);
2646 12216205 : if (res != 0)
2647 529897 : matches[(res == -1) ? 0 : i] = false;
2648 12216205 : if (!matches[0])
2649 : break;
2650 : }
2651 14893616 : for (i = 0; i < group_size; ++i)
2652 12125587 : if (!matches[i])
2653 : {
2654 210452 : vect_free_oprnd_info (oprnds_info);
2655 210452 : return NULL;
2656 : }
2657 8304087 : swap = NULL;
2658 :
2659 8304087 : bool has_two_operators_perm = false;
2660 16608174 : auto_vec<unsigned> two_op_perm_indices[2];
2661 2768029 : vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
2662 :
2663 2780193 : if (two_operators && oprnds_info.length () == 2 && group_size > 2)
2664 : {
2665 2723 : unsigned idx = 0;
2666 2723 : hash_map<gimple *, unsigned> seen;
2667 2723 : vec<slp_oprnd_info> new_oprnds_info
2668 2723 : = vect_create_oprnd_info (1, group_size);
2669 2723 : bool success = true;
2670 :
2671 2723 : enum tree_code code = ERROR_MARK;
2672 2723 : if (oprnds_info[0]->def_stmts[0]
2673 2723 : && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
2674 2665 : code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
2675 2723 : basic_block bb = nullptr;
2676 :
2677 5992 : for (unsigned j = 0; j < group_size; ++j)
2678 : {
2679 14323 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2680 : {
2681 11054 : stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
2682 11054 : if (!stmt_info
2683 10843 : || !is_a<gassign *> (stmt_info->stmt)
2684 10840 : || gimple_assign_rhs_code (stmt_info->stmt) != code
2685 19783 : || skip_args[i])
2686 : {
2687 : success = false;
2688 2329 : break;
2689 : }
2690 : /* Avoid mixing lanes with defs in different basic-blocks. */
2691 8729 : if (!bb)
2692 2821 : bb = gimple_bb (vect_orig_stmt (stmt_info)->stmt);
2693 7428 : else if (gimple_bb (vect_orig_stmt (stmt_info)->stmt) != bb)
2694 : {
2695 : success = false;
2696 : break;
2697 : }
2698 :
2699 8725 : bool exists;
2700 8725 : unsigned &stmt_idx
2701 8725 : = seen.get_or_insert (stmt_info->stmt, &exists);
2702 :
2703 8725 : if (!exists)
2704 : {
2705 7676 : new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
2706 7676 : new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
2707 7676 : stmt_idx = idx;
2708 7676 : idx++;
2709 : }
2710 :
2711 8725 : two_op_perm_indices[i].safe_push (stmt_idx);
2712 : }
2713 :
2714 5598 : if (!success)
2715 : break;
2716 : }
2717 :
2718 2723 : if (success && idx == group_size)
2719 : {
2720 56 : if (dump_enabled_p ())
2721 : {
2722 0 : dump_printf_loc (MSG_NOTE, vect_location,
2723 : "Replace two_operators operands:\n");
2724 :
2725 0 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2726 : {
2727 0 : dump_printf_loc (MSG_NOTE, vect_location,
2728 : "Operand %u:\n", i);
2729 0 : for (unsigned j = 0; j < group_size; j++)
2730 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2731 0 : j, oprnd_info->def_stmts[j]->stmt);
2732 : }
2733 :
2734 0 : dump_printf_loc (MSG_NOTE, vect_location,
2735 : "With a single operand:\n");
2736 0 : for (unsigned j = 0; j < group_size; j++)
2737 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2738 0 : j, new_oprnds_info[0]->def_stmts[j]->stmt);
2739 : }
2740 :
2741 56 : two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
2742 56 : two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
2743 :
2744 56 : new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
2745 56 : new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
2746 56 : new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
2747 56 : new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
2748 56 : new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
2749 :
2750 56 : vect_free_oprnd_info (oprnds_info);
2751 56 : oprnds_info = new_oprnds_info;
2752 56 : nops = 1;
2753 56 : has_two_operators_perm = true;
2754 : }
2755 : else
2756 2667 : vect_free_oprnd_info (new_oprnds_info);
2757 2723 : }
2758 :
2759 5536058 : auto_vec<slp_tree, 4> children;
2760 :
2761 2768029 : stmt_info = stmts[0];
2762 :
2763 2768029 : int reduc_idx = -1;
2764 2768029 : int gs_scale = 0;
2765 2768029 : tree gs_base = NULL_TREE;
2766 :
2767 : /* Create SLP_TREE nodes for the definition node/s. */
2768 7074900 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2769 : {
2770 4394467 : slp_tree child = nullptr;
2771 4394467 : unsigned int j;
2772 :
2773 : /* We're skipping certain operands from processing, for example
2774 : outer loop reduction initial defs. */
2775 4394467 : if (skip_args[i])
2776 : {
2777 417546 : children.safe_push (NULL);
2778 4724417 : continue;
2779 : }
2780 :
2781 3976921 : if (oprnd_info->first_dt == vect_uninitialized_def)
2782 : {
2783 : /* COND_EXPR have one too many eventually if the condition
2784 : is a SSA name. */
2785 0 : gcc_assert (i == 3 && nops == 4);
2786 0 : continue;
2787 : }
2788 :
2789 3976921 : if (oprnd_info->first_gs_p)
2790 : {
2791 21765 : gs_scale = oprnd_info->first_gs_info.scale;
2792 21765 : gs_base = oprnd_info->first_gs_info.base;
2793 : }
2794 :
2795 3976921 : if (is_a <bb_vec_info> (vinfo)
2796 1564464 : && oprnd_info->first_dt == vect_internal_def
2797 4788378 : && !oprnd_info->any_pattern)
2798 : {
2799 : /* For BB vectorization, if all defs are the same do not
2800 : bother to continue the build along the single-lane
2801 : graph but use a splat of the scalar value. */
2802 768525 : stmt_vec_info first_def = oprnd_info->def_stmts[0];
2803 824490 : for (j = 1; j < group_size; ++j)
2804 784394 : if (oprnd_info->def_stmts[j] != first_def)
2805 : break;
2806 768525 : if (j == group_size
2807 : /* But avoid doing this for loads where we may be
2808 : able to CSE things, unless the stmt is not
2809 : vectorizable. */
2810 768525 : && (!STMT_VINFO_VECTORIZABLE (first_def)
2811 49356 : || !gimple_vuse (first_def->stmt)))
2812 : {
2813 30833 : if (dump_enabled_p ())
2814 93 : dump_printf_loc (MSG_NOTE, vect_location,
2815 : "Using a splat of the uniform operand %G",
2816 : first_def->stmt);
2817 30833 : oprnd_info->first_dt = vect_external_def;
2818 : }
2819 : }
2820 :
2821 3976921 : if (oprnd_info->first_dt == vect_external_def
2822 3976921 : || oprnd_info->first_dt == vect_constant_def)
2823 : {
2824 1388551 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2825 : {
2826 : tree op0;
2827 : tree uniform_val = op0 = oprnd_info->ops[0];
2828 : for (j = 1; j < oprnd_info->ops.length (); ++j)
2829 : if (oprnd_info->ops[j]
2830 : && !operand_equal_p (uniform_val, oprnd_info->ops[j]))
2831 : {
2832 : uniform_val = NULL_TREE;
2833 : break;
2834 : }
2835 : if (!uniform_val
2836 : && !can_duplicate_and_interleave_p (vinfo,
2837 : oprnd_info->ops.length (),
2838 : TREE_TYPE (op0)))
2839 : {
2840 : matches[j] = false;
2841 : if (dump_enabled_p ())
2842 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2843 : "Build SLP failed: invalid type of def "
2844 : "for variable-length SLP %T\n", op0);
2845 : goto fail;
2846 : }
2847 : }
2848 1388551 : slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2849 1388551 : SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2850 1388551 : oprnd_info->ops = vNULL;
2851 1388551 : children.safe_push (invnode);
2852 1388551 : continue;
2853 1388551 : }
2854 :
2855 : /* See which SLP operand a reduction chain continues on. We want
2856 : to chain even PHIs but not backedges. */
2857 2588370 : if (STMT_VINFO_REDUC_DEF (oprnd_info->def_stmts[0])
2858 2588370 : || STMT_VINFO_REDUC_IDX (oprnd_info->def_stmts[0]) != -1)
2859 : {
2860 160550 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2861 : {
2862 638 : if (oprnd_info->first_dt == vect_double_reduction_def)
2863 319 : reduc_idx = i;
2864 : }
2865 159912 : else if (is_a <gphi *> (stmt_info->stmt)
2866 159912 : && gimple_phi_num_args
2867 70241 : (as_a <gphi *> (stmt_info->stmt)) != 1)
2868 : ;
2869 89995 : else if (STMT_VINFO_REDUC_IDX (stmt_info) == -1
2870 324 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2871 : ;
2872 89995 : else if (reduc_idx == -1)
2873 85679 : reduc_idx = i;
2874 : else
2875 : /* For .COND_* reduction operations the else value can be the
2876 : same as one of the operation operands. The other def
2877 : stmts have been moved, so we can't check easily. Check
2878 : it's a call at least. */
2879 4316 : gcc_assert (is_a <gcall *> (stmt_info->stmt));
2880 : }
2881 :
2882 : /* When we have a masked load with uniform mask discover this
2883 : as a single-lane mask with a splat permute. This way we can
2884 : recognize this as a masked load-lane by stripping the splat. */
2885 2588370 : if (is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
2886 34757 : && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
2887 : IFN_MASK_LOAD)
2888 4737 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2889 2588447 : && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info)))
2890 : {
2891 35 : vec<stmt_vec_info> def_stmts2;
2892 35 : def_stmts2.create (1);
2893 35 : def_stmts2.quick_push (oprnd_info->def_stmts[0]);
2894 35 : child = vect_build_slp_tree (vinfo, def_stmts2, 1,
2895 : &this_max_nunits,
2896 : matches, limit,
2897 : &this_tree_size, bst_map);
2898 35 : if (child)
2899 : {
2900 35 : slp_tree pnode = vect_create_new_slp_node (1, VEC_PERM_EXPR);
2901 35 : SLP_TREE_VECTYPE (pnode) = SLP_TREE_VECTYPE (child);
2902 35 : SLP_TREE_LANES (pnode) = group_size;
2903 35 : SLP_TREE_SCALAR_STMTS (pnode).create (group_size);
2904 35 : SLP_TREE_LANE_PERMUTATION (pnode).create (group_size);
2905 210 : for (unsigned k = 0; k < group_size; ++k)
2906 : {
2907 175 : SLP_TREE_SCALAR_STMTS (pnode)
2908 175 : .quick_push (oprnd_info->def_stmts[0]);
2909 175 : SLP_TREE_LANE_PERMUTATION (pnode)
2910 175 : .quick_push (std::make_pair (0u, 0u));
2911 : }
2912 35 : SLP_TREE_CHILDREN (pnode).quick_push (child);
2913 35 : pnode->max_nunits = child->max_nunits;
2914 35 : children.safe_push (pnode);
2915 35 : oprnd_info->def_stmts = vNULL;
2916 35 : continue;
2917 35 : }
2918 : else
2919 0 : def_stmts2.release ();
2920 : }
2921 :
2922 2588335 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2923 : group_size, &this_max_nunits,
2924 : matches, limit,
2925 : &this_tree_size, bst_map)) != NULL)
2926 : {
2927 2135375 : oprnd_info->def_stmts = vNULL;
2928 2135375 : children.safe_push (child);
2929 2135375 : continue;
2930 : }
2931 :
2932 : /* If the SLP build for operand zero failed and operand zero
2933 : and one can be commutated try that for the scalar stmts
2934 : that failed the match. */
2935 452960 : if (i == 0
2936 : /* A first scalar stmt mismatch signals a fatal mismatch. */
2937 356413 : && matches[0]
2938 : /* ??? For COND_EXPRs we can swap the comparison operands
2939 : as well as the arms under some constraints. */
2940 168514 : && (nops == 2 || nops == 3)
2941 101227 : && oprnds_info[1]->first_dt == vect_internal_def
2942 55262 : && (is_gimple_assign (stmt_info->stmt)
2943 11437 : || is_gimple_call (stmt_info->stmt))
2944 : /* Swapping operands for reductions breaks assumptions later on. */
2945 496798 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
2946 : {
2947 : /* See whether we can swap the matching or the non-matching
2948 : stmt operands. */
2949 : bool swap_not_matching = true;
2950 49259 : do
2951 : {
2952 7033900 : for (j = 0; j < group_size; ++j)
2953 : {
2954 6998334 : if (matches[j] != !swap_not_matching)
2955 64107 : continue;
2956 6934227 : stmt_vec_info stmt_info = stmts[j];
2957 : /* Verify if we can swap operands of this stmt. */
2958 6934227 : if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
2959 : {
2960 6934201 : tree_code code = gimple_assign_rhs_code (stmt);
2961 6934201 : if (! commutative_tree_code (code)
2962 6934201 : && ! commutative_ternary_tree_code (code))
2963 : {
2964 13669 : if (!swap_not_matching)
2965 6279 : goto fail;
2966 : swap_not_matching = false;
2967 : break;
2968 : }
2969 : }
2970 6984667 : else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
2971 : {
2972 26 : internal_fn fn = (gimple_call_internal_p (call)
2973 26 : ? gimple_call_internal_fn (call)
2974 : : IFN_LAST);
2975 26 : if ((! commutative_binary_fn_p (fn)
2976 26 : && ! commutative_ternary_fn_p (fn))
2977 28 : || first_commutative_argument (fn) != 0)
2978 : {
2979 24 : if (!swap_not_matching)
2980 12 : goto fail;
2981 : swap_not_matching = false;
2982 : break;
2983 : }
2984 : }
2985 : }
2986 : }
2987 42968 : while (j != group_size);
2988 :
2989 : /* Swap mismatched definition stmts. */
2990 35566 : if (dump_enabled_p ())
2991 345 : dump_printf_loc (MSG_NOTE, vect_location,
2992 : "Re-trying with swapped operands of stmts ");
2993 7012000 : for (j = 0; j < group_size; ++j)
2994 6976434 : if (matches[j] == !swap_not_matching)
2995 : {
2996 13840756 : std::swap (oprnds_info[0]->def_stmts[j],
2997 6920378 : oprnds_info[1]->def_stmts[j]);
2998 13840756 : std::swap (oprnds_info[0]->ops[j],
2999 6920378 : oprnds_info[1]->ops[j]);
3000 6920378 : if (dump_enabled_p ())
3001 938 : dump_printf (MSG_NOTE, "%d ", j);
3002 : }
3003 35566 : if (dump_enabled_p ())
3004 345 : dump_printf (MSG_NOTE, "\n");
3005 : /* After swapping some operands we lost track whether an
3006 : operand has any pattern defs so be conservative here. */
3007 67903 : if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
3008 3273 : oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
3009 : /* And try again with scratch 'matches' ... */
3010 35566 : bool *tem = XALLOCAVEC (bool, group_size);
3011 35566 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
3012 : group_size, &this_max_nunits,
3013 : tem, limit,
3014 : &this_tree_size, bst_map)) != NULL)
3015 : {
3016 5592 : oprnd_info->def_stmts = vNULL;
3017 5592 : children.safe_push (child);
3018 5592 : continue;
3019 : }
3020 : }
3021 447368 : fail:
3022 :
3023 : /* If the SLP build failed and we analyze a basic-block
3024 : simply treat nodes we fail to build as externally defined
3025 : (and thus build vectors from the scalar defs).
3026 : The cost model will reject outright expensive cases.
3027 : ??? This doesn't treat cases where permutation ultimatively
3028 : fails (or we don't try permutation below). Ideally we'd
3029 : even compute a permutation that will end up with the maximum
3030 : SLP tree size... */
3031 447368 : if (is_a <bb_vec_info> (vinfo)
3032 : /* ??? Rejecting patterns this way doesn't work. We'd have to
3033 : do extra work to cancel the pattern so the uses see the
3034 : scalar version. */
3035 394201 : && !is_pattern_stmt_p (stmt_info)
3036 817622 : && !oprnd_info->any_pattern)
3037 : {
3038 : /* But if there's a leading vector sized set of matching stmts
3039 : fail here so we can split the group. This matches the condition
3040 : vect_analyze_slp_instance uses. */
3041 : /* ??? We might want to split here and combine the results to support
3042 : multiple vector sizes better. */
3043 580447 : for (j = 0; j < group_size; ++j)
3044 580447 : if (!matches[j])
3045 : break;
3046 369993 : if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
3047 369964 : && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
3048 : {
3049 359772 : if (dump_enabled_p ())
3050 501 : dump_printf_loc (MSG_NOTE, vect_location,
3051 : "Building vector operands from scalars\n");
3052 359772 : this_tree_size++;
3053 359772 : child = vect_create_new_slp_node (oprnd_info->ops);
3054 359772 : children.safe_push (child);
3055 359772 : oprnd_info->ops = vNULL;
3056 359772 : continue;
3057 : }
3058 : }
3059 :
3060 87596 : gcc_assert (child == NULL);
3061 98481 : FOR_EACH_VEC_ELT (children, j, child)
3062 10885 : if (child)
3063 10885 : vect_free_slp_tree (child);
3064 87596 : vect_free_oprnd_info (oprnds_info);
3065 87596 : return NULL;
3066 : }
3067 :
3068 2680433 : vect_free_oprnd_info (oprnds_info);
3069 :
3070 : /* If we have all children of a child built up from uniform scalars
3071 : or does more than one possibly expensive vector construction then
3072 : just throw that away, causing it built up from scalars.
3073 : The exception is the SLP node for the vector store. */
3074 2680433 : if (is_a <bb_vec_info> (vinfo)
3075 1090288 : && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3076 : /* ??? Rejecting patterns this way doesn't work. We'd have to
3077 : do extra work to cancel the pattern so the uses see the
3078 : scalar version. */
3079 3113888 : && !is_pattern_stmt_p (stmt_info))
3080 : {
3081 : slp_tree child;
3082 : unsigned j;
3083 : bool all_uniform_p = true;
3084 : unsigned n_vector_builds = 0;
3085 1231550 : FOR_EACH_VEC_ELT (children, j, child)
3086 : {
3087 823332 : if (!child)
3088 : ;
3089 823332 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3090 : all_uniform_p = false;
3091 587755 : else if (!vect_slp_tree_uniform_p (child))
3092 : {
3093 447893 : all_uniform_p = false;
3094 447893 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
3095 413831 : n_vector_builds++;
3096 : }
3097 : }
3098 408218 : if (all_uniform_p
3099 408218 : || n_vector_builds > 1
3100 692787 : || (n_vector_builds == children.length ()
3101 30290 : && is_a <gphi *> (stmt_info->stmt)))
3102 : {
3103 : /* Roll back. */
3104 128445 : matches[0] = false;
3105 408342 : FOR_EACH_VEC_ELT (children, j, child)
3106 279897 : if (child)
3107 279897 : vect_free_slp_tree (child);
3108 :
3109 128445 : if (dump_enabled_p ())
3110 129 : dump_printf_loc (MSG_NOTE, vect_location,
3111 : "Building parent vector operands from "
3112 : "scalars instead\n");
3113 128445 : return NULL;
3114 : }
3115 : }
3116 :
3117 2551988 : *tree_size += this_tree_size + 1;
3118 2551988 : *max_nunits = this_max_nunits;
3119 :
3120 2551988 : if (two_operators)
3121 : {
3122 : /* ??? We'd likely want to either cache in bst_map sth like
3123 : { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
3124 : the true { a+b, a+b, a+b, a+b } ... but there we don't have
3125 : explicit stmts to put in so the keying on 'stmts' doesn't
3126 : work (but we have the same issue with nodes that use 'ops'). */
3127 :
3128 5908 : if (has_two_operators_perm)
3129 : {
3130 22 : slp_tree child = children[0];
3131 22 : children.truncate (0);
3132 66 : for (i = 0; i < 2; i++)
3133 : {
3134 44 : slp_tree pnode
3135 44 : = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
3136 44 : SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
3137 44 : SLP_TREE_VECTYPE (pnode) = vectype;
3138 44 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3139 44 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3140 44 : lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
3141 44 : children.safe_push (pnode);
3142 :
3143 476 : for (unsigned j = 0; j < stmts.length (); j++)
3144 432 : perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
3145 : }
3146 :
3147 22 : SLP_TREE_REF_COUNT (child) += 4;
3148 : }
3149 :
3150 5908 : slp_tree one = new _slp_tree;
3151 5908 : slp_tree two = new _slp_tree;
3152 5908 : SLP_TREE_DEF_TYPE (one) = vect_internal_def;
3153 5908 : SLP_TREE_DEF_TYPE (two) = vect_internal_def;
3154 5908 : SLP_TREE_VECTYPE (one) = vectype;
3155 5908 : SLP_TREE_VECTYPE (two) = vectype;
3156 5908 : SLP_TREE_CHILDREN (one).safe_splice (children);
3157 5908 : SLP_TREE_CHILDREN (two).safe_splice (children);
3158 5908 : slp_tree child;
3159 23634 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
3160 11818 : SLP_TREE_REF_COUNT (child)++;
3161 :
3162 : /* Here we record the original defs since this
3163 : node represents the final lane configuration. */
3164 5908 : node = vect_create_new_slp_node (node, stmts, 2);
3165 5908 : SLP_TREE_VECTYPE (node) = vectype;
3166 5908 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3167 5908 : SLP_TREE_CHILDREN (node).quick_push (one);
3168 5908 : SLP_TREE_CHILDREN (node).quick_push (two);
3169 5908 : enum tree_code code0 = ERROR_MARK;
3170 5908 : enum tree_code ocode = ERROR_MARK;
3171 5908 : if (gassign *stmt = dyn_cast <gassign *> (stmts[0]->stmt))
3172 5906 : code0 = gimple_assign_rhs_code (stmt);
3173 5908 : stmt_vec_info ostmt_info;
3174 5908 : unsigned j = 0;
3175 22009 : FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
3176 : {
3177 16101 : int op = 0;
3178 16101 : if (gassign *ostmt = dyn_cast <gassign *> (ostmt_info->stmt))
3179 : {
3180 16097 : if (gimple_assign_rhs_code (ostmt) != code0)
3181 : {
3182 8083 : ocode = gimple_assign_rhs_code (ostmt);
3183 : op = 1;
3184 : j = i;
3185 : }
3186 : }
3187 : else
3188 : {
3189 8 : if (gimple_call_combined_fn (stmts[0]->stmt)
3190 4 : != gimple_call_combined_fn (ostmt_info->stmt))
3191 : {
3192 2 : op = 1;
3193 2 : j = i;
3194 : }
3195 : }
3196 16101 : SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (op, i));
3197 : }
3198 5908 : SLP_TREE_CODE (one) = code0;
3199 5908 : SLP_TREE_CODE (two) = ocode;
3200 5908 : SLP_TREE_LANES (one) = stmts.length ();
3201 5908 : SLP_TREE_LANES (two) = stmts.length ();
3202 5908 : SLP_TREE_REPRESENTATIVE (one) = stmts[0];
3203 5908 : SLP_TREE_REPRESENTATIVE (two) = stmts[j];
3204 :
3205 5908 : return node;
3206 : }
3207 :
3208 2546080 : node = vect_create_new_slp_node (node, stmts, nops);
3209 2546080 : SLP_TREE_VECTYPE (node) = vectype;
3210 2546080 : SLP_TREE_CHILDREN (node).splice (children);
3211 2546080 : SLP_TREE_GS_SCALE (node) = gs_scale;
3212 2546080 : SLP_TREE_GS_BASE (node) = gs_base;
3213 2546080 : if (reduc_idx != -1)
3214 : {
3215 80848 : gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) != -1
3216 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
3217 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def);
3218 80848 : SLP_TREE_REDUC_IDX (node) = reduc_idx;
3219 80848 : node->cycle_info.id = SLP_TREE_CHILDREN (node)[reduc_idx]->cycle_info.id;
3220 : }
3221 : /* When reaching the reduction PHI, create a vect_reduc_info. */
3222 2465232 : else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3223 2465232 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3224 2465232 : && is_a <gphi *> (STMT_VINFO_STMT (stmt_info)))
3225 : {
3226 71331 : loop_vec_info loop_vinfo = as_a <loop_vec_info> (vinfo);
3227 71331 : gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) == -1);
3228 71331 : node->cycle_info.id = loop_vinfo->reduc_infos.length ();
3229 71331 : vect_reduc_info reduc_info = new vect_reduc_info_s ();
3230 71331 : loop_vinfo->reduc_infos.safe_push (reduc_info);
3231 71331 : stmt_vec_info reduc_phi = stmt_info;
3232 : /* ??? For double reductions vect_is_simple_reduction stores the
3233 : reduction type and code on the inner loop header PHI. */
3234 71331 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3235 : {
3236 319 : use_operand_p use_p;
3237 319 : gimple *use_stmt;
3238 319 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
3239 : &use_p, &use_stmt);
3240 319 : gcc_assert (res);
3241 319 : reduc_phi = loop_vinfo->lookup_stmt (use_stmt);
3242 : }
3243 71331 : VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (stmt_info);
3244 71331 : VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (reduc_phi);
3245 71331 : VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (reduc_phi);
3246 71331 : VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
3247 : }
3248 : return node;
3249 8304087 : }
3250 :
3251 : /* Dump a single SLP tree NODE. */
3252 :
3253 : static void
3254 438634 : vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
3255 : slp_tree node)
3256 : {
3257 438634 : unsigned i, j;
3258 438634 : slp_tree child;
3259 438634 : stmt_vec_info stmt_info;
3260 438634 : tree op;
3261 :
3262 438634 : dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
3263 438634 : dump_user_location_t user_loc = loc.get_user_location ();
3264 438634 : dump_printf_loc (metadata, user_loc,
3265 : "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
3266 : ", refcnt=%u)",
3267 438634 : SLP_TREE_DEF_TYPE (node) == vect_external_def
3268 : ? " (external)"
3269 : : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
3270 423395 : ? " (constant)"
3271 : : ""), (void *) node,
3272 438634 : estimated_poly_value (node->max_nunits),
3273 : SLP_TREE_REF_COUNT (node));
3274 438634 : if (SLP_TREE_VECTYPE (node))
3275 372258 : dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
3276 438634 : dump_printf (metadata, "%s",
3277 438634 : node->avoid_stlf_fail ? " (avoid-stlf-fail)" : "");
3278 438634 : if (node->cycle_info.id != -1 || node->cycle_info.reduc_idx != -1)
3279 23089 : dump_printf (metadata, " cycle %d, link %d", node->cycle_info.id,
3280 : node->cycle_info.reduc_idx);
3281 438634 : dump_printf (metadata, "\n");
3282 438634 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3283 : {
3284 356896 : if (SLP_TREE_PERMUTE_P (node))
3285 13548 : dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
3286 : else
3287 343348 : dump_printf_loc (metadata, user_loc, "op template: %G",
3288 343348 : SLP_TREE_REPRESENTATIVE (node)->stmt);
3289 : }
3290 438634 : if (SLP_TREE_SCALAR_STMTS (node).exists ())
3291 854804 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3292 505949 : if (stmt_info)
3293 500668 : dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
3294 500668 : STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
3295 : i, stmt_info->stmt);
3296 : else
3297 5281 : dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
3298 : else
3299 : {
3300 89779 : dump_printf_loc (metadata, user_loc, "\t{ ");
3301 287496 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
3302 107938 : dump_printf (metadata, "%T%s ", op,
3303 107938 : i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
3304 89779 : dump_printf (metadata, "}\n");
3305 : }
3306 438634 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3307 : {
3308 62702 : dump_printf_loc (metadata, user_loc, "\tload permutation {");
3309 205772 : FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
3310 80368 : dump_printf (dump_kind, " %u", j);
3311 62702 : dump_printf (dump_kind, " }\n");
3312 : }
3313 438634 : if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3314 : {
3315 13556 : dump_printf_loc (metadata, user_loc, "\tlane permutation {");
3316 64464 : for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
3317 37352 : dump_printf (dump_kind, " %u[%u]",
3318 37352 : SLP_TREE_LANE_PERMUTATION (node)[i].first,
3319 37352 : SLP_TREE_LANE_PERMUTATION (node)[i].second);
3320 13556 : dump_printf (dump_kind, " }%s\n",
3321 13556 : node->ldst_lanes ? " (load-lanes)" : "");
3322 : }
3323 438634 : if (SLP_TREE_CHILDREN (node).is_empty ())
3324 166609 : return;
3325 272025 : dump_printf_loc (metadata, user_loc, "\tchildren");
3326 990121 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3327 446071 : dump_printf (dump_kind, " %p", (void *)child);
3328 272025 : dump_printf (dump_kind, "%s\n",
3329 272025 : node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
3330 : ? " (store-lanes)" : "");
3331 : }
3332 :
3333 : DEBUG_FUNCTION void
3334 0 : debug (slp_tree node)
3335 : {
3336 0 : debug_dump_context ctx;
3337 0 : vect_print_slp_tree (MSG_NOTE,
3338 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3339 : node);
3340 0 : }
3341 :
3342 : /* Recursive helper for the dot producer below. */
3343 :
3344 : static void
3345 0 : dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
3346 : {
3347 0 : if (visited.add (node))
3348 : return;
3349 :
3350 0 : fprintf (f, "\"%p\" [label=\"", (void *)node);
3351 0 : vect_print_slp_tree (MSG_NOTE,
3352 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3353 : node);
3354 0 : fprintf (f, "\"];\n");
3355 :
3356 :
3357 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3358 0 : fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
3359 :
3360 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3361 0 : if (child)
3362 0 : dot_slp_tree (f, child, visited);
3363 : }
3364 :
3365 : DEBUG_FUNCTION void
3366 0 : dot_slp_tree (const char *fname, slp_tree node)
3367 : {
3368 0 : FILE *f = fopen (fname, "w");
3369 0 : fprintf (f, "digraph {\n");
3370 0 : fflush (f);
3371 0 : {
3372 0 : debug_dump_context ctx (f);
3373 0 : hash_set<slp_tree> visited;
3374 0 : dot_slp_tree (f, node, visited);
3375 0 : }
3376 0 : fflush (f);
3377 0 : fprintf (f, "}\n");
3378 0 : fclose (f);
3379 0 : }
3380 :
3381 : DEBUG_FUNCTION void
3382 0 : dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
3383 : {
3384 0 : FILE *f = fopen (fname, "w");
3385 0 : fprintf (f, "digraph {\n");
3386 0 : fflush (f);
3387 0 : {
3388 0 : debug_dump_context ctx (f);
3389 0 : hash_set<slp_tree> visited;
3390 0 : for (auto inst : slp_instances)
3391 0 : dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
3392 0 : }
3393 0 : fflush (f);
3394 0 : fprintf (f, "}\n");
3395 0 : fclose (f);
3396 0 : }
3397 :
3398 : /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
3399 :
3400 : static void
3401 477768 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3402 : slp_tree node, hash_set<slp_tree> &visited)
3403 : {
3404 477768 : unsigned i;
3405 477768 : slp_tree child;
3406 :
3407 477768 : if (visited.add (node))
3408 477768 : return;
3409 :
3410 438184 : vect_print_slp_tree (dump_kind, loc, node);
3411 :
3412 1321925 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3413 445557 : if (child)
3414 403596 : vect_print_slp_graph (dump_kind, loc, child, visited);
3415 : }
3416 :
3417 : static void
3418 45709 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3419 : slp_tree entry)
3420 : {
3421 45709 : hash_set<slp_tree> visited;
3422 45709 : vect_print_slp_graph (dump_kind, loc, entry, visited);
3423 45709 : }
3424 :
3425 : DEBUG_FUNCTION void
3426 0 : debug (slp_instance instance)
3427 : {
3428 0 : debug_dump_context ctx;
3429 0 : vect_print_slp_graph (MSG_NOTE,
3430 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3431 : SLP_INSTANCE_TREE (instance));
3432 0 : }
3433 :
3434 : /* Mark the tree rooted at NODE with PURE_SLP. */
3435 :
3436 : static void
3437 2325104 : vect_mark_slp_stmts (vec_info *vinfo, slp_tree node,
3438 : hash_set<slp_tree> &visited)
3439 : {
3440 2325104 : int i;
3441 2325104 : stmt_vec_info stmt_info;
3442 2325104 : slp_tree child;
3443 :
3444 2325104 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3445 : return;
3446 :
3447 1368327 : if (visited.add (node))
3448 : return;
3449 :
3450 4266228 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3451 3004134 : if (stmt_info)
3452 : {
3453 3004134 : STMT_SLP_TYPE (stmt_info) = pure_slp;
3454 : /* ??? For .MASK_LOAD and .MASK_STORE detected as load/store-lanes
3455 : when there is the mask_conversion pattern applied we have lost the
3456 : alternate lanes of the uniform mask which nevertheless
3457 : have separate pattern defs. To not confuse hybrid
3458 : analysis we mark those as covered as well here. */
3459 3004134 : if (node->ldst_lanes)
3460 3004134 : if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
3461 0 : if (gimple_call_internal_p (call, IFN_MASK_LOAD)
3462 0 : || gimple_call_internal_p (call, IFN_MASK_STORE))
3463 : {
3464 0 : tree mask = gimple_call_arg (call,
3465 : internal_fn_mask_index
3466 0 : (gimple_call_internal_fn (call)));
3467 0 : if (TREE_CODE (mask) == SSA_NAME)
3468 0 : if (stmt_vec_info mask_info = vinfo->lookup_def (mask))
3469 : {
3470 0 : mask_info = vect_stmt_to_vectorize (mask_info);
3471 0 : STMT_SLP_TYPE (mask_info) = pure_slp;
3472 : }
3473 : }
3474 : }
3475 :
3476 2811520 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3477 1549426 : if (child)
3478 1549426 : vect_mark_slp_stmts (vinfo, child, visited);
3479 : }
3480 :
3481 : static void
3482 775678 : vect_mark_slp_stmts (vec_info *vinfo, slp_tree node)
3483 : {
3484 775678 : hash_set<slp_tree> visited;
3485 775678 : vect_mark_slp_stmts (vinfo, node, visited);
3486 775678 : }
3487 :
3488 : /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
3489 :
3490 : static void
3491 2325104 : vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
3492 : {
3493 2325104 : int i;
3494 2325104 : stmt_vec_info stmt_info;
3495 2325104 : slp_tree child;
3496 :
3497 2325104 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3498 : return;
3499 :
3500 1368327 : if (visited.add (node))
3501 : return;
3502 :
3503 4266228 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3504 3004134 : if (stmt_info)
3505 : {
3506 3004134 : gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
3507 : || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
3508 3004134 : STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
3509 : }
3510 :
3511 2811520 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3512 1549426 : if (child)
3513 1549426 : vect_mark_slp_stmts_relevant (child, visited);
3514 : }
3515 :
3516 : static void
3517 775678 : vect_mark_slp_stmts_relevant (slp_tree node)
3518 : {
3519 775678 : hash_set<slp_tree> visited;
3520 775678 : vect_mark_slp_stmts_relevant (node, visited);
3521 775678 : }
3522 :
3523 :
3524 : /* Gather loads in the SLP graph NODE and populate the INST loads array. */
3525 :
3526 : static void
3527 9207471 : vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
3528 : hash_set<slp_tree> &visited)
3529 : {
3530 9207471 : if (!node || visited.add (node))
3531 1409878 : return;
3532 :
3533 7797593 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3534 : return;
3535 :
3536 5684240 : if (!SLP_TREE_PERMUTE_P (node))
3537 : {
3538 5507132 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3539 5507132 : if (STMT_VINFO_DATA_REF (stmt_info)
3540 2449845 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3541 1360004 : loads.safe_push (node);
3542 : }
3543 :
3544 : unsigned i;
3545 : slp_tree child;
3546 12846482 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3547 7162242 : vect_gather_slp_loads (loads, child, visited);
3548 : }
3549 :
3550 :
3551 : /* Find the last store in SLP INSTANCE. */
3552 :
3553 : stmt_vec_info
3554 2717662 : vect_find_last_scalar_stmt_in_slp (slp_tree node)
3555 : {
3556 2717662 : stmt_vec_info last = NULL;
3557 2717662 : stmt_vec_info stmt_vinfo;
3558 :
3559 9902102 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3560 7184440 : if (stmt_vinfo)
3561 : {
3562 7184440 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3563 7184440 : last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
3564 : }
3565 :
3566 2717662 : return last;
3567 : }
3568 :
3569 : /* Find the first stmt in NODE. */
3570 :
3571 : stmt_vec_info
3572 530923 : vect_find_first_scalar_stmt_in_slp (slp_tree node)
3573 : {
3574 530923 : stmt_vec_info first = NULL;
3575 530923 : stmt_vec_info stmt_vinfo;
3576 :
3577 1796305 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3578 1265382 : if (stmt_vinfo)
3579 : {
3580 1262688 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3581 1262688 : if (!first
3582 1262688 : || get_later_stmt (stmt_vinfo, first) == first)
3583 : first = stmt_vinfo;
3584 : }
3585 :
3586 530923 : return first;
3587 : }
3588 :
3589 : /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3590 : two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3591 : (also containing the first GROUP1_SIZE stmts, since stores are
3592 : consecutive), the second containing the remainder.
3593 : Return the first stmt in the second group. */
3594 :
3595 : static stmt_vec_info
3596 156088 : vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3597 : {
3598 156088 : gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3599 156088 : gcc_assert (group1_size > 0);
3600 156088 : int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3601 156088 : gcc_assert (group2_size > 0);
3602 156088 : DR_GROUP_SIZE (first_vinfo) = group1_size;
3603 :
3604 156088 : stmt_vec_info stmt_info = first_vinfo;
3605 522407 : for (unsigned i = group1_size; i > 1; i--)
3606 : {
3607 366319 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3608 366319 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3609 : }
3610 : /* STMT is now the last element of the first group. */
3611 156088 : stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3612 156088 : DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3613 :
3614 156088 : DR_GROUP_SIZE (group2) = group2_size;
3615 436335 : for (stmt_info = group2; stmt_info;
3616 280247 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3617 : {
3618 280247 : DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3619 280247 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3620 : }
3621 :
3622 : /* For the second group, the DR_GROUP_GAP is that before the original group,
3623 : plus skipping over the first vector. */
3624 156088 : DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3625 :
3626 : /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3627 156088 : DR_GROUP_GAP (first_vinfo) += group2_size;
3628 :
3629 156088 : if (dump_enabled_p ())
3630 61 : dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3631 : group1_size, group2_size);
3632 :
3633 156088 : return group2;
3634 : }
3635 :
3636 : /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3637 : statements and a vector of NUNITS elements. */
3638 :
3639 : static poly_uint64
3640 3673452 : calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3641 : {
3642 3673452 : return exact_div (common_multiple (nunits, group_size), group_size);
3643 : }
3644 :
3645 : /* Helper that checks to see if a node is a load node. */
3646 :
3647 : static inline bool
3648 54 : vect_is_slp_load_node (slp_tree root)
3649 : {
3650 54 : return (!SLP_TREE_PERMUTE_P (root)
3651 54 : && SLP_TREE_DEF_TYPE (root) == vect_internal_def
3652 48 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3653 94 : && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
3654 : }
3655 :
3656 :
3657 : /* Helper function of optimize_load_redistribution that performs the operation
3658 : recursively. */
3659 :
3660 : static slp_tree
3661 20132 : optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3662 : vec_info *vinfo, unsigned int group_size,
3663 : hash_map<slp_tree, slp_tree> *load_map,
3664 : slp_tree root)
3665 : {
3666 20132 : if (slp_tree *leader = load_map->get (root))
3667 3576 : return *leader;
3668 :
3669 16556 : slp_tree node;
3670 16556 : unsigned i;
3671 :
3672 : /* For now, we don't know anything about externals so do not do anything. */
3673 16556 : if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3674 : return NULL;
3675 12002 : else if (SLP_TREE_PERMUTE_P (root))
3676 : {
3677 : /* First convert this node into a load node and add it to the leaves
3678 : list and flatten the permute from a lane to a load one. If it's
3679 : unneeded it will be elided later. */
3680 34 : vec<stmt_vec_info> stmts;
3681 34 : stmts.create (SLP_TREE_LANES (root));
3682 34 : lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3683 74 : for (unsigned j = 0; j < lane_perm.length (); j++)
3684 : {
3685 54 : std::pair<unsigned, unsigned> perm = lane_perm[j];
3686 54 : node = SLP_TREE_CHILDREN (root)[perm.first];
3687 :
3688 54 : if (!vect_is_slp_load_node (node)
3689 54 : || SLP_TREE_CHILDREN (node).exists ())
3690 : {
3691 14 : stmts.release ();
3692 14 : goto next;
3693 : }
3694 :
3695 40 : stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3696 : }
3697 :
3698 20 : if (dump_enabled_p ())
3699 0 : dump_printf_loc (MSG_NOTE, vect_location,
3700 : "converting stmts on permute node %p\n",
3701 : (void *) root);
3702 :
3703 20 : bool *matches = XALLOCAVEC (bool, group_size);
3704 20 : poly_uint64 max_nunits = 1;
3705 20 : unsigned tree_size = 0, limit = 1;
3706 20 : node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3707 : matches, &limit, &tree_size, bst_map);
3708 20 : if (!node)
3709 0 : stmts.release ();
3710 :
3711 20 : load_map->put (root, node);
3712 20 : return node;
3713 : }
3714 :
3715 11968 : next:
3716 11982 : load_map->put (root, NULL);
3717 :
3718 28363 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3719 : {
3720 16381 : slp_tree value
3721 16381 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3722 : node);
3723 16381 : if (value)
3724 : {
3725 20 : SLP_TREE_REF_COUNT (value)++;
3726 20 : SLP_TREE_CHILDREN (root)[i] = value;
3727 : /* ??? We know the original leafs of the replaced nodes will
3728 : be referenced by bst_map, only the permutes created by
3729 : pattern matching are not. */
3730 20 : if (SLP_TREE_REF_COUNT (node) == 1)
3731 20 : load_map->remove (node);
3732 20 : vect_free_slp_tree (node);
3733 : }
3734 : }
3735 :
3736 : return NULL;
3737 : }
3738 :
3739 : /* Temporary workaround for loads not being CSEd during SLP build. This
3740 : function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3741 : VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3742 : same DR such that the final operation is equal to a permuted load. Such
3743 : NODES are then directly converted into LOADS themselves. The nodes are
3744 : CSEd using BST_MAP. */
3745 :
3746 : static void
3747 2835 : optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3748 : vec_info *vinfo, unsigned int group_size,
3749 : hash_map<slp_tree, slp_tree> *load_map,
3750 : slp_tree root)
3751 : {
3752 2835 : slp_tree node;
3753 2835 : unsigned i;
3754 :
3755 6586 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3756 : {
3757 3751 : slp_tree value
3758 3751 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3759 : node);
3760 3751 : if (value)
3761 : {
3762 0 : SLP_TREE_REF_COUNT (value)++;
3763 0 : SLP_TREE_CHILDREN (root)[i] = value;
3764 : /* ??? We know the original leafs of the replaced nodes will
3765 : be referenced by bst_map, only the permutes created by
3766 : pattern matching are not. */
3767 0 : if (SLP_TREE_REF_COUNT (node) == 1)
3768 0 : load_map->remove (node);
3769 0 : vect_free_slp_tree (node);
3770 : }
3771 : }
3772 2835 : }
3773 :
3774 : /* Helper function of vect_match_slp_patterns.
3775 :
3776 : Attempts to match patterns against the slp tree rooted in REF_NODE using
3777 : VINFO. Patterns are matched in post-order traversal.
3778 :
3779 : If matching is successful the value in REF_NODE is updated and returned, if
3780 : not then it is returned unchanged. */
3781 :
3782 : static bool
3783 5467318 : vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3784 : slp_tree_to_load_perm_map_t *perm_cache,
3785 : slp_compat_nodes_map_t *compat_cache,
3786 : hash_set<slp_tree> *visited)
3787 : {
3788 5467318 : unsigned i;
3789 5467318 : slp_tree node = *ref_node;
3790 5467318 : bool found_p = false;
3791 5467318 : if (!node || visited->add (node))
3792 723008 : return false;
3793 :
3794 : slp_tree child;
3795 8763405 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3796 4019095 : found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3797 : vinfo, perm_cache, compat_cache,
3798 : visited);
3799 :
3800 14232930 : for (unsigned x = 0; x < num__slp_patterns; x++)
3801 : {
3802 9488620 : vect_pattern *pattern
3803 9488620 : = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3804 9488620 : if (pattern)
3805 : {
3806 1081 : pattern->build (vinfo);
3807 1081 : delete pattern;
3808 1081 : found_p = true;
3809 : }
3810 : }
3811 :
3812 : return found_p;
3813 : }
3814 :
3815 : /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3816 : vec_info VINFO.
3817 :
3818 : The modified tree is returned. Patterns are tried in order and multiple
3819 : patterns may match. */
3820 :
3821 : static bool
3822 1448223 : vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3823 : hash_set<slp_tree> *visited,
3824 : slp_tree_to_load_perm_map_t *perm_cache,
3825 : slp_compat_nodes_map_t *compat_cache)
3826 : {
3827 1448223 : DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3828 1448223 : slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3829 :
3830 1448223 : if (dump_enabled_p ())
3831 29623 : dump_printf_loc (MSG_NOTE, vect_location,
3832 : "Analyzing SLP tree %p for patterns\n",
3833 29623 : (void *) SLP_INSTANCE_TREE (instance));
3834 :
3835 1448223 : return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3836 1448223 : visited);
3837 : }
3838 :
3839 : /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3840 : vectorizing with VECTYPE that might be NULL. MASKED_P indicates whether
3841 : the stores are masked.
3842 : Return true if we could use IFN_STORE_LANES instead and if that appears
3843 : to be the better approach. */
3844 :
3845 : static bool
3846 4866 : vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3847 : tree vectype, bool masked_p,
3848 : unsigned int group_size,
3849 : unsigned int new_group_size)
3850 : {
3851 4866 : if (!vectype)
3852 : {
3853 4866 : tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3854 4866 : vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3855 : }
3856 4866 : if (!vectype)
3857 : return false;
3858 : /* Allow the split if one of the two new groups would operate on full
3859 : vectors *within* rather than across one scalar loop iteration.
3860 : This is purely a heuristic, but it should work well for group
3861 : sizes of 3 and 4, where the possible splits are:
3862 :
3863 : 3->2+1: OK if the vector has exactly two elements
3864 : 4->2+2: Likewise
3865 : 4->3+1: Less clear-cut. */
3866 4866 : if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3867 2537 : || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3868 2346 : return false;
3869 2520 : return vect_store_lanes_supported (vectype, group_size, masked_p) != IFN_LAST;
3870 : }
3871 :
3872 : /* Analyze an SLP instance starting from a group of grouped stores. Call
3873 : vect_build_slp_tree to build a tree of packed stmts if possible.
3874 : Return FALSE if it's impossible to SLP any stmt in the loop. */
3875 :
3876 : static bool
3877 : vect_analyze_slp_instance (vec_info *vinfo,
3878 : scalar_stmts_to_slp_tree_map_t *bst_map,
3879 : stmt_vec_info stmt_info, slp_instance_kind kind,
3880 : unsigned max_tree_size, unsigned *limit,
3881 : bool force_single_lane);
3882 :
3883 : /* Build an interleaving scheme for the store sources RHS_NODES from
3884 : SCALAR_STMTS. */
3885 :
3886 : static slp_tree
3887 6204 : vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
3888 : vec<stmt_vec_info> &scalar_stmts,
3889 : poly_uint64 max_nunits)
3890 : {
3891 6204 : unsigned int group_size = scalar_stmts.length ();
3892 12408 : slp_tree node = vect_create_new_slp_node (scalar_stmts,
3893 6204 : SLP_TREE_CHILDREN
3894 : (rhs_nodes[0]).length ());
3895 6204 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
3896 6204 : node->max_nunits = max_nunits;
3897 6204 : for (unsigned l = 0;
3898 12435 : l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
3899 : {
3900 : /* And a permute merging all RHS SLP trees. */
3901 6231 : slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
3902 6231 : VEC_PERM_EXPR);
3903 6231 : SLP_TREE_CHILDREN (node).quick_push (perm);
3904 6231 : SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
3905 6231 : SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
3906 6231 : perm->max_nunits = max_nunits;
3907 6231 : SLP_TREE_LANES (perm) = group_size;
3908 : /* ??? We should set this NULL but that's not expected. */
3909 6231 : SLP_TREE_REPRESENTATIVE (perm)
3910 6231 : = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
3911 24558 : for (unsigned j = 0; j < rhs_nodes.length (); ++j)
3912 : {
3913 18327 : SLP_TREE_CHILDREN (perm)
3914 18327 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
3915 18327 : SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
3916 18327 : for (unsigned k = 0;
3917 38665 : k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
3918 : {
3919 : /* ??? We should populate SLP_TREE_SCALAR_STMTS
3920 : or SLP_TREE_SCALAR_OPS but then we might have
3921 : a mix of both in our children. */
3922 20338 : SLP_TREE_LANE_PERMUTATION (perm)
3923 20338 : .quick_push (std::make_pair (j, k));
3924 : }
3925 : }
3926 :
3927 : /* Now we have a single permute node but we cannot code-generate
3928 : the case with more than two inputs.
3929 : Perform pairwise reduction, reducing the two inputs
3930 : with the least number of lanes to one and then repeat until
3931 : we end up with two inputs. That scheme makes sure we end
3932 : up with permutes satisfying the restriction of requiring at
3933 : most two vector inputs to produce a single vector output
3934 : when the number of lanes is even. */
3935 12096 : while (SLP_TREE_CHILDREN (perm).length () > 2)
3936 : {
3937 : /* When we have three equal sized groups left the pairwise
3938 : reduction does not result in a scheme that avoids using
3939 : three vectors. Instead merge the first two groups
3940 : to the final size with do-not-care elements (chosen
3941 : from the first group) and then merge with the third.
3942 : { A0, B0, x, A1, B1, x, ... }
3943 : -> { A0, B0, C0, A1, B1, C1, ... }
3944 : This handles group size of three (and at least
3945 : power-of-two multiples of that). */
3946 5865 : if (SLP_TREE_CHILDREN (perm).length () == 3
3947 3022 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3948 3022 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
3949 5865 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3950 2280 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
3951 : {
3952 2084 : int ai = 0;
3953 2084 : int bi = 1;
3954 2084 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3955 2084 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3956 2084 : unsigned n = SLP_TREE_LANES (perm);
3957 :
3958 2084 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3959 2084 : SLP_TREE_LANES (permab) = n;
3960 2084 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
3961 2084 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3962 2084 : permab->max_nunits = max_nunits;
3963 : /* ??? Should be NULL but that's not expected. */
3964 2084 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3965 2084 : SLP_TREE_CHILDREN (permab).quick_push (a);
3966 4179 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3967 2095 : SLP_TREE_LANE_PERMUTATION (permab)
3968 2095 : .quick_push (std::make_pair (0, k));
3969 2084 : SLP_TREE_CHILDREN (permab).quick_push (b);
3970 4179 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3971 2095 : SLP_TREE_LANE_PERMUTATION (permab)
3972 2095 : .quick_push (std::make_pair (1, k));
3973 : /* Push the do-not-care lanes. */
3974 4179 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3975 2095 : SLP_TREE_LANE_PERMUTATION (permab)
3976 2095 : .quick_push (std::make_pair (0, k));
3977 :
3978 : /* Put the merged node into 'perm', in place of a. */
3979 2084 : SLP_TREE_CHILDREN (perm)[ai] = permab;
3980 : /* Adjust the references to b in the permutation
3981 : of perm and to the later children which we'll
3982 : remove. */
3983 8369 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3984 : {
3985 6285 : std::pair<unsigned, unsigned> &p
3986 6285 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
3987 6285 : if (p.first == (unsigned) bi)
3988 : {
3989 2095 : p.first = ai;
3990 2095 : p.second += SLP_TREE_LANES (a);
3991 : }
3992 4190 : else if (p.first > (unsigned) bi)
3993 2095 : p.first--;
3994 : }
3995 2084 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3996 2084 : break;
3997 : }
3998 :
3999 : /* Pick the two nodes with the least number of lanes,
4000 : prefer the earliest candidate and maintain ai < bi. */
4001 : int ai = -1;
4002 : int bi = -1;
4003 33069 : for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
4004 : {
4005 29288 : if (ai == -1)
4006 3781 : ai = ci;
4007 25507 : else if (bi == -1)
4008 3781 : bi = ci;
4009 21726 : else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
4010 21726 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
4011 21726 : || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
4012 17812 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
4013 : {
4014 8714 : if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
4015 4357 : <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
4016 2074 : bi = ci;
4017 : else
4018 : {
4019 2283 : ai = bi;
4020 2283 : bi = ci;
4021 : }
4022 : }
4023 : }
4024 :
4025 : /* Produce a merge of nodes ai and bi. */
4026 3781 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
4027 3781 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
4028 3781 : unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
4029 3781 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
4030 3781 : SLP_TREE_LANES (permab) = n;
4031 3781 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
4032 3781 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
4033 3781 : permab->max_nunits = max_nunits;
4034 : /* ??? Should be NULL but that's not expected. */
4035 3781 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
4036 3781 : SLP_TREE_CHILDREN (permab).quick_push (a);
4037 9886 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
4038 6105 : SLP_TREE_LANE_PERMUTATION (permab)
4039 6105 : .quick_push (std::make_pair (0, k));
4040 3781 : SLP_TREE_CHILDREN (permab).quick_push (b);
4041 9398 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
4042 5617 : SLP_TREE_LANE_PERMUTATION (permab)
4043 5617 : .quick_push (std::make_pair (1, k));
4044 :
4045 : /* Put the merged node into 'perm', in place of a. */
4046 3781 : SLP_TREE_CHILDREN (perm)[ai] = permab;
4047 : /* Adjust the references to b in the permutation
4048 : of perm and to the later children which we'll
4049 : remove. */
4050 52693 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
4051 : {
4052 48912 : std::pair<unsigned, unsigned> &p
4053 48912 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
4054 48912 : if (p.first == (unsigned) bi)
4055 : {
4056 5617 : p.first = ai;
4057 5617 : p.second += SLP_TREE_LANES (a);
4058 : }
4059 43295 : else if (p.first > (unsigned) bi)
4060 17862 : p.first--;
4061 : }
4062 3781 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
4063 : }
4064 : }
4065 :
4066 6204 : return node;
4067 : }
4068 :
4069 : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
4070 : of KIND. Return true if successful. SCALAR_STMTS is owned by this
4071 : function, REMAIN and ROOT_STMT_INFOS ownership is transferred back to
4072 : the caller upon failure. */
4073 :
4074 : static bool
4075 1795331 : vect_build_slp_instance (vec_info *vinfo,
4076 : slp_instance_kind kind,
4077 : vec<stmt_vec_info> &scalar_stmts,
4078 : vec<stmt_vec_info> &root_stmt_infos,
4079 : vec<tree> &remain,
4080 : unsigned max_tree_size, unsigned *limit,
4081 : scalar_stmts_to_slp_tree_map_t *bst_map,
4082 : bool force_single_lane)
4083 : {
4084 : /* If there's no budget left bail out early. */
4085 1795331 : if (*limit == 0)
4086 : {
4087 27205 : scalar_stmts.release ();
4088 27205 : return false;
4089 : }
4090 :
4091 1768126 : if (kind == slp_inst_kind_ctor)
4092 : {
4093 12899 : if (dump_enabled_p ())
4094 86 : dump_printf_loc (MSG_NOTE, vect_location,
4095 : "Analyzing vectorizable constructor: %G\n",
4096 43 : root_stmt_infos[0]->stmt);
4097 : }
4098 1755227 : else if (kind == slp_inst_kind_gcond)
4099 : {
4100 272720 : if (dump_enabled_p ())
4101 5558 : dump_printf_loc (MSG_NOTE, vect_location,
4102 : "Analyzing vectorizable control flow: %G",
4103 2779 : root_stmt_infos[0]->stmt);
4104 : }
4105 :
4106 1768126 : if (dump_enabled_p ())
4107 : {
4108 24809 : dump_printf_loc (MSG_NOTE, vect_location,
4109 : "Starting SLP discovery for\n");
4110 52961 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4111 56304 : dump_printf_loc (MSG_NOTE, vect_location,
4112 28152 : " %G", scalar_stmts[i]->stmt);
4113 : }
4114 :
4115 : /* Build the tree for the SLP instance. */
4116 1768126 : unsigned int group_size = scalar_stmts.length ();
4117 1768126 : bool *matches = XALLOCAVEC (bool, group_size);
4118 1768126 : poly_uint64 max_nunits = 1;
4119 1768126 : unsigned tree_size = 0;
4120 :
4121 1768126 : slp_tree node = NULL;
4122 1768126 : if (group_size > 1 && force_single_lane)
4123 : {
4124 0 : matches[0] = true;
4125 0 : matches[1] = false;
4126 : }
4127 : else
4128 1768126 : node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4129 : &max_nunits, matches, limit,
4130 : &tree_size, bst_map);
4131 1768126 : if (node != NULL)
4132 : {
4133 : /* Calculate the unrolling factor based on the smallest type. */
4134 701136 : poly_uint64 unrolling_factor
4135 701136 : = calculate_unrolling_factor (max_nunits, group_size);
4136 :
4137 701136 : if (maybe_ne (unrolling_factor, 1U)
4138 701136 : && is_a <bb_vec_info> (vinfo))
4139 : {
4140 0 : unsigned HOST_WIDE_INT const_max_nunits;
4141 0 : if (!max_nunits.is_constant (&const_max_nunits)
4142 0 : || const_max_nunits > group_size)
4143 : {
4144 0 : if (dump_enabled_p ())
4145 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4146 : "Build SLP failed: store group "
4147 : "size not a multiple of the vector size "
4148 : "in basic block SLP\n");
4149 0 : vect_free_slp_tree (node);
4150 0 : return false;
4151 : }
4152 : /* Fatal mismatch. */
4153 0 : if (dump_enabled_p ())
4154 0 : dump_printf_loc (MSG_NOTE, vect_location,
4155 : "SLP discovery succeeded but node needs "
4156 : "splitting\n");
4157 0 : memset (matches, true, group_size);
4158 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
4159 0 : vect_free_slp_tree (node);
4160 : }
4161 : else
4162 : {
4163 : /* Create a new SLP instance. */
4164 701136 : slp_instance new_instance = XNEW (class _slp_instance);
4165 701136 : SLP_INSTANCE_TREE (new_instance) = node;
4166 701136 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4167 701136 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4168 701136 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4169 701136 : SLP_INSTANCE_KIND (new_instance) = kind;
4170 701136 : new_instance->reduc_phis = NULL;
4171 701136 : new_instance->cost_vec = vNULL;
4172 701136 : new_instance->subgraph_entries = vNULL;
4173 :
4174 701136 : if (dump_enabled_p ())
4175 21827 : dump_printf_loc (MSG_NOTE, vect_location,
4176 : "SLP size %u vs. limit %u.\n",
4177 : tree_size, max_tree_size);
4178 :
4179 701136 : vinfo->slp_instances.safe_push (new_instance);
4180 :
4181 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4182 : the number of scalar stmts in the root in a few places.
4183 : Verify that assumption holds. */
4184 1402272 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4185 : .length () == group_size);
4186 :
4187 701136 : if (dump_enabled_p ())
4188 : {
4189 21827 : if (kind == slp_inst_kind_reduc_group)
4190 1431 : dump_printf_loc (MSG_NOTE, vect_location,
4191 : "SLP discovery of size %d reduction group "
4192 : "succeeded\n", group_size);
4193 21827 : dump_printf_loc (MSG_NOTE, vect_location,
4194 : "Final SLP tree for instance %p:\n",
4195 : (void *) new_instance);
4196 21827 : vect_print_slp_graph (MSG_NOTE, vect_location,
4197 : SLP_INSTANCE_TREE (new_instance));
4198 : }
4199 :
4200 701136 : return true;
4201 : }
4202 : }
4203 : /* Failed to SLP. */
4204 :
4205 : /* While we arrive here even with slp_inst_kind_store we should only
4206 : for group_size == 1. The code to split store groups is only in
4207 : vect_analyze_slp_instance now. */
4208 1066990 : gcc_assert (kind != slp_inst_kind_store || group_size == 1);
4209 :
4210 : /* Free the allocated memory. */
4211 1066990 : scalar_stmts.release ();
4212 :
4213 : /* Failed to SLP. */
4214 1066990 : if (dump_enabled_p ())
4215 2982 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4216 : return false;
4217 : }
4218 :
4219 : /* Analyze an SLP instance starting from a the start of a reduction chain.
4220 : Call vect_build_slp_tree to build a tree of packed stmts if possible.
4221 : Return FALSE if SLP build fails. */
4222 :
4223 : static bool
4224 42784 : vect_analyze_slp_reduc_chain (loop_vec_info vinfo,
4225 : scalar_stmts_to_slp_tree_map_t *bst_map,
4226 : stmt_vec_info scalar_stmt,
4227 : unsigned max_tree_size, unsigned *limit)
4228 : {
4229 42784 : vec<stmt_vec_info> scalar_stmts = vNULL;
4230 :
4231 42784 : bool fail = false;
4232 : /* ??? We could leave operation code checking to SLP discovery. */
4233 42784 : code_helper code = STMT_VINFO_REDUC_CODE (STMT_VINFO_REDUC_DEF
4234 : (vect_orig_stmt (scalar_stmt)));
4235 42784 : bool first = true;
4236 42784 : stmt_vec_info next_stmt = scalar_stmt;
4237 47928 : do
4238 : {
4239 47928 : stmt_vec_info stmt = next_stmt;
4240 47928 : gimple_match_op op;
4241 47928 : if (!gimple_extract_op (STMT_VINFO_STMT (stmt), &op))
4242 0 : gcc_unreachable ();
4243 95856 : tree reduc_def = gimple_arg (STMT_VINFO_STMT (stmt),
4244 47928 : STMT_VINFO_REDUC_IDX (stmt));
4245 47928 : next_stmt = vect_stmt_to_vectorize (vinfo->lookup_def (reduc_def));
4246 47928 : gcc_assert (is_a <gphi *> (STMT_VINFO_STMT (next_stmt))
4247 : || STMT_VINFO_REDUC_IDX (next_stmt) != -1);
4248 51332 : if (!gimple_extract_op (STMT_VINFO_STMT (vect_orig_stmt (stmt)), &op))
4249 0 : gcc_unreachable ();
4250 47928 : if (CONVERT_EXPR_CODE_P (op.code)
4251 2149 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))
4252 50065 : && (first
4253 1058 : || is_a <gphi *> (STMT_VINFO_STMT (next_stmt))))
4254 : ;
4255 45793 : else if (code != op.code)
4256 : {
4257 1718 : fail = true;
4258 1718 : break;
4259 : }
4260 : else
4261 44075 : scalar_stmts.safe_push (stmt);
4262 46210 : first = false;
4263 : }
4264 46210 : while (!is_a <gphi *> (STMT_VINFO_STMT (next_stmt)));
4265 42784 : if (fail)
4266 1718 : return false;
4267 :
4268 : /* Remember a stmt with the actual reduction operation. */
4269 41066 : stmt_vec_info reduc_scalar_stmt = scalar_stmts[0];
4270 :
4271 : /* When the SSA def chain through reduc-idx does not form a natural
4272 : reduction chain try to linearize an associative operation manually. */
4273 41066 : if (scalar_stmts.length () == 1
4274 39411 : && code.is_tree_code ()
4275 36025 : && associative_tree_code ((tree_code)code)
4276 : /* We may not associate if a fold-left reduction is required. */
4277 76220 : && !needs_fold_left_reduction_p (TREE_TYPE (gimple_get_lhs
4278 : (reduc_scalar_stmt->stmt)),
4279 : code))
4280 : {
4281 33330 : auto_vec<chain_op_t> chain;
4282 33330 : auto_vec<std::pair<tree_code, gimple *> > worklist;
4283 33330 : gimple *op_stmt = NULL, *other_op_stmt = NULL;
4284 33330 : vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
4285 33330 : scalar_stmts[0]->stmt, op_stmt, other_op_stmt,
4286 : NULL);
4287 :
4288 33330 : scalar_stmts.truncate (0);
4289 33330 : stmt_vec_info tail = NULL;
4290 165891 : for (auto el : chain)
4291 : {
4292 66583 : if (el.dt == vect_external_def
4293 66583 : || el.dt == vect_constant_def
4294 66583 : || el.code != (tree_code) code)
4295 : {
4296 682 : scalar_stmts.release ();
4297 682 : return false;
4298 : }
4299 65901 : stmt_vec_info stmt = vinfo->lookup_def (el.op);
4300 65901 : if (STMT_VINFO_REDUC_IDX (stmt) != -1
4301 64934 : || STMT_VINFO_REDUC_DEF (stmt))
4302 : {
4303 32824 : gcc_assert (tail == NULL);
4304 32824 : tail = stmt;
4305 32824 : continue;
4306 : }
4307 33077 : scalar_stmts.safe_push (stmt);
4308 : }
4309 32648 : gcc_assert (tail);
4310 :
4311 : /* When this linearization didn't produce a chain see if stripping
4312 : a wrapping sign conversion produces one. */
4313 32648 : if (scalar_stmts.length () == 1
4314 32648 : && (code == PLUS_EXPR || code == MULT_EXPR || code == BIT_IOR_EXPR
4315 : || code == BIT_AND_EXPR || code == BIT_XOR_EXPR))
4316 : {
4317 31358 : gimple *stmt = scalar_stmts[0]->stmt;
4318 31358 : if (!is_gimple_assign (stmt)
4319 30318 : || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
4320 3917 : || TREE_CODE (gimple_assign_rhs1 (stmt)) != SSA_NAME
4321 35275 : || !tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
4322 3917 : TREE_TYPE (gimple_assign_rhs1 (stmt))))
4323 : {
4324 29878 : scalar_stmts.release ();
4325 29878 : return false;
4326 : }
4327 1480 : stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
4328 1480 : if (!is_gimple_assign (stmt)
4329 1480 : || gimple_assign_rhs_code (stmt) != (tree_code)code)
4330 : {
4331 1462 : scalar_stmts.release ();
4332 1462 : return false;
4333 : }
4334 18 : chain.truncate (0);
4335 18 : vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
4336 : stmt, op_stmt, other_op_stmt, NULL);
4337 :
4338 18 : scalar_stmts.truncate (0);
4339 18 : tail = NULL;
4340 88 : for (auto el : chain)
4341 : {
4342 42 : if (el.dt == vect_external_def
4343 42 : || el.dt == vect_constant_def
4344 42 : || el.code != (tree_code) code)
4345 : {
4346 8 : scalar_stmts.release ();
4347 8 : return false;
4348 : }
4349 34 : stmt_vec_info stmt = vinfo->lookup_def (el.op);
4350 34 : if (STMT_VINFO_REDUC_IDX (stmt) != -1
4351 34 : || STMT_VINFO_REDUC_DEF (stmt))
4352 : {
4353 0 : gcc_assert (tail == NULL);
4354 0 : tail = stmt;
4355 0 : continue;
4356 : }
4357 34 : scalar_stmts.safe_push (stmt);
4358 : }
4359 : /* Unlike the above this does not include the reduction SSA
4360 : cycle. */
4361 10 : gcc_assert (!tail);
4362 : }
4363 :
4364 1300 : if (scalar_stmts.length () < 2)
4365 : {
4366 1207 : scalar_stmts.release ();
4367 1207 : return false;
4368 : }
4369 :
4370 93 : if (dump_enabled_p ())
4371 : {
4372 34 : dump_printf_loc (MSG_NOTE, vect_location,
4373 : "Starting SLP discovery of reduction chain for\n");
4374 140 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4375 212 : dump_printf_loc (MSG_NOTE, vect_location,
4376 106 : " %G", scalar_stmts[i]->stmt);
4377 : }
4378 :
4379 93 : unsigned int group_size = scalar_stmts.length ();
4380 93 : bool *matches = XALLOCAVEC (bool, group_size);
4381 93 : poly_uint64 max_nunits = 1;
4382 93 : unsigned tree_size = 0;
4383 93 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4384 : &max_nunits, matches, limit,
4385 93 : &tree_size, bst_map);
4386 93 : if (!node)
4387 : {
4388 37 : scalar_stmts.release ();
4389 37 : return false;
4390 : }
4391 :
4392 56 : unsigned cycle_id = vinfo->reduc_infos.length ();
4393 56 : vect_reduc_info reduc_info = new vect_reduc_info_s ();
4394 56 : vinfo->reduc_infos.safe_push (reduc_info);
4395 56 : VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (next_stmt);
4396 56 : VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (next_stmt);
4397 56 : VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (next_stmt);
4398 56 : VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
4399 56 : reduc_info->is_reduc_chain = true;
4400 :
4401 : /* Build the node for the PHI and possibly the conversions. */
4402 56 : slp_tree phis = vect_create_new_slp_node (2, ERROR_MARK);
4403 56 : SLP_TREE_REPRESENTATIVE (phis) = next_stmt;
4404 56 : phis->cycle_info.id = cycle_id;
4405 56 : SLP_TREE_LANES (phis) = group_size;
4406 56 : if (reduc_scalar_stmt == scalar_stmt)
4407 52 : SLP_TREE_VECTYPE (phis) = SLP_TREE_VECTYPE (node);
4408 : else
4409 4 : SLP_TREE_VECTYPE (phis)
4410 4 : = signed_or_unsigned_type_for (TYPE_UNSIGNED
4411 : (TREE_TYPE (gimple_get_lhs
4412 : (scalar_stmt->stmt))),
4413 : SLP_TREE_VECTYPE (node));
4414 : /* ??? vect_cse_slp_nodes cannot cope with cycles without any
4415 : SLP_TREE_SCALAR_STMTS. */
4416 56 : SLP_TREE_SCALAR_STMTS (phis).create (group_size);
4417 235 : for (unsigned i = 0; i < group_size; ++i)
4418 179 : SLP_TREE_SCALAR_STMTS (phis).quick_push (next_stmt);
4419 :
4420 56 : slp_tree op_input = phis;
4421 56 : if (reduc_scalar_stmt != scalar_stmt)
4422 : {
4423 4 : slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
4424 4 : SLP_TREE_REPRESENTATIVE (conv)
4425 4 : = vinfo->lookup_def (gimple_arg (reduc_scalar_stmt->stmt,
4426 4 : STMT_VINFO_REDUC_IDX
4427 : (reduc_scalar_stmt)));
4428 4 : SLP_TREE_CHILDREN (conv).quick_push (phis);
4429 4 : conv->cycle_info.id = cycle_id;
4430 4 : SLP_TREE_REDUC_IDX (conv) = 0;
4431 4 : SLP_TREE_LANES (conv) = group_size;
4432 4 : SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (node);
4433 4 : SLP_TREE_SCALAR_STMTS (conv) = vNULL;
4434 4 : op_input = conv;
4435 : }
4436 :
4437 56 : slp_tree reduc = vect_create_new_slp_node (2, ERROR_MARK);
4438 56 : SLP_TREE_REPRESENTATIVE (reduc) = reduc_scalar_stmt;
4439 56 : SLP_TREE_CHILDREN (reduc).quick_push (op_input);
4440 56 : SLP_TREE_CHILDREN (reduc).quick_push (node);
4441 56 : reduc->cycle_info.id = cycle_id;
4442 56 : SLP_TREE_REDUC_IDX (reduc) = 0;
4443 56 : SLP_TREE_LANES (reduc) = group_size;
4444 56 : SLP_TREE_VECTYPE (reduc) = SLP_TREE_VECTYPE (node);
4445 : /* ??? For the reduction epilogue we need a live lane. */
4446 56 : SLP_TREE_SCALAR_STMTS (reduc).create (group_size);
4447 56 : SLP_TREE_SCALAR_STMTS (reduc).quick_push (reduc_scalar_stmt);
4448 179 : for (unsigned i = 1; i < group_size; ++i)
4449 123 : SLP_TREE_SCALAR_STMTS (reduc).quick_push (NULL);
4450 :
4451 56 : if (reduc_scalar_stmt != scalar_stmt)
4452 : {
4453 4 : slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
4454 4 : SLP_TREE_REPRESENTATIVE (conv) = scalar_stmt;
4455 4 : SLP_TREE_CHILDREN (conv).quick_push (reduc);
4456 4 : conv->cycle_info.id = cycle_id;
4457 4 : SLP_TREE_REDUC_IDX (conv) = 0;
4458 4 : SLP_TREE_LANES (conv) = group_size;
4459 4 : SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (phis);
4460 : /* ??? For the reduction epilogue we need a live lane. */
4461 4 : SLP_TREE_SCALAR_STMTS (conv).create (group_size);
4462 4 : SLP_TREE_SCALAR_STMTS (conv).quick_push (scalar_stmt);
4463 8 : for (unsigned i = 1; i < group_size; ++i)
4464 4 : SLP_TREE_SCALAR_STMTS (conv).quick_push (NULL);
4465 4 : reduc = conv;
4466 : }
4467 :
4468 56 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (vinfo));
4469 56 : SLP_TREE_CHILDREN (phis).quick_push (NULL);
4470 56 : SLP_TREE_CHILDREN (phis).quick_push (NULL);
4471 56 : SLP_TREE_CHILDREN (phis)[le->dest_idx] = reduc;
4472 56 : SLP_TREE_REF_COUNT (reduc)++;
4473 :
4474 : /* Create a new SLP instance. */
4475 56 : slp_instance new_instance = XNEW (class _slp_instance);
4476 56 : SLP_INSTANCE_TREE (new_instance) = reduc;
4477 56 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4478 56 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4479 56 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4480 56 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
4481 56 : new_instance->reduc_phis = NULL;
4482 56 : new_instance->cost_vec = vNULL;
4483 56 : new_instance->subgraph_entries = vNULL;
4484 :
4485 56 : vinfo->slp_instances.safe_push (new_instance);
4486 :
4487 56 : if (dump_enabled_p ())
4488 : {
4489 24 : dump_printf_loc (MSG_NOTE, vect_location,
4490 : "Final SLP tree for instance %p:\n",
4491 : (void *) new_instance);
4492 24 : vect_print_slp_graph (MSG_NOTE, vect_location,
4493 : SLP_INSTANCE_TREE (new_instance));
4494 : }
4495 :
4496 56 : return true;
4497 33330 : }
4498 :
4499 7736 : if (scalar_stmts.length () <= 1)
4500 : {
4501 6081 : scalar_stmts.release ();
4502 6081 : return false;
4503 : }
4504 :
4505 1655 : scalar_stmts.reverse ();
4506 1655 : stmt_vec_info reduc_phi_info = next_stmt;
4507 :
4508 : /* Build the tree for the SLP instance. */
4509 1655 : vec<stmt_vec_info> root_stmt_infos = vNULL;
4510 1655 : vec<tree> remain = vNULL;
4511 :
4512 1655 : if (dump_enabled_p ())
4513 : {
4514 180 : dump_printf_loc (MSG_NOTE, vect_location,
4515 : "Starting SLP discovery of reduction chain for\n");
4516 966 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4517 1572 : dump_printf_loc (MSG_NOTE, vect_location,
4518 786 : " %G", scalar_stmts[i]->stmt);
4519 : }
4520 :
4521 : /* Build the tree for the SLP instance. */
4522 1655 : unsigned int group_size = scalar_stmts.length ();
4523 1655 : bool *matches = XALLOCAVEC (bool, group_size);
4524 1655 : poly_uint64 max_nunits = 1;
4525 1655 : unsigned tree_size = 0;
4526 :
4527 : /* ??? We need this only for SLP discovery. */
4528 6315 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4529 4660 : REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = scalar_stmts[0];
4530 :
4531 1655 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4532 : &max_nunits, matches, limit,
4533 1655 : &tree_size, bst_map);
4534 :
4535 6315 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4536 4660 : REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = NULL;
4537 :
4538 1655 : if (node != NULL)
4539 : {
4540 : /* Create a new SLP instance. */
4541 1395 : slp_instance new_instance = XNEW (class _slp_instance);
4542 1395 : SLP_INSTANCE_TREE (new_instance) = node;
4543 1395 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4544 1395 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4545 1395 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4546 1395 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
4547 1395 : new_instance->reduc_phis = NULL;
4548 1395 : new_instance->cost_vec = vNULL;
4549 1395 : new_instance->subgraph_entries = vNULL;
4550 :
4551 1395 : vect_reduc_info reduc_info = info_for_reduction (vinfo, node);
4552 1395 : reduc_info->is_reduc_chain = true;
4553 :
4554 1395 : if (dump_enabled_p ())
4555 135 : dump_printf_loc (MSG_NOTE, vect_location,
4556 : "SLP size %u vs. limit %u.\n",
4557 : tree_size, max_tree_size);
4558 :
4559 : /* Fixup SLP reduction chains. If this is a reduction chain with
4560 : a conversion in front amend the SLP tree with a node for that. */
4561 1395 : gimple *scalar_def = STMT_VINFO_REDUC_DEF (reduc_phi_info)->stmt;
4562 1395 : if (is_gimple_assign (scalar_def)
4563 1395 : && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (scalar_def)))
4564 : {
4565 28 : stmt_vec_info conv_info = vect_stmt_to_vectorize
4566 28 : (STMT_VINFO_REDUC_DEF (reduc_phi_info));
4567 28 : scalar_stmts = vNULL;
4568 28 : scalar_stmts.create (group_size);
4569 90 : for (unsigned i = 0; i < group_size; ++i)
4570 62 : scalar_stmts.quick_push (conv_info);
4571 28 : slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
4572 28 : SLP_TREE_VECTYPE (conv)
4573 28 : = get_vectype_for_scalar_type (vinfo,
4574 28 : TREE_TYPE
4575 : (gimple_assign_lhs (scalar_def)),
4576 : group_size);
4577 28 : SLP_TREE_REDUC_IDX (conv) = 0;
4578 28 : conv->cycle_info.id = node->cycle_info.id;
4579 28 : SLP_TREE_CHILDREN (conv).quick_push (node);
4580 28 : SLP_INSTANCE_TREE (new_instance) = conv;
4581 : }
4582 : /* Fill the backedge child of the PHI SLP node. The
4583 : general matching code cannot find it because the
4584 : scalar code does not reflect how we vectorize the
4585 : reduction. */
4586 1395 : use_operand_p use_p;
4587 1395 : imm_use_iterator imm_iter;
4588 1395 : class loop *loop = LOOP_VINFO_LOOP (vinfo);
4589 6670 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
4590 : gimple_get_lhs (scalar_def))
4591 : /* There are exactly two non-debug uses, the reduction
4592 : PHI and the loop-closed PHI node. */
4593 3880 : if (!is_gimple_debug (USE_STMT (use_p))
4594 3880 : && gimple_bb (USE_STMT (use_p)) == loop->header)
4595 : {
4596 1395 : auto_vec<stmt_vec_info, 64> phis (group_size);
4597 1395 : stmt_vec_info phi_info = vinfo->lookup_stmt (USE_STMT (use_p));
4598 5386 : for (unsigned i = 0; i < group_size; ++i)
4599 3991 : phis.quick_push (phi_info);
4600 1395 : slp_tree *phi_node = bst_map->get (phis);
4601 1395 : unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
4602 2790 : SLP_TREE_CHILDREN (*phi_node)[dest_idx]
4603 1395 : = SLP_INSTANCE_TREE (new_instance);
4604 1395 : SLP_INSTANCE_TREE (new_instance)->refcnt++;
4605 1395 : }
4606 :
4607 1395 : vinfo->slp_instances.safe_push (new_instance);
4608 :
4609 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4610 : the number of scalar stmts in the root in a few places.
4611 : Verify that assumption holds. */
4612 2790 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4613 : .length () == group_size);
4614 :
4615 1395 : if (dump_enabled_p ())
4616 : {
4617 135 : dump_printf_loc (MSG_NOTE, vect_location,
4618 : "Final SLP tree for instance %p:\n",
4619 : (void *) new_instance);
4620 135 : vect_print_slp_graph (MSG_NOTE, vect_location,
4621 : SLP_INSTANCE_TREE (new_instance));
4622 : }
4623 :
4624 1395 : return true;
4625 : }
4626 :
4627 : /* Failed to SLP. */
4628 260 : scalar_stmts.release ();
4629 260 : if (dump_enabled_p ())
4630 45 : dump_printf_loc (MSG_NOTE, vect_location,
4631 : "SLP discovery of reduction chain failed\n");
4632 : return false;
4633 : }
4634 :
4635 : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
4636 : of KIND. Return true if successful. */
4637 :
4638 : static bool
4639 63593 : vect_analyze_slp_reduction (loop_vec_info vinfo,
4640 : stmt_vec_info scalar_stmt,
4641 : unsigned max_tree_size, unsigned *limit,
4642 : scalar_stmts_to_slp_tree_map_t *bst_map,
4643 : bool force_single_lane)
4644 : {
4645 63593 : slp_instance_kind kind = slp_inst_kind_reduc_group;
4646 :
4647 : /* If there's no budget left bail out early. */
4648 63593 : if (*limit == 0)
4649 : return false;
4650 :
4651 : /* Try to gather a reduction chain. */
4652 63593 : if (! force_single_lane
4653 43001 : && STMT_VINFO_DEF_TYPE (scalar_stmt) == vect_reduction_def
4654 106377 : && vect_analyze_slp_reduc_chain (vinfo, bst_map, scalar_stmt,
4655 : max_tree_size, limit))
4656 : return true;
4657 :
4658 62142 : vec<stmt_vec_info> scalar_stmts;
4659 62142 : scalar_stmts.create (1);
4660 62142 : scalar_stmts.quick_push (scalar_stmt);
4661 :
4662 62142 : if (dump_enabled_p ())
4663 : {
4664 3338 : dump_printf_loc (MSG_NOTE, vect_location,
4665 : "Starting SLP discovery for\n");
4666 6676 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4667 6676 : dump_printf_loc (MSG_NOTE, vect_location,
4668 3338 : " %G", scalar_stmts[i]->stmt);
4669 : }
4670 :
4671 : /* Build the tree for the SLP instance. */
4672 62142 : unsigned int group_size = scalar_stmts.length ();
4673 62142 : bool *matches = XALLOCAVEC (bool, group_size);
4674 62142 : poly_uint64 max_nunits = 1;
4675 62142 : unsigned tree_size = 0;
4676 :
4677 62142 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4678 : &max_nunits, matches, limit,
4679 : &tree_size, bst_map);
4680 62142 : if (node != NULL)
4681 : {
4682 : /* Create a new SLP instance. */
4683 59557 : slp_instance new_instance = XNEW (class _slp_instance);
4684 59557 : SLP_INSTANCE_TREE (new_instance) = node;
4685 59557 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4686 59557 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4687 59557 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4688 59557 : SLP_INSTANCE_KIND (new_instance) = kind;
4689 59557 : new_instance->reduc_phis = NULL;
4690 59557 : new_instance->cost_vec = vNULL;
4691 59557 : new_instance->subgraph_entries = vNULL;
4692 :
4693 59557 : if (dump_enabled_p ())
4694 3222 : dump_printf_loc (MSG_NOTE, vect_location,
4695 : "SLP size %u vs. limit %u.\n",
4696 : tree_size, max_tree_size);
4697 :
4698 59557 : vinfo->slp_instances.safe_push (new_instance);
4699 :
4700 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4701 : the number of scalar stmts in the root in a few places.
4702 : Verify that assumption holds. */
4703 119114 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4704 : .length () == group_size);
4705 :
4706 59557 : if (dump_enabled_p ())
4707 : {
4708 3222 : dump_printf_loc (MSG_NOTE, vect_location,
4709 : "Final SLP tree for instance %p:\n",
4710 : (void *) new_instance);
4711 3222 : vect_print_slp_graph (MSG_NOTE, vect_location,
4712 : SLP_INSTANCE_TREE (new_instance));
4713 : }
4714 :
4715 59557 : return true;
4716 : }
4717 : /* Failed to SLP. */
4718 :
4719 : /* Free the allocated memory. */
4720 2585 : scalar_stmts.release ();
4721 :
4722 : /* Failed to SLP. */
4723 2585 : if (dump_enabled_p ())
4724 116 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4725 : return false;
4726 : }
4727 :
4728 : /* Analyze a single SLP reduction group. If successful add a SLP instance
4729 : for it and return true, otherwise return false and have *MATCHES
4730 : populated. */
4731 :
4732 : static bool
4733 18143 : vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo,
4734 : vec<stmt_vec_info> scalar_stmts,
4735 : scalar_stmts_to_slp_tree_map_t *bst_map,
4736 : unsigned max_tree_size, unsigned *limit,
4737 : bool *matches)
4738 : {
4739 : /* Try to form a reduction group. */
4740 18143 : unsigned int group_size = scalar_stmts.length ();
4741 18143 : if (!matches)
4742 7417 : matches = XALLOCAVEC (bool, group_size);
4743 18143 : poly_uint64 max_nunits = 1;
4744 18143 : unsigned tree_size = 0;
4745 18143 : slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts,
4746 : group_size,
4747 : &max_nunits, matches, limit,
4748 : &tree_size, bst_map);
4749 18143 : if (!node)
4750 : return false;
4751 :
4752 : /* Create a new SLP instance. */
4753 8601 : slp_instance new_instance = XNEW (class _slp_instance);
4754 8601 : SLP_INSTANCE_TREE (new_instance) = node;
4755 8601 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4756 8601 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4757 8601 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4758 8601 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group;
4759 8601 : new_instance->reduc_phis = NULL;
4760 8601 : new_instance->cost_vec = vNULL;
4761 8601 : new_instance->subgraph_entries = vNULL;
4762 :
4763 8601 : if (dump_enabled_p ())
4764 544 : dump_printf_loc (MSG_NOTE, vect_location,
4765 : "SLP size %u vs. limit %u.\n",
4766 : tree_size, max_tree_size);
4767 :
4768 8601 : loop_vinfo->slp_instances.safe_push (new_instance);
4769 :
4770 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4771 : the number of scalar stmts in the root in a few places.
4772 : Verify that assumption holds. */
4773 17202 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4774 : .length () == group_size);
4775 :
4776 8601 : if (dump_enabled_p ())
4777 : {
4778 544 : dump_printf_loc (MSG_NOTE, vect_location,
4779 : "SLP discovery of size %d reduction group "
4780 : "succeeded\n", group_size);
4781 544 : dump_printf_loc (MSG_NOTE, vect_location,
4782 : "Final SLP tree for instance %p:\n",
4783 : (void *) new_instance);
4784 544 : vect_print_slp_graph (MSG_NOTE, vect_location,
4785 : SLP_INSTANCE_TREE (new_instance));
4786 : }
4787 :
4788 : return true;
4789 : }
4790 :
4791 : /* Analyze reductions in LOOP_VINFO and populate SLP instances
4792 : accordingly. Returns false if something fails. */
4793 :
4794 : static bool
4795 423314 : vect_analyze_slp_reductions (loop_vec_info loop_vinfo,
4796 : unsigned max_tree_size, unsigned *limit,
4797 : scalar_stmts_to_slp_tree_map_t *bst_map,
4798 : bool force_single_lane)
4799 : {
4800 470668 : if (loop_vinfo->reductions.is_empty ())
4801 : return true;
4802 :
4803 : /* Collect reduction statements we can combine into
4804 : a SLP reduction. */
4805 53144 : vec<stmt_vec_info> scalar_stmts;
4806 53144 : scalar_stmts.create (loop_vinfo->reductions.length ());
4807 234289 : for (auto next_info : loop_vinfo->reductions)
4808 : {
4809 74857 : next_info = vect_stmt_to_vectorize (next_info);
4810 74857 : if ((STMT_VINFO_RELEVANT_P (next_info)
4811 14 : || STMT_VINFO_LIVE_P (next_info))
4812 : /* ??? Make sure we didn't skip a conversion around a
4813 : reduction path. In that case we'd have to reverse
4814 : engineer that conversion stmt following the chain using
4815 : reduc_idx and from the PHI using reduc_def. */
4816 74843 : && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
4817 74843 : || (STMT_VINFO_DEF_TYPE (next_info)
4818 : == vect_double_reduction_def)))
4819 : {
4820 : /* Do not discover SLP reductions combining lane-reducing
4821 : ops, that will fail later. */
4822 74843 : if (!force_single_lane
4823 74843 : && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
4824 53816 : scalar_stmts.quick_push (next_info);
4825 : /* Do SLP discovery for single-lane reductions. */
4826 21027 : else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
4827 : max_tree_size, limit,
4828 : bst_map,
4829 : force_single_lane))
4830 : {
4831 0 : scalar_stmts.release ();
4832 0 : return false;
4833 : }
4834 : }
4835 : }
4836 :
4837 53144 : if (scalar_stmts.length () > 1)
4838 : {
4839 : /* Try to form a reduction group. */
4840 3331 : unsigned int group_size = scalar_stmts.length ();
4841 3331 : bool *matches = XALLOCAVEC (bool, group_size);
4842 3331 : if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts, bst_map,
4843 : max_tree_size, limit, matches))
4844 3227 : return true;
4845 :
4846 : /* When analysis as a single SLP reduction group failed try to
4847 : form sub-groups by collecting matching lanes. Do not recurse
4848 : that on failure (to limit compile-time costs), but recurse
4849 : for the initial non-matching parts. Everything not covered
4850 : by a sub-group gets single-reduction treatment. */
4851 2418 : vec<stmt_vec_info> cands = vNULL;
4852 7521 : while (matches[0])
4853 : {
4854 7417 : cands.truncate (0);
4855 7417 : cands.reserve (group_size, true);
4856 58074 : for (unsigned i = 0; i < group_size; ++i)
4857 50657 : if (matches[i])
4858 12395 : cands.quick_push (scalar_stmts[i]);
4859 :
4860 : /* Try to form a reduction group. */
4861 7417 : if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map,
4862 : max_tree_size, limit, NULL))
4863 5396 : cands = vNULL;
4864 : else
4865 : {
4866 : /* Do SLP discovery for single-lane reductions. */
4867 12272 : for (auto stmt_info : cands)
4868 6231 : if (! vect_analyze_slp_reduction (loop_vinfo,
4869 : vect_stmt_to_vectorize
4870 : (stmt_info),
4871 : max_tree_size, limit,
4872 : bst_map, force_single_lane))
4873 : {
4874 22 : scalar_stmts.release ();
4875 22 : cands.release ();
4876 22 : return false;
4877 : }
4878 : }
4879 : /* Remove the handled stmts from scalar_stmts and try again,
4880 : possibly repeating the above with updated matches[]. */
4881 : unsigned j = 0;
4882 57990 : for (unsigned i = 0; i < group_size; ++i)
4883 50595 : if (!matches[i])
4884 : {
4885 38235 : scalar_stmts[j] = scalar_stmts[i];
4886 38235 : ++j;
4887 : }
4888 7395 : scalar_stmts.truncate (j);
4889 7395 : group_size = scalar_stmts.length ();
4890 7395 : if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
4891 : bst_map, max_tree_size, limit,
4892 : matches))
4893 : return true;
4894 : }
4895 : }
4896 : /* Do SLP discovery for single-lane reductions. */
4897 183523 : for (auto stmt_info : scalar_stmts)
4898 36335 : if (! vect_analyze_slp_reduction (loop_vinfo,
4899 : vect_stmt_to_vectorize (stmt_info),
4900 : max_tree_size, limit,
4901 : bst_map, force_single_lane))
4902 : {
4903 2563 : scalar_stmts.release ();
4904 2563 : return false;
4905 : }
4906 :
4907 47354 : scalar_stmts.release ();
4908 47354 : return true;
4909 : }
4910 :
4911 : /* Analyze an SLP instance starting from a group of grouped stores. Call
4912 : vect_build_slp_tree to build a tree of packed stmts if possible.
4913 : Return FALSE if it's impossible to SLP any stmt in the group. */
4914 :
4915 : static bool
4916 1083263 : vect_analyze_slp_instance (vec_info *vinfo,
4917 : scalar_stmts_to_slp_tree_map_t *bst_map,
4918 : stmt_vec_info stmt_info,
4919 : slp_instance_kind kind,
4920 : unsigned max_tree_size, unsigned *limit,
4921 : bool force_single_lane)
4922 : {
4923 1083263 : vec<stmt_vec_info> scalar_stmts;
4924 :
4925 1083263 : if (is_a <bb_vec_info> (vinfo))
4926 1059535 : vect_location = stmt_info->stmt;
4927 :
4928 1083263 : gcc_assert (kind == slp_inst_kind_store);
4929 :
4930 : /* Collect the stores and store them in scalar_stmts. */
4931 1083263 : scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
4932 1083263 : stmt_vec_info next_info = stmt_info;
4933 5376605 : while (next_info)
4934 : {
4935 3210079 : scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4936 3210079 : next_info = DR_GROUP_NEXT_ELEMENT (next_info);
4937 : }
4938 :
4939 1083263 : vec<stmt_vec_info> root_stmt_infos = vNULL;
4940 1083263 : vec<tree> remain = vNULL;
4941 :
4942 : /* Build the tree for the SLP instance. */
4943 :
4944 : /* If there's no budget left bail out early. */
4945 1083263 : if (*limit == 0)
4946 : return false;
4947 :
4948 1083240 : if (dump_enabled_p ())
4949 : {
4950 4109 : dump_printf_loc (MSG_NOTE, vect_location,
4951 : "Starting SLP discovery for\n");
4952 23674 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4953 39130 : dump_printf_loc (MSG_NOTE, vect_location,
4954 19565 : " %G", scalar_stmts[i]->stmt);
4955 : }
4956 :
4957 : /* Build the tree for the SLP instance. */
4958 1083240 : unsigned int group_size = scalar_stmts.length ();
4959 1083240 : bool *matches = XALLOCAVEC (bool, group_size);
4960 1083240 : poly_uint64 max_nunits = 1;
4961 1083240 : unsigned tree_size = 0;
4962 1083240 : unsigned i;
4963 :
4964 1083240 : slp_tree node = NULL;
4965 1083240 : if (group_size > 1 && force_single_lane)
4966 : {
4967 1498 : matches[0] = true;
4968 1498 : matches[1] = false;
4969 : }
4970 : else
4971 1081742 : node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4972 : &max_nunits, matches, limit,
4973 : &tree_size, bst_map);
4974 1083240 : if (node != NULL)
4975 : {
4976 : /* Calculate the unrolling factor based on the smallest type. */
4977 674116 : poly_uint64 unrolling_factor
4978 674116 : = calculate_unrolling_factor (max_nunits, group_size);
4979 :
4980 674116 : if (maybe_ne (unrolling_factor, 1U)
4981 674116 : && is_a <bb_vec_info> (vinfo))
4982 : {
4983 0 : unsigned HOST_WIDE_INT const_max_nunits;
4984 0 : if (!max_nunits.is_constant (&const_max_nunits)
4985 0 : || const_max_nunits > group_size)
4986 : {
4987 0 : if (dump_enabled_p ())
4988 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4989 : "Build SLP failed: store group "
4990 : "size not a multiple of the vector size "
4991 : "in basic block SLP\n");
4992 0 : vect_free_slp_tree (node);
4993 0 : return false;
4994 : }
4995 : /* Fatal mismatch. */
4996 0 : if (dump_enabled_p ())
4997 0 : dump_printf_loc (MSG_NOTE, vect_location,
4998 : "SLP discovery succeeded but node needs "
4999 : "splitting\n");
5000 0 : memset (matches, true, group_size);
5001 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
5002 0 : vect_free_slp_tree (node);
5003 : }
5004 : else
5005 : {
5006 : /* Create a new SLP instance. */
5007 674116 : slp_instance new_instance = XNEW (class _slp_instance);
5008 674116 : SLP_INSTANCE_TREE (new_instance) = node;
5009 674116 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
5010 674116 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
5011 674116 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
5012 674116 : SLP_INSTANCE_KIND (new_instance) = kind;
5013 674116 : new_instance->reduc_phis = NULL;
5014 674116 : new_instance->cost_vec = vNULL;
5015 674116 : new_instance->subgraph_entries = vNULL;
5016 :
5017 674116 : if (dump_enabled_p ())
5018 3126 : dump_printf_loc (MSG_NOTE, vect_location,
5019 : "SLP size %u vs. limit %u.\n",
5020 : tree_size, max_tree_size);
5021 :
5022 674116 : vinfo->slp_instances.safe_push (new_instance);
5023 :
5024 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
5025 : the number of scalar stmts in the root in a few places.
5026 : Verify that assumption holds. */
5027 1348232 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
5028 : .length () == group_size);
5029 :
5030 674116 : if (dump_enabled_p ())
5031 : {
5032 3126 : dump_printf_loc (MSG_NOTE, vect_location,
5033 : "Final SLP tree for instance %p:\n",
5034 : (void *) new_instance);
5035 3126 : vect_print_slp_graph (MSG_NOTE, vect_location,
5036 : SLP_INSTANCE_TREE (new_instance));
5037 : }
5038 :
5039 674116 : return true;
5040 : }
5041 : }
5042 : /* Failed to SLP. */
5043 :
5044 : /* Try to break the group up into pieces. */
5045 409124 : if (*limit > 0 && kind == slp_inst_kind_store)
5046 : {
5047 : /* ??? We could delay all the actual splitting of store-groups
5048 : until after SLP discovery of the original group completed.
5049 : Then we can recurse to vect_build_slp_instance directly. */
5050 1071610 : for (i = 0; i < group_size; i++)
5051 1071610 : if (!matches[i])
5052 : break;
5053 :
5054 : /* For basic block SLP, try to break the group up into multiples of
5055 : a vector size. */
5056 409123 : if (is_a <bb_vec_info> (vinfo)
5057 409123 : && (i > 1 && i < group_size))
5058 : {
5059 : /* Free the allocated memory. */
5060 153686 : scalar_stmts.release ();
5061 :
5062 153686 : tree scalar_type
5063 153686 : = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
5064 307372 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
5065 153686 : 1 << floor_log2 (i));
5066 153686 : unsigned HOST_WIDE_INT const_nunits;
5067 153686 : if (vectype
5068 153686 : && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
5069 : {
5070 : /* Split into two groups at the first vector boundary. */
5071 153686 : gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
5072 153686 : unsigned group1_size = i & ~(const_nunits - 1);
5073 :
5074 153686 : if (dump_enabled_p ())
5075 59 : dump_printf_loc (MSG_NOTE, vect_location,
5076 : "Splitting SLP group at stmt %u\n", i);
5077 153686 : stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
5078 : group1_size);
5079 153686 : bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
5080 : kind, max_tree_size,
5081 : limit, false);
5082 : /* Split the rest at the failure point and possibly
5083 : re-analyze the remaining matching part if it has
5084 : at least two lanes. */
5085 153686 : if (group1_size < i
5086 5271 : && (i + 1 < group_size
5087 2901 : || i - group1_size > 1))
5088 : {
5089 2402 : stmt_vec_info rest2 = rest;
5090 2402 : rest = vect_split_slp_store_group (rest, i - group1_size);
5091 2402 : if (i - group1_size > 1)
5092 61 : res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
5093 : kind, max_tree_size,
5094 : limit, false);
5095 : }
5096 : /* Re-analyze the non-matching tail if it has at least
5097 : two lanes. */
5098 153686 : if (i + 1 < group_size)
5099 21730 : res |= vect_analyze_slp_instance (vinfo, bst_map,
5100 : rest, kind, max_tree_size,
5101 : limit, false);
5102 153686 : return res;
5103 : }
5104 : }
5105 :
5106 : /* For loop vectorization split the RHS into arbitrary pieces of
5107 : size >= 1. */
5108 255437 : else if (is_a <loop_vec_info> (vinfo)
5109 255437 : && (group_size != 1 && i < group_size))
5110 : {
5111 6434 : gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
5112 28 : bool masked_p = call
5113 28 : && gimple_call_internal_p (call)
5114 28 : && internal_fn_mask_index (gimple_call_internal_fn (call)) != -1;
5115 : /* There are targets that cannot do even/odd interleaving schemes
5116 : so they absolutely need to use load/store-lanes. For now
5117 : force single-lane SLP for them - they would be happy with
5118 : uniform power-of-two lanes (but depending on element size),
5119 : but even if we can use 'i' as indicator we would need to
5120 : backtrack when later lanes fail to discover with the same
5121 : granularity. We cannot turn any of strided or scatter store
5122 : into store-lanes. */
5123 : /* ??? If this is not in sync with what get_load_store_type
5124 : later decides the SLP representation is not good for other
5125 : store vectorization methods. */
5126 6434 : bool want_store_lanes
5127 6434 : = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5128 6434 : && ! STMT_VINFO_STRIDED_P (stmt_info)
5129 4893 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
5130 4889 : && compare_step_with_zero (vinfo, stmt_info) > 0
5131 11300 : && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
5132 12868 : masked_p, group_size, i));
5133 6434 : if (want_store_lanes || force_single_lane)
5134 : i = 1;
5135 :
5136 : /* A fatal discovery fail doesn't always mean single-lane SLP
5137 : isn't a possibility, so try. */
5138 4936 : if (i == 0)
5139 : i = 1;
5140 :
5141 6434 : if (dump_enabled_p ())
5142 882 : dump_printf_loc (MSG_NOTE, vect_location,
5143 : "Splitting SLP group at stmt %u\n", i);
5144 :
5145 : /* Analyze the stored values and pinch them together with
5146 : a permute node so we can preserve the whole store group. */
5147 6434 : auto_vec<slp_tree> rhs_nodes;
5148 6434 : poly_uint64 max_nunits = 1;
5149 :
5150 6434 : unsigned int rhs_common_nlanes = 0;
5151 6434 : unsigned int start = 0, end = i;
5152 29167 : while (start < group_size)
5153 : {
5154 22963 : gcc_assert (end - start >= 1);
5155 22963 : vec<stmt_vec_info> substmts;
5156 22963 : substmts.create (end - start);
5157 69463 : for (unsigned j = start; j < end; ++j)
5158 46500 : substmts.quick_push (scalar_stmts[j]);
5159 22963 : max_nunits = 1;
5160 22963 : node = vect_build_slp_tree (vinfo, substmts, end - start,
5161 : &max_nunits,
5162 : matches, limit, &tree_size, bst_map);
5163 22963 : if (node)
5164 : {
5165 18270 : rhs_nodes.safe_push (node);
5166 18270 : vect_update_max_nunits (&max_nunits, node->max_nunits);
5167 18270 : if (start == 0)
5168 6208 : rhs_common_nlanes = SLP_TREE_LANES (node);
5169 12062 : else if (rhs_common_nlanes != SLP_TREE_LANES (node))
5170 1267 : rhs_common_nlanes = 0;
5171 18270 : start = end;
5172 18270 : if (want_store_lanes || force_single_lane)
5173 4532 : end = start + 1;
5174 : else
5175 : end = group_size;
5176 : }
5177 : else
5178 : {
5179 4693 : substmts.release ();
5180 4693 : if (end - start == 1)
5181 : {
5182 : /* Single-lane discovery failed. Free ressources. */
5183 244 : for (auto node : rhs_nodes)
5184 6 : vect_free_slp_tree (node);
5185 230 : scalar_stmts.release ();
5186 230 : if (dump_enabled_p ())
5187 38 : dump_printf_loc (MSG_NOTE, vect_location,
5188 : "SLP discovery failed\n");
5189 230 : return false;
5190 : }
5191 :
5192 : /* ??? It really happens that we soft-fail SLP
5193 : build at a mismatch but the matching part hard-fails
5194 : later. As we know we arrived here with a group
5195 : larger than one try a group of size one! */
5196 4463 : if (!matches[0])
5197 42 : end = start + 1;
5198 : else
5199 9934 : for (unsigned j = start; j < end; j++)
5200 9934 : if (!matches[j - start])
5201 : {
5202 : end = j;
5203 : break;
5204 : }
5205 : }
5206 : }
5207 :
5208 : /* Now re-assess whether we want store lanes in case the
5209 : discovery ended up producing all single-lane RHSs. */
5210 6204 : if (! want_store_lanes
5211 6204 : && rhs_common_nlanes == 1
5212 5339 : && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5213 5339 : && ! STMT_VINFO_STRIDED_P (stmt_info)
5214 4052 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
5215 4049 : && compare_step_with_zero (vinfo, stmt_info) > 0
5216 10242 : && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes[0]),
5217 : group_size, masked_p)
5218 : != IFN_LAST))
5219 : want_store_lanes = true;
5220 :
5221 : /* Now we assume we can build the root SLP node from all stores. */
5222 6204 : if (want_store_lanes)
5223 : {
5224 : /* For store-lanes feed the store node with all RHS nodes
5225 : in order. */
5226 0 : node = vect_create_new_slp_node (scalar_stmts,
5227 0 : SLP_TREE_CHILDREN
5228 : (rhs_nodes[0]).length ());
5229 0 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
5230 0 : node->max_nunits = max_nunits;
5231 0 : node->ldst_lanes = true;
5232 0 : SLP_TREE_CHILDREN (node)
5233 0 : .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
5234 0 : + rhs_nodes.length () - 1);
5235 : /* First store value and possibly mask. */
5236 0 : SLP_TREE_CHILDREN (node)
5237 0 : .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
5238 : /* Rest of the store values. All mask nodes are the same,
5239 : this should be guaranteed by dataref group discovery. */
5240 0 : for (unsigned j = 1; j < rhs_nodes.length (); ++j)
5241 0 : SLP_TREE_CHILDREN (node)
5242 0 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
5243 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
5244 0 : child->refcnt++;
5245 : }
5246 : else
5247 6204 : node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
5248 : max_nunits);
5249 :
5250 24468 : while (!rhs_nodes.is_empty ())
5251 18264 : vect_free_slp_tree (rhs_nodes.pop ());
5252 :
5253 : /* Create a new SLP instance. */
5254 6204 : slp_instance new_instance = XNEW (class _slp_instance);
5255 6204 : SLP_INSTANCE_TREE (new_instance) = node;
5256 6204 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
5257 6204 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
5258 6204 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
5259 6204 : SLP_INSTANCE_KIND (new_instance) = kind;
5260 6204 : new_instance->reduc_phis = NULL;
5261 6204 : new_instance->cost_vec = vNULL;
5262 6204 : new_instance->subgraph_entries = vNULL;
5263 :
5264 6204 : if (dump_enabled_p ())
5265 844 : dump_printf_loc (MSG_NOTE, vect_location,
5266 : "SLP size %u vs. limit %u.\n",
5267 : tree_size, max_tree_size);
5268 :
5269 6204 : vinfo->slp_instances.safe_push (new_instance);
5270 :
5271 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
5272 : the number of scalar stmts in the root in a few places.
5273 : Verify that assumption holds. */
5274 12408 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
5275 : .length () == group_size);
5276 :
5277 6204 : if (dump_enabled_p ())
5278 : {
5279 844 : dump_printf_loc (MSG_NOTE, vect_location,
5280 : "Final SLP tree for instance %p:\n",
5281 : (void *) new_instance);
5282 844 : vect_print_slp_graph (MSG_NOTE, vect_location,
5283 : SLP_INSTANCE_TREE (new_instance));
5284 : }
5285 6204 : return true;
5286 6434 : }
5287 : else
5288 : /* Free the allocated memory. */
5289 249003 : scalar_stmts.release ();
5290 :
5291 : /* Even though the first vector did not all match, we might be able to SLP
5292 : (some) of the remainder. FORNOW ignore this possibility. */
5293 : }
5294 : else
5295 : /* Free the allocated memory. */
5296 1 : scalar_stmts.release ();
5297 :
5298 : /* Failed to SLP. */
5299 249004 : if (dump_enabled_p ())
5300 42 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
5301 : return false;
5302 : }
5303 :
5304 : /* qsort comparator ordering SLP load nodes. */
5305 :
5306 : static int
5307 2243707 : vllp_cmp (const void *a_, const void *b_)
5308 : {
5309 2243707 : const slp_tree a = *(const slp_tree *)a_;
5310 2243707 : const slp_tree b = *(const slp_tree *)b_;
5311 2243707 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
5312 2243707 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
5313 2243707 : if (STMT_VINFO_GROUPED_ACCESS (a0)
5314 1374376 : && STMT_VINFO_GROUPED_ACCESS (b0)
5315 3557681 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
5316 : {
5317 : /* Same group, order after lanes used. */
5318 296477 : if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
5319 : return 1;
5320 290446 : else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
5321 : return -1;
5322 : else
5323 : {
5324 : /* Try to order loads using the same lanes together, breaking
5325 : the tie with the lane number that first differs. */
5326 283788 : if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
5327 283788 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
5328 : return 0;
5329 283788 : else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
5330 283788 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
5331 : return 1;
5332 281194 : else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
5333 281194 : && SLP_TREE_LOAD_PERMUTATION (b).exists ())
5334 : return -1;
5335 : else
5336 : {
5337 276414 : for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
5338 276414 : if (SLP_TREE_LOAD_PERMUTATION (a)[i]
5339 276414 : != SLP_TREE_LOAD_PERMUTATION (b)[i])
5340 : {
5341 : /* In-order lane first, that's what the above case for
5342 : no permutation does. */
5343 275582 : if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
5344 : return -1;
5345 167937 : else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
5346 : return 1;
5347 88830 : else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
5348 88830 : < SLP_TREE_LOAD_PERMUTATION (b)[i])
5349 : return -1;
5350 : else
5351 : return 1;
5352 : }
5353 : return 0;
5354 : }
5355 : }
5356 : }
5357 : else /* Different groups or non-groups. */
5358 : {
5359 : /* Order groups as their first element to keep them together. */
5360 1947230 : if (STMT_VINFO_GROUPED_ACCESS (a0))
5361 1947230 : a0 = DR_GROUP_FIRST_ELEMENT (a0);
5362 1947230 : if (STMT_VINFO_GROUPED_ACCESS (b0))
5363 1947230 : b0 = DR_GROUP_FIRST_ELEMENT (b0);
5364 1947230 : if (a0 == b0)
5365 : return 0;
5366 : /* Tie using UID. */
5367 1947110 : else if (gimple_uid (STMT_VINFO_STMT (a0))
5368 1947110 : < gimple_uid (STMT_VINFO_STMT (b0)))
5369 : return -1;
5370 : else
5371 : {
5372 856954 : gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
5373 : != gimple_uid (STMT_VINFO_STMT (b0)));
5374 : return 1;
5375 : }
5376 : }
5377 : }
5378 :
5379 : /* Return whether if the load permutation of NODE is consecutive starting
5380 : with value START_VAL in the first element. If START_VAL is not given
5381 : the first element's value is used. */
5382 :
5383 : bool
5384 544354 : vect_load_perm_consecutive_p (slp_tree node, unsigned start_val)
5385 : {
5386 544354 : load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
5387 :
5388 544354 : if (!perm.exists () || !perm.length ())
5389 : return false;
5390 :
5391 544354 : if (start_val == UINT_MAX)
5392 74008 : start_val = perm[0];
5393 :
5394 1076385 : for (unsigned int i = 0; i < perm.length (); i++)
5395 550147 : if (perm[i] != start_val + (unsigned int) i)
5396 : return false;
5397 :
5398 : return true;
5399 : }
5400 :
5401 : /* Process the set of LOADS that are all from the same dataref group. */
5402 :
5403 : static void
5404 151287 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
5405 : scalar_stmts_to_slp_tree_map_t *bst_map,
5406 : const array_slice<slp_tree> &loads,
5407 : bool force_single_lane)
5408 : {
5409 : /* We at this point want to lower without a fixed VF or vector
5410 : size in mind which means we cannot actually compute whether we
5411 : need three or more vectors for a load permutation yet. So always
5412 : lower. */
5413 151287 : stmt_vec_info first
5414 151287 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
5415 151287 : unsigned group_lanes = DR_GROUP_SIZE (first);
5416 :
5417 : /* Verify if all load permutations can be implemented with a suitably
5418 : large element load-lanes operation. */
5419 151287 : unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
5420 151287 : if (STMT_VINFO_STRIDED_P (first)
5421 149173 : || compare_step_with_zero (loop_vinfo, first) <= 0
5422 146842 : || exact_log2 (ld_lanes_lanes) == -1
5423 : /* ??? For now only support the single-lane case as there is
5424 : missing support on the store-lane side and code generation
5425 : isn't up to the task yet. */
5426 144732 : || ld_lanes_lanes != 1
5427 288884 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
5428 : group_lanes / ld_lanes_lanes,
5429 : false) == IFN_LAST)
5430 : ld_lanes_lanes = 0;
5431 : else
5432 : /* Verify the loads access the same number of lanes aligned to
5433 : ld_lanes_lanes. */
5434 0 : for (slp_tree load : loads)
5435 : {
5436 0 : if (SLP_TREE_LANES (load) != ld_lanes_lanes)
5437 : {
5438 : ld_lanes_lanes = 0;
5439 : break;
5440 : }
5441 0 : unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
5442 0 : if (first % ld_lanes_lanes != 0)
5443 : {
5444 : ld_lanes_lanes = 0;
5445 : break;
5446 : }
5447 0 : if (!vect_load_perm_consecutive_p (load))
5448 : {
5449 : ld_lanes_lanes = 0;
5450 : break;
5451 : }
5452 : }
5453 :
5454 : /* Only a power-of-two number of lanes matches interleaving with N levels.
5455 : ??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
5456 : at each step. */
5457 249117 : if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
5458 : return;
5459 :
5460 238047 : for (slp_tree load : loads)
5461 : {
5462 : /* Leave masked or gather loads alone for now. */
5463 168889 : if (!SLP_TREE_CHILDREN (load).is_empty ())
5464 48193 : continue;
5465 :
5466 : /* For single-element interleaving spanning multiple vectors avoid
5467 : lowering, we want to use VMAT_ELEMENTWISE later. */
5468 168883 : if (ld_lanes_lanes == 0
5469 168883 : && SLP_TREE_LANES (load) == 1
5470 155744 : && !DR_GROUP_NEXT_ELEMENT (first)
5471 247122 : && maybe_gt (group_lanes,
5472 : TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (load))))
5473 50420 : return;
5474 :
5475 : /* We want to pattern-match special cases here and keep those
5476 : alone. Candidates are splats and load-lane. */
5477 :
5478 : /* We need to lower only loads of less than half of the groups
5479 : lanes, including duplicate lanes. Note this leaves nodes
5480 : with a non-1:1 load permutation around instead of canonicalizing
5481 : those into a load and a permute node. Removing this early
5482 : check would do such canonicalization. */
5483 118463 : if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
5484 44809 : && ld_lanes_lanes == 0)
5485 44809 : continue;
5486 :
5487 : /* Build the permute to get the original load permutation order. */
5488 73654 : bool contiguous = vect_load_perm_consecutive_p (load);
5489 73654 : lane_permutation_t final_perm;
5490 73654 : final_perm.create (SLP_TREE_LANES (load));
5491 147966 : for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
5492 148624 : final_perm.quick_push (
5493 74312 : std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
5494 :
5495 : /* When the load permutation accesses a contiguous unpermuted,
5496 : power-of-two aligned and sized chunk leave the load alone.
5497 : We can likely (re-)load it more efficiently rather than
5498 : extracting it from the larger load.
5499 : ??? Long-term some of the lowering should move to where
5500 : the vector types involved are fixed. */
5501 77032 : if (!force_single_lane
5502 73654 : && ld_lanes_lanes == 0
5503 48916 : && contiguous
5504 48679 : && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
5505 6373 : && pow2p_hwi (SLP_TREE_LANES (load))
5506 6337 : && pow2p_hwi (group_lanes)
5507 3378 : && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
5508 77032 : && group_lanes % SLP_TREE_LANES (load) == 0)
5509 : {
5510 3378 : final_perm.release ();
5511 3378 : continue;
5512 : }
5513 :
5514 : /* First build (and possibly re-use) a load node for the
5515 : unpermuted group. Gaps in the middle and on the end are
5516 : represented with NULL stmts. */
5517 70276 : vec<stmt_vec_info> stmts;
5518 70276 : stmts.create (group_lanes);
5519 245345 : for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
5520 : {
5521 175069 : if (s != first)
5522 108876 : for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
5523 4083 : stmts.quick_push (NULL);
5524 175069 : stmts.quick_push (s);
5525 : }
5526 131526 : for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
5527 61250 : stmts.quick_push (NULL);
5528 70276 : poly_uint64 max_nunits = 1;
5529 70276 : bool *matches = XALLOCAVEC (bool, group_lanes);
5530 70276 : unsigned limit = 1;
5531 70276 : unsigned tree_size = 0;
5532 70276 : slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
5533 : group_lanes,
5534 : &max_nunits, matches, &limit,
5535 70276 : &tree_size, bst_map);
5536 70276 : gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0).exists ());
5537 :
5538 70276 : if (ld_lanes_lanes != 0)
5539 : {
5540 : /* ??? If this is not in sync with what get_load_store_type
5541 : later decides the SLP representation is not good for other
5542 : store vectorization methods. */
5543 0 : l0->ldst_lanes = true;
5544 0 : load->ldst_lanes = true;
5545 : }
5546 :
5547 217494 : while (1)
5548 : {
5549 143885 : unsigned group_lanes = SLP_TREE_LANES (l0);
5550 143885 : if (ld_lanes_lanes != 0
5551 143885 : || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
5552 : break;
5553 :
5554 : /* Try to lower by reducing the group to half its size using an
5555 : interleaving scheme. For this try to compute whether all
5556 : elements needed for this load are in even or odd elements of
5557 : an even/odd decomposition with N consecutive elements.
5558 : Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
5559 : with N == 2. */
5560 : /* ??? Only an even number of lanes can be handed this way, but the
5561 : fallback below could work for any number. We have to make sure
5562 : to round up in that case. */
5563 73609 : gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
5564 9807 : unsigned even = 0, odd = 0;
5565 9807 : if ((group_lanes & 1) == 0)
5566 : {
5567 9807 : even = (1 << ceil_log2 (group_lanes)) - 1;
5568 9807 : odd = even;
5569 39899 : for (auto l : final_perm)
5570 : {
5571 10478 : even &= ~l.second;
5572 10478 : odd &= l.second;
5573 : }
5574 : }
5575 :
5576 : /* Now build an even or odd extraction from the unpermuted load. */
5577 73609 : lane_permutation_t perm;
5578 73609 : perm.create ((group_lanes + 1) / 2);
5579 73609 : unsigned even_level = even ? 1 << ctz_hwi (even) : 0;
5580 73609 : unsigned odd_level = odd ? 1 << ctz_hwi (odd) : 0;
5581 73609 : if (even_level
5582 9051 : && group_lanes % (2 * even_level) == 0
5583 : /* ??? When code generating permutes we do not try to pun
5584 : to larger component modes so level != 1 isn't a natural
5585 : even/odd extract. Prefer one if possible. */
5586 9051 : && (even_level == 1 || !odd_level || odd_level != 1))
5587 : {
5588 : /* { 0, 1, ... 4, 5 ..., } */
5589 33232 : for (unsigned i = 0; i < group_lanes / 2 / even_level; ++i)
5590 52520 : for (unsigned j = 0; j < even_level; ++j)
5591 26430 : perm.quick_push (std::make_pair (0, 2 * i * even_level + j));
5592 : }
5593 64558 : else if (odd_level)
5594 : {
5595 : /* { ..., 2, 3, ... 6, 7 } */
5596 2635 : gcc_assert (group_lanes % (2 * odd_level) == 0);
5597 11413 : for (unsigned i = 0; i < group_lanes / 2 / odd_level; ++i)
5598 17610 : for (unsigned j = 0; j < odd_level; ++j)
5599 8832 : perm.quick_push
5600 8832 : (std::make_pair (0, (2 * i + 1) * odd_level + j));
5601 : }
5602 : else
5603 : {
5604 : /* As fallback extract all used lanes and fill to half the
5605 : group size by repeating the last element.
5606 : ??? This is quite a bad strathegy for re-use - we could
5607 : brute force our way to find more optimal filling lanes to
5608 : maximize re-use when looking at all loads from the group. */
5609 63832 : auto_bitmap l;
5610 255384 : for (auto p : final_perm)
5611 63888 : bitmap_set_bit (l, p.second);
5612 63832 : unsigned i = 0;
5613 63832 : bitmap_iterator bi;
5614 127720 : EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
5615 63888 : perm.quick_push (std::make_pair (0, i));
5616 255480 : while (perm.length () < (group_lanes + 1) / 2)
5617 63908 : perm.quick_push (perm.last ());
5618 63832 : }
5619 :
5620 : /* Update final_perm with the intermediate permute. */
5621 147889 : for (unsigned i = 0; i < final_perm.length (); ++i)
5622 : {
5623 74280 : unsigned l = final_perm[i].second;
5624 74280 : unsigned j;
5625 81480 : for (j = 0; j < perm.length (); ++j)
5626 81480 : if (perm[j].second == l)
5627 : {
5628 74280 : final_perm[i].second = j;
5629 74280 : break;
5630 : }
5631 74280 : gcc_assert (j < perm.length ());
5632 : }
5633 :
5634 : /* And create scalar stmts. */
5635 73609 : vec<stmt_vec_info> perm_stmts;
5636 73609 : perm_stmts.create (perm.length ());
5637 236667 : for (unsigned i = 0; i < perm.length (); ++i)
5638 163058 : perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
5639 :
5640 73609 : slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
5641 73609 : SLP_TREE_CHILDREN (p).quick_push (l0);
5642 73609 : SLP_TREE_LANE_PERMUTATION (p) = perm;
5643 73609 : SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
5644 73609 : SLP_TREE_LANES (p) = perm.length ();
5645 73609 : SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
5646 : /* ??? As we have scalar stmts for this intermediate permute we
5647 : could CSE it via bst_map but we do not want to pick up
5648 : another SLP node with a load permutation. We instead should
5649 : have a "local" CSE map here. */
5650 73609 : SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
5651 :
5652 : /* We now have a node for (group_lanes + 1) / 2 lanes. */
5653 73609 : l0 = p;
5654 73609 : }
5655 :
5656 : /* And finally from the ordered reduction node create the
5657 : permute to shuffle the lanes into the original load-permutation
5658 : order. We replace the original load node with this. */
5659 70276 : SLP_TREE_CODE (load) = VEC_PERM_EXPR;
5660 70276 : SLP_TREE_LOAD_PERMUTATION (load).release ();
5661 70276 : SLP_TREE_LANE_PERMUTATION (load) = final_perm;
5662 70276 : SLP_TREE_CHILDREN (load).create (1);
5663 70276 : SLP_TREE_CHILDREN (load).quick_push (l0);
5664 : }
5665 : }
5666 :
5667 : /* Transform SLP loads in the SLP graph created by SLP discovery to
5668 : group loads from the same group and lower load permutations that
5669 : are unlikely to be supported into a series of permutes.
5670 : In the degenerate case of having only single-lane SLP instances
5671 : this should result in a series of permute nodes emulating an
5672 : interleaving scheme. */
5673 :
5674 : static void
5675 405823 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
5676 : scalar_stmts_to_slp_tree_map_t *bst_map,
5677 : bool force_single_lane)
5678 : {
5679 : /* Gather and sort loads across all instances. */
5680 405823 : hash_set<slp_tree> visited;
5681 405823 : auto_vec<slp_tree> loads;
5682 1879773 : for (auto inst : loop_vinfo->slp_instances)
5683 664226 : vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
5684 405823 : if (loads.is_empty ())
5685 71505 : return;
5686 334318 : loads.qsort (vllp_cmp);
5687 :
5688 : /* Now process each dataref group separately. */
5689 334318 : unsigned firsti = 0;
5690 621891 : for (unsigned i = 1; i < loads.length (); ++i)
5691 : {
5692 287573 : slp_tree first = loads[firsti];
5693 287573 : slp_tree next = loads[i];
5694 287573 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
5695 287573 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
5696 287573 : if (STMT_VINFO_GROUPED_ACCESS (a0)
5697 145112 : && STMT_VINFO_GROUPED_ACCESS (b0)
5698 419718 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
5699 54268 : continue;
5700 : /* Now we have one or multiple SLP loads of the same group from
5701 : firsti to i - 1. */
5702 233305 : if (STMT_VINFO_GROUPED_ACCESS (a0))
5703 90844 : vect_lower_load_permutations (loop_vinfo, bst_map,
5704 90844 : make_array_slice (&loads[firsti],
5705 : i - firsti),
5706 : force_single_lane);
5707 : firsti = i;
5708 : }
5709 668636 : if (firsti < loads.length ()
5710 668636 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
5711 60443 : vect_lower_load_permutations (loop_vinfo, bst_map,
5712 60443 : make_array_slice (&loads[firsti],
5713 60443 : loads.length () - firsti),
5714 : force_single_lane);
5715 405823 : }
5716 :
5717 : /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
5718 : trees of packed scalar stmts if SLP is possible. */
5719 :
5720 : opt_result
5721 1036359 : vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
5722 : bool force_single_lane)
5723 : {
5724 1036359 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5725 1036359 : unsigned int i;
5726 1036359 : stmt_vec_info first_element;
5727 1036359 : slp_instance instance;
5728 :
5729 1036359 : DUMP_VECT_SCOPE ("vect_analyze_slp");
5730 :
5731 1036359 : unsigned limit = max_tree_size;
5732 :
5733 1036359 : scalar_stmts_to_slp_tree_map_t *bst_map
5734 1036359 : = new scalar_stmts_to_slp_tree_map_t ();
5735 :
5736 : /* Find SLP sequences starting from groups of grouped stores. */
5737 2980266 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5738 907786 : if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
5739 : slp_inst_kind_store, max_tree_size, &limit,
5740 : force_single_lane)
5741 907786 : && loop_vinfo)
5742 : {
5743 238 : release_scalar_stmts_to_slp_tree_map (bst_map);
5744 238 : return opt_result::failure_at (vect_location, "SLP build failed.\n");
5745 : }
5746 :
5747 : /* For loops also start SLP discovery from non-grouped stores. */
5748 1036121 : if (loop_vinfo)
5749 : {
5750 : data_reference_p dr;
5751 1371732 : FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
5752 948418 : if (DR_IS_WRITE (dr))
5753 : {
5754 287920 : stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
5755 : /* Grouped stores are already handled above. */
5756 287920 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5757 76098 : continue;
5758 211822 : vec<stmt_vec_info> stmts;
5759 211822 : vec<stmt_vec_info> roots = vNULL;
5760 211822 : vec<tree> remain = vNULL;
5761 211822 : stmts.create (1);
5762 211822 : stmts.quick_push (stmt_info);
5763 211822 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
5764 : stmts, roots, remain, max_tree_size,
5765 : &limit, bst_map, force_single_lane))
5766 : {
5767 3718 : release_scalar_stmts_to_slp_tree_map (bst_map);
5768 3718 : return opt_result::failure_at (vect_location,
5769 : "SLP build failed.\n");
5770 : }
5771 : }
5772 :
5773 : stmt_vec_info stmt_info;
5774 423354 : FOR_EACH_VEC_ELT (LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo), i, stmt_info)
5775 : {
5776 20 : vec<stmt_vec_info> stmts;
5777 20 : vec<stmt_vec_info> roots = vNULL;
5778 20 : vec<tree> remain = vNULL;
5779 20 : stmts.create (1);
5780 20 : stmts.quick_push (stmt_info);
5781 20 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
5782 : stmts, roots, remain, max_tree_size,
5783 : &limit, bst_map, force_single_lane))
5784 : {
5785 0 : release_scalar_stmts_to_slp_tree_map (bst_map);
5786 0 : return opt_result::failure_at (vect_location,
5787 : "SLP build failed.\n");
5788 : }
5789 : }
5790 : }
5791 :
5792 1032403 : if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
5793 : {
5794 1813553 : for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
5795 : {
5796 1204464 : vect_location = bb_vinfo->roots[i].roots[0]->stmt;
5797 : /* Apply patterns. */
5798 3767026 : for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
5799 5125124 : bb_vinfo->roots[i].stmts[j]
5800 2636047 : = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
5801 1204464 : if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
5802 1204464 : bb_vinfo->roots[i].stmts,
5803 1204464 : bb_vinfo->roots[i].roots,
5804 1204464 : bb_vinfo->roots[i].remain,
5805 : max_tree_size, &limit, bst_map, false))
5806 : {
5807 127167 : bb_vinfo->roots[i].roots = vNULL;
5808 127167 : bb_vinfo->roots[i].remain = vNULL;
5809 : }
5810 1204464 : bb_vinfo->roots[i].stmts = vNULL;
5811 : }
5812 : }
5813 :
5814 1032403 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5815 : {
5816 : /* Find SLP sequences starting from groups of reductions. */
5817 423314 : if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit,
5818 : bst_map, force_single_lane))
5819 : {
5820 2585 : release_scalar_stmts_to_slp_tree_map (bst_map);
5821 2585 : return opt_result::failure_at (vect_location, "SLP build failed.\n");
5822 : }
5823 :
5824 : /* Make sure to vectorize only-live stmts, usually inductions. */
5825 1925740 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
5826 1271841 : for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
5827 598796 : gsi_next (&gsi))
5828 : {
5829 608288 : gphi *lc_phi = *gsi;
5830 608288 : tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
5831 608288 : stmt_vec_info stmt_info;
5832 608288 : if (TREE_CODE (def) == SSA_NAME
5833 497150 : && !virtual_operand_p (def)
5834 268710 : && (stmt_info = loop_vinfo->lookup_def (def))
5835 237887 : && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
5836 237887 : && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
5837 178157 : && STMT_VINFO_LIVE_P (stmt_info)
5838 178157 : && !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))
5839 714672 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
5840 : {
5841 106305 : vec<stmt_vec_info> stmts;
5842 106305 : vec<stmt_vec_info> roots = vNULL;
5843 106305 : vec<tree> remain = vNULL;
5844 106305 : stmts.create (1);
5845 106305 : stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
5846 106305 : if (! vect_build_slp_instance (vinfo,
5847 : slp_inst_kind_reduc_group,
5848 : stmts, roots, remain,
5849 : max_tree_size, &limit,
5850 : bst_map, force_single_lane))
5851 : {
5852 9492 : release_scalar_stmts_to_slp_tree_map (bst_map);
5853 9492 : return opt_result::failure_at (vect_location,
5854 : "SLP build failed.\n");
5855 : }
5856 : }
5857 9492 : }
5858 :
5859 : /* Find SLP sequences starting from gconds. */
5860 1108605 : for (auto cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
5861 : {
5862 274446 : auto cond_info = loop_vinfo->lookup_stmt (cond);
5863 :
5864 274446 : cond_info = vect_stmt_to_vectorize (cond_info);
5865 274446 : vec<stmt_vec_info> roots = vNULL;
5866 274446 : roots.safe_push (cond_info);
5867 274446 : gimple *stmt = STMT_VINFO_STMT (cond_info);
5868 274446 : tree args0 = gimple_cond_lhs (stmt);
5869 274446 : tree args1 = gimple_cond_rhs (stmt);
5870 :
5871 : /* These should be enforced by cond lowering, but if it failed
5872 : bail. */
5873 274446 : if (gimple_cond_code (stmt) != NE_EXPR
5874 273368 : || TREE_TYPE (args0) != boolean_type_node
5875 547166 : || !integer_zerop (args1))
5876 : {
5877 1726 : roots.release ();
5878 1726 : release_scalar_stmts_to_slp_tree_map (bst_map);
5879 1726 : return opt_result::failure_at (vect_location,
5880 : "SLP build failed.\n");
5881 : }
5882 :
5883 : /* An argument without a loop def will be codegened from vectorizing the
5884 : root gcond itself. As such we don't need to try to build an SLP tree
5885 : from them. It's highly likely that the resulting SLP tree here if both
5886 : arguments have a def will be incompatible, but we rely on it being split
5887 : later on. */
5888 272720 : auto varg = loop_vinfo->lookup_def (args0);
5889 272720 : vec<stmt_vec_info> stmts;
5890 272720 : vec<tree> remain = vNULL;
5891 272720 : stmts.create (1);
5892 272720 : stmts.quick_push (vect_stmt_to_vectorize (varg));
5893 :
5894 272720 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
5895 : stmts, roots, remain,
5896 : max_tree_size, &limit,
5897 : bst_map, force_single_lane))
5898 : {
5899 3688 : roots.release ();
5900 3688 : release_scalar_stmts_to_slp_tree_map (bst_map);
5901 3688 : return opt_result::failure_at (vect_location,
5902 : "SLP build failed.\n");
5903 : }
5904 : }
5905 : }
5906 :
5907 1014912 : hash_set<slp_tree> visited_patterns;
5908 1014912 : slp_tree_to_load_perm_map_t perm_cache;
5909 1014912 : slp_compat_nodes_map_t compat_cache;
5910 :
5911 : /* See if any patterns can be found in the SLP tree. */
5912 1014912 : bool pattern_found = false;
5913 3478047 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5914 1448223 : pattern_found |= vect_match_slp_patterns (instance, vinfo,
5915 : &visited_patterns, &perm_cache,
5916 : &compat_cache);
5917 :
5918 : /* If any were found optimize permutations of loads. */
5919 1014912 : if (pattern_found)
5920 : {
5921 202 : hash_map<slp_tree, slp_tree> load_map;
5922 3239 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5923 : {
5924 2835 : slp_tree root = SLP_INSTANCE_TREE (instance);
5925 2835 : optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
5926 : &load_map, root);
5927 : }
5928 202 : }
5929 :
5930 : /* Check whether we should force some SLP instances to use load/store-lanes
5931 : and do so by forcing SLP re-discovery with single lanes. We used
5932 : to cancel SLP when this applied to all instances in a loop but now
5933 : we decide this per SLP instance. It's important to do this only
5934 : after SLP pattern recognition. */
5935 1014912 : if (is_a <loop_vec_info> (vinfo))
5936 1070049 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5937 664226 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
5938 229875 : && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
5939 : {
5940 229875 : slp_tree slp_root = SLP_INSTANCE_TREE (instance);
5941 229875 : unsigned int group_size = SLP_TREE_LANES (slp_root);
5942 229875 : tree vectype = SLP_TREE_VECTYPE (slp_root);
5943 :
5944 229875 : stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
5945 229875 : gimple *rep = STMT_VINFO_STMT (rep_info);
5946 229875 : bool masked = (is_gimple_call (rep)
5947 1366 : && gimple_call_internal_p (rep)
5948 231221 : && internal_fn_mask_index
5949 1346 : (gimple_call_internal_fn (rep)) != -1);
5950 229855 : if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
5951 23466 : || slp_root->ldst_lanes
5952 253341 : || (vect_store_lanes_supported (vectype, group_size, masked)
5953 : == IFN_LAST))
5954 229875 : continue;
5955 :
5956 0 : auto_vec<slp_tree> loads;
5957 0 : hash_set<slp_tree> visited;
5958 0 : vect_gather_slp_loads (loads, slp_root, visited);
5959 :
5960 : /* Check whether any load in the SLP instance is possibly
5961 : permuted. */
5962 0 : bool loads_permuted = false;
5963 0 : slp_tree load_node;
5964 0 : unsigned j;
5965 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
5966 : {
5967 0 : if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
5968 0 : continue;
5969 : unsigned k;
5970 : stmt_vec_info load_info;
5971 0 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
5972 0 : if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
5973 : {
5974 : loads_permuted = true;
5975 : break;
5976 : }
5977 : }
5978 :
5979 : /* If the loads and stores can use load/store-lanes force re-discovery
5980 : with single lanes. */
5981 0 : if (loads_permuted)
5982 : {
5983 0 : bool can_use_lanes = true;
5984 : bool prefer_load_lanes = false;
5985 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
5986 0 : if (STMT_VINFO_GROUPED_ACCESS
5987 : (SLP_TREE_REPRESENTATIVE (load_node)))
5988 : {
5989 0 : stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
5990 : (SLP_TREE_REPRESENTATIVE (load_node));
5991 0 : rep = STMT_VINFO_STMT (stmt_vinfo);
5992 0 : masked = (is_gimple_call (rep)
5993 0 : && gimple_call_internal_p (rep)
5994 0 : && internal_fn_mask_index
5995 0 : (gimple_call_internal_fn (rep)));
5996 : /* Use SLP for strided accesses (or if we can't
5997 : load-lanes). */
5998 0 : if (STMT_VINFO_STRIDED_P (stmt_vinfo)
5999 0 : || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
6000 0 : || vect_load_lanes_supported
6001 0 : (SLP_TREE_VECTYPE (load_node),
6002 0 : DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
6003 : /* ??? During SLP re-discovery with a single lane
6004 : a masked grouped load will appear permuted and
6005 : discovery will fail. We have to rework this
6006 : on the discovery side - for now avoid ICEing. */
6007 0 : || masked)
6008 : {
6009 : can_use_lanes = false;
6010 : break;
6011 : }
6012 : /* Make sure that the target would prefer store-lanes
6013 : for at least one of the loads.
6014 :
6015 : ??? Perhaps we should instead require this for
6016 : all loads? */
6017 0 : prefer_load_lanes
6018 : = (prefer_load_lanes
6019 0 : || SLP_TREE_LANES (load_node) == group_size
6020 0 : || (vect_slp_prefer_store_lanes_p
6021 0 : (vinfo, stmt_vinfo,
6022 : SLP_TREE_VECTYPE (load_node), masked,
6023 : group_size, SLP_TREE_LANES (load_node))));
6024 : }
6025 :
6026 0 : if (can_use_lanes && prefer_load_lanes)
6027 : {
6028 0 : if (dump_enabled_p ())
6029 0 : dump_printf_loc (MSG_NOTE, vect_location,
6030 : "SLP instance %p can use load/store-lanes,"
6031 : " re-discovering with single-lanes\n",
6032 : (void *) instance);
6033 :
6034 0 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
6035 :
6036 0 : vect_free_slp_instance (instance);
6037 0 : limit = max_tree_size;
6038 0 : bool res = vect_analyze_slp_instance (vinfo, bst_map,
6039 : stmt_info,
6040 : slp_inst_kind_store,
6041 : max_tree_size, &limit,
6042 : true);
6043 0 : gcc_assert (res);
6044 0 : auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
6045 0 : LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
6046 : }
6047 : }
6048 0 : }
6049 :
6050 : /* When we end up with load permutations that we cannot possibly handle,
6051 : like those requiring three vector inputs, lower them using interleaving
6052 : like schemes. */
6053 1014912 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6054 : {
6055 405823 : vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
6056 405823 : if (dump_enabled_p ())
6057 : {
6058 19245 : dump_printf_loc (MSG_NOTE, vect_location,
6059 : "SLP graph after lowering permutations:\n");
6060 19245 : hash_set<slp_tree> visited;
6061 86042 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6062 28332 : vect_print_slp_graph (MSG_NOTE, vect_location,
6063 : SLP_INSTANCE_TREE (instance), visited);
6064 19245 : }
6065 : }
6066 :
6067 1014912 : release_scalar_stmts_to_slp_tree_map (bst_map);
6068 :
6069 1014912 : if (pattern_found && dump_enabled_p ())
6070 : {
6071 14 : dump_printf_loc (MSG_NOTE, vect_location,
6072 : "Pattern matched SLP tree\n");
6073 14 : hash_set<slp_tree> visited;
6074 74 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6075 32 : vect_print_slp_graph (MSG_NOTE, vect_location,
6076 : SLP_INSTANCE_TREE (instance), visited);
6077 14 : }
6078 :
6079 1014912 : return opt_result::success ();
6080 1014912 : }
6081 :
6082 : /* Estimates the cost of inserting layout changes into the SLP graph.
6083 : It can also say that the insertion is impossible. */
6084 :
6085 : struct slpg_layout_cost
6086 : {
6087 9641099 : slpg_layout_cost () = default;
6088 : slpg_layout_cost (sreal, bool);
6089 :
6090 453358 : static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
6091 4991049 : bool is_possible () const { return depth != sreal::max (); }
6092 :
6093 : bool operator== (const slpg_layout_cost &) const;
6094 : bool operator!= (const slpg_layout_cost &) const;
6095 :
6096 : bool is_better_than (const slpg_layout_cost &, bool) const;
6097 :
6098 : void add_parallel_cost (const slpg_layout_cost &);
6099 : void add_serial_cost (const slpg_layout_cost &);
6100 : void split (unsigned int);
6101 :
6102 : /* The longest sequence of layout changes needed during any traversal
6103 : of the partition dag, weighted by execution frequency.
6104 :
6105 : This is the most important metric when optimizing for speed, since
6106 : it helps to ensure that we keep the number of operations on
6107 : critical paths to a minimum. */
6108 : sreal depth = 0;
6109 :
6110 : /* An estimate of the total number of operations needed. It is weighted by
6111 : execution frequency when optimizing for speed but not when optimizing for
6112 : size. In order to avoid double-counting, a node with a fanout of N will
6113 : distribute 1/N of its total cost to each successor.
6114 :
6115 : This is the most important metric when optimizing for size, since
6116 : it helps to keep the total number of operations to a minimum, */
6117 : sreal total = 0;
6118 : };
6119 :
6120 : /* Construct costs for a node with weight WEIGHT. A higher weight
6121 : indicates more frequent execution. IS_FOR_SIZE is true if we are
6122 : optimizing for size rather than speed. */
6123 :
6124 1172300 : slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
6125 1173168 : : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
6126 : {
6127 1172300 : }
6128 :
6129 : bool
6130 0 : slpg_layout_cost::operator== (const slpg_layout_cost &other) const
6131 : {
6132 0 : return depth == other.depth && total == other.total;
6133 : }
6134 :
6135 : bool
6136 0 : slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
6137 : {
6138 0 : return !operator== (other);
6139 : }
6140 :
6141 : /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
6142 : true if we are optimizing for size rather than speed. */
6143 :
6144 : bool
6145 292991 : slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
6146 : bool is_for_size) const
6147 : {
6148 292991 : if (is_for_size)
6149 : {
6150 382 : if (total != other.total)
6151 159 : return total < other.total;
6152 223 : return depth < other.depth;
6153 : }
6154 : else
6155 : {
6156 292609 : if (depth != other.depth)
6157 125223 : return depth < other.depth;
6158 167386 : return total < other.total;
6159 : }
6160 : }
6161 :
6162 : /* Increase the costs to account for something with cost INPUT_COST
6163 : happening in parallel with the current costs. */
6164 :
6165 : void
6166 346132 : slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
6167 : {
6168 346132 : depth = std::max (depth, input_cost.depth);
6169 346132 : total += input_cost.total;
6170 346132 : }
6171 :
6172 : /* Increase the costs to account for something with cost INPUT_COST
6173 : happening in series with the current costs. */
6174 :
6175 : void
6176 1412031 : slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
6177 : {
6178 1412031 : depth += other.depth;
6179 1412031 : total += other.total;
6180 1412031 : }
6181 :
6182 : /* Split the total cost among TIMES successors or predecessors. */
6183 :
6184 : void
6185 1161954 : slpg_layout_cost::split (unsigned int times)
6186 : {
6187 1161954 : if (times > 1)
6188 483821 : total /= times;
6189 1161954 : }
6190 :
6191 : /* Information about one node in the SLP graph, for use during
6192 : vect_optimize_slp_pass. */
6193 :
6194 : struct slpg_vertex
6195 : {
6196 9102782 : slpg_vertex (slp_tree node_) : node (node_) {}
6197 :
6198 : /* The node itself. */
6199 : slp_tree node;
6200 :
6201 : /* Which partition the node belongs to, or -1 if none. Nodes outside of
6202 : partitions are flexible; they can have whichever layout consumers
6203 : want them to have. */
6204 : int partition = -1;
6205 :
6206 : /* The number of nodes that directly use the result of this one
6207 : (i.e. the number of nodes that count this one as a child). */
6208 : unsigned int out_degree = 0;
6209 :
6210 : /* The execution frequency of the node. */
6211 : sreal weight = 0;
6212 :
6213 : /* The total execution frequency of all nodes that directly use the
6214 : result of this one. */
6215 : sreal out_weight = 0;
6216 : };
6217 :
6218 : /* Information about one partition of the SLP graph, for use during
6219 : vect_optimize_slp_pass. */
6220 :
6221 : struct slpg_partition_info
6222 : {
6223 : /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
6224 : of m_partitioned_nodes. */
6225 : unsigned int node_begin = 0;
6226 : unsigned int node_end = 0;
6227 :
6228 : /* Which layout we've chosen to use for this partition, or -1 if
6229 : we haven't picked one yet. */
6230 : int layout = -1;
6231 :
6232 : /* The number of predecessors and successors in the partition dag.
6233 : The predecessors always have lower partition numbers and the
6234 : successors always have higher partition numbers.
6235 :
6236 : Note that the directions of these edges are not necessarily the
6237 : same as in the data flow graph. For example, if an SCC has separate
6238 : partitions for an inner loop and an outer loop, the inner loop's
6239 : partition will have at least two incoming edges from the outer loop's
6240 : partition: one for a live-in value and one for a live-out value.
6241 : In data flow terms, one of these edges would also be from the outer loop
6242 : to the inner loop, but the other would be in the opposite direction. */
6243 : unsigned int in_degree = 0;
6244 : unsigned int out_degree = 0;
6245 : };
6246 :
6247 : /* Information about the costs of using a particular layout for a
6248 : particular partition. It can also say that the combination is
6249 : impossible. */
6250 :
6251 : struct slpg_partition_layout_costs
6252 : {
6253 1429544 : bool is_possible () const { return internal_cost.is_possible (); }
6254 50930 : void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
6255 :
6256 : /* The costs inherited from predecessor partitions. */
6257 : slpg_layout_cost in_cost;
6258 :
6259 : /* The inherent cost of the layout within the node itself. For example,
6260 : this is nonzero for a load if choosing a particular layout would require
6261 : the load to permute the loaded elements. It is nonzero for a
6262 : VEC_PERM_EXPR if the permutation cannot be eliminated or converted
6263 : to full-vector moves. */
6264 : slpg_layout_cost internal_cost;
6265 :
6266 : /* The costs inherited from successor partitions. */
6267 : slpg_layout_cost out_cost;
6268 : };
6269 :
6270 : /* This class tries to optimize the layout of vectors in order to avoid
6271 : unnecessary shuffling. At the moment, the set of possible layouts are
6272 : restricted to bijective permutations.
6273 :
6274 : The goal of the pass depends on whether we're optimizing for size or
6275 : for speed. When optimizing for size, the goal is to reduce the overall
6276 : number of layout changes (including layout changes implied by things
6277 : like load permutations). When optimizing for speed, the goal is to
6278 : reduce the maximum latency attributable to layout changes on any
6279 : non-cyclical path through the data flow graph.
6280 :
6281 : For example, when optimizing a loop nest for speed, we will prefer
6282 : to make layout changes outside of a loop rather than inside of a loop,
6283 : and will prefer to make layout changes in parallel rather than serially,
6284 : even if that increases the overall number of layout changes.
6285 :
6286 : The high-level procedure is:
6287 :
6288 : (1) Build a graph in which edges go from uses (parents) to definitions
6289 : (children).
6290 :
6291 : (2) Divide the graph into a dag of strongly-connected components (SCCs).
6292 :
6293 : (3) When optimizing for speed, partition the nodes in each SCC based
6294 : on their containing cfg loop. When optimizing for size, treat
6295 : each SCC as a single partition.
6296 :
6297 : This gives us a dag of partitions. The goal is now to assign a
6298 : layout to each partition.
6299 :
6300 : (4) Construct a set of vector layouts that are worth considering.
6301 : Record which nodes must keep their current layout.
6302 :
6303 : (5) Perform a forward walk over the partition dag (from loads to stores)
6304 : accumulating the "forward" cost of using each layout. When visiting
6305 : each partition, assign a tentative choice of layout to the partition
6306 : and use that choice when calculating the cost of using a different
6307 : layout in successor partitions.
6308 :
6309 : (6) Perform a backward walk over the partition dag (from stores to loads),
6310 : accumulating the "backward" cost of using each layout. When visiting
6311 : each partition, make a final choice of layout for that partition based
6312 : on the accumulated forward costs (from (5)) and backward costs
6313 : (from (6)).
6314 :
6315 : (7) Apply the chosen layouts to the SLP graph.
6316 :
6317 : For example, consider the SLP statements:
6318 :
6319 : S1: a_1 = load
6320 : loop:
6321 : S2: a_2 = PHI<a_1, a_3>
6322 : S3: b_1 = load
6323 : S4: a_3 = a_2 + b_1
6324 : exit:
6325 : S5: a_4 = PHI<a_3>
6326 : S6: store a_4
6327 :
6328 : S2 and S4 form an SCC and are part of the same loop. Every other
6329 : statement is in a singleton SCC. In this example there is a one-to-one
6330 : mapping between SCCs and partitions and the partition dag looks like this;
6331 :
6332 : S1 S3
6333 : \ /
6334 : S2+S4
6335 : |
6336 : S5
6337 : |
6338 : S6
6339 :
6340 : S2, S3 and S4 will have a higher execution frequency than the other
6341 : statements, so when optimizing for speed, the goal is to avoid any
6342 : layout changes:
6343 :
6344 : - within S3
6345 : - within S2+S4
6346 : - on the S3->S2+S4 edge
6347 :
6348 : For example, if S3 was originally a reversing load, the goal of the
6349 : pass is to make it an unreversed load and change the layout on the
6350 : S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
6351 : on S1->S2+S4 and S5->S6 would also be acceptable.)
6352 :
6353 : The difference between SCCs and partitions becomes important if we
6354 : add an outer loop:
6355 :
6356 : S1: a_1 = ...
6357 : loop1:
6358 : S2: a_2 = PHI<a_1, a_6>
6359 : S3: b_1 = load
6360 : S4: a_3 = a_2 + b_1
6361 : loop2:
6362 : S5: a_4 = PHI<a_3, a_5>
6363 : S6: c_1 = load
6364 : S7: a_5 = a_4 + c_1
6365 : exit2:
6366 : S8: a_6 = PHI<a_5>
6367 : S9: store a_6
6368 : exit1:
6369 :
6370 : Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
6371 : for speed, we usually do not want restrictions in the outer loop to "infect"
6372 : the decision for the inner loop. For example, if an outer-loop node
6373 : in the SCC contains a statement with a fixed layout, that should not
6374 : prevent the inner loop from using a different layout. Conversely,
6375 : the inner loop should not dictate a layout to the outer loop: if the
6376 : outer loop does a lot of computation, then it may not be efficient to
6377 : do all of that computation in the inner loop's preferred layout.
6378 :
6379 : So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
6380 : and S5+S7 (inner). We also try to arrange partitions so that:
6381 :
6382 : - the partition for an outer loop comes before the partition for
6383 : an inner loop
6384 :
6385 : - if a sibling loop A dominates a sibling loop B, A's partition
6386 : comes before B's
6387 :
6388 : This gives the following partition dag for the example above:
6389 :
6390 : S1 S3
6391 : \ /
6392 : S2+S4+S8 S6
6393 : | \\ /
6394 : | S5+S7
6395 : |
6396 : S9
6397 :
6398 : There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
6399 : one for a reversal of the edge S7->S8.
6400 :
6401 : The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
6402 : for S2+S4+S8 therefore has to balance the cost of using the outer loop's
6403 : preferred layout against the cost of changing the layout on entry to the
6404 : inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
6405 :
6406 : Although this works well when optimizing for speed, it has the downside
6407 : when optimizing for size that the choice of layout for S5+S7 is completely
6408 : independent of S9, which lessens the chance of reducing the overall number
6409 : of permutations. We therefore do not partition SCCs when optimizing
6410 : for size.
6411 :
6412 : To give a concrete example of the difference between optimizing
6413 : for size and speed, consider:
6414 :
6415 : a[0] = (b[1] << c[3]) - d[1];
6416 : a[1] = (b[0] << c[2]) - d[0];
6417 : a[2] = (b[3] << c[1]) - d[3];
6418 : a[3] = (b[2] << c[0]) - d[2];
6419 :
6420 : There are three different layouts here: one for a, one for b and d,
6421 : and one for c. When optimizing for speed it is better to permute each
6422 : of b, c and d into the order required by a, since those permutations
6423 : happen in parallel. But when optimizing for size, it is better to:
6424 :
6425 : - permute c into the same order as b
6426 : - do the arithmetic
6427 : - permute the result into the order required by a
6428 :
6429 : This gives 2 permutations rather than 3. */
6430 :
6431 : class vect_optimize_slp_pass
6432 : {
6433 : public:
6434 624895 : vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
6435 : void run ();
6436 :
6437 : private:
6438 : /* Graph building. */
6439 : struct loop *containing_loop (slp_tree);
6440 : bool is_cfg_latch_edge (graph_edge *);
6441 : void build_vertices (hash_set<slp_tree> &, slp_tree);
6442 : void build_vertices ();
6443 : void build_graph ();
6444 :
6445 : /* Partitioning. */
6446 : void create_partitions ();
6447 : template<typename T> void for_each_partition_edge (unsigned int, T);
6448 :
6449 : /* Layout selection. */
6450 : bool is_compatible_layout (slp_tree, unsigned int);
6451 : bool is_compatible_layout (const slpg_partition_info &, unsigned int);
6452 : int change_layout_cost (slp_tree, unsigned int, unsigned int);
6453 : slpg_partition_layout_costs &partition_layout_costs (unsigned int,
6454 : unsigned int);
6455 : void change_vec_perm_layout (slp_tree, lane_permutation_t &,
6456 : int, unsigned int);
6457 : int internal_node_cost (slp_tree, int, unsigned int);
6458 : void start_choosing_layouts ();
6459 : bool legitimize ();
6460 :
6461 : /* Cost propagation. */
6462 : slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
6463 : unsigned int, unsigned int);
6464 : slpg_layout_cost total_in_cost (unsigned int);
6465 : slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
6466 : slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
6467 : void forward_pass ();
6468 : void backward_pass ();
6469 :
6470 : /* Rematerialization. */
6471 : slp_tree get_result_with_layout (slp_tree, unsigned int);
6472 : void materialize ();
6473 :
6474 : /* Clean-up. */
6475 : void remove_redundant_permutations ();
6476 :
6477 : /* Masked load lanes discovery. */
6478 : void decide_masked_load_lanes ();
6479 :
6480 : void dump ();
6481 :
6482 : vec_info *m_vinfo;
6483 :
6484 : /* True if we should optimize the graph for size, false if we should
6485 : optimize it for speed. (It wouldn't be easy to make this decision
6486 : more locally.) */
6487 : bool m_optimize_size;
6488 :
6489 : /* A graph of all SLP nodes, with edges leading from uses to definitions.
6490 : In other words, a node's predecessors are its slp_tree parents and
6491 : a node's successors are its slp_tree children. */
6492 : graph *m_slpg = nullptr;
6493 :
6494 : /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
6495 : auto_vec<slpg_vertex> m_vertices;
6496 :
6497 : /* The list of all leaves of M_SLPG. such as external definitions, constants,
6498 : and loads. */
6499 : auto_vec<int> m_leafs;
6500 :
6501 : /* This array has one entry for every vector layout that we're considering.
6502 : Element 0 is null and indicates "no change". Other entries describe
6503 : permutations that are inherent in the current graph and that we would
6504 : like to reverse if possible.
6505 :
6506 : For example, a permutation { 1, 2, 3, 0 } means that something has
6507 : effectively been permuted in that way, such as a load group
6508 : { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
6509 : We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
6510 : in order to put things "back" in order. */
6511 : auto_vec<vec<unsigned> > m_perms;
6512 :
6513 : /* A partitioning of the nodes for which a layout must be chosen.
6514 : Each partition represents an <SCC, cfg loop> pair; that is,
6515 : nodes in different SCCs belong to different partitions, and nodes
6516 : within an SCC can be further partitioned according to a containing
6517 : cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
6518 :
6519 : - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
6520 : from leaves (such as loads) to roots (such as stores).
6521 :
6522 : - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
6523 : auto_vec<slpg_partition_info> m_partitions;
6524 :
6525 : /* The list of all nodes for which a layout must be chosen. Nodes for
6526 : partition P come before the nodes for partition P+1. Nodes within a
6527 : partition are in reverse postorder. */
6528 : auto_vec<unsigned int> m_partitioned_nodes;
6529 :
6530 : /* Index P * num-layouts + L contains the cost of using layout L
6531 : for partition P. */
6532 : auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
6533 :
6534 : /* Index N * num-layouts + L, if nonnull, is a node that provides the
6535 : original output of node N adjusted to have layout L. */
6536 : auto_vec<slp_tree> m_node_layouts;
6537 : };
6538 :
6539 : /* Fill the vertices and leafs vector with all nodes in the SLP graph.
6540 : Also record whether we should optimize anything for speed rather
6541 : than size. */
6542 :
6543 : void
6544 9747330 : vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
6545 : slp_tree node)
6546 : {
6547 9747330 : unsigned i;
6548 9747330 : slp_tree child;
6549 :
6550 9747330 : if (visited.add (node))
6551 9747330 : return;
6552 :
6553 9102782 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
6554 : {
6555 7062157 : basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
6556 6279703 : if (optimize_bb_for_speed_p (bb))
6557 6162465 : m_optimize_size = false;
6558 : }
6559 :
6560 9102782 : node->vertex = m_vertices.length ();
6561 9102782 : m_vertices.safe_push (slpg_vertex (node));
6562 :
6563 9102782 : bool leaf = true;
6564 9102782 : bool force_leaf = false;
6565 16843072 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6566 7740290 : if (child)
6567 : {
6568 6985324 : leaf = false;
6569 6985324 : build_vertices (visited, child);
6570 : }
6571 : else
6572 : force_leaf = true;
6573 : /* Since SLP discovery works along use-def edges all cycles have an
6574 : entry - but there's the exception of cycles where we do not handle
6575 : the entry explicitly (but with a NULL SLP node), like some reductions
6576 : and inductions. Force those SLP PHIs to act as leafs to make them
6577 : backwards reachable. */
6578 9102782 : if (leaf || force_leaf)
6579 4533242 : m_leafs.safe_push (node->vertex);
6580 : }
6581 :
6582 : /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
6583 :
6584 : void
6585 1249790 : vect_optimize_slp_pass::build_vertices ()
6586 : {
6587 1249790 : hash_set<slp_tree> visited;
6588 1249790 : unsigned i;
6589 1249790 : slp_instance instance;
6590 1249790 : m_vertices.truncate (0);
6591 1249790 : m_leafs.truncate (0);
6592 6511376 : FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
6593 2762006 : build_vertices (visited, SLP_INSTANCE_TREE (instance));
6594 1249790 : }
6595 :
6596 : /* Apply (reverse) bijectite PERM to VEC. */
6597 :
6598 : template <class T>
6599 : static void
6600 191963 : vect_slp_permute (vec<unsigned> perm,
6601 : vec<T> &vec, bool reverse)
6602 : {
6603 191963 : auto_vec<T, 64> saved;
6604 191963 : saved.create (vec.length ());
6605 626423 : for (unsigned i = 0; i < vec.length (); ++i)
6606 434460 : saved.quick_push (vec[i]);
6607 :
6608 191963 : if (reverse)
6609 : {
6610 1242759 : for (unsigned i = 0; i < vec.length (); ++i)
6611 433248 : vec[perm[i]] = saved[i];
6612 624673 : for (unsigned i = 0; i < vec.length (); ++i)
6613 762549 : gcc_assert (vec[perm[i]] == saved[i]);
6614 : }
6615 : else
6616 : {
6617 3500 : for (unsigned i = 0; i < vec.length (); ++i)
6618 1212 : vec[i] = saved[perm[i]];
6619 193175 : for (unsigned i = 0; i < vec.length (); ++i)
6620 1818 : gcc_assert (vec[i] == saved[perm[i]]);
6621 : }
6622 191963 : }
6623 :
6624 : /* Return the cfg loop that contains NODE. */
6625 :
6626 : struct loop *
6627 3434447 : vect_optimize_slp_pass::containing_loop (slp_tree node)
6628 : {
6629 3434447 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
6630 3434447 : if (!rep)
6631 4608 : return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
6632 3830219 : return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
6633 : }
6634 :
6635 : /* Return true if UD (an edge from a use to a definition) is associated
6636 : with a loop latch edge in the cfg. */
6637 :
6638 : bool
6639 6985324 : vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
6640 : {
6641 6985324 : slp_tree use = m_vertices[ud->src].node;
6642 6985324 : slp_tree def = m_vertices[ud->dest].node;
6643 6985324 : if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
6644 6985324 : || SLP_TREE_PERMUTE_P (use))
6645 6693736 : || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
6646 : return false;
6647 :
6648 3881618 : stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
6649 3881618 : return (is_a<gphi *> (use_rep->stmt)
6650 319332 : && bb_loop_header_p (gimple_bb (use_rep->stmt))
6651 4038800 : && containing_loop (def) == containing_loop (use));
6652 : }
6653 :
6654 : /* Build the graph. Mark edges that correspond to cfg loop latch edges with
6655 : a nonnull data field. */
6656 :
6657 : void
6658 1249790 : vect_optimize_slp_pass::build_graph ()
6659 : {
6660 1249790 : m_optimize_size = true;
6661 1249790 : build_vertices ();
6662 :
6663 2499580 : m_slpg = new_graph (m_vertices.length ());
6664 12852152 : for (slpg_vertex &v : m_vertices)
6665 26865004 : for (slp_tree child : SLP_TREE_CHILDREN (v.node))
6666 7740290 : if (child)
6667 : {
6668 6985324 : graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
6669 6985324 : if (is_cfg_latch_edge (ud))
6670 148710 : ud->data = this;
6671 : }
6672 1249790 : }
6673 :
6674 : /* Return true if E corresponds to a loop latch edge in the cfg. */
6675 :
6676 : static bool
6677 3566748 : skip_cfg_latch_edges (graph_edge *e)
6678 : {
6679 3566748 : return e->data;
6680 : }
6681 :
6682 : /* Create the node partitions. */
6683 :
6684 : void
6685 624895 : vect_optimize_slp_pass::create_partitions ()
6686 : {
6687 : /* Calculate a postorder of the graph, ignoring edges that correspond
6688 : to natural latch edges in the cfg. Reading the vector from the end
6689 : to the beginning gives the reverse postorder. */
6690 624895 : auto_vec<int> initial_rpo;
6691 1249790 : graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
6692 : false, NULL, skip_cfg_latch_edges);
6693 1874685 : gcc_assert (initial_rpo.length () == m_vertices.length ());
6694 :
6695 : /* Calculate the strongly connected components of the graph. */
6696 624895 : auto_vec<int> scc_grouping;
6697 624895 : unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
6698 :
6699 : /* Create a new index order in which all nodes from the same SCC are
6700 : consecutive. Use scc_pos to record the index of the first node in
6701 : each SCC. */
6702 624895 : auto_vec<unsigned int> scc_pos (num_sccs);
6703 624895 : int last_component = -1;
6704 624895 : unsigned int node_count = 0;
6705 6425809 : for (unsigned int node_i : scc_grouping)
6706 : {
6707 4551124 : if (last_component != m_slpg->vertices[node_i].component)
6708 : {
6709 4459418 : last_component = m_slpg->vertices[node_i].component;
6710 8918836 : gcc_assert (last_component == int (scc_pos.length ()));
6711 4459418 : scc_pos.quick_push (node_count);
6712 : }
6713 4551124 : node_count += 1;
6714 : }
6715 1249790 : gcc_assert (node_count == initial_rpo.length ()
6716 : && last_component + 1 == int (num_sccs));
6717 :
6718 : /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
6719 : inside each SCC following the RPO we calculated above. The fact that
6720 : we ignored natural latch edges when calculating the RPO should ensure
6721 : that, for natural loop nests:
6722 :
6723 : - the first node that we encounter in a cfg loop is the loop header phi
6724 : - the loop header phis are in dominance order
6725 :
6726 : Arranging for this is an optimization (see below) rather than a
6727 : correctness issue. Unnatural loops with a tangled mess of backedges
6728 : will still work correctly, but might give poorer results.
6729 :
6730 : Also update scc_pos so that it gives 1 + the index of the last node
6731 : in the SCC. */
6732 624895 : m_partitioned_nodes.safe_grow (node_count);
6733 5800914 : for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
6734 : {
6735 4551124 : unsigned int node_i = initial_rpo[old_i];
6736 4551124 : unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
6737 4551124 : m_partitioned_nodes[new_i] = node_i;
6738 : }
6739 :
6740 : /* When optimizing for speed, partition each SCC based on the containing
6741 : cfg loop. The order we constructed above should ensure that, for natural
6742 : cfg loops, we'll create sub-SCC partitions for outer loops before
6743 : the corresponding sub-SCC partitions for inner loops. Similarly,
6744 : when one sibling loop A dominates another sibling loop B, we should
6745 : create a sub-SCC partition for A before a sub-SCC partition for B.
6746 :
6747 : As above, nothing depends for correctness on whether this achieves
6748 : a natural nesting, but we should get better results when it does. */
6749 1249790 : m_partitions.reserve (m_vertices.length ());
6750 624895 : unsigned int next_partition_i = 0;
6751 624895 : hash_map<struct loop *, int> loop_partitions;
6752 624895 : unsigned int rpo_begin = 0;
6753 624895 : unsigned int num_partitioned_nodes = 0;
6754 6334103 : for (unsigned int rpo_end : scc_pos)
6755 : {
6756 4459418 : loop_partitions.empty ();
6757 : unsigned int partition_i = next_partition_i;
6758 9010542 : for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
6759 : {
6760 : /* Handle externals and constants optimistically throughout.
6761 : But treat existing vectors as fixed since we do not handle
6762 : permuting them. */
6763 4551124 : unsigned int node_i = m_partitioned_nodes[rpo_i];
6764 4551124 : auto &vertex = m_vertices[node_i];
6765 4551124 : if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
6766 494625 : && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
6767 4553368 : || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
6768 1406931 : vertex.partition = -1;
6769 : else
6770 : {
6771 3144193 : bool existed;
6772 3144193 : if (m_optimize_size)
6773 24110 : existed = next_partition_i > partition_i;
6774 : else
6775 : {
6776 3120083 : struct loop *loop = containing_loop (vertex.node);
6777 3120083 : auto &entry = loop_partitions.get_or_insert (loop, &existed);
6778 3120083 : if (!existed)
6779 3029361 : entry = next_partition_i;
6780 3120083 : partition_i = entry;
6781 : }
6782 3144193 : if (!existed)
6783 : {
6784 3053393 : m_partitions.quick_push (slpg_partition_info ());
6785 3053393 : next_partition_i += 1;
6786 : }
6787 3144193 : vertex.partition = partition_i;
6788 3144193 : num_partitioned_nodes += 1;
6789 3144193 : m_partitions[partition_i].node_end += 1;
6790 : }
6791 : }
6792 4459418 : rpo_begin = rpo_end;
6793 : }
6794 :
6795 : /* Assign ranges of consecutive node indices to each partition,
6796 : in partition order. Start with node_end being the same as
6797 : node_begin so that the next loop can use it as a counter. */
6798 624895 : unsigned int node_begin = 0;
6799 4928078 : for (auto &partition : m_partitions)
6800 : {
6801 3053393 : partition.node_begin = node_begin;
6802 3053393 : node_begin += partition.node_end;
6803 3053393 : partition.node_end = partition.node_begin;
6804 : }
6805 624895 : gcc_assert (node_begin == num_partitioned_nodes);
6806 :
6807 : /* Finally build the list of nodes in partition order. */
6808 624895 : m_partitioned_nodes.truncate (num_partitioned_nodes);
6809 5176019 : for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
6810 : {
6811 4551124 : int partition_i = m_vertices[node_i].partition;
6812 4551124 : if (partition_i >= 0)
6813 : {
6814 3144193 : unsigned int order_i = m_partitions[partition_i].node_end++;
6815 3144193 : m_partitioned_nodes[order_i] = node_i;
6816 : }
6817 : }
6818 624895 : }
6819 :
6820 : /* Look for edges from earlier partitions into node NODE_I and edges from
6821 : node NODE_I into later partitions. Call:
6822 :
6823 : FN (ud, other_node_i)
6824 :
6825 : for each such use-to-def edge ud, where other_node_i is the node at the
6826 : other end of the edge. */
6827 :
6828 : template<typename T>
6829 : void
6830 3536888 : vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
6831 : {
6832 3536888 : int partition_i = m_vertices[node_i].partition;
6833 3536888 : for (graph_edge *pred = m_slpg->vertices[node_i].pred;
6834 6015467 : pred; pred = pred->pred_next)
6835 : {
6836 2478579 : int src_partition_i = m_vertices[pred->src].partition;
6837 2478579 : if (src_partition_i >= 0 && src_partition_i != partition_i)
6838 2252769 : fn (pred, pred->src);
6839 : }
6840 3536888 : for (graph_edge *succ = m_slpg->vertices[node_i].succ;
6841 7570912 : succ; succ = succ->succ_next)
6842 : {
6843 4034024 : int dest_partition_i = m_vertices[succ->dest].partition;
6844 4034024 : if (dest_partition_i >= 0 && dest_partition_i != partition_i)
6845 2274608 : fn (succ, succ->dest);
6846 : }
6847 3536888 : }
6848 :
6849 : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
6850 : that NODE would operate on. This test is independent of NODE's actual
6851 : operation. */
6852 :
6853 : bool
6854 1584482 : vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
6855 : unsigned int layout_i)
6856 : {
6857 1584482 : if (layout_i == 0)
6858 : return true;
6859 :
6860 918810 : if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
6861 11596 : return false;
6862 :
6863 : return true;
6864 : }
6865 :
6866 : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
6867 : that NODE would operate on for each NODE in PARTITION.
6868 : This test is independent of NODE's actual operations. */
6869 :
6870 : bool
6871 17595 : vect_optimize_slp_pass::is_compatible_layout (const slpg_partition_info
6872 : &partition,
6873 : unsigned int layout_i)
6874 : {
6875 35424 : for (unsigned int order_i = partition.node_begin;
6876 35424 : order_i < partition.node_end; ++order_i)
6877 : {
6878 17895 : unsigned int node_i = m_partitioned_nodes[order_i];
6879 17895 : auto &vertex = m_vertices[node_i];
6880 :
6881 : /* The layout is incompatible if it is individually incompatible
6882 : with any node in the partition. */
6883 17895 : if (!is_compatible_layout (vertex.node, layout_i))
6884 : return false;
6885 : }
6886 : return true;
6887 : }
6888 :
6889 : /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
6890 : to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
6891 : layouts is incompatible with NODE or if the change is not possible for
6892 : some other reason.
6893 :
6894 : The properties taken from NODE include the number of lanes and the
6895 : vector type. The actual operation doesn't matter. */
6896 :
6897 : int
6898 678941 : vect_optimize_slp_pass::change_layout_cost (slp_tree node,
6899 : unsigned int from_layout_i,
6900 : unsigned int to_layout_i)
6901 : {
6902 678941 : if (!is_compatible_layout (node, from_layout_i)
6903 678941 : || !is_compatible_layout (node, to_layout_i))
6904 569 : return -1;
6905 :
6906 678372 : if (from_layout_i == to_layout_i)
6907 : return 0;
6908 :
6909 293386 : auto_vec<slp_tree, 1> children (1);
6910 293386 : children.quick_push (node);
6911 293386 : auto_lane_permutation_t perm (SLP_TREE_LANES (node));
6912 293386 : if (from_layout_i > 0)
6913 830284 : for (unsigned int i : m_perms[from_layout_i])
6914 365491 : perm.quick_push ({ 0, i });
6915 : else
6916 448184 : for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
6917 309729 : perm.quick_push ({ 0, i });
6918 293386 : if (to_layout_i > 0)
6919 138882 : vect_slp_permute (m_perms[to_layout_i], perm, true);
6920 293386 : auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
6921 : children, false);
6922 293386 : if (count >= 0)
6923 288858 : return MAX (count, 1);
6924 :
6925 : /* ??? In principle we could try changing via layout 0, giving two
6926 : layout changes rather than 1. Doing that would require
6927 : corresponding support in get_result_with_layout. */
6928 : return -1;
6929 293386 : }
6930 :
6931 : /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
6932 :
6933 : inline slpg_partition_layout_costs &
6934 981419 : vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
6935 : unsigned int layout_i)
6936 : {
6937 1962838 : return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
6938 : }
6939 :
6940 : /* Change PERM in one of two ways:
6941 :
6942 : - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
6943 : chosen for child I of NODE.
6944 :
6945 : - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
6946 :
6947 : In both cases, arrange for the output to have layout OUT_LAYOUT_I */
6948 :
6949 : void
6950 27867 : vect_optimize_slp_pass::
6951 : change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
6952 : int in_layout_i, unsigned int out_layout_i)
6953 : {
6954 163837 : for (auto &entry : perm)
6955 : {
6956 80236 : int this_in_layout_i = in_layout_i;
6957 80236 : if (this_in_layout_i < 0)
6958 : {
6959 57281 : slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
6960 57281 : unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
6961 57281 : if (in_partition_i == -1u)
6962 329 : continue;
6963 56952 : this_in_layout_i = m_partitions[in_partition_i].layout;
6964 : }
6965 79907 : if (this_in_layout_i > 0)
6966 17441 : entry.second = m_perms[this_in_layout_i][entry.second];
6967 : }
6968 27867 : if (out_layout_i > 0)
6969 6305 : vect_slp_permute (m_perms[out_layout_i], perm, true);
6970 27867 : }
6971 :
6972 : /* Check whether the target allows NODE to be rearranged so that the node's
6973 : output has layout OUT_LAYOUT_I. Return the cost of the change if so,
6974 : in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
6975 :
6976 : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
6977 : NODE can adapt to the layout changes that have (perhaps provisionally)
6978 : been chosen for NODE's children, so that no extra permutations are
6979 : needed on either the input or the output of NODE.
6980 :
6981 : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
6982 : that all inputs will be forced into layout IN_LAYOUT_I beforehand.
6983 :
6984 : IN_LAYOUT_I has no meaning for other types of node.
6985 :
6986 : Keeping the node as-is is always valid. If the target doesn't appear
6987 : to support the node as-is, but might realistically support other layouts,
6988 : then layout 0 instead has the cost of a worst-case permutation. On the
6989 : one hand, this ensures that every node has at least one valid layout,
6990 : avoiding what would otherwise be an awkward special case. On the other,
6991 : it still encourages the pass to change an invalid pre-existing layout
6992 : choice into a valid one. */
6993 :
6994 : int
6995 208670 : vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
6996 : unsigned int out_layout_i)
6997 : {
6998 208670 : const int fallback_cost = 1;
6999 :
7000 208670 : if (SLP_TREE_PERMUTE_P (node))
7001 : {
7002 23544 : auto_lane_permutation_t tmp_perm;
7003 23544 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
7004 :
7005 : /* Check that the child nodes support the chosen layout. Checking
7006 : the first child is enough, since any second child would have the
7007 : same shape. */
7008 23544 : auto first_child = SLP_TREE_CHILDREN (node)[0];
7009 23544 : if (in_layout_i > 0
7010 23544 : && !is_compatible_layout (first_child, in_layout_i))
7011 : return -1;
7012 :
7013 22979 : change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
7014 45958 : int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
7015 : node, tmp_perm,
7016 22979 : SLP_TREE_CHILDREN (node),
7017 : false);
7018 22979 : if (count < 0)
7019 : {
7020 1516 : if (in_layout_i == 0 && out_layout_i == 0)
7021 : {
7022 : /* Use the fallback cost if the node could in principle support
7023 : some nonzero layout for both the inputs and the outputs.
7024 : Otherwise assume that the node will be rejected later
7025 : and rebuilt from scalars. */
7026 369 : if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
7027 : return fallback_cost;
7028 299 : return 0;
7029 : }
7030 : return -1;
7031 : }
7032 :
7033 : /* We currently have no way of telling whether the new layout is cheaper
7034 : or more expensive than the old one. But at least in principle,
7035 : it should be worth making zero permutations (whole-vector shuffles)
7036 : cheaper than real permutations, in case the pass is able to remove
7037 : the latter. */
7038 21463 : return count == 0 ? 0 : 1;
7039 23544 : }
7040 :
7041 185126 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
7042 185126 : if (rep
7043 184187 : && STMT_VINFO_DATA_REF (rep)
7044 58905 : && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
7045 226771 : && SLP_TREE_LOAD_PERMUTATION (node).exists ())
7046 : {
7047 35339 : auto_load_permutation_t tmp_perm;
7048 35339 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
7049 35339 : if (out_layout_i > 0)
7050 12344 : vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
7051 :
7052 35339 : poly_uint64 vf = 1;
7053 35339 : if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
7054 7972 : vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7055 35339 : unsigned int n_perms;
7056 35339 : if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
7057 : nullptr, vf, true, false, &n_perms))
7058 : {
7059 1501 : auto rep = SLP_TREE_REPRESENTATIVE (node);
7060 1501 : if (out_layout_i == 0)
7061 : {
7062 : /* Use the fallback cost if the load is an N-to-N permutation.
7063 : Otherwise assume that the node will be rejected later
7064 : and rebuilt from scalars. */
7065 1098 : if (STMT_VINFO_GROUPED_ACCESS (rep)
7066 2196 : && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
7067 1098 : == SLP_TREE_LANES (node)))
7068 602 : return fallback_cost;
7069 : return 0;
7070 : }
7071 : return -1;
7072 : }
7073 :
7074 : /* See the comment above the corresponding VEC_PERM_EXPR handling. */
7075 33838 : return n_perms == 0 ? 0 : 1;
7076 35339 : }
7077 :
7078 : return 0;
7079 : }
7080 :
7081 : /* Decide which element layouts we should consider using. Calculate the
7082 : weights associated with inserting layout changes on partition edges.
7083 : Also mark partitions that cannot change layout, by setting their
7084 : layout to zero. */
7085 :
7086 : void
7087 624895 : vect_optimize_slp_pass::start_choosing_layouts ()
7088 : {
7089 : /* Used to assign unique permutation indices. */
7090 624895 : using perm_hash = unbounded_hashmap_traits<
7091 : vec_free_hash_base<int_hash_base<unsigned>>,
7092 : int_hash<int, -1, -2>
7093 : >;
7094 624895 : hash_map<vec<unsigned>, int, perm_hash> layout_ids;
7095 :
7096 : /* Layout 0 is "no change". */
7097 624895 : m_perms.safe_push (vNULL);
7098 :
7099 : /* Create layouts from existing permutations. */
7100 624895 : auto_load_permutation_t tmp_perm;
7101 5018878 : for (unsigned int node_i : m_partitioned_nodes)
7102 : {
7103 : /* Leafs also double as entries to the reverse graph. Allow the
7104 : layout of those to be changed. */
7105 3144193 : auto &vertex = m_vertices[node_i];
7106 3144193 : auto &partition = m_partitions[vertex.partition];
7107 3144193 : if (!m_slpg->vertices[node_i].succ)
7108 795747 : partition.layout = 0;
7109 :
7110 : /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
7111 3144193 : slp_tree node = vertex.node;
7112 3144193 : stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
7113 3144193 : slp_tree child;
7114 3144193 : unsigned HOST_WIDE_INT imin, imax = 0;
7115 3144193 : bool any_permute = false;
7116 3144193 : tmp_perm.truncate (0);
7117 3144193 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
7118 : {
7119 : /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
7120 : unpermuted, record a layout that reverses this permutation.
7121 :
7122 : We would need more work to cope with loads that are internally
7123 : permuted and also have inputs (such as masks for
7124 : IFN_MASK_LOADs). */
7125 522218 : gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
7126 522218 : if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
7127 : {
7128 357776 : partition.layout = -1;
7129 3128082 : continue;
7130 : }
7131 164442 : dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
7132 164442 : imin = DR_GROUP_SIZE (dr_stmt) + 1;
7133 164442 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
7134 : }
7135 5128827 : else if (SLP_TREE_PERMUTE_P (node)
7136 130324 : && SLP_TREE_CHILDREN (node).length () == 1
7137 115123 : && (child = SLP_TREE_CHILDREN (node)[0])
7138 2737098 : && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
7139 115123 : .is_constant (&imin)))
7140 : {
7141 : /* If the child has the same vector size as this node,
7142 : reversing the permutation can make the permutation a no-op.
7143 : In other cases it can change a true permutation into a
7144 : full-vector extract. */
7145 115123 : tmp_perm.reserve (SLP_TREE_LANES (node));
7146 307321 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7147 192198 : tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
7148 : }
7149 : else
7150 2506852 : continue;
7151 :
7152 737559 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7153 : {
7154 457994 : unsigned idx = tmp_perm[j];
7155 457994 : imin = MIN (imin, idx);
7156 457994 : imax = MAX (imax, idx);
7157 457994 : if (idx - tmp_perm[0] != j)
7158 132443 : any_permute = true;
7159 : }
7160 : /* If the span doesn't match we'd disrupt VF computation, avoid
7161 : that for now. */
7162 279565 : if (imax - imin + 1 != SLP_TREE_LANES (node))
7163 79785 : continue;
7164 : /* If there's no permute no need to split one out. In this case
7165 : we can consider turning a load into a permuted load, if that
7166 : turns out to be cheaper than alternatives. */
7167 199780 : if (!any_permute)
7168 : {
7169 183535 : partition.layout = -1;
7170 183535 : continue;
7171 : }
7172 :
7173 : /* For now only handle true permutes, like
7174 : vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
7175 : when permuting constants and invariants keeping the permute
7176 : bijective. */
7177 16245 : auto_sbitmap load_index (SLP_TREE_LANES (node));
7178 16245 : bitmap_clear (load_index);
7179 62835 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7180 46590 : bitmap_set_bit (load_index, tmp_perm[j] - imin);
7181 : unsigned j;
7182 62159 : for (j = 0; j < SLP_TREE_LANES (node); ++j)
7183 46048 : if (!bitmap_bit_p (load_index, j))
7184 : break;
7185 16245 : if (j != SLP_TREE_LANES (node))
7186 134 : continue;
7187 :
7188 16111 : vec<unsigned> perm = vNULL;
7189 16111 : perm.safe_grow (SLP_TREE_LANES (node), true);
7190 61924 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7191 45813 : perm[j] = tmp_perm[j] - imin;
7192 :
7193 32222 : if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
7194 : {
7195 : /* Continue to use existing layouts, but don't add any more. */
7196 0 : int *entry = layout_ids.get (perm);
7197 0 : partition.layout = entry ? *entry : 0;
7198 0 : perm.release ();
7199 : }
7200 : else
7201 : {
7202 16111 : bool existed;
7203 16111 : int &layout_i = layout_ids.get_or_insert (perm, &existed);
7204 16111 : if (existed)
7205 5511 : perm.release ();
7206 : else
7207 : {
7208 10600 : layout_i = m_perms.length ();
7209 10600 : m_perms.safe_push (perm);
7210 : }
7211 16111 : partition.layout = layout_i;
7212 : }
7213 16245 : }
7214 :
7215 : /* Initially assume that every layout is possible and has zero cost
7216 : in every partition. */
7217 624895 : m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
7218 1249790 : * m_perms.length ());
7219 :
7220 : /* We have to mark outgoing permutations facing non-associating-reduction
7221 : graph entries that are not represented as to be materialized.
7222 : slp_inst_kind_bb_reduc currently only covers associatable reductions. */
7223 3255688 : for (slp_instance instance : m_vinfo->slp_instances)
7224 1381003 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
7225 : {
7226 6248 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
7227 6248 : m_partitions[m_vertices[node_i].partition].layout = 0;
7228 : }
7229 1374755 : else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
7230 : {
7231 1399 : stmt_vec_info stmt_info
7232 1399 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
7233 1399 : vect_reduc_info reduc_info
7234 1399 : = info_for_reduction (as_a <loop_vec_info> (m_vinfo),
7235 : SLP_INSTANCE_TREE (instance));
7236 1399 : if (needs_fold_left_reduction_p (TREE_TYPE
7237 : (gimple_get_lhs (stmt_info->stmt)),
7238 : VECT_REDUC_INFO_CODE (reduc_info)))
7239 : {
7240 64 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
7241 64 : m_partitions[m_vertices[node_i].partition].layout = 0;
7242 : }
7243 : }
7244 :
7245 : /* Check which layouts each node and partition can handle. Calculate the
7246 : weights associated with inserting layout changes on edges. */
7247 5018878 : for (unsigned int node_i : m_partitioned_nodes)
7248 : {
7249 3144193 : auto &vertex = m_vertices[node_i];
7250 3144193 : auto &partition = m_partitions[vertex.partition];
7251 3144193 : slp_tree node = vertex.node;
7252 :
7253 3144193 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
7254 : {
7255 3139585 : vertex.weight = vect_slp_node_weight (node);
7256 :
7257 : /* We do not handle stores with a permutation, so all
7258 : incoming permutations must have been materialized.
7259 :
7260 : We also don't handle masked grouped loads, which lack a
7261 : permutation vector. In this case the memory locations
7262 : form an implicit second input to the loads, on top of the
7263 : explicit mask input, and the memory input's layout cannot
7264 : be changed.
7265 :
7266 : On the other hand, we do support permuting gather loads and
7267 : masked gather loads, where each scalar load is independent
7268 : of the others. This can be useful if the address/index input
7269 : benefits from permutation. */
7270 3139585 : if (STMT_VINFO_DATA_REF (rep)
7271 1627231 : && STMT_VINFO_GROUPED_ACCESS (rep)
7272 4208816 : && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
7273 904789 : partition.layout = 0;
7274 :
7275 : /* We cannot change the layout of an operation that is
7276 : not independent on lanes. Note this is an explicit
7277 : negative list since that's much shorter than the respective
7278 : positive one but it's critical to keep maintaining it. */
7279 3139585 : if (is_gimple_call (STMT_VINFO_STMT (rep)))
7280 23350 : switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
7281 : {
7282 1071 : case CFN_COMPLEX_ADD_ROT90:
7283 1071 : case CFN_COMPLEX_ADD_ROT270:
7284 1071 : case CFN_COMPLEX_MUL:
7285 1071 : case CFN_COMPLEX_MUL_CONJ:
7286 1071 : case CFN_VEC_ADDSUB:
7287 1071 : case CFN_VEC_FMADDSUB:
7288 1071 : case CFN_VEC_FMSUBADD:
7289 1071 : partition.layout = 0;
7290 : default:;
7291 : }
7292 : }
7293 :
7294 6965685 : auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
7295 : {
7296 3821492 : auto &other_vertex = m_vertices[other_node_i];
7297 :
7298 : /* Count the number of edges from earlier partitions and the number
7299 : of edges to later partitions. */
7300 3821492 : if (other_vertex.partition < vertex.partition)
7301 1910746 : partition.in_degree += 1;
7302 : else
7303 1910746 : partition.out_degree += 1;
7304 :
7305 : /* If the current node uses the result of OTHER_NODE_I, accumulate
7306 : the effects of that. */
7307 3821492 : if (ud->src == int (node_i))
7308 : {
7309 1910746 : other_vertex.out_weight += vertex.weight;
7310 1910746 : other_vertex.out_degree += 1;
7311 : }
7312 6965685 : };
7313 3144193 : for_each_partition_edge (node_i, process_edge);
7314 : }
7315 624895 : }
7316 :
7317 : /* Return the incoming costs for node NODE_I, assuming that each input keeps
7318 : its current (provisional) choice of layout. The inputs do not necessarily
7319 : have the same layout as each other. */
7320 :
7321 : slpg_layout_cost
7322 3116 : vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
7323 : {
7324 3116 : auto &vertex = m_vertices[node_i];
7325 3116 : slpg_layout_cost cost;
7326 11365 : auto add_cost = [&](graph_edge *, unsigned int other_node_i)
7327 : {
7328 8249 : auto &other_vertex = m_vertices[other_node_i];
7329 8249 : if (other_vertex.partition < vertex.partition)
7330 : {
7331 5228 : auto &other_partition = m_partitions[other_vertex.partition];
7332 10456 : auto &other_costs = partition_layout_costs (other_vertex.partition,
7333 5228 : other_partition.layout);
7334 5228 : slpg_layout_cost this_cost = other_costs.in_cost;
7335 5228 : this_cost.add_serial_cost (other_costs.internal_cost);
7336 5228 : this_cost.split (other_partition.out_degree);
7337 5228 : cost.add_parallel_cost (this_cost);
7338 : }
7339 11365 : };
7340 3116 : for_each_partition_edge (node_i, add_cost);
7341 3116 : return cost;
7342 : }
7343 :
7344 : /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
7345 : and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
7346 : slpg_layout_cost::impossible () if the change isn't possible. */
7347 :
7348 : slpg_layout_cost
7349 678941 : vect_optimize_slp_pass::
7350 : edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
7351 : unsigned int layout2_i)
7352 : {
7353 678941 : auto &def_vertex = m_vertices[ud->dest];
7354 678941 : auto &use_vertex = m_vertices[ud->src];
7355 678941 : auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
7356 678941 : auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
7357 678941 : auto factor = change_layout_cost (def_vertex.node, def_layout_i,
7358 : use_layout_i);
7359 678941 : if (factor < 0)
7360 5097 : return slpg_layout_cost::impossible ();
7361 :
7362 : /* We have a choice of putting the layout change at the site of the
7363 : definition or at the site of the use. Prefer the former when
7364 : optimizing for size or when the execution frequency of the
7365 : definition is no greater than the combined execution frequencies of
7366 : the uses. When putting the layout change at the site of the definition,
7367 : divvy up the cost among all consumers. */
7368 673844 : if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
7369 : {
7370 656852 : slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
7371 656852 : cost.split (def_vertex.out_degree);
7372 656852 : return cost;
7373 : }
7374 16992 : return { use_vertex.weight * factor, m_optimize_size };
7375 : }
7376 :
7377 : /* UD represents a use-def link between FROM_NODE_I and a node in a later
7378 : partition; FROM_NODE_I could be the definition node or the use node.
7379 : The node at the other end of the link wants to use layout TO_LAYOUT_I.
7380 : Return the cost of any necessary fix-ups on edge UD, or return
7381 : slpg_layout_cost::impossible () if the change isn't possible.
7382 :
7383 : At this point, FROM_NODE_I's partition has chosen the cheapest
7384 : layout based on the information available so far, but this choice
7385 : is only provisional. */
7386 :
7387 : slpg_layout_cost
7388 178205 : vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
7389 : unsigned int to_layout_i)
7390 : {
7391 178205 : auto &from_vertex = m_vertices[from_node_i];
7392 178205 : unsigned int from_partition_i = from_vertex.partition;
7393 178205 : slpg_partition_info &from_partition = m_partitions[from_partition_i];
7394 178205 : gcc_assert (from_partition.layout >= 0);
7395 :
7396 : /* First calculate the cost on the assumption that FROM_PARTITION sticks
7397 : with its current layout preference. */
7398 178205 : slpg_layout_cost cost = slpg_layout_cost::impossible ();
7399 178205 : auto edge_cost = edge_layout_cost (ud, from_node_i,
7400 178205 : from_partition.layout, to_layout_i);
7401 178205 : if (edge_cost.is_possible ())
7402 : {
7403 351064 : auto &from_costs = partition_layout_costs (from_partition_i,
7404 175532 : from_partition.layout);
7405 175532 : cost = from_costs.in_cost;
7406 175532 : cost.add_serial_cost (from_costs.internal_cost);
7407 175532 : cost.split (from_partition.out_degree);
7408 175532 : cost.add_serial_cost (edge_cost);
7409 : }
7410 2673 : else if (from_partition.layout == 0)
7411 : /* We must allow the source partition to have layout 0 as a fallback,
7412 : in case all other options turn out to be impossible. */
7413 2673 : return cost;
7414 :
7415 : /* Take the minimum of that cost and the cost that applies if
7416 : FROM_PARTITION instead switches to TO_LAYOUT_I. */
7417 175532 : auto &direct_layout_costs = partition_layout_costs (from_partition_i,
7418 : to_layout_i);
7419 175532 : if (direct_layout_costs.is_possible ())
7420 : {
7421 158970 : slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
7422 158970 : direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
7423 158970 : direct_cost.split (from_partition.out_degree);
7424 158970 : if (!cost.is_possible ()
7425 158970 : || direct_cost.is_better_than (cost, m_optimize_size))
7426 42131 : cost = direct_cost;
7427 : }
7428 :
7429 175532 : return cost;
7430 : }
7431 :
7432 : /* UD represents a use-def link between TO_NODE_I and a node in an earlier
7433 : partition; TO_NODE_I could be the definition node or the use node.
7434 : The node at the other end of the link wants to use layout FROM_LAYOUT_I;
7435 : return the cost of any necessary fix-ups on edge UD, or
7436 : slpg_layout_cost::impossible () if the choice cannot be made.
7437 :
7438 : At this point, TO_NODE_I's partition has a fixed choice of layout. */
7439 :
7440 : slpg_layout_cost
7441 165372 : vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
7442 : unsigned int from_layout_i)
7443 : {
7444 165372 : auto &to_vertex = m_vertices[to_node_i];
7445 165372 : unsigned int to_partition_i = to_vertex.partition;
7446 165372 : slpg_partition_info &to_partition = m_partitions[to_partition_i];
7447 165372 : gcc_assert (to_partition.layout >= 0);
7448 :
7449 : /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
7450 : adjusted for this input having layout FROM_LAYOUT_I. Assume that
7451 : any other inputs keep their current choice of layout. */
7452 165372 : auto &to_costs = partition_layout_costs (to_partition_i,
7453 : to_partition.layout);
7454 165372 : if (ud->src == int (to_node_i)
7455 165210 : && SLP_TREE_PERMUTE_P (to_vertex.node))
7456 : {
7457 9275 : auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
7458 9275 : auto old_layout = from_partition.layout;
7459 9275 : from_partition.layout = from_layout_i;
7460 18550 : int factor = internal_node_cost (to_vertex.node, -1,
7461 9275 : to_partition.layout);
7462 9275 : from_partition.layout = old_layout;
7463 9275 : if (factor >= 0)
7464 : {
7465 8643 : slpg_layout_cost cost = to_costs.out_cost;
7466 17286 : cost.add_serial_cost ({ to_vertex.weight * factor,
7467 8643 : m_optimize_size });
7468 8643 : cost.split (to_partition.in_degree);
7469 8643 : return cost;
7470 : }
7471 : }
7472 :
7473 : /* Compute the cost if we insert any necessary layout change on edge UD. */
7474 156729 : auto edge_cost = edge_layout_cost (ud, to_node_i,
7475 156729 : to_partition.layout, from_layout_i);
7476 156729 : if (edge_cost.is_possible ())
7477 : {
7478 156729 : slpg_layout_cost cost = to_costs.out_cost;
7479 156729 : cost.add_serial_cost (to_costs.internal_cost);
7480 156729 : cost.split (to_partition.in_degree);
7481 156729 : cost.add_serial_cost (edge_cost);
7482 156729 : return cost;
7483 : }
7484 :
7485 0 : return slpg_layout_cost::impossible ();
7486 : }
7487 :
7488 : /* Make a forward pass through the partitions, accumulating input costs.
7489 : Make a tentative (provisional) choice of layout for each partition,
7490 : ensuring that this choice still allows later partitions to keep
7491 : their original layout. */
7492 :
7493 : void
7494 5313 : vect_optimize_slp_pass::forward_pass ()
7495 : {
7496 114876 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
7497 : ++partition_i)
7498 : {
7499 109563 : auto &partition = m_partitions[partition_i];
7500 :
7501 : /* If the partition consists of a single VEC_PERM_EXPR, precompute
7502 : the incoming cost that would apply if every predecessor partition
7503 : keeps its current layout. This is used within the loop below. */
7504 109563 : slpg_layout_cost in_cost;
7505 109563 : slp_tree single_node = nullptr;
7506 109563 : if (partition.node_end == partition.node_begin + 1)
7507 : {
7508 105721 : unsigned int node_i = m_partitioned_nodes[partition.node_begin];
7509 105721 : single_node = m_vertices[node_i].node;
7510 105721 : if (SLP_TREE_PERMUTE_P (single_node))
7511 3116 : in_cost = total_in_cost (node_i);
7512 : }
7513 :
7514 : /* Go through the possible layouts. Decide which ones are valid
7515 : for this partition and record which of the valid layouts has
7516 : the lowest cost. */
7517 109563 : unsigned int min_layout_i = 0;
7518 109563 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
7519 334580 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7520 : {
7521 225017 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7522 225017 : if (!layout_costs.is_possible ())
7523 50930 : continue;
7524 :
7525 : /* If the recorded layout is already 0 then the layout cannot
7526 : change. */
7527 225017 : if (partition.layout == 0 && layout_i != 0)
7528 : {
7529 37201 : layout_costs.mark_impossible ();
7530 37201 : continue;
7531 : }
7532 :
7533 187816 : bool is_possible = true;
7534 380755 : for (unsigned int order_i = partition.node_begin;
7535 380755 : order_i < partition.node_end; ++order_i)
7536 : {
7537 204373 : unsigned int node_i = m_partitioned_nodes[order_i];
7538 204373 : auto &vertex = m_vertices[node_i];
7539 :
7540 : /* Reject the layout if it is individually incompatible
7541 : with any node in the partition. */
7542 204373 : if (!is_compatible_layout (vertex.node, layout_i))
7543 : {
7544 10396 : is_possible = false;
7545 11434 : break;
7546 : }
7547 :
7548 540786 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
7549 : {
7550 346809 : auto &other_vertex = m_vertices[other_node_i];
7551 346809 : if (other_vertex.partition < vertex.partition)
7552 : {
7553 : /* Accumulate the incoming costs from earlier
7554 : partitions, plus the cost of any layout changes
7555 : on UD itself. */
7556 178205 : auto cost = forward_cost (ud, other_node_i, layout_i);
7557 178205 : if (!cost.is_possible ())
7558 2673 : is_possible = false;
7559 : else
7560 175532 : layout_costs.in_cost.add_parallel_cost (cost);
7561 : }
7562 : else
7563 : /* Reject the layout if it would make layout 0 impossible
7564 : for later partitions. This amounts to testing that the
7565 : target supports reversing the layout change on edges
7566 : to later partitions.
7567 :
7568 : In principle, it might be possible to push a layout
7569 : change all the way down a graph, so that it never
7570 : needs to be reversed and so that the target doesn't
7571 : need to support the reverse operation. But it would
7572 : be awkward to bail out if we hit a partition that
7573 : does not support the new layout, especially since
7574 : we are not dealing with a lattice. */
7575 168604 : is_possible &= edge_layout_cost (ud, other_node_i, 0,
7576 168604 : layout_i).is_possible ();
7577 540786 : };
7578 193977 : for_each_partition_edge (node_i, add_cost);
7579 :
7580 : /* Accumulate the cost of using LAYOUT_I within NODE,
7581 : both for the inputs and the outputs. */
7582 193977 : int factor = internal_node_cost (vertex.node, layout_i,
7583 : layout_i);
7584 193977 : if (factor < 0)
7585 : {
7586 1038 : is_possible = false;
7587 1038 : break;
7588 : }
7589 192939 : else if (factor)
7590 31482 : layout_costs.internal_cost.add_serial_cost
7591 31482 : ({ vertex.weight * factor, m_optimize_size });
7592 : }
7593 187816 : if (!is_possible)
7594 : {
7595 13729 : layout_costs.mark_impossible ();
7596 13729 : continue;
7597 : }
7598 :
7599 : /* Combine the incoming and partition-internal costs. */
7600 174087 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7601 174087 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7602 :
7603 : /* If this partition consists of a single VEC_PERM_EXPR, see
7604 : if the VEC_PERM_EXPR can be changed to support output layout
7605 : LAYOUT_I while keeping all the provisional choices of input
7606 : layout. */
7607 174087 : if (single_node && SLP_TREE_PERMUTE_P (single_node))
7608 : {
7609 5418 : int factor = internal_node_cost (single_node, -1, layout_i);
7610 5418 : if (factor >= 0)
7611 : {
7612 4973 : auto weight = m_vertices[single_node->vertex].weight;
7613 4973 : slpg_layout_cost internal_cost
7614 4973 : = { weight * factor, m_optimize_size };
7615 :
7616 4973 : slpg_layout_cost alt_cost = in_cost;
7617 4973 : alt_cost.add_serial_cost (internal_cost);
7618 4973 : if (alt_cost.is_better_than (combined_cost, m_optimize_size))
7619 : {
7620 1577 : combined_cost = alt_cost;
7621 1577 : layout_costs.in_cost = in_cost;
7622 1577 : layout_costs.internal_cost = internal_cost;
7623 : }
7624 : }
7625 : }
7626 :
7627 : /* Record the layout with the lowest cost. Prefer layout 0 in
7628 : the event of a tie between it and another layout. */
7629 174087 : if (!min_layout_cost.is_possible ()
7630 64524 : || combined_cost.is_better_than (min_layout_cost,
7631 64524 : m_optimize_size))
7632 : {
7633 123198 : min_layout_i = layout_i;
7634 123198 : min_layout_cost = combined_cost;
7635 : }
7636 : }
7637 :
7638 : /* This loop's handling of earlier partitions should ensure that
7639 : choosing the original layout for the current partition is no
7640 : less valid than it was in the original graph, even with the
7641 : provisional layout choices for those earlier partitions. */
7642 109563 : gcc_assert (min_layout_cost.is_possible ());
7643 109563 : partition.layout = min_layout_i;
7644 : }
7645 5313 : }
7646 :
7647 : /* Make a backward pass through the partitions, accumulating output costs.
7648 : Make a final choice of layout for each partition. */
7649 :
7650 : void
7651 5313 : vect_optimize_slp_pass::backward_pass ()
7652 : {
7653 120189 : for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
7654 : {
7655 109563 : auto &partition = m_partitions[partition_i];
7656 :
7657 109563 : unsigned int min_layout_i = 0;
7658 109563 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
7659 334580 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7660 : {
7661 225017 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7662 225017 : if (!layout_costs.is_possible ())
7663 50930 : continue;
7664 :
7665 : /* Accumulate the costs from successor partitions. */
7666 174087 : bool is_possible = true;
7667 364700 : for (unsigned int order_i = partition.node_begin;
7668 364700 : order_i < partition.node_end; ++order_i)
7669 : {
7670 190613 : unsigned int node_i = m_partitioned_nodes[order_i];
7671 190613 : auto &vertex = m_vertices[node_i];
7672 531388 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
7673 : {
7674 340775 : auto &other_vertex = m_vertices[other_node_i];
7675 340775 : auto &other_partition = m_partitions[other_vertex.partition];
7676 340775 : if (other_vertex.partition > vertex.partition)
7677 : {
7678 : /* Accumulate the incoming costs from later
7679 : partitions, plus the cost of any layout changes
7680 : on UD itself. */
7681 165372 : auto cost = backward_cost (ud, other_node_i, layout_i);
7682 165372 : if (!cost.is_possible ())
7683 0 : is_possible = false;
7684 : else
7685 165372 : layout_costs.out_cost.add_parallel_cost (cost);
7686 : }
7687 : else
7688 : /* Make sure that earlier partitions can (if necessary
7689 : or beneficial) keep the layout that they chose in
7690 : the forward pass. This ensures that there is at
7691 : least one valid choice of layout. */
7692 175403 : is_possible &= edge_layout_cost (ud, other_node_i,
7693 175403 : other_partition.layout,
7694 175403 : layout_i).is_possible ();
7695 531388 : };
7696 190613 : for_each_partition_edge (node_i, add_cost);
7697 : }
7698 174087 : if (!is_possible)
7699 : {
7700 0 : layout_costs.mark_impossible ();
7701 0 : continue;
7702 : }
7703 :
7704 : /* Locally combine the costs from the forward and backward passes.
7705 : (This combined cost is not passed on, since that would lead
7706 : to double counting.) */
7707 174087 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7708 174087 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7709 174087 : combined_cost.add_serial_cost (layout_costs.out_cost);
7710 :
7711 : /* Record the layout with the lowest cost. Prefer layout 0 in
7712 : the event of a tie between it and another layout. */
7713 174087 : if (!min_layout_cost.is_possible ()
7714 64524 : || combined_cost.is_better_than (min_layout_cost,
7715 64524 : m_optimize_size))
7716 : {
7717 117726 : min_layout_i = layout_i;
7718 117726 : min_layout_cost = combined_cost;
7719 : }
7720 : }
7721 :
7722 109563 : gcc_assert (min_layout_cost.is_possible ());
7723 109563 : partition.layout = min_layout_i;
7724 : }
7725 5313 : }
7726 :
7727 : /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
7728 : NODE already has the layout that was selected for its partition. */
7729 :
7730 : slp_tree
7731 146340 : vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
7732 : unsigned int to_layout_i)
7733 : {
7734 146340 : unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
7735 146340 : slp_tree result = m_node_layouts[result_i];
7736 146340 : if (result)
7737 : return result;
7738 :
7739 145874 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7740 145874 : || (SLP_TREE_DEF_TYPE (node) == vect_external_def
7741 : /* We can't permute vector defs in place. */
7742 20405 : && SLP_TREE_VEC_DEFS (node).is_empty ()))
7743 : {
7744 : /* If the vector is uniform or unchanged, there's nothing to do. */
7745 37920 : if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
7746 : result = node;
7747 : else
7748 : {
7749 1996 : auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
7750 1996 : result = vect_create_new_slp_node (scalar_ops);
7751 1996 : vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
7752 : }
7753 : }
7754 : else
7755 : {
7756 107954 : unsigned int partition_i = m_vertices[node->vertex].partition;
7757 107954 : unsigned int from_layout_i = m_partitions[partition_i].layout;
7758 107954 : if (from_layout_i == to_layout_i)
7759 107418 : return node;
7760 :
7761 : /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
7762 : permutation instead of a serial one. Leave the new permutation
7763 : in TMP_PERM on success. */
7764 536 : auto_lane_permutation_t tmp_perm;
7765 536 : unsigned int num_inputs = 1;
7766 536 : if (SLP_TREE_PERMUTE_P (node))
7767 : {
7768 7 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
7769 7 : if (from_layout_i != 0)
7770 7 : vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
7771 7 : if (to_layout_i != 0)
7772 4 : vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
7773 7 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
7774 : tmp_perm,
7775 7 : SLP_TREE_CHILDREN (node),
7776 : false) >= 0)
7777 7 : num_inputs = SLP_TREE_CHILDREN (node).length ();
7778 : else
7779 0 : tmp_perm.truncate (0);
7780 : }
7781 :
7782 536 : if (dump_enabled_p ())
7783 : {
7784 68 : if (tmp_perm.length () > 0)
7785 6 : dump_printf_loc (MSG_NOTE, vect_location,
7786 : "duplicating permutation node %p with"
7787 : " layout %d\n",
7788 : (void *) node, to_layout_i);
7789 : else
7790 62 : dump_printf_loc (MSG_NOTE, vect_location,
7791 : "inserting permutation node in place of %p\n",
7792 : (void *) node);
7793 : }
7794 :
7795 536 : unsigned int num_lanes = SLP_TREE_LANES (node);
7796 536 : result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
7797 536 : if (SLP_TREE_SCALAR_STMTS (node).length ())
7798 : {
7799 535 : auto &stmts = SLP_TREE_SCALAR_STMTS (result);
7800 535 : stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
7801 535 : if (from_layout_i != 0)
7802 269 : vect_slp_permute (m_perms[from_layout_i], stmts, false);
7803 535 : if (to_layout_i != 0)
7804 270 : vect_slp_permute (m_perms[to_layout_i], stmts, true);
7805 : }
7806 536 : SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
7807 536 : SLP_TREE_LANES (result) = num_lanes;
7808 536 : SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
7809 536 : result->vertex = -1;
7810 :
7811 536 : auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
7812 536 : if (tmp_perm.length ())
7813 : {
7814 7 : lane_perm.safe_splice (tmp_perm);
7815 7 : SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
7816 : }
7817 : else
7818 : {
7819 529 : lane_perm.create (num_lanes);
7820 1651 : for (unsigned j = 0; j < num_lanes; ++j)
7821 1122 : lane_perm.quick_push ({ 0, j });
7822 529 : if (from_layout_i != 0)
7823 262 : vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
7824 529 : if (to_layout_i != 0)
7825 267 : vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
7826 529 : SLP_TREE_CHILDREN (result).safe_push (node);
7827 : }
7828 2148 : for (slp_tree child : SLP_TREE_CHILDREN (result))
7829 540 : child->refcnt++;
7830 536 : }
7831 38456 : m_node_layouts[result_i] = result;
7832 38456 : return result;
7833 : }
7834 :
7835 : /* Apply the chosen vector layouts to the SLP graph. */
7836 :
7837 : void
7838 10181 : vect_optimize_slp_pass::materialize ()
7839 : {
7840 : /* We no longer need the costs, so avoid having two O(N * P) arrays
7841 : live at the same time. */
7842 10181 : m_partition_layout_costs.release ();
7843 30543 : m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
7844 :
7845 20362 : auto_sbitmap fully_folded (m_vertices.length ());
7846 10181 : bitmap_clear (fully_folded);
7847 157029 : for (unsigned int node_i : m_partitioned_nodes)
7848 : {
7849 126486 : auto &vertex = m_vertices[node_i];
7850 126486 : slp_tree node = vertex.node;
7851 126486 : int layout_i = m_partitions[vertex.partition].layout;
7852 126486 : gcc_assert (layout_i >= 0);
7853 :
7854 : /* Rearrange the scalar statements to match the chosen layout. */
7855 126486 : if (layout_i > 0)
7856 15730 : vect_slp_permute (m_perms[layout_i],
7857 15730 : SLP_TREE_SCALAR_STMTS (node), true);
7858 :
7859 : /* Update load and lane permutations. */
7860 126486 : if (SLP_TREE_PERMUTE_P (node))
7861 : {
7862 : /* First try to absorb the input vector layouts. If that fails,
7863 : force the inputs to have layout LAYOUT_I too. We checked that
7864 : that was possible before deciding to use nonzero output layouts.
7865 : (Note that at this stage we don't really have any guarantee that
7866 : the target supports the original VEC_PERM_EXPR.) */
7867 4519 : auto &perm = SLP_TREE_LANE_PERMUTATION (node);
7868 4519 : auto_lane_permutation_t tmp_perm;
7869 4519 : tmp_perm.safe_splice (perm);
7870 4519 : change_vec_perm_layout (node, tmp_perm, -1, layout_i);
7871 4519 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
7872 : tmp_perm,
7873 4519 : SLP_TREE_CHILDREN (node),
7874 : false) >= 0)
7875 : {
7876 4150 : if (dump_enabled_p ()
7877 5042 : && !std::equal (tmp_perm.begin (), tmp_perm.end (),
7878 : perm.begin ()))
7879 58 : dump_printf_loc (MSG_NOTE, vect_location,
7880 : "absorbing input layouts into %p\n",
7881 : (void *) node);
7882 23827 : std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
7883 4150 : bitmap_set_bit (fully_folded, node_i);
7884 : }
7885 : else
7886 : {
7887 : /* Not MSG_MISSED because it would make no sense to users. */
7888 369 : if (dump_enabled_p ())
7889 46 : dump_printf_loc (MSG_NOTE, vect_location,
7890 : "failed to absorb input layouts into %p\n",
7891 : (void *) node);
7892 369 : change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
7893 : }
7894 4519 : }
7895 : else
7896 : {
7897 121967 : gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
7898 121967 : auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
7899 121967 : if (layout_i > 0)
7900 : /* ??? When we handle non-bijective permutes the idea
7901 : is that we can force the load-permutation to be
7902 : { min, min + 1, min + 2, ... max }. But then the
7903 : scalar defs might no longer match the lane content
7904 : which means wrong-code with live lane vectorization.
7905 : So we possibly have to have NULL entries for those. */
7906 15627 : vect_slp_permute (m_perms[layout_i], load_perm, true);
7907 : }
7908 : }
7909 :
7910 : /* Do this before any nodes disappear, since it involves a walk
7911 : over the leaves. */
7912 10181 : remove_redundant_permutations ();
7913 :
7914 : /* Replace each child with a correctly laid-out version. */
7915 157029 : for (unsigned int node_i : m_partitioned_nodes)
7916 : {
7917 : /* Skip nodes that have already been handled above. */
7918 126486 : if (bitmap_bit_p (fully_folded, node_i))
7919 4150 : continue;
7920 :
7921 122336 : auto &vertex = m_vertices[node_i];
7922 122336 : int in_layout_i = m_partitions[vertex.partition].layout;
7923 122336 : gcc_assert (in_layout_i >= 0);
7924 :
7925 : unsigned j;
7926 : slp_tree child;
7927 363310 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
7928 : {
7929 149997 : if (!child)
7930 3657 : continue;
7931 :
7932 146340 : slp_tree new_child = get_result_with_layout (child, in_layout_i);
7933 146340 : if (new_child != child)
7934 : {
7935 2741 : vect_free_slp_tree (child);
7936 2741 : SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
7937 2741 : new_child->refcnt += 1;
7938 : }
7939 : }
7940 : }
7941 10181 : }
7942 :
7943 : /* Elide load permutations that are not necessary. Such permutations might
7944 : be pre-existing, rather than created by the layout optimizations. */
7945 :
7946 : void
7947 624895 : vect_optimize_slp_pass::remove_redundant_permutations ()
7948 : {
7949 4141306 : for (unsigned int node_i : m_leafs)
7950 : {
7951 2266621 : slp_tree node = m_vertices[node_i].node;
7952 2266621 : if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
7953 1744403 : continue;
7954 :
7955 : /* In basic block vectorization we allow any subchain of an interleaving
7956 : chain.
7957 : FORNOW: not in loop SLP because of realignment complications. */
7958 522218 : if (is_a <bb_vec_info> (m_vinfo))
7959 : {
7960 157899 : bool subchain_p = true;
7961 : stmt_vec_info next_load_info = NULL;
7962 : stmt_vec_info load_info;
7963 : unsigned j;
7964 157899 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
7965 : {
7966 128540 : if (j != 0
7967 128540 : && (next_load_info != load_info
7968 61094 : || ! load_info
7969 61094 : || DR_GROUP_GAP (load_info) != 1))
7970 : {
7971 : subchain_p = false;
7972 : break;
7973 : }
7974 106027 : next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
7975 : }
7976 51872 : if (subchain_p)
7977 : {
7978 29359 : SLP_TREE_LOAD_PERMUTATION (node).release ();
7979 29359 : continue;
7980 : }
7981 : }
7982 : else
7983 : {
7984 470346 : loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
7985 470346 : bool this_load_permuted = !vect_load_perm_consecutive_p (node, 0);
7986 : /* When this isn't a grouped access we know it's single element
7987 : and contiguous. */
7988 470346 : if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
7989 : {
7990 357776 : if (!this_load_permuted
7991 357776 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
7992 357162 : || SLP_TREE_LANES (node) == 1))
7993 357151 : SLP_TREE_LOAD_PERMUTATION (node).release ();
7994 357776 : continue;
7995 : }
7996 112570 : stmt_vec_info first_stmt_info
7997 112570 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
7998 112971 : if (!this_load_permuted
7999 : /* The load requires permutation when unrolling exposes
8000 : a gap either because the group is larger than the SLP
8001 : group-size or because there is a gap between the groups. */
8002 112570 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
8003 95323 : || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
8004 124 : && DR_GROUP_GAP (first_stmt_info) == 0)))
8005 : {
8006 401 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8007 401 : continue;
8008 : }
8009 : }
8010 : }
8011 624895 : }
8012 :
8013 : /* Print the partition graph and layout information to the dump file. */
8014 :
8015 : void
8016 659 : vect_optimize_slp_pass::dump ()
8017 : {
8018 659 : dump_printf_loc (MSG_NOTE, vect_location,
8019 : "SLP optimize permutations:\n");
8020 1331 : for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
8021 : {
8022 672 : dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
8023 672 : const char *sep = "";
8024 5769 : for (unsigned int idx : m_perms[layout_i])
8025 : {
8026 3753 : dump_printf (MSG_NOTE, "%s%d", sep, idx);
8027 3753 : sep = ", ";
8028 : }
8029 672 : dump_printf (MSG_NOTE, " }\n");
8030 : }
8031 659 : dump_printf_loc (MSG_NOTE, vect_location,
8032 : "SLP optimize partitions:\n");
8033 5420 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
8034 : ++partition_i)
8035 : {
8036 4761 : auto &partition = m_partitions[partition_i];
8037 4761 : dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
8038 4761 : dump_printf_loc (MSG_NOTE, vect_location,
8039 : " partition %d (layout %d):\n",
8040 : partition_i, partition.layout);
8041 4761 : dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
8042 9750 : for (unsigned int order_i = partition.node_begin;
8043 9750 : order_i < partition.node_end; ++order_i)
8044 : {
8045 4989 : auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
8046 9978 : dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
8047 4989 : (void *) vertex.node);
8048 4989 : dump_printf_loc (MSG_NOTE, vect_location,
8049 : " weight: %f\n",
8050 : vertex.weight.to_double ());
8051 4989 : if (vertex.out_degree)
8052 3888 : dump_printf_loc (MSG_NOTE, vect_location,
8053 : " out weight: %f (degree %d)\n",
8054 : vertex.out_weight.to_double (),
8055 : vertex.out_degree);
8056 4989 : if (SLP_TREE_PERMUTE_P (vertex.node))
8057 492 : dump_printf_loc (MSG_NOTE, vect_location,
8058 : " op: VEC_PERM_EXPR\n");
8059 4497 : else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
8060 4479 : dump_printf_loc (MSG_NOTE, vect_location,
8061 : " op template: %G", rep->stmt);
8062 : }
8063 4761 : dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
8064 9750 : for (unsigned int order_i = partition.node_begin;
8065 9750 : order_i < partition.node_end; ++order_i)
8066 : {
8067 4989 : unsigned int node_i = m_partitioned_nodes[order_i];
8068 4989 : auto &vertex = m_vertices[node_i];
8069 15041 : auto print_edge = [&](graph_edge *, unsigned int other_node_i)
8070 : {
8071 10052 : auto &other_vertex = m_vertices[other_node_i];
8072 10052 : if (other_vertex.partition < vertex.partition)
8073 5026 : dump_printf_loc (MSG_NOTE, vect_location,
8074 : " - %p [%d] --> %p\n",
8075 5026 : (void *) other_vertex.node,
8076 : other_vertex.partition,
8077 5026 : (void *) vertex.node);
8078 : else
8079 5026 : dump_printf_loc (MSG_NOTE, vect_location,
8080 : " - %p --> [%d] %p\n",
8081 5026 : (void *) vertex.node,
8082 : other_vertex.partition,
8083 5026 : (void *) other_vertex.node);
8084 15041 : };
8085 4989 : for_each_partition_edge (node_i, print_edge);
8086 : }
8087 :
8088 14482 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
8089 : {
8090 9721 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
8091 9721 : if (layout_costs.is_possible ())
8092 : {
8093 7976 : dump_printf_loc (MSG_NOTE, vect_location,
8094 : " layout %d:%s\n", layout_i,
8095 7976 : partition.layout == int (layout_i)
8096 : ? " (*)" : "");
8097 7976 : slpg_layout_cost combined_cost = layout_costs.in_cost;
8098 7976 : combined_cost.add_serial_cost (layout_costs.internal_cost);
8099 7976 : combined_cost.add_serial_cost (layout_costs.out_cost);
8100 : #define TEMPLATE "{depth: %f, total: %f}"
8101 7976 : dump_printf_loc (MSG_NOTE, vect_location,
8102 : " " TEMPLATE "\n",
8103 : layout_costs.in_cost.depth.to_double (),
8104 : layout_costs.in_cost.total.to_double ());
8105 7976 : dump_printf_loc (MSG_NOTE, vect_location,
8106 : " + " TEMPLATE "\n",
8107 : layout_costs.internal_cost.depth.to_double (),
8108 : layout_costs.internal_cost.total.to_double ());
8109 7976 : dump_printf_loc (MSG_NOTE, vect_location,
8110 : " + " TEMPLATE "\n",
8111 : layout_costs.out_cost.depth.to_double (),
8112 : layout_costs.out_cost.total.to_double ());
8113 7976 : dump_printf_loc (MSG_NOTE, vect_location,
8114 : " = " TEMPLATE "\n",
8115 : combined_cost.depth.to_double (),
8116 : combined_cost.total.to_double ());
8117 : #undef TEMPLATE
8118 : }
8119 : else
8120 1745 : dump_printf_loc (MSG_NOTE, vect_location,
8121 : " layout %d: rejected\n", layout_i);
8122 : }
8123 : }
8124 659 : }
8125 :
8126 : /* Masked load lanes discovery. */
8127 :
8128 : void
8129 624895 : vect_optimize_slp_pass::decide_masked_load_lanes ()
8130 : {
8131 6426343 : for (auto v : m_vertices)
8132 : {
8133 4551658 : slp_tree node = v.node;
8134 4551658 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
8135 3142474 : || SLP_TREE_PERMUTE_P (node))
8136 1540042 : continue;
8137 3011616 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8138 1511957 : if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
8139 : /* The mask has to be uniform. */
8140 954308 : || STMT_VINFO_SLP_VECT_ONLY (stmt_info)
8141 954177 : || ! is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
8142 3011701 : || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
8143 : IFN_MASK_LOAD))
8144 3011583 : continue;
8145 33 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8146 66 : if (STMT_VINFO_STRIDED_P (stmt_info)
8147 33 : || compare_step_with_zero (m_vinfo, stmt_info) <= 0
8148 63 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (node),
8149 30 : DR_GROUP_SIZE (stmt_info),
8150 : true) == IFN_LAST)
8151 33 : continue;
8152 :
8153 : /* Uniform masks need to be suitably represented. */
8154 0 : slp_tree mask = SLP_TREE_CHILDREN (node)[0];
8155 0 : if (!SLP_TREE_PERMUTE_P (mask)
8156 0 : || SLP_TREE_CHILDREN (mask).length () != 1)
8157 0 : continue;
8158 0 : bool match = true;
8159 0 : for (auto perm : SLP_TREE_LANE_PERMUTATION (mask))
8160 0 : if (perm.first != 0 || perm.second != 0)
8161 : {
8162 : match = false;
8163 : break;
8164 : }
8165 0 : if (!match)
8166 0 : continue;
8167 :
8168 : /* Now see if the consumer side matches. */
8169 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
8170 0 : pred; pred = pred->pred_next)
8171 : {
8172 0 : slp_tree pred_node = m_vertices[pred->src].node;
8173 : /* All consumers should be a permute with a single outgoing lane. */
8174 0 : if (!SLP_TREE_PERMUTE_P (pred_node)
8175 0 : || SLP_TREE_LANES (pred_node) != 1)
8176 : {
8177 : match = false;
8178 : break;
8179 : }
8180 0 : gcc_assert (SLP_TREE_CHILDREN (pred_node).length () == 1);
8181 : }
8182 0 : if (!match)
8183 0 : continue;
8184 : /* Now we can mark the nodes as to use load lanes. */
8185 0 : node->ldst_lanes = true;
8186 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
8187 0 : pred; pred = pred->pred_next)
8188 0 : m_vertices[pred->src].node->ldst_lanes = true;
8189 : /* The catch is we have to massage the mask. We have arranged
8190 : analyzed uniform masks to be represented by a splat VEC_PERM
8191 : which we can now simply elide as we cannot easily re-do SLP
8192 : discovery here. */
8193 0 : slp_tree new_mask = SLP_TREE_CHILDREN (mask)[0];
8194 0 : SLP_TREE_REF_COUNT (new_mask)++;
8195 0 : SLP_TREE_CHILDREN (node)[0] = new_mask;
8196 0 : vect_free_slp_tree (mask);
8197 : }
8198 624895 : }
8199 :
8200 : /* Perform legitimizing attempts. This is intended to improve the
8201 : situation when layout 0 is not valid which is a situation the cost
8202 : based propagation does not handle well.
8203 : Return true if further layout optimization is possible, false if
8204 : the layout configuration should be considered final. */
8205 :
8206 : bool
8207 10181 : vect_optimize_slp_pass::legitimize ()
8208 : {
8209 : /* Perform a very simple legitimizing attempt by attempting to choose
8210 : a single layout for all partitions that will make all permutations
8211 : a noop. That should also be the optimal layout choice in case
8212 : layout zero is legitimate.
8213 : ??? Disconnected components of the SLP graph could have distinct
8214 : single layouts. */
8215 10181 : int single_layout_i = -1;
8216 10181 : unsigned deferred_up_to = -1U;
8217 30787 : for (unsigned partition_i = 0; partition_i < m_partitions.length ();
8218 : ++partition_i)
8219 : {
8220 25913 : auto &partition = m_partitions[partition_i];
8221 25913 : if (single_layout_i == -1)
8222 : {
8223 13374 : single_layout_i = partition.layout;
8224 13374 : deferred_up_to = partition_i;
8225 : }
8226 12539 : else if (partition.layout == single_layout_i || partition.layout == -1)
8227 : ;
8228 : else
8229 : single_layout_i = 0;
8230 22675 : if (single_layout_i == 0)
8231 : return true;
8232 :
8233 20666 : if (single_layout_i != -1
8234 20666 : && !is_compatible_layout (partition, single_layout_i))
8235 : return true;
8236 : }
8237 :
8238 4874 : if (single_layout_i <= 0)
8239 : return true;
8240 :
8241 4990 : for (unsigned partition_i = 0; partition_i < deferred_up_to; ++partition_i)
8242 122 : if (!is_compatible_layout (m_partitions[partition_i],
8243 : single_layout_i))
8244 : return true;
8245 :
8246 12161 : for (unsigned partition_i = 0; partition_i < m_partitions.length ();
8247 : ++partition_i)
8248 : {
8249 7293 : auto &partition = m_partitions[partition_i];
8250 7293 : partition.layout = single_layout_i;
8251 : }
8252 :
8253 : return false;
8254 : }
8255 :
8256 : /* Main entry point for the SLP graph optimization pass. */
8257 :
8258 : void
8259 624895 : vect_optimize_slp_pass::run ()
8260 : {
8261 624895 : build_graph ();
8262 624895 : create_partitions ();
8263 624895 : start_choosing_layouts ();
8264 624895 : if (m_perms.length () > 1)
8265 : {
8266 10181 : if (legitimize ())
8267 : {
8268 5313 : forward_pass ();
8269 5313 : backward_pass ();
8270 : }
8271 10181 : if (dump_enabled_p ())
8272 659 : dump ();
8273 10181 : materialize ();
8274 41143 : while (!m_perms.is_empty ())
8275 20781 : m_perms.pop ().release ();
8276 : }
8277 : else
8278 614714 : remove_redundant_permutations ();
8279 624895 : free_graph (m_slpg);
8280 624895 : build_graph ();
8281 624895 : decide_masked_load_lanes ();
8282 624895 : free_graph (m_slpg);
8283 624895 : }
8284 :
8285 : /* Apply CSE to NODE and its children using BST_MAP. */
8286 :
8287 : static void
8288 4870820 : vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
8289 : {
8290 4870820 : bool put_p = false;
8291 4870820 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
8292 : /* Besides some VEC_PERM_EXPR, two-operator nodes also
8293 : lack scalar stmts and thus CSE doesn't work via bst_map. Ideally
8294 : we'd have sth that works for all internal and external nodes. */
8295 4870820 : && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
8296 : {
8297 3439877 : slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
8298 3439877 : if (leader)
8299 : {
8300 : /* We've visited this node already. */
8301 321026 : if (!*leader || *leader == node)
8302 : return;
8303 :
8304 2432 : if (dump_enabled_p ())
8305 887 : dump_printf_loc (MSG_NOTE, vect_location,
8306 : "re-using SLP tree %p for %p\n",
8307 : (void *)*leader, (void *)node);
8308 2432 : vect_free_slp_tree (node);
8309 2432 : (*leader)->refcnt += 1;
8310 2432 : node = *leader;
8311 2432 : return;
8312 : }
8313 :
8314 : /* Avoid creating a cycle by populating the map only after recursion. */
8315 3118851 : bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
8316 3118851 : node->refcnt += 1;
8317 3118851 : put_p = true;
8318 : /* And recurse. */
8319 : }
8320 :
8321 13422534 : for (slp_tree &child : SLP_TREE_CHILDREN (node))
8322 3867294 : if (child)
8323 3489817 : vect_cse_slp_nodes (bst_map, child);
8324 :
8325 : /* Now record the node for CSE in other siblings. */
8326 4549794 : if (put_p)
8327 3118851 : *bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
8328 : }
8329 :
8330 : /* Optimize the SLP graph of VINFO. */
8331 :
8332 : void
8333 968348 : vect_optimize_slp (vec_info *vinfo)
8334 : {
8335 968348 : if (vinfo->slp_instances.is_empty ())
8336 : return;
8337 624895 : vect_optimize_slp_pass (vinfo).run ();
8338 :
8339 : /* Apply CSE again to nodes after permute optimization. */
8340 624895 : scalar_stmts_to_slp_tree_map_t *bst_map
8341 624895 : = new scalar_stmts_to_slp_tree_map_t ();
8342 :
8343 3255688 : for (auto inst : vinfo->slp_instances)
8344 1381003 : vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
8345 :
8346 624895 : release_scalar_stmts_to_slp_tree_map (bst_map);
8347 : }
8348 :
8349 : /* Gather loads reachable from the individual SLP graph entries. */
8350 :
8351 : void
8352 968348 : vect_gather_slp_loads (vec_info *vinfo)
8353 : {
8354 968348 : unsigned i;
8355 968348 : slp_instance instance;
8356 2349351 : FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
8357 : {
8358 1381003 : hash_set<slp_tree> visited;
8359 1381003 : vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
8360 : SLP_INSTANCE_TREE (instance), visited);
8361 1381003 : }
8362 968348 : }
8363 :
8364 : /* For NODE update VF based on the number of lanes and the vector types
8365 : used. */
8366 :
8367 : static void
8368 3583313 : vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
8369 : hash_set<slp_tree> &visited)
8370 : {
8371 3583313 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
8372 1287971 : return;
8373 2569238 : if (visited.add (node))
8374 : return;
8375 :
8376 8667821 : for (slp_tree child : SLP_TREE_CHILDREN (node))
8377 2919087 : vect_update_slp_vf_for_node (child, vf, visited);
8378 :
8379 : /* We do not visit SLP nodes for constants or externals - those neither
8380 : have a vector type set yet (vectorizable_* does this) nor do they
8381 : have max_nunits set. Instead we rely on internal nodes max_nunit
8382 : to cover constant/external operands.
8383 : Note that when we stop using fixed size vectors externs and constants
8384 : shouldn't influence the (minimum) vectorization factor, instead
8385 : vectorizable_* should honor the vectorization factor when trying to
8386 : assign vector types to constants and externals and cause iteration
8387 : to a higher vectorization factor when required. */
8388 2295342 : poly_uint64 node_vf
8389 2295342 : = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
8390 2295342 : vf = force_common_multiple (vf, node_vf);
8391 :
8392 : /* For permute nodes that are fed from externs or constants we have to
8393 : consider their number of lanes as well. Likewise for store-lanes. */
8394 2295342 : if (SLP_TREE_PERMUTE_P (node) || node->ldst_lanes)
8395 645924 : for (slp_tree child : SLP_TREE_CHILDREN (node))
8396 172008 : if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
8397 : {
8398 2858 : poly_uint64 child_vf
8399 2858 : = calculate_unrolling_factor (node->max_nunits,
8400 : SLP_TREE_LANES (child));
8401 2858 : vf = force_common_multiple (vf, child_vf);
8402 : }
8403 : }
8404 :
8405 : /* For each possible SLP instance decide whether to SLP it and calculate overall
8406 : unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
8407 : least one instance. */
8408 :
8409 : bool
8410 405823 : vect_make_slp_decision (loop_vec_info loop_vinfo)
8411 : {
8412 405823 : unsigned int i;
8413 405823 : poly_uint64 unrolling_factor = 1;
8414 405823 : const vec<slp_instance> &slp_instances
8415 : = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8416 405823 : slp_instance instance;
8417 405823 : int decided_to_slp = 0;
8418 :
8419 405823 : DUMP_VECT_SCOPE ("vect_make_slp_decision");
8420 :
8421 405823 : hash_set<slp_tree> visited;
8422 1070049 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
8423 : {
8424 664226 : slp_tree root = SLP_INSTANCE_TREE (instance);
8425 :
8426 : /* All unroll factors have the form:
8427 :
8428 : GET_MODE_SIZE (vinfo->vector_mode) * X
8429 :
8430 : for some rational X, so they must have a common multiple. */
8431 664226 : vect_update_slp_vf_for_node (root, unrolling_factor, visited);
8432 :
8433 : /* If all instances ended up with vector(1) T roots make sure to
8434 : not vectorize. RVV for example relies on loop vectorization
8435 : when some instances are essentially kept scalar. See PR121048. */
8436 664226 : if (SLP_TREE_VECTYPE (root)
8437 664226 : && known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U))
8438 546073 : decided_to_slp++;
8439 : }
8440 :
8441 405823 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = unrolling_factor;
8442 :
8443 405823 : if (decided_to_slp && dump_enabled_p ())
8444 : {
8445 18419 : dump_printf_loc (MSG_NOTE, vect_location,
8446 : "Decided to SLP %d instances. Unrolling factor ",
8447 : decided_to_slp);
8448 18419 : dump_dec (MSG_NOTE, unrolling_factor);
8449 18419 : dump_printf (MSG_NOTE, "\n");
8450 : }
8451 :
8452 405823 : return (decided_to_slp > 0);
8453 405823 : }
8454 :
8455 : /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
8456 :
8457 2183824 : _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
8458 : : vec_info (vec_info::bb, shared),
8459 2183824 : roots (vNULL)
8460 : {
8461 : /* The region we are operating on. bbs[0] is the entry, excluding
8462 : its PHI nodes. In the future we might want to track an explicit
8463 : entry edge to cover bbs[0] PHI nodes and have a region entry
8464 : insert location. */
8465 2183824 : bbs = _bbs.address ();
8466 2183824 : nbbs = _bbs.length ();
8467 :
8468 17723259 : for (unsigned i = 0; i < nbbs; ++i)
8469 : {
8470 15539435 : if (i != 0)
8471 20291617 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
8472 6936006 : gsi_next (&si))
8473 : {
8474 6936006 : gphi *phi = si.phi ();
8475 6936006 : gimple_set_uid (phi, 0);
8476 6936006 : add_stmt (phi);
8477 : }
8478 31078870 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
8479 134980207 : !gsi_end_p (gsi); gsi_next (&gsi))
8480 : {
8481 119440772 : gimple *stmt = gsi_stmt (gsi);
8482 119440772 : gimple_set_uid (stmt, 0);
8483 119440772 : if (is_gimple_debug (stmt))
8484 74171204 : continue;
8485 45269568 : add_stmt (stmt);
8486 : }
8487 : }
8488 2183824 : }
8489 :
8490 :
8491 : /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
8492 : stmts in the basic block. */
8493 :
8494 2183824 : _bb_vec_info::~_bb_vec_info ()
8495 : {
8496 : /* Reset region marker. */
8497 17723259 : for (unsigned i = 0; i < nbbs; ++i)
8498 : {
8499 15539435 : if (i != 0)
8500 20307345 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
8501 6951734 : gsi_next (&si))
8502 : {
8503 6951734 : gphi *phi = si.phi ();
8504 6951734 : gimple_set_uid (phi, -1);
8505 : }
8506 31078870 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
8507 134931004 : !gsi_end_p (gsi); gsi_next (&gsi))
8508 : {
8509 119391569 : gimple *stmt = gsi_stmt (gsi);
8510 119391569 : gimple_set_uid (stmt, -1);
8511 : }
8512 : }
8513 :
8514 3388288 : for (unsigned i = 0; i < roots.length (); ++i)
8515 : {
8516 1204464 : roots[i].stmts.release ();
8517 1204464 : roots[i].roots.release ();
8518 1204464 : roots[i].remain.release ();
8519 : }
8520 2183824 : roots.release ();
8521 2183824 : }
8522 :
8523 : /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
8524 : given then that child nodes have already been processed, and that
8525 : their def types currently match their SLP node's def type. */
8526 :
8527 : static bool
8528 2429037 : vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
8529 : slp_instance node_instance,
8530 : stmt_vector_for_cost *cost_vec)
8531 : {
8532 : /* Handle purely internal nodes. */
8533 2429037 : if (SLP_TREE_PERMUTE_P (node))
8534 : {
8535 99113 : if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
8536 : return false;
8537 :
8538 : stmt_vec_info slp_stmt_info;
8539 : unsigned int i;
8540 256648 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
8541 : {
8542 158864 : if (slp_stmt_info
8543 153919 : && STMT_VINFO_LIVE_P (slp_stmt_info)
8544 158864 : && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
8545 : node_instance, i,
8546 : false, cost_vec))
8547 : return false;
8548 : }
8549 97784 : SLP_TREE_TYPE (node) = permute_info_type;
8550 97784 : return true;
8551 : }
8552 :
8553 2329924 : return vect_analyze_stmt (vinfo, node, node_instance, cost_vec);
8554 : }
8555 :
8556 : static int
8557 1845264 : sort_ints (const void *a_, const void *b_)
8558 : {
8559 1845264 : int a = *(const int *)a_;
8560 1845264 : int b = *(const int *)b_;
8561 1845264 : return a - b;
8562 : }
8563 :
8564 : /* Verify if we can externalize a set of internal defs. */
8565 :
8566 : static bool
8567 379584 : vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
8568 : {
8569 : /* Constant generation uses get_later_stmt which can only handle
8570 : defs from the same BB or a set of defs that can be ordered
8571 : with a dominance query. */
8572 379584 : basic_block bb = NULL;
8573 379584 : bool all_same = true;
8574 379584 : auto_vec<int> bbs;
8575 759168 : bbs.reserve_exact (stmts.length ());
8576 2052826 : for (stmt_vec_info stmt : stmts)
8577 : {
8578 914074 : if (!stmt)
8579 : return false;
8580 914074 : else if (!bb)
8581 379584 : bb = gimple_bb (stmt->stmt);
8582 534490 : else if (gimple_bb (stmt->stmt) != bb)
8583 172108 : all_same = false;
8584 914074 : bbs.quick_push (gimple_bb (stmt->stmt)->index);
8585 : }
8586 379584 : if (all_same)
8587 : return true;
8588 :
8589 : /* Produce a vector of unique BB indexes for the defs. */
8590 129040 : bbs.qsort (sort_ints);
8591 : unsigned i, j;
8592 314220 : for (i = 1, j = 1; i < bbs.length (); ++i)
8593 185180 : if (bbs[i] != bbs[j-1])
8594 137844 : bbs[j++] = bbs[i];
8595 129040 : gcc_assert (j >= 2);
8596 129040 : bbs.truncate (j);
8597 :
8598 258080 : if (bbs.length () == 2)
8599 125527 : return (dominated_by_p (CDI_DOMINATORS,
8600 125527 : BASIC_BLOCK_FOR_FN (cfun, bbs[0]),
8601 125527 : BASIC_BLOCK_FOR_FN (cfun, bbs[1]))
8602 244320 : || dominated_by_p (CDI_DOMINATORS,
8603 118793 : BASIC_BLOCK_FOR_FN (cfun, bbs[1]),
8604 118793 : BASIC_BLOCK_FOR_FN (cfun, bbs[0])));
8605 :
8606 : /* ??? For more than two BBs we can sort the vector and verify the
8607 : result is a total order. But we can't use vec::qsort with a
8608 : compare function using a dominance query since there's no way to
8609 : signal failure and any fallback for an unordered pair would
8610 : fail qsort_chk later.
8611 : For now simply hope that ordering after BB index provides the
8612 : best candidate total order. If required we can implement our
8613 : own mergesort or export an entry without checking. */
8614 395361 : for (unsigned i = 1; i < bbs.length (); ++i)
8615 12293 : if (!dominated_by_p (CDI_DOMINATORS,
8616 12293 : BASIC_BLOCK_FOR_FN (cfun, bbs[i]),
8617 12293 : BASIC_BLOCK_FOR_FN (cfun, bbs[i-1])))
8618 : return false;
8619 :
8620 : return true;
8621 379584 : }
8622 :
8623 : /* Try to build NODE from scalars, returning true on success.
8624 : NODE_INSTANCE is the SLP instance that contains NODE. */
8625 :
8626 : static bool
8627 543873 : vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
8628 : slp_instance node_instance)
8629 : {
8630 543873 : stmt_vec_info stmt_info;
8631 543873 : unsigned int i;
8632 :
8633 543873 : if (!is_a <bb_vec_info> (vinfo)
8634 70703 : || node == SLP_INSTANCE_TREE (node_instance)
8635 22344 : || !SLP_TREE_SCALAR_STMTS (node).exists ()
8636 22303 : || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
8637 : /* Force the mask use to be built from scalars instead. */
8638 19998 : || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
8639 563656 : || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
8640 524090 : return false;
8641 :
8642 19783 : if (dump_enabled_p ())
8643 70 : dump_printf_loc (MSG_NOTE, vect_location,
8644 : "Building vector operands of %p from scalars instead\n",
8645 : (void *) node);
8646 :
8647 : /* Don't remove and free the child nodes here, since they could be
8648 : referenced by other structures. The analysis and scheduling phases
8649 : (need to) ignore child nodes of anything that isn't vect_internal_def. */
8650 19783 : unsigned int group_size = SLP_TREE_LANES (node);
8651 19783 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
8652 : /* Invariants get their vector type from the uses. */
8653 19783 : SLP_TREE_VECTYPE (node) = NULL_TREE;
8654 19783 : SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
8655 19783 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8656 68867 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8657 : {
8658 49084 : tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
8659 49084 : SLP_TREE_SCALAR_OPS (node)[i] = lhs;
8660 : }
8661 : return true;
8662 : }
8663 :
8664 : /* Return true if all elements of the slice are the same. */
8665 : bool
8666 452321 : vect_scalar_ops_slice::all_same_p () const
8667 : {
8668 499556 : for (unsigned int i = 1; i < length; ++i)
8669 421818 : if (!operand_equal_p (op (0), op (i)))
8670 : return false;
8671 : return true;
8672 : }
8673 :
8674 : hashval_t
8675 392222 : vect_scalar_ops_slice_hash::hash (const value_type &s)
8676 : {
8677 392222 : hashval_t hash = 0;
8678 1516199 : for (unsigned i = 0; i < s.length; ++i)
8679 1123977 : hash = iterative_hash_expr (s.op (i), hash);
8680 392222 : return hash;
8681 : }
8682 :
8683 : bool
8684 213458 : vect_scalar_ops_slice_hash::equal (const value_type &s1,
8685 : const compare_type &s2)
8686 : {
8687 213458 : if (s1.length != s2.length)
8688 : return false;
8689 370451 : for (unsigned i = 0; i < s1.length; ++i)
8690 323861 : if (!operand_equal_p (s1.op (i), s2.op (i)))
8691 : return false;
8692 : return true;
8693 : }
8694 :
8695 : /* Compute the prologue cost for invariant or constant operands represented
8696 : by NODE. */
8697 :
8698 : static void
8699 1035475 : vect_prologue_cost_for_slp (vec_info *vinfo, slp_tree node,
8700 : stmt_vector_for_cost *cost_vec)
8701 : {
8702 : /* There's a special case of an existing vector, that costs nothing. */
8703 1035475 : if (SLP_TREE_SCALAR_OPS (node).length () == 0
8704 1035475 : && !SLP_TREE_VEC_DEFS (node).is_empty ())
8705 1576 : return;
8706 : /* Without looking at the actual initializer a vector of
8707 : constants can be implemented as load from the constant pool.
8708 : When all elements are the same we can use a splat. */
8709 1033899 : tree vectype = SLP_TREE_VECTYPE (node);
8710 1033899 : unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
8711 1033899 : unsigned HOST_WIDE_INT const_nunits;
8712 1033899 : unsigned nelt_limit;
8713 1033899 : unsigned nvectors = vect_get_num_copies (vinfo, node);
8714 1033899 : auto ops = &SLP_TREE_SCALAR_OPS (node);
8715 1033899 : auto_vec<unsigned int> starts (nvectors);
8716 1033899 : if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
8717 1033899 : && ! multiple_p (const_nunits, group_size))
8718 : {
8719 62585 : nelt_limit = const_nunits;
8720 62585 : hash_set<vect_scalar_ops_slice_hash> vector_ops;
8721 258561 : for (unsigned int i = 0; i < nvectors; ++i)
8722 195976 : if (!vector_ops.add ({ ops, i * nelt_limit, nelt_limit }))
8723 149386 : starts.quick_push (i * nelt_limit);
8724 62585 : }
8725 : else
8726 : {
8727 : /* If either the vector has variable length or the vectors
8728 : are composed of repeated whole groups we only need to
8729 : cost construction once. All vectors will be the same. */
8730 971314 : nelt_limit = group_size;
8731 971314 : starts.quick_push (0);
8732 : }
8733 : /* ??? We're just tracking whether vectors in a single node are the same.
8734 : Ideally we'd do something more global. */
8735 1033899 : bool passed = false;
8736 4222397 : for (unsigned int start : starts)
8737 : {
8738 1120700 : vect_cost_for_stmt kind;
8739 1120700 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
8740 : kind = vector_load;
8741 452321 : else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
8742 : kind = scalar_to_vec;
8743 : else
8744 374583 : kind = vec_construct;
8745 : /* The target cost hook has no idea which part of the SLP node
8746 : we are costing so avoid passing it down more than once. Pass
8747 : it to the first vec_construct or scalar_to_vec part since for those
8748 : the x86 backend tries to account for GPR to XMM register moves. */
8749 1120700 : record_stmt_cost (cost_vec, 1, kind, nullptr,
8750 1120700 : (kind != vector_load && !passed) ? node : nullptr,
8751 : vectype, 0, vect_prologue);
8752 1120700 : if (kind != vector_load)
8753 452321 : passed = true;
8754 : }
8755 1033899 : }
8756 :
8757 : /* Analyze statements contained in SLP tree NODE after recursively analyzing
8758 : the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
8759 :
8760 : Return true if the operations are supported. */
8761 :
8762 : static bool
8763 4541890 : vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
8764 : slp_instance node_instance,
8765 : hash_set<slp_tree> &visited_set,
8766 : vec<slp_tree> &visited_vec,
8767 : stmt_vector_for_cost *cost_vec)
8768 : {
8769 4541890 : int i, j;
8770 4541890 : slp_tree child;
8771 :
8772 : /* Assume we can code-generate all invariants. */
8773 4541890 : if (!node
8774 4220139 : || SLP_TREE_DEF_TYPE (node) == vect_constant_def
8775 3504742 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8776 : return true;
8777 :
8778 3000487 : if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
8779 : {
8780 9 : if (dump_enabled_p ())
8781 0 : dump_printf_loc (MSG_NOTE, vect_location,
8782 : "Failed cyclic SLP reference in %p\n", (void *) node);
8783 9 : return false;
8784 : }
8785 3000478 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
8786 :
8787 : /* If we already analyzed the exact same set of scalar stmts we're done.
8788 : We share the generated vector stmts for those. */
8789 3000478 : if (visited_set.add (node))
8790 : return true;
8791 2731270 : visited_vec.safe_push (node);
8792 :
8793 2731270 : bool res = true;
8794 2731270 : unsigned visited_rec_start = visited_vec.length ();
8795 2731270 : unsigned cost_vec_rec_start = cost_vec->length ();
8796 2731270 : bool seen_non_constant_child = false;
8797 5746041 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8798 : {
8799 3316779 : res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
8800 : visited_set, visited_vec,
8801 : cost_vec);
8802 3316779 : if (!res)
8803 : break;
8804 3014771 : if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
8805 3014771 : seen_non_constant_child = true;
8806 : }
8807 : /* We're having difficulties scheduling nodes with just constant
8808 : operands and no scalar stmts since we then cannot compute a stmt
8809 : insertion place. */
8810 2731270 : if (res
8811 2731270 : && !seen_non_constant_child
8812 2731270 : && SLP_TREE_SCALAR_STMTS (node).is_empty ())
8813 : {
8814 225 : if (dump_enabled_p ())
8815 6 : dump_printf_loc (MSG_NOTE, vect_location,
8816 : "Cannot vectorize all-constant op node %p\n",
8817 : (void *) node);
8818 : res = false;
8819 : }
8820 :
8821 2731045 : if (res)
8822 2429037 : res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
8823 : cost_vec);
8824 : /* If analysis failed we have to pop all recursive visited nodes
8825 : plus ourselves. */
8826 2731270 : if (!res)
8827 : {
8828 2687000 : while (visited_vec.length () >= visited_rec_start)
8829 799627 : visited_set.remove (visited_vec.pop ());
8830 543873 : cost_vec->truncate (cost_vec_rec_start);
8831 : }
8832 :
8833 : /* When the node can be vectorized cost invariant nodes it references.
8834 : This is not done in DFS order to allow the referring node
8835 : vectorizable_* calls to nail down the invariant nodes vector type
8836 : and possibly unshare it if it needs a different vector type than
8837 : other referrers. */
8838 2731270 : if (res)
8839 4910795 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
8840 2723398 : if (child
8841 2466297 : && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
8842 2466297 : || SLP_TREE_DEF_TYPE (child) == vect_external_def)
8843 : /* Perform usual caching, note code-generation still
8844 : code-gens these nodes multiple times but we expect
8845 : to CSE them later. */
8846 3824571 : && !visited_set.add (child))
8847 : {
8848 1076524 : visited_vec.safe_push (child);
8849 : /* ??? After auditing more code paths make a "default"
8850 : and push the vector type from NODE to all children
8851 : if it is not already set. */
8852 : /* Compute the number of vectors to be generated. */
8853 1076524 : tree vector_type = SLP_TREE_VECTYPE (child);
8854 1076524 : if (!vector_type)
8855 : {
8856 : /* Masked loads can have an undefined (default SSA definition)
8857 : else operand. We do not need to cost it. */
8858 41049 : vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
8859 42099 : if (SLP_TREE_TYPE (node) == load_vec_info_type
8860 42099 : && ((ops.length ()
8861 1050 : && TREE_CODE (ops[0]) == SSA_NAME
8862 0 : && SSA_NAME_IS_DEFAULT_DEF (ops[0])
8863 0 : && VAR_P (SSA_NAME_VAR (ops[0])))
8864 1050 : || SLP_TREE_DEF_TYPE (child) == vect_constant_def))
8865 1050 : continue;
8866 :
8867 : /* For shifts with a scalar argument we don't need
8868 : to cost or code-generate anything.
8869 : ??? Represent this more explicitly. */
8870 39999 : gcc_assert (SLP_TREE_TYPE (node) == shift_vec_info_type
8871 : && j == 1);
8872 39999 : continue;
8873 39999 : }
8874 :
8875 : /* And cost them. */
8876 1035475 : vect_prologue_cost_for_slp (vinfo, child, cost_vec);
8877 : }
8878 :
8879 : /* If this node or any of its children can't be vectorized, try pruning
8880 : the tree here rather than felling the whole thing. */
8881 543873 : if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
8882 : {
8883 : /* We'll need to revisit this for invariant costing and number
8884 : of vectorized stmt setting. */
8885 : res = true;
8886 : }
8887 :
8888 : return res;
8889 : }
8890 :
8891 : /* Given a definition DEF, analyze if it will have any live scalar use after
8892 : performing SLP vectorization whose information is represented by BB_VINFO,
8893 : and record result into hash map SCALAR_USE_MAP as cache for later fast
8894 : check. If recursion DEPTH exceeds a limit, stop analysis and make a
8895 : conservative assumption. Return 0 if no scalar use, 1 if there is, -1
8896 : means recursion is limited. */
8897 :
8898 : static int
8899 582084 : vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
8900 : hash_map<tree, int> &scalar_use_map,
8901 : int depth = 0)
8902 : {
8903 582084 : const int depth_limit = 3;
8904 582084 : imm_use_iterator use_iter;
8905 582084 : gimple *use_stmt;
8906 :
8907 582084 : if (int *res = scalar_use_map.get (def))
8908 25386 : return *res;
8909 :
8910 556698 : int scalar_use = 1;
8911 :
8912 1822214 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
8913 : {
8914 838250 : if (is_gimple_debug (use_stmt))
8915 183602 : continue;
8916 :
8917 654648 : stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
8918 :
8919 654648 : if (!use_stmt_info)
8920 : break;
8921 :
8922 657915 : if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
8923 521398 : continue;
8924 :
8925 : /* Do not step forward when encounter PHI statement, since it may
8926 : involve cyclic reference and cause infinite recursive invocation. */
8927 127194 : if (gimple_code (use_stmt) == GIMPLE_PHI)
8928 : break;
8929 :
8930 : /* When pattern recognition is involved, a statement whose definition is
8931 : consumed in some pattern, may not be included in the final replacement
8932 : pattern statements, so would be skipped when building SLP graph.
8933 :
8934 : * Original
8935 : char a_c = *(char *) a;
8936 : char b_c = *(char *) b;
8937 : unsigned short a_s = (unsigned short) a_c;
8938 : int a_i = (int) a_s;
8939 : int b_i = (int) b_c;
8940 : int r_i = a_i - b_i;
8941 :
8942 : * After pattern replacement
8943 : a_s = (unsigned short) a_c;
8944 : a_i = (int) a_s;
8945 :
8946 : patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
8947 : patt_b_i = (int) patt_b_s; // b_i = (int) b_c
8948 :
8949 : patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
8950 : patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
8951 :
8952 : The definitions of a_i(original statement) and b_i(pattern statement)
8953 : are related to, but actually not part of widen_minus pattern.
8954 : Vectorizing the pattern does not cause these definition statements to
8955 : be marked as PURE_SLP. For this case, we need to recursively check
8956 : whether their uses are all absorbed into vectorized code. But there
8957 : is an exception that some use may participate in an vectorized
8958 : operation via an external SLP node containing that use as an element.
8959 : The parameter "scalar_use_map" tags such kind of SSA as having scalar
8960 : use in advance. */
8961 107944 : tree lhs = gimple_get_lhs (use_stmt);
8962 :
8963 107944 : if (!lhs || TREE_CODE (lhs) != SSA_NAME)
8964 : break;
8965 :
8966 73217 : if (depth_limit && depth >= depth_limit)
8967 8937 : return -1;
8968 :
8969 64280 : if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
8970 : depth + 1)))
8971 : break;
8972 8937 : }
8973 :
8974 547761 : if (end_imm_use_stmt_p (&use_iter))
8975 427266 : scalar_use = 0;
8976 :
8977 : /* If recursion is limited, do not cache result for non-root defs. */
8978 547761 : if (!depth || scalar_use >= 0)
8979 : {
8980 529887 : bool added = scalar_use_map.put (def, scalar_use);
8981 529887 : gcc_assert (!added);
8982 : }
8983 :
8984 547761 : return scalar_use;
8985 : }
8986 :
8987 : /* Mark lanes of NODE that are live outside of the basic-block vectorized
8988 : region and that can be vectorized using vectorizable_live_operation
8989 : with STMT_VINFO_LIVE_P. Not handled live operations will cause the
8990 : scalar code computing it to be retained. */
8991 :
8992 : static void
8993 909370 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
8994 : slp_instance instance,
8995 : stmt_vector_for_cost *cost_vec,
8996 : hash_map<tree, int> &scalar_use_map,
8997 : hash_set<stmt_vec_info> &svisited,
8998 : hash_set<slp_tree> &visited)
8999 : {
9000 909370 : if (visited.add (node))
9001 41636 : return;
9002 :
9003 867734 : unsigned i;
9004 867734 : stmt_vec_info stmt_info;
9005 867734 : stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
9006 3142192 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9007 : {
9008 2274458 : if (!stmt_info || svisited.contains (stmt_info))
9009 30788 : continue;
9010 2252531 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
9011 2252531 : if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
9012 11959 : && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
9013 : /* Only the pattern root stmt computes the original scalar value. */
9014 8861 : continue;
9015 2243670 : bool mark_visited = true;
9016 2243670 : gimple *orig_stmt = orig_stmt_info->stmt;
9017 2243670 : ssa_op_iter op_iter;
9018 2243670 : def_operand_p def_p;
9019 5005144 : FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
9020 : {
9021 517804 : if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
9022 : scalar_use_map))
9023 : {
9024 93938 : STMT_VINFO_LIVE_P (stmt_info) = true;
9025 93938 : if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
9026 : instance, i, false, cost_vec))
9027 : /* ??? So we know we can vectorize the live stmt from one SLP
9028 : node. If we cannot do so from all or none consistently
9029 : we'd have to record which SLP node (and lane) we want to
9030 : use for the live operation. So make sure we can
9031 : code-generate from all nodes. */
9032 : mark_visited = false;
9033 : else
9034 0 : STMT_VINFO_LIVE_P (stmt_info) = false;
9035 : }
9036 :
9037 : /* We have to verify whether we can insert the lane extract
9038 : before all uses. The following is a conservative approximation.
9039 : We cannot put this into vectorizable_live_operation because
9040 : iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
9041 : doesn't work.
9042 : Note that while the fact that we emit code for loads at the
9043 : first load should make this a non-problem leafs we construct
9044 : from scalars are vectorized after the last scalar def.
9045 : ??? If we'd actually compute the insert location during
9046 : analysis we could use sth less conservative than the last
9047 : scalar stmt in the node for the dominance check. */
9048 : /* ??? What remains is "live" uses in vector CTORs in the same
9049 : SLP graph which is where those uses can end up code-generated
9050 : right after their definition instead of close to their original
9051 : use. But that would restrict us to code-generate lane-extracts
9052 : from the latest stmt in a node. So we compensate for this
9053 : during code-generation, simply not replacing uses for those
9054 : hopefully rare cases. */
9055 517804 : imm_use_iterator use_iter;
9056 517804 : gimple *use_stmt;
9057 517804 : stmt_vec_info use_stmt_info;
9058 :
9059 517804 : if (STMT_VINFO_LIVE_P (stmt_info))
9060 626756 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
9061 438880 : if (!is_gimple_debug (use_stmt)
9062 330276 : && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
9063 320787 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
9064 621082 : && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
9065 : {
9066 17552 : if (dump_enabled_p ())
9067 57 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9068 : "Cannot determine insertion place for "
9069 : "lane extract\n");
9070 17552 : STMT_VINFO_LIVE_P (stmt_info) = false;
9071 17552 : mark_visited = true;
9072 93938 : }
9073 : }
9074 2243670 : if (mark_visited)
9075 2164312 : svisited.add (stmt_info);
9076 : }
9077 :
9078 : slp_tree child;
9079 2506636 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9080 877332 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9081 232880 : vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
9082 : scalar_use_map, svisited, visited);
9083 : }
9084 :
9085 : /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
9086 : are live outside of the basic-block vectorized region and that can be
9087 : vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
9088 :
9089 : static void
9090 263794 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
9091 : {
9092 263794 : if (bb_vinfo->slp_instances.is_empty ())
9093 29661 : return;
9094 :
9095 234133 : hash_set<stmt_vec_info> svisited;
9096 234133 : hash_set<slp_tree> visited;
9097 234133 : hash_map<tree, int> scalar_use_map;
9098 234133 : auto_vec<slp_tree> worklist;
9099 :
9100 1378889 : for (slp_instance instance : bb_vinfo->slp_instances)
9101 : {
9102 676490 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
9103 58673 : for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
9104 16736 : if (TREE_CODE (op) == SSA_NAME)
9105 14100 : scalar_use_map.put (op, 1);
9106 676490 : if (!visited.add (SLP_INSTANCE_TREE (instance)))
9107 674406 : worklist.safe_push (SLP_INSTANCE_TREE (instance));
9108 : }
9109 :
9110 1510824 : do
9111 : {
9112 1510824 : slp_tree node = worklist.pop ();
9113 :
9114 1510824 : if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
9115 : {
9116 1542444 : for (tree op : SLP_TREE_SCALAR_OPS (node))
9117 681340 : if (TREE_CODE (op) == SSA_NAME)
9118 460542 : scalar_use_map.put (op, 1);
9119 : }
9120 : else
9121 : {
9122 3623700 : for (slp_tree child : SLP_TREE_CHILDREN (node))
9123 877308 : if (child && !visited.add (child))
9124 836418 : worklist.safe_push (child);
9125 : }
9126 : }
9127 3021648 : while (!worklist.is_empty ());
9128 :
9129 234133 : visited.empty ();
9130 :
9131 1378889 : for (slp_instance instance : bb_vinfo->slp_instances)
9132 : {
9133 676490 : vect_location = instance->location ();
9134 676490 : vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
9135 : instance, &instance->cost_vec,
9136 : scalar_use_map, svisited, visited);
9137 : }
9138 234133 : }
9139 :
9140 : /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
9141 :
9142 : static bool
9143 73907 : vectorizable_bb_reduc_epilogue (slp_instance instance,
9144 : stmt_vector_for_cost *cost_vec)
9145 : {
9146 73907 : gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
9147 73907 : enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
9148 73907 : if (reduc_code == MINUS_EXPR)
9149 0 : reduc_code = PLUS_EXPR;
9150 73907 : internal_fn reduc_fn;
9151 73907 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
9152 73907 : if (!vectype
9153 73895 : || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9154 73895 : || reduc_fn == IFN_LAST
9155 73895 : || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
9156 108781 : || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
9157 34874 : TREE_TYPE (vectype)))
9158 : {
9159 49311 : if (dump_enabled_p ())
9160 271 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9161 : "not vectorized: basic block reduction epilogue "
9162 : "operation unsupported.\n");
9163 49311 : return false;
9164 : }
9165 :
9166 : /* There's no way to cost a horizontal vector reduction via REDUC_FN so
9167 : cost log2 vector operations plus shuffles and one extraction. */
9168 24596 : unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
9169 24596 : record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
9170 : vectype, 0, vect_body);
9171 24596 : record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
9172 : vectype, 0, vect_body);
9173 24596 : record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
9174 : vectype, 0, vect_body);
9175 :
9176 : /* Since we replace all stmts of a possibly longer scalar reduction
9177 : chain account for the extra scalar stmts for that. */
9178 24596 : if (!instance->remain_defs.is_empty ())
9179 19766 : record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
9180 9883 : instance->root_stmts[0], 0, vect_body);
9181 : return true;
9182 : }
9183 :
9184 : /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
9185 : and recurse to children. */
9186 :
9187 : static void
9188 183720 : vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
9189 : hash_set<slp_tree> &visited)
9190 : {
9191 183720 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
9192 183720 : || visited.add (node))
9193 81426 : return;
9194 :
9195 : stmt_vec_info stmt;
9196 : unsigned i;
9197 350674 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
9198 248380 : if (stmt)
9199 252600 : roots.remove (vect_orig_stmt (stmt));
9200 :
9201 : slp_tree child;
9202 226731 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9203 124437 : if (child)
9204 123109 : vect_slp_prune_covered_roots (child, roots, visited);
9205 : }
9206 :
9207 : /* Analyze statements in SLP instances of VINFO. Return true if the
9208 : operations are supported. */
9209 :
9210 : bool
9211 605944 : vect_slp_analyze_operations (vec_info *vinfo)
9212 : {
9213 605944 : slp_instance instance;
9214 605944 : int i;
9215 :
9216 605944 : DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
9217 :
9218 605944 : hash_set<slp_tree> visited;
9219 1601898 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
9220 : {
9221 1225111 : auto_vec<slp_tree> visited_vec;
9222 1225111 : stmt_vector_for_cost cost_vec;
9223 1225111 : cost_vec.create (2);
9224 1225111 : if (is_a <bb_vec_info> (vinfo))
9225 775678 : vect_location = instance->location ();
9226 1225111 : if (!vect_slp_analyze_node_operations (vinfo,
9227 : SLP_INSTANCE_TREE (instance),
9228 : instance, visited, visited_vec,
9229 : &cost_vec)
9230 : /* CTOR instances require vectorized defs for the SLP tree root. */
9231 1003020 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
9232 5521 : && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
9233 : != vect_internal_def
9234 : /* Make sure we vectorized with the expected type. */
9235 5521 : || !useless_type_conversion_p
9236 5521 : (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
9237 : (instance->root_stmts[0]->stmt))),
9238 5521 : TREE_TYPE (SLP_TREE_VECTYPE
9239 : (SLP_INSTANCE_TREE (instance))))))
9240 : /* Check we can vectorize the reduction. */
9241 1003005 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
9242 73907 : && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
9243 : /* Check we can vectorize the gcond. */
9244 2178805 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
9245 57416 : && !vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
9246 57416 : SLP_INSTANCE_ROOT_STMTS (instance)[0],
9247 : NULL,
9248 : SLP_INSTANCE_TREE (instance),
9249 : &cost_vec)))
9250 : {
9251 326842 : cost_vec.release ();
9252 326842 : slp_tree node = SLP_INSTANCE_TREE (instance);
9253 326842 : stmt_vec_info stmt_info;
9254 326842 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9255 252124 : stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
9256 74718 : else if (!SLP_TREE_SCALAR_STMTS (node).is_empty ()
9257 74718 : && SLP_TREE_SCALAR_STMTS (node)[0])
9258 : stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
9259 : else
9260 0 : stmt_info = SLP_TREE_REPRESENTATIVE (node);
9261 326842 : if (is_a <loop_vec_info> (vinfo))
9262 : {
9263 229157 : if (dump_enabled_p ())
9264 6343 : dump_printf_loc (MSG_NOTE, vect_location,
9265 : "unsupported SLP instance starting from: %G",
9266 : stmt_info->stmt);
9267 229157 : return false;
9268 : }
9269 97685 : if (dump_enabled_p ())
9270 325 : dump_printf_loc (MSG_NOTE, vect_location,
9271 : "removing SLP instance operations starting from: %G",
9272 : stmt_info->stmt);
9273 435459 : while (!visited_vec.is_empty ())
9274 : {
9275 337774 : slp_tree node = visited_vec.pop ();
9276 337774 : SLP_TREE_TYPE (node) = undef_vec_info_type;
9277 337774 : if (node->data)
9278 : {
9279 12269 : delete node->data;
9280 12269 : node->data = nullptr;
9281 : }
9282 337774 : visited.remove (node);
9283 : }
9284 97685 : vect_free_slp_instance (instance);
9285 97685 : vinfo->slp_instances.ordered_remove (i);
9286 : }
9287 : else
9288 : {
9289 898269 : i++;
9290 898269 : if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
9291 : {
9292 220276 : add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
9293 220276 : cost_vec.release ();
9294 : }
9295 : else
9296 : /* For BB vectorization remember the SLP graph entry
9297 : cost for later. */
9298 677993 : instance->cost_vec = cost_vec;
9299 : }
9300 1225111 : }
9301 :
9302 : /* Now look for SLP instances with a root that are covered by other
9303 : instances and remove them. */
9304 376787 : hash_set<stmt_vec_info> roots;
9305 1585610 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
9306 864117 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9307 32081 : roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
9308 376787 : if (!roots.is_empty ())
9309 : {
9310 12431 : visited.empty ();
9311 73042 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
9312 60611 : vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
9313 : visited);
9314 73042 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
9315 60611 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
9316 32081 : && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
9317 : {
9318 1503 : stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
9319 1503 : if (dump_enabled_p ())
9320 20 : dump_printf_loc (MSG_NOTE, vect_location,
9321 : "removing SLP instance operations starting "
9322 : "from: %G", root->stmt);
9323 1503 : vect_free_slp_instance (instance);
9324 1503 : vinfo->slp_instances.ordered_remove (i);
9325 : }
9326 : else
9327 59108 : ++i;
9328 : }
9329 :
9330 : /* Compute vectorizable live stmts. */
9331 376787 : if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
9332 263794 : vect_bb_slp_mark_live_stmts (bb_vinfo);
9333 :
9334 753574 : return !vinfo->slp_instances.is_empty ();
9335 982731 : }
9336 :
9337 : /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
9338 : closing the eventual chain. */
9339 :
9340 : static slp_instance
9341 742064 : get_ultimate_leader (slp_instance instance,
9342 : hash_map<slp_instance, slp_instance> &instance_leader)
9343 : {
9344 742064 : auto_vec<slp_instance *, 8> chain;
9345 742064 : slp_instance *tem;
9346 819669 : while (*(tem = instance_leader.get (instance)) != instance)
9347 : {
9348 77605 : chain.safe_push (tem);
9349 77605 : instance = *tem;
9350 : }
9351 819669 : while (!chain.is_empty ())
9352 77605 : *chain.pop () = instance;
9353 742064 : return instance;
9354 742064 : }
9355 :
9356 : namespace {
9357 : /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
9358 : KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
9359 : for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
9360 :
9361 : INSTANCE_LEADER is as for get_ultimate_leader. */
9362 :
9363 : template<typename T>
9364 : bool
9365 3285934 : vect_map_to_instance (slp_instance instance, T key,
9366 : hash_map<T, slp_instance> &key_to_instance,
9367 : hash_map<slp_instance, slp_instance> &instance_leader)
9368 : {
9369 : bool existed_p;
9370 3285934 : slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
9371 3285934 : if (!existed_p)
9372 : ;
9373 174715 : else if (key_instance != instance)
9374 : {
9375 : /* If we're running into a previously marked key make us the
9376 : leader of the current ultimate leader. This keeps the
9377 : leader chain acyclic and works even when the current instance
9378 : connects two previously independent graph parts. */
9379 65574 : slp_instance key_leader
9380 65574 : = get_ultimate_leader (key_instance, instance_leader);
9381 65574 : if (key_leader != instance)
9382 19507 : instance_leader.put (key_leader, instance);
9383 : }
9384 3285934 : key_instance = instance;
9385 3285934 : return existed_p;
9386 : }
9387 : }
9388 :
9389 : /* Worker of vect_bb_partition_graph, recurse on NODE. */
9390 :
9391 : static void
9392 909370 : vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
9393 : slp_instance instance, slp_tree node,
9394 : hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
9395 : hash_map<slp_tree, slp_instance> &node_to_instance,
9396 : hash_map<slp_instance, slp_instance> &instance_leader)
9397 : {
9398 909370 : stmt_vec_info stmt_info;
9399 909370 : unsigned i;
9400 :
9401 3285934 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9402 2376564 : if (stmt_info)
9403 2376564 : vect_map_to_instance (instance, stmt_info, stmt_to_instance,
9404 : instance_leader);
9405 :
9406 909370 : if (vect_map_to_instance (instance, node, node_to_instance,
9407 : instance_leader))
9408 909370 : return;
9409 :
9410 : slp_tree child;
9411 1745066 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9412 877332 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9413 232880 : vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
9414 : node_to_instance, instance_leader);
9415 : }
9416 :
9417 : /* Partition the SLP graph into pieces that can be costed independently. */
9418 :
9419 : static void
9420 234133 : vect_bb_partition_graph (bb_vec_info bb_vinfo)
9421 : {
9422 234133 : DUMP_VECT_SCOPE ("vect_bb_partition_graph");
9423 :
9424 : /* First walk the SLP graph assigning each involved scalar stmt a
9425 : corresponding SLP graph entry and upon visiting a previously
9426 : marked stmt, make the stmts leader the current SLP graph entry. */
9427 234133 : hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
9428 234133 : hash_map<slp_tree, slp_instance> node_to_instance;
9429 234133 : hash_map<slp_instance, slp_instance> instance_leader;
9430 234133 : slp_instance instance;
9431 910623 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
9432 : {
9433 676490 : instance_leader.put (instance, instance);
9434 676490 : vect_bb_partition_graph_r (bb_vinfo,
9435 : instance, SLP_INSTANCE_TREE (instance),
9436 : stmt_to_instance, node_to_instance,
9437 : instance_leader);
9438 : }
9439 :
9440 : /* Then collect entries to each independent subgraph. */
9441 1144756 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
9442 : {
9443 676490 : slp_instance leader = get_ultimate_leader (instance, instance_leader);
9444 676490 : leader->subgraph_entries.safe_push (instance);
9445 676490 : if (dump_enabled_p ()
9446 676490 : && leader != instance)
9447 69 : dump_printf_loc (MSG_NOTE, vect_location,
9448 : "instance %p is leader of %p\n",
9449 : (void *) leader, (void *) instance);
9450 : }
9451 234133 : }
9452 :
9453 : /* Compute the set of scalar stmts participating in internal and external
9454 : nodes. */
9455 :
9456 : static void
9457 1540098 : vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
9458 : hash_set<slp_tree> &visited,
9459 : hash_set<stmt_vec_info> &vstmts,
9460 : hash_set<stmt_vec_info> &estmts)
9461 : {
9462 1540098 : int i;
9463 1540098 : stmt_vec_info stmt_info;
9464 1540098 : slp_tree child;
9465 :
9466 1540098 : if (visited.add (node))
9467 41573 : return;
9468 :
9469 1498525 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
9470 : {
9471 3081903 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9472 2223238 : if (stmt_info)
9473 2223238 : vstmts.add (stmt_info);
9474 :
9475 3119980 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9476 867080 : if (child)
9477 867080 : vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
9478 : vstmts, estmts);
9479 : }
9480 : else
9481 3588950 : for (tree def : SLP_TREE_SCALAR_OPS (node))
9482 : {
9483 1670418 : stmt_vec_info def_stmt = vinfo->lookup_def (def);
9484 1670418 : if (def_stmt)
9485 332354 : estmts.add (def_stmt);
9486 : }
9487 : }
9488 :
9489 :
9490 : /* Compute the scalar cost of the SLP node NODE and its children
9491 : and return it. Do not account defs that are marked in LIFE and
9492 : update LIFE according to uses of NODE. */
9493 :
9494 : static void
9495 899457 : vect_bb_slp_scalar_cost (vec_info *vinfo,
9496 : slp_tree node, vec<bool, va_heap> *life,
9497 : stmt_vector_for_cost *cost_vec,
9498 : hash_set<stmt_vec_info> &vectorized_scalar_stmts,
9499 : hash_set<stmt_vec_info> &scalar_stmts_in_externs,
9500 : hash_set<slp_tree> &visited)
9501 : {
9502 899457 : unsigned i;
9503 899457 : stmt_vec_info stmt_info;
9504 899457 : slp_tree child;
9505 :
9506 899457 : if (visited.add (node))
9507 40775 : return;
9508 :
9509 3081954 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9510 : {
9511 2223272 : ssa_op_iter op_iter;
9512 2223272 : def_operand_p def_p;
9513 :
9514 2255446 : if (!stmt_info
9515 2223272 : || (*life)[i]
9516 : /* Defs also used in external nodes are not in the
9517 : vectorized_scalar_stmts set as they need to be preserved.
9518 : Honor that. */
9519 4417371 : || scalar_stmts_in_externs.contains (stmt_info))
9520 106016 : continue;
9521 :
9522 2191098 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
9523 2191098 : gimple *orig_stmt = orig_stmt_info->stmt;
9524 :
9525 : /* If there is a non-vectorized use of the defs then the scalar
9526 : stmt is kept live in which case we do not account it or any
9527 : required defs in the SLP children in the scalar cost. This
9528 : way we make the vectorization more costly when compared to
9529 : the scalar cost. */
9530 2191098 : if (!STMT_VINFO_LIVE_P (stmt_info))
9531 : {
9532 2121600 : auto_vec<gimple *, 8> worklist;
9533 2121600 : hash_set<gimple *> *worklist_visited = NULL;
9534 2121600 : worklist.quick_push (orig_stmt);
9535 2127088 : do
9536 : {
9537 2127088 : gimple *work_stmt = worklist.pop ();
9538 4657081 : FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
9539 : {
9540 423722 : imm_use_iterator use_iter;
9541 423722 : gimple *use_stmt;
9542 1054986 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
9543 : DEF_FROM_PTR (def_p))
9544 652081 : if (!is_gimple_debug (use_stmt))
9545 : {
9546 499020 : stmt_vec_info use_stmt_info
9547 499020 : = vinfo->lookup_stmt (use_stmt);
9548 499020 : if (!use_stmt_info
9549 499020 : || !vectorized_scalar_stmts.contains (use_stmt_info))
9550 : {
9551 26411 : if (use_stmt_info
9552 23331 : && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
9553 : {
9554 : /* For stmts participating in patterns we have
9555 : to check its uses recursively. */
9556 5594 : if (!worklist_visited)
9557 4243 : worklist_visited = new hash_set<gimple *> ();
9558 5594 : if (!worklist_visited->add (use_stmt))
9559 5594 : worklist.safe_push (use_stmt);
9560 5594 : continue;
9561 : }
9562 20817 : (*life)[i] = true;
9563 20817 : goto next_lane;
9564 : }
9565 423722 : }
9566 : }
9567 : }
9568 4212542 : while (!worklist.is_empty ());
9569 2100783 : next_lane:
9570 2121600 : if (worklist_visited)
9571 4243 : delete worklist_visited;
9572 2121600 : if ((*life)[i])
9573 20817 : continue;
9574 2121600 : }
9575 :
9576 : /* Count scalar stmts only once. */
9577 2170281 : if (gimple_visited_p (orig_stmt))
9578 24979 : continue;
9579 2145302 : gimple_set_visited (orig_stmt, true);
9580 :
9581 2145302 : vect_cost_for_stmt kind;
9582 2145302 : if (STMT_VINFO_DATA_REF (orig_stmt_info))
9583 : {
9584 1945652 : data_reference_p dr = STMT_VINFO_DATA_REF (orig_stmt_info);
9585 1945652 : tree base = get_base_address (DR_REF (dr));
9586 : /* When the scalar access is to a non-global not address-taken
9587 : decl that is not BLKmode assume we can access it with a single
9588 : non-load/store instruction. */
9589 1945652 : if (DECL_P (base)
9590 1498243 : && !is_global_var (base)
9591 1421967 : && !TREE_ADDRESSABLE (base)
9592 2495046 : && DECL_MODE (base) != BLKmode)
9593 : kind = scalar_stmt;
9594 1802085 : else if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
9595 : kind = scalar_load;
9596 : else
9597 1576212 : kind = scalar_store;
9598 : }
9599 199650 : else if (vect_nop_conversion_p (orig_stmt_info))
9600 20035 : continue;
9601 : /* For single-argument PHIs assume coalescing which means zero cost
9602 : for the scalar and the vector PHIs. This avoids artificially
9603 : favoring the vector path (but may pessimize it in some cases). */
9604 179615 : else if (is_a <gphi *> (orig_stmt_info->stmt)
9605 179615 : && gimple_phi_num_args
9606 83457 : (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
9607 8011 : continue;
9608 : else
9609 : kind = scalar_stmt;
9610 2117256 : record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
9611 : SLP_TREE_VECTYPE (node), 0, vect_body);
9612 : }
9613 :
9614 1717364 : auto_vec<bool, 20> subtree_life;
9615 2480170 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9616 : {
9617 867104 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9618 : {
9619 : /* Do not directly pass LIFE to the recursive call, copy it to
9620 : confine changes in the callee to the current child/subtree. */
9621 226439 : if (SLP_TREE_PERMUTE_P (node))
9622 : {
9623 3496 : subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
9624 12240 : for (unsigned j = 0;
9625 12240 : j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
9626 : {
9627 8744 : auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
9628 8744 : if (perm.first == i)
9629 4600 : subtree_life[perm.second] = (*life)[j];
9630 : }
9631 : }
9632 : else
9633 : {
9634 222943 : gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
9635 222943 : subtree_life.safe_splice (*life);
9636 : }
9637 226439 : vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
9638 : vectorized_scalar_stmts,
9639 : scalar_stmts_in_externs, visited);
9640 226439 : subtree_life.truncate (0);
9641 : }
9642 : }
9643 : }
9644 :
9645 : /* Comparator for the loop-index sorted cost vectors. */
9646 :
9647 : static int
9648 17645886 : li_cost_vec_cmp (const void *a_, const void *b_)
9649 : {
9650 17645886 : auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
9651 17645886 : auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
9652 17645886 : if (a->first < b->first)
9653 : return -1;
9654 16884865 : else if (a->first == b->first)
9655 16212843 : return 0;
9656 : return 1;
9657 : }
9658 :
9659 : /* Check if vectorization of the basic block is profitable for the
9660 : subgraph denoted by SLP_INSTANCES. */
9661 :
9662 : static bool
9663 653648 : vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
9664 : vec<slp_instance> slp_instances,
9665 : loop_p orig_loop)
9666 : {
9667 653648 : slp_instance instance;
9668 653648 : int i;
9669 653648 : unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
9670 653648 : unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
9671 :
9672 653648 : if (dump_enabled_p ())
9673 : {
9674 96 : dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
9675 96 : hash_set<slp_tree> visited;
9676 387 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9677 99 : vect_print_slp_graph (MSG_NOTE, vect_location,
9678 : SLP_INSTANCE_TREE (instance), visited);
9679 96 : }
9680 :
9681 : /* Compute the set of scalar stmts we know will go away 'locally' when
9682 : vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
9683 : not accurate for nodes promoted extern late or for scalar stmts that
9684 : are used both in extern defs and in vectorized defs. */
9685 653648 : hash_set<stmt_vec_info> vectorized_scalar_stmts;
9686 653648 : hash_set<stmt_vec_info> scalar_stmts_in_externs;
9687 653648 : hash_set<slp_tree> visited;
9688 1326666 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9689 : {
9690 673018 : vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
9691 : SLP_INSTANCE_TREE (instance),
9692 : visited,
9693 : vectorized_scalar_stmts,
9694 : scalar_stmts_in_externs);
9695 781332 : for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
9696 51642 : vectorized_scalar_stmts.add (rstmt);
9697 : }
9698 : /* Scalar stmts used as defs in external nodes need to be preseved, so
9699 : remove them from vectorized_scalar_stmts. */
9700 950439 : for (stmt_vec_info stmt : scalar_stmts_in_externs)
9701 296791 : vectorized_scalar_stmts.remove (stmt);
9702 :
9703 : /* Calculate scalar cost and sum the cost for the vector stmts
9704 : previously collected. */
9705 653648 : stmt_vector_for_cost scalar_costs = vNULL;
9706 653648 : stmt_vector_for_cost vector_costs = vNULL;
9707 653648 : visited.empty ();
9708 1326666 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9709 : {
9710 673018 : auto_vec<bool, 20> life;
9711 673018 : life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
9712 : true);
9713 673018 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9714 56672 : record_stmt_cost (&scalar_costs,
9715 28336 : SLP_INSTANCE_ROOT_STMTS (instance).length (),
9716 : scalar_stmt,
9717 28336 : SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
9718 673018 : vect_bb_slp_scalar_cost (bb_vinfo,
9719 : SLP_INSTANCE_TREE (instance),
9720 : &life, &scalar_costs, vectorized_scalar_stmts,
9721 : scalar_stmts_in_externs, visited);
9722 673018 : vector_costs.safe_splice (instance->cost_vec);
9723 673018 : instance->cost_vec.release ();
9724 673018 : }
9725 :
9726 653648 : if (dump_enabled_p ())
9727 96 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
9728 :
9729 : /* When costing non-loop vectorization we need to consider each covered
9730 : loop independently and make sure vectorization is profitable. For
9731 : now we assume a loop may be not entered or executed an arbitrary
9732 : number of iterations (??? static information can provide more
9733 : precise info here) which means we can simply cost each containing
9734 : loops stmts separately. */
9735 :
9736 : /* First produce cost vectors sorted by loop index. */
9737 653648 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
9738 653648 : li_scalar_costs (scalar_costs.length ());
9739 653648 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
9740 653648 : li_vector_costs (vector_costs.length ());
9741 653648 : stmt_info_for_cost *cost;
9742 2799240 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9743 : {
9744 2145592 : unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
9745 2145592 : li_scalar_costs.quick_push (std::make_pair (l, cost));
9746 : }
9747 : /* Use a random used loop as fallback in case the first vector_costs
9748 : entry does not have a stmt_info associated with it. */
9749 653648 : unsigned l = li_scalar_costs[0].first;
9750 2412521 : FOR_EACH_VEC_ELT (vector_costs, i, cost)
9751 : {
9752 : /* We inherit from the previous COST, invariants, externals and
9753 : extracts immediately follow the cost for the related stmt. */
9754 1758873 : if (cost->stmt_info)
9755 1044776 : l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
9756 1758873 : li_vector_costs.quick_push (std::make_pair (l, cost));
9757 : }
9758 653648 : li_scalar_costs.qsort (li_cost_vec_cmp);
9759 653648 : li_vector_costs.qsort (li_cost_vec_cmp);
9760 :
9761 : /* Now cost the portions individually. */
9762 : unsigned vi = 0;
9763 : unsigned si = 0;
9764 1135742 : bool profitable = true;
9765 1135742 : while (si < li_scalar_costs.length ()
9766 1794210 : && vi < li_vector_costs.length ())
9767 : {
9768 658468 : unsigned sl = li_scalar_costs[si].first;
9769 658468 : unsigned vl = li_vector_costs[vi].first;
9770 658468 : if (sl != vl)
9771 : {
9772 1253 : if (dump_enabled_p ())
9773 0 : dump_printf_loc (MSG_NOTE, vect_location,
9774 : "Scalar %d and vector %d loop part do not "
9775 : "match up, skipping scalar part\n", sl, vl);
9776 : /* Skip the scalar part, assuming zero cost on the vector side. */
9777 2744 : do
9778 : {
9779 2744 : si++;
9780 : }
9781 2744 : while (si < li_scalar_costs.length ()
9782 4876 : && li_scalar_costs[si].first == sl);
9783 1253 : continue;
9784 : }
9785 :
9786 657215 : class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
9787 2125731 : do
9788 : {
9789 2125731 : add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
9790 2125731 : si++;
9791 : }
9792 2125731 : while (si < li_scalar_costs.length ()
9793 4259004 : && li_scalar_costs[si].first == sl);
9794 657215 : scalar_target_cost_data->finish_cost (nullptr);
9795 657215 : scalar_cost = (scalar_target_cost_data->body_cost ()
9796 657215 : * param_vect_scalar_cost_multiplier) / 100;
9797 :
9798 : /* Complete the target-specific vector cost calculation. */
9799 657215 : class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
9800 1724812 : do
9801 : {
9802 1724812 : add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
9803 1724812 : vi++;
9804 : }
9805 1724812 : while (vi < li_vector_costs.length ()
9806 3458299 : && li_vector_costs[vi].first == vl);
9807 657215 : vect_target_cost_data->finish_cost (scalar_target_cost_data);
9808 657215 : vec_prologue_cost = vect_target_cost_data->prologue_cost ();
9809 657215 : vec_inside_cost = vect_target_cost_data->body_cost ();
9810 657215 : vec_epilogue_cost = vect_target_cost_data->epilogue_cost ();
9811 657215 : delete scalar_target_cost_data;
9812 657215 : delete vect_target_cost_data;
9813 :
9814 657215 : vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
9815 :
9816 657215 : if (dump_enabled_p ())
9817 : {
9818 96 : dump_printf_loc (MSG_NOTE, vect_location,
9819 : "Cost model analysis for part in loop %d:\n", sl);
9820 96 : dump_printf (MSG_NOTE, " Vector cost: %d\n",
9821 : vec_inside_cost + vec_outside_cost);
9822 96 : dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
9823 : }
9824 :
9825 : /* Vectorization is profitable if its cost is more than the cost of scalar
9826 : version. Note that we err on the vector side for equal cost because
9827 : the cost estimate is otherwise quite pessimistic (constant uses are
9828 : free on the scalar side but cost a load on the vector side for
9829 : example). */
9830 657215 : if (vec_outside_cost + vec_inside_cost > scalar_cost)
9831 : {
9832 : profitable = false;
9833 : break;
9834 : }
9835 : }
9836 1130909 : if (profitable && vi < li_vector_costs.length ())
9837 : {
9838 1177 : if (dump_enabled_p ())
9839 12 : dump_printf_loc (MSG_NOTE, vect_location,
9840 : "Excess vector cost for part in loop %d:\n",
9841 6 : li_vector_costs[vi].first);
9842 : profitable = false;
9843 : }
9844 :
9845 : /* Unset visited flag. This is delayed when the subgraph is profitable
9846 : and we process the loop for remaining unvectorized if-converted code. */
9847 653648 : if (!orig_loop || !profitable)
9848 2797953 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9849 2144394 : gimple_set_visited (cost->stmt_info->stmt, false);
9850 :
9851 653648 : scalar_costs.release ();
9852 653648 : vector_costs.release ();
9853 :
9854 653648 : return profitable;
9855 653648 : }
9856 :
9857 : /* qsort comparator for lane defs. */
9858 :
9859 : static int
9860 40 : vld_cmp (const void *a_, const void *b_)
9861 : {
9862 40 : auto *a = (const std::pair<unsigned, tree> *)a_;
9863 40 : auto *b = (const std::pair<unsigned, tree> *)b_;
9864 40 : return a->first - b->first;
9865 : }
9866 :
9867 : /* Return true if USE_STMT is a vector lane insert into VEC and set
9868 : *THIS_LANE to the lane number that is set. */
9869 :
9870 : static bool
9871 248 : vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
9872 : {
9873 248 : gassign *use_ass = dyn_cast <gassign *> (use_stmt);
9874 91 : if (!use_ass
9875 91 : || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
9876 22 : || (vec
9877 22 : ? gimple_assign_rhs1 (use_ass) != vec
9878 24 : : ((vec = gimple_assign_rhs1 (use_ass)), false))
9879 46 : || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
9880 46 : TREE_TYPE (gimple_assign_rhs2 (use_ass)))
9881 46 : || !constant_multiple_p
9882 46 : (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
9883 92 : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
9884 : this_lane))
9885 202 : return false;
9886 : return true;
9887 : }
9888 :
9889 : /* Find any vectorizable constructors and add them to the grouped_store
9890 : array. */
9891 :
9892 : static void
9893 2183824 : vect_slp_check_for_roots (bb_vec_info bb_vinfo)
9894 : {
9895 17723259 : for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
9896 31078870 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
9897 134980207 : !gsi_end_p (gsi); gsi_next (&gsi))
9898 : {
9899 119440772 : gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
9900 : /* This can be used to start SLP discovery for early breaks for BB early breaks
9901 : when we get that far. */
9902 119440772 : if (!assign)
9903 178835507 : continue;
9904 :
9905 30937072 : tree rhs = gimple_assign_rhs1 (assign);
9906 30937072 : enum tree_code code = gimple_assign_rhs_code (assign);
9907 30937072 : use_operand_p use_p;
9908 30937072 : gimple *use_stmt;
9909 30937072 : if (code == CONSTRUCTOR)
9910 : {
9911 1563735 : if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9912 63705 : || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
9913 92969 : CONSTRUCTOR_NELTS (rhs))
9914 42794 : || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
9915 1606525 : || uniform_vector_p (rhs))
9916 1550838 : continue;
9917 :
9918 : unsigned j;
9919 : tree val;
9920 64357 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9921 51460 : if (TREE_CODE (val) != SSA_NAME
9922 51460 : || !bb_vinfo->lookup_def (val))
9923 : break;
9924 31628 : if (j != CONSTRUCTOR_NELTS (rhs))
9925 2917 : continue;
9926 :
9927 12897 : vec<stmt_vec_info> roots = vNULL;
9928 12897 : roots.safe_push (bb_vinfo->lookup_stmt (assign));
9929 12897 : vec<stmt_vec_info> stmts;
9930 12897 : stmts.create (CONSTRUCTOR_NELTS (rhs));
9931 72760 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9932 46966 : stmts.quick_push
9933 46966 : (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
9934 12897 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9935 12897 : stmts, roots));
9936 : }
9937 29373337 : else if (code == BIT_INSERT_EXPR
9938 926 : && VECTOR_TYPE_P (TREE_TYPE (rhs))
9939 608 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
9940 608 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
9941 605 : && integer_zerop (gimple_assign_rhs3 (assign))
9942 341 : && useless_type_conversion_p
9943 341 : (TREE_TYPE (TREE_TYPE (rhs)),
9944 341 : TREE_TYPE (gimple_assign_rhs2 (assign)))
9945 29373959 : && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
9946 : {
9947 : /* We start to match on insert to lane zero but since the
9948 : inserts need not be ordered we'd have to search both
9949 : the def and the use chains. */
9950 215 : tree vectype = TREE_TYPE (rhs);
9951 215 : unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
9952 215 : auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
9953 215 : auto_sbitmap lanes (nlanes);
9954 215 : bitmap_clear (lanes);
9955 215 : bitmap_set_bit (lanes, 0);
9956 215 : tree def = gimple_assign_lhs (assign);
9957 215 : lane_defs.quick_push
9958 215 : (std::make_pair (0, gimple_assign_rhs2 (assign)));
9959 215 : unsigned lanes_found = 1;
9960 : /* Start with the use chains, the last stmt will be the root. */
9961 215 : stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
9962 215 : vec<stmt_vec_info> roots = vNULL;
9963 215 : roots.safe_push (last);
9964 217 : do
9965 : {
9966 217 : use_operand_p use_p;
9967 217 : gimple *use_stmt;
9968 217 : if (!single_imm_use (def, &use_p, &use_stmt))
9969 : break;
9970 211 : unsigned this_lane;
9971 211 : if (!bb_vinfo->lookup_stmt (use_stmt)
9972 211 : || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
9973 233 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
9974 : break;
9975 22 : if (bitmap_bit_p (lanes, this_lane))
9976 : break;
9977 2 : lanes_found++;
9978 2 : bitmap_set_bit (lanes, this_lane);
9979 2 : gassign *use_ass = as_a <gassign *> (use_stmt);
9980 2 : lane_defs.quick_push (std::make_pair
9981 2 : (this_lane, gimple_assign_rhs2 (use_ass)));
9982 2 : last = bb_vinfo->lookup_stmt (use_ass);
9983 2 : roots.safe_push (last);
9984 2 : def = gimple_assign_lhs (use_ass);
9985 : }
9986 2 : while (lanes_found < nlanes);
9987 215 : if (roots.length () > 1)
9988 2 : std::swap(roots[0], roots[roots.length () - 1]);
9989 215 : if (lanes_found < nlanes)
9990 : {
9991 : /* Now search the def chain. */
9992 215 : def = gimple_assign_rhs1 (assign);
9993 217 : do
9994 : {
9995 217 : if (TREE_CODE (def) != SSA_NAME
9996 217 : || !has_single_use (def))
9997 : break;
9998 56 : gimple *def_stmt = SSA_NAME_DEF_STMT (def);
9999 56 : unsigned this_lane;
10000 56 : if (!bb_vinfo->lookup_stmt (def_stmt)
10001 37 : || !vect_slp_is_lane_insert (def_stmt,
10002 : NULL_TREE, &this_lane)
10003 80 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
10004 : break;
10005 24 : if (bitmap_bit_p (lanes, this_lane))
10006 : break;
10007 4 : lanes_found++;
10008 4 : bitmap_set_bit (lanes, this_lane);
10009 8 : lane_defs.quick_push (std::make_pair
10010 4 : (this_lane,
10011 4 : gimple_assign_rhs2 (def_stmt)));
10012 4 : roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
10013 4 : def = gimple_assign_rhs1 (def_stmt);
10014 : }
10015 4 : while (lanes_found < nlanes);
10016 : }
10017 215 : if (lanes_found == nlanes)
10018 : {
10019 : /* Sort lane_defs after the lane index and register the root. */
10020 2 : lane_defs.qsort (vld_cmp);
10021 2 : vec<stmt_vec_info> stmts;
10022 2 : stmts.create (nlanes);
10023 10 : for (unsigned i = 0; i < nlanes; ++i)
10024 8 : stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
10025 2 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
10026 2 : stmts, roots));
10027 : }
10028 : else
10029 213 : roots.release ();
10030 215 : }
10031 29373122 : else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
10032 28387558 : && (associative_tree_code (code) || code == MINUS_EXPR)
10033 : /* ??? This pessimizes a two-element reduction. PR54400.
10034 : ??? In-order reduction could be handled if we only
10035 : traverse one operand chain in vect_slp_linearize_chain. */
10036 33320528 : && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
10037 : /* Ops with constants at the tail can be stripped here. */
10038 5811026 : && TREE_CODE (rhs) == SSA_NAME
10039 5748632 : && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
10040 : /* Should be the chain end. */
10041 31643800 : && (!single_imm_use (gimple_assign_lhs (assign),
10042 : &use_p, &use_stmt)
10043 1752004 : || !is_gimple_assign (use_stmt)
10044 1191498 : || (gimple_assign_rhs_code (use_stmt) != code
10045 883057 : && ((code != PLUS_EXPR && code != MINUS_EXPR)
10046 500242 : || (gimple_assign_rhs_code (use_stmt)
10047 500242 : != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
10048 : {
10049 : /* We start the match at the end of a possible association
10050 : chain. */
10051 1863620 : auto_vec<chain_op_t> chain;
10052 1863620 : auto_vec<std::pair<tree_code, gimple *> > worklist;
10053 1863620 : auto_vec<gimple *> chain_stmts;
10054 1863620 : gimple *code_stmt = NULL, *alt_code_stmt = NULL;
10055 1863620 : if (code == MINUS_EXPR)
10056 304307 : code = PLUS_EXPR;
10057 1863620 : internal_fn reduc_fn;
10058 2140889 : if (!reduction_fn_for_scalar_code (code, &reduc_fn)
10059 1863620 : || reduc_fn == IFN_LAST)
10060 277269 : continue;
10061 1586351 : vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
10062 : /* ??? */
10063 : code_stmt, alt_code_stmt, &chain_stmts);
10064 3172702 : if (chain.length () > 1)
10065 : {
10066 : /* Sort the chain according to def_type and operation. */
10067 1586351 : chain.sort (dt_sort_cmp, bb_vinfo);
10068 : /* ??? Now we'd want to strip externals and constants
10069 : but record those to be handled in the epilogue. */
10070 : /* ??? For now do not allow mixing ops or externs/constants. */
10071 1586351 : bool invalid = false;
10072 1586351 : unsigned remain_cnt = 0;
10073 1586351 : unsigned last_idx = 0;
10074 4786594 : for (unsigned i = 0; i < chain.length (); ++i)
10075 : {
10076 3527511 : if (chain[i].code != code)
10077 : {
10078 : invalid = true;
10079 : break;
10080 : }
10081 3200243 : if (chain[i].dt != vect_internal_def
10082 : /* Avoid stmts where the def is not the LHS, like
10083 : ASMs. */
10084 6172153 : || (gimple_get_lhs (bb_vinfo->lookup_def
10085 2971910 : (chain[i].op)->stmt)
10086 2971910 : != chain[i].op))
10087 231277 : remain_cnt++;
10088 : else
10089 : last_idx = i;
10090 : }
10091 : /* Make sure to have an even number of lanes as we later do
10092 : all-or-nothing discovery, not trying to split further. */
10093 1586351 : if ((chain.length () - remain_cnt) & 1)
10094 184779 : remain_cnt++;
10095 1586351 : if (!invalid && chain.length () - remain_cnt > 1)
10096 : {
10097 1191565 : vec<stmt_vec_info> stmts;
10098 1191565 : vec<tree> remain = vNULL;
10099 1191565 : stmts.create (chain.length ());
10100 1191565 : if (remain_cnt > 0)
10101 110112 : remain.create (remain_cnt);
10102 3828602 : for (unsigned i = 0; i < chain.length (); ++i)
10103 : {
10104 2637037 : stmt_vec_info stmt_info;
10105 2637037 : if (chain[i].dt == vect_internal_def
10106 2600203 : && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
10107 2600203 : gimple_get_lhs (stmt_info->stmt) == chain[i].op)
10108 5237156 : && (i != last_idx
10109 1191565 : || (stmts.length () & 1)))
10110 2515588 : stmts.quick_push (stmt_info);
10111 : else
10112 121449 : remain.quick_push (chain[i].op);
10113 : }
10114 1191565 : vec<stmt_vec_info> roots;
10115 1191565 : roots.create (chain_stmts.length ());
10116 2637037 : for (unsigned i = 0; i < chain_stmts.length (); ++i)
10117 1445472 : roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
10118 1191565 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
10119 1191565 : stmts, roots, remain));
10120 : }
10121 : }
10122 1863620 : }
10123 : }
10124 2183824 : }
10125 :
10126 : /* Walk the grouped store chains and replace entries with their
10127 : pattern variant if any. */
10128 :
10129 : static void
10130 609089 : vect_fixup_store_groups_with_patterns (vec_info *vinfo)
10131 : {
10132 609089 : stmt_vec_info first_element;
10133 609089 : unsigned i;
10134 :
10135 1493147 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
10136 : {
10137 : /* We also have CTORs in this array. */
10138 884058 : if (!STMT_VINFO_GROUPED_ACCESS (first_element))
10139 0 : continue;
10140 884058 : if (STMT_VINFO_IN_PATTERN_P (first_element))
10141 : {
10142 254 : stmt_vec_info orig = first_element;
10143 254 : first_element = STMT_VINFO_RELATED_STMT (first_element);
10144 254 : DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
10145 254 : DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
10146 254 : DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
10147 254 : DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
10148 254 : vinfo->grouped_stores[i] = first_element;
10149 : }
10150 884058 : stmt_vec_info prev = first_element;
10151 2482831 : while (DR_GROUP_NEXT_ELEMENT (prev))
10152 : {
10153 1598773 : stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
10154 1598773 : if (STMT_VINFO_IN_PATTERN_P (elt))
10155 : {
10156 893 : stmt_vec_info orig = elt;
10157 893 : elt = STMT_VINFO_RELATED_STMT (elt);
10158 893 : DR_GROUP_NEXT_ELEMENT (prev) = elt;
10159 893 : DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
10160 893 : DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
10161 : }
10162 1598773 : DR_GROUP_FIRST_ELEMENT (elt) = first_element;
10163 1598773 : prev = elt;
10164 : }
10165 : }
10166 609089 : }
10167 :
10168 : /* Check if the region described by BB_VINFO can be vectorized, returning
10169 : true if so. When returning false, set FATAL to true if the same failure
10170 : would prevent vectorization at other vector sizes, false if it is still
10171 : worth trying other sizes. N_STMTS is the number of statements in the
10172 : region. */
10173 :
10174 : static bool
10175 2183824 : vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
10176 : vec<int> *dataref_groups)
10177 : {
10178 2183824 : DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
10179 :
10180 2183824 : slp_instance instance;
10181 2183824 : int i;
10182 :
10183 : /* The first group of checks is independent of the vector size. */
10184 2183824 : fatal = true;
10185 :
10186 : /* Analyze the data references. */
10187 :
10188 2183824 : if (!vect_analyze_data_refs (bb_vinfo, NULL))
10189 : {
10190 0 : if (dump_enabled_p ())
10191 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10192 : "not vectorized: unhandled data-ref in basic "
10193 : "block.\n");
10194 0 : return false;
10195 : }
10196 :
10197 2183824 : if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
10198 : {
10199 0 : if (dump_enabled_p ())
10200 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10201 : "not vectorized: unhandled data access in "
10202 : "basic block.\n");
10203 0 : return false;
10204 : }
10205 :
10206 2183824 : vect_slp_check_for_roots (bb_vinfo);
10207 :
10208 : /* If there are no grouped stores and no constructors in the region
10209 : there is no need to continue with pattern recog as vect_analyze_slp
10210 : will fail anyway. */
10211 2183824 : if (bb_vinfo->grouped_stores.is_empty ()
10212 1843395 : && bb_vinfo->roots.is_empty ())
10213 : {
10214 1574735 : if (dump_enabled_p ())
10215 1022 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10216 : "not vectorized: no grouped stores in "
10217 : "basic block.\n");
10218 1574735 : return false;
10219 : }
10220 :
10221 : /* While the rest of the analysis below depends on it in some way. */
10222 609089 : fatal = false;
10223 :
10224 609089 : vect_pattern_recog (bb_vinfo);
10225 :
10226 : /* Update store groups from pattern processing. */
10227 609089 : vect_fixup_store_groups_with_patterns (bb_vinfo);
10228 :
10229 : /* Check the SLP opportunities in the basic block, analyze and build SLP
10230 : trees. */
10231 609089 : if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
10232 : {
10233 0 : if (dump_enabled_p ())
10234 : {
10235 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10236 : "Failed to SLP the basic block.\n");
10237 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10238 : "not vectorized: failed to find SLP opportunities "
10239 : "in basic block.\n");
10240 : }
10241 0 : return false;
10242 : }
10243 :
10244 : /* Optimize permutations. */
10245 609089 : vect_optimize_slp (bb_vinfo);
10246 :
10247 : /* Gather the loads reachable from the SLP graph entries. */
10248 609089 : vect_gather_slp_loads (bb_vinfo);
10249 :
10250 609089 : vect_record_base_alignments (bb_vinfo);
10251 :
10252 : /* Analyze and verify the alignment of data references and the
10253 : dependence in the SLP instances. */
10254 1393086 : for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
10255 : {
10256 783997 : vect_location = instance->location ();
10257 783997 : if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
10258 783997 : || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
10259 : {
10260 8319 : slp_tree node = SLP_INSTANCE_TREE (instance);
10261 8319 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
10262 8319 : if (dump_enabled_p ())
10263 4 : dump_printf_loc (MSG_NOTE, vect_location,
10264 : "removing SLP instance operations starting from: %G",
10265 : stmt_info->stmt);
10266 8319 : vect_free_slp_instance (instance);
10267 8319 : BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
10268 8319 : continue;
10269 8319 : }
10270 :
10271 : /* Mark all the statements that we want to vectorize as pure SLP and
10272 : relevant. */
10273 775678 : vect_mark_slp_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance));
10274 775678 : vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
10275 775678 : unsigned j;
10276 775678 : stmt_vec_info root;
10277 : /* Likewise consider instance root stmts as vectorized. */
10278 1712751 : FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
10279 161395 : STMT_SLP_TYPE (root) = pure_slp;
10280 :
10281 775678 : i++;
10282 : }
10283 2213485 : if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
10284 : return false;
10285 :
10286 263794 : if (!vect_slp_analyze_operations (bb_vinfo))
10287 : {
10288 29661 : if (dump_enabled_p ())
10289 81 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10290 : "not vectorized: bad operation in basic block.\n");
10291 29661 : return false;
10292 : }
10293 :
10294 234133 : vect_bb_partition_graph (bb_vinfo);
10295 :
10296 234133 : return true;
10297 : }
10298 :
10299 : /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
10300 : basic blocks in BBS, returning true on success.
10301 : The region has N_STMTS statements and has the datarefs given by DATAREFS. */
10302 :
10303 : static bool
10304 1864745 : vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
10305 : vec<int> *dataref_groups, unsigned int n_stmts,
10306 : loop_p orig_loop)
10307 : {
10308 1864745 : bb_vec_info bb_vinfo;
10309 1864745 : auto_vector_modes vector_modes;
10310 :
10311 : /* Autodetect first vector size we try. */
10312 1864745 : machine_mode next_vector_mode = VOIDmode;
10313 1864745 : targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
10314 1864745 : unsigned int mode_i = 0;
10315 :
10316 1864745 : vec_info_shared shared;
10317 :
10318 1864745 : machine_mode autodetected_vector_mode = VOIDmode;
10319 2502903 : while (1)
10320 : {
10321 2183824 : bool vectorized = false;
10322 2183824 : bool fatal = false;
10323 2183824 : bb_vinfo = new _bb_vec_info (bbs, &shared);
10324 :
10325 2183824 : bool first_time_p = shared.datarefs.is_empty ();
10326 2183824 : BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
10327 2183824 : if (first_time_p)
10328 1887021 : bb_vinfo->shared->save_datarefs ();
10329 : else
10330 296803 : bb_vinfo->shared->check_datarefs ();
10331 2183824 : bb_vinfo->vector_mode = next_vector_mode;
10332 :
10333 2183824 : if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
10334 : {
10335 234133 : if (dump_enabled_p ())
10336 : {
10337 1498 : dump_printf_loc (MSG_NOTE, vect_location,
10338 : "***** Analysis succeeded with vector mode"
10339 749 : " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
10340 749 : dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
10341 : }
10342 :
10343 234133 : bb_vinfo->shared->check_datarefs ();
10344 :
10345 234133 : bool force_clear = false;
10346 234133 : auto_vec<slp_instance> profitable_subgraphs;
10347 1378889 : for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
10348 : {
10349 676490 : if (instance->subgraph_entries.is_empty ())
10350 216565 : continue;
10351 :
10352 656983 : dump_user_location_t saved_vect_location = vect_location;
10353 656983 : vect_location = instance->location ();
10354 656983 : if (!unlimited_cost_model (NULL)
10355 1310631 : && !vect_bb_vectorization_profitable_p
10356 653648 : (bb_vinfo, instance->subgraph_entries, orig_loop))
10357 : {
10358 177551 : if (dump_enabled_p ())
10359 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10360 : "not vectorized: vectorization is not "
10361 : "profitable.\n");
10362 177551 : vect_location = saved_vect_location;
10363 177551 : continue;
10364 : }
10365 :
10366 479432 : vect_location = saved_vect_location;
10367 479432 : if (!dbg_cnt (vect_slp))
10368 : {
10369 0 : force_clear = true;
10370 0 : continue;
10371 : }
10372 :
10373 479432 : profitable_subgraphs.safe_push (instance);
10374 : }
10375 :
10376 : /* When we're vectorizing an if-converted loop body make sure
10377 : we vectorized all if-converted code. */
10378 392538 : if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
10379 : {
10380 97 : gcc_assert (bb_vinfo->nbbs == 1);
10381 194 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
10382 4084 : !gsi_end_p (gsi); gsi_next (&gsi))
10383 : {
10384 : /* The costing above left us with DCEable vectorized scalar
10385 : stmts having the visited flag set on profitable
10386 : subgraphs. Do the delayed clearing of the flag here. */
10387 3987 : if (gimple_visited_p (gsi_stmt (gsi)))
10388 : {
10389 1172 : gimple_set_visited (gsi_stmt (gsi), false);
10390 1172 : continue;
10391 : }
10392 2815 : if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
10393 813 : continue;
10394 :
10395 5859 : if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
10396 2450 : if (gimple_assign_rhs_code (ass) == COND_EXPR)
10397 : {
10398 51 : if (!profitable_subgraphs.is_empty ()
10399 22 : && dump_enabled_p ())
10400 0 : dump_printf_loc (MSG_NOTE, vect_location,
10401 : "not profitable because of "
10402 : "unprofitable if-converted scalar "
10403 : "code\n");
10404 29 : profitable_subgraphs.truncate (0);
10405 : }
10406 : }
10407 : }
10408 :
10409 : /* Finally schedule the profitable subgraphs. */
10410 1030343 : for (slp_instance instance : profitable_subgraphs)
10411 : {
10412 479400 : if (!vectorized && dump_enabled_p ())
10413 724 : dump_printf_loc (MSG_NOTE, vect_location,
10414 : "Basic block will be vectorized "
10415 : "using SLP\n");
10416 479400 : vectorized = true;
10417 :
10418 : /* Dump before scheduling as store vectorization will remove
10419 : the original stores and mess with the instance tree
10420 : so querying its location will eventually ICE. */
10421 479400 : if (flag_checking)
10422 1928229 : for (slp_instance sub : instance->subgraph_entries)
10423 490029 : gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
10424 479400 : unsigned HOST_WIDE_INT bytes;
10425 479400 : if (dump_enabled_p ())
10426 3449 : for (slp_instance sub : instance->subgraph_entries)
10427 : {
10428 914 : tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
10429 1828 : if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
10430 914 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
10431 914 : sub->location (),
10432 : "basic block part vectorized using %wu "
10433 : "byte vectors\n", bytes);
10434 : else
10435 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
10436 : sub->location (),
10437 : "basic block part vectorized using "
10438 : "variable length vectors\n");
10439 : }
10440 :
10441 479400 : dump_user_location_t saved_vect_location = vect_location;
10442 479400 : vect_location = instance->location ();
10443 :
10444 479400 : vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
10445 :
10446 479400 : vect_location = saved_vect_location;
10447 : }
10448 :
10449 :
10450 : /* Generate the invariant statements. */
10451 234133 : if (!gimple_seq_empty_p (bb_vinfo->inv_pattern_def_seq))
10452 : {
10453 23 : if (dump_enabled_p ())
10454 0 : dump_printf_loc (MSG_NOTE, vect_location,
10455 : "------>generating invariant statements\n");
10456 :
10457 23 : bb_vinfo->insert_seq_on_entry (NULL,
10458 : bb_vinfo->inv_pattern_def_seq);
10459 : }
10460 234133 : }
10461 : else
10462 : {
10463 1949691 : if (dump_enabled_p ())
10464 1314 : dump_printf_loc (MSG_NOTE, vect_location,
10465 : "***** Analysis failed with vector mode %s\n",
10466 1314 : GET_MODE_NAME (bb_vinfo->vector_mode));
10467 : }
10468 :
10469 2183824 : if (mode_i == 0)
10470 1864745 : autodetected_vector_mode = bb_vinfo->vector_mode;
10471 :
10472 2183824 : if (!fatal)
10473 3125653 : while (mode_i < vector_modes.length ()
10474 1751984 : && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
10475 : {
10476 332740 : if (dump_enabled_p ())
10477 1650 : dump_printf_loc (MSG_NOTE, vect_location,
10478 : "***** The result for vector mode %s would"
10479 : " be the same\n",
10480 825 : GET_MODE_NAME (vector_modes[mode_i]));
10481 332740 : mode_i += 1;
10482 : }
10483 :
10484 2183824 : delete bb_vinfo;
10485 :
10486 2183824 : if (mode_i < vector_modes.length ()
10487 2007392 : && VECTOR_MODE_P (autodetected_vector_mode)
10488 1988466 : && (related_vector_mode (vector_modes[mode_i],
10489 : GET_MODE_INNER (autodetected_vector_mode))
10490 994233 : == autodetected_vector_mode)
10491 4191216 : && (related_vector_mode (autodetected_vector_mode,
10492 516353 : GET_MODE_INNER (vector_modes[mode_i]))
10493 1032706 : == vector_modes[mode_i]))
10494 : {
10495 516353 : if (dump_enabled_p ())
10496 205 : dump_printf_loc (MSG_NOTE, vect_location,
10497 : "***** Skipping vector mode %s, which would"
10498 : " repeat the analysis for %s\n",
10499 205 : GET_MODE_NAME (vector_modes[mode_i]),
10500 205 : GET_MODE_NAME (autodetected_vector_mode));
10501 516353 : mode_i += 1;
10502 : }
10503 :
10504 2183824 : if (vectorized
10505 2025441 : || mode_i == vector_modes.length ()
10506 1849056 : || autodetected_vector_mode == VOIDmode
10507 : /* If vect_slp_analyze_bb_1 signaled that analysis for all
10508 : vector sizes will fail do not bother iterating. */
10509 3019721 : || fatal)
10510 3729490 : return vectorized;
10511 :
10512 : /* Try the next biggest vector size. */
10513 319079 : next_vector_mode = vector_modes[mode_i++];
10514 319079 : if (dump_enabled_p ())
10515 218 : dump_printf_loc (MSG_NOTE, vect_location,
10516 : "***** Re-trying analysis with vector mode %s\n",
10517 218 : GET_MODE_NAME (next_vector_mode));
10518 319079 : }
10519 1864745 : }
10520 :
10521 :
10522 : /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
10523 : true if anything in the basic-block was vectorized. */
10524 :
10525 : static bool
10526 1864745 : vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
10527 : {
10528 1864745 : vec<data_reference_p> datarefs = vNULL;
10529 1864745 : auto_vec<int> dataref_groups;
10530 1864745 : int insns = 0;
10531 1864745 : int current_group = 0;
10532 :
10533 12493816 : for (unsigned i = 0; i < bbs.length (); i++)
10534 : {
10535 10629071 : basic_block bb = bbs[i];
10536 88546594 : for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
10537 77917523 : gsi_next (&gsi))
10538 : {
10539 77917523 : gimple *stmt = gsi_stmt (gsi);
10540 77917523 : if (is_gimple_debug (stmt))
10541 48220995 : continue;
10542 :
10543 29696528 : insns++;
10544 :
10545 29696528 : if (gimple_location (stmt) != UNKNOWN_LOCATION)
10546 26682990 : vect_location = stmt;
10547 :
10548 29696528 : if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
10549 : &dataref_groups, current_group))
10550 5086097 : ++current_group;
10551 : }
10552 : /* New BBs always start a new DR group. */
10553 10629071 : ++current_group;
10554 : }
10555 :
10556 1864745 : return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
10557 1864745 : }
10558 :
10559 : /* Special entry for the BB vectorizer. Analyze and transform a single
10560 : if-converted BB with ORIG_LOOPs body being the not if-converted
10561 : representation. Returns true if anything in the basic-block was
10562 : vectorized. */
10563 :
10564 : bool
10565 19420 : vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
10566 : {
10567 19420 : auto_vec<basic_block> bbs;
10568 19420 : bbs.safe_push (bb);
10569 19420 : return vect_slp_bbs (bbs, orig_loop);
10570 19420 : }
10571 :
10572 : /* Main entry for the BB vectorizer. Analyze and transform BB, returns
10573 : true if anything in the basic-block was vectorized. */
10574 :
10575 : bool
10576 905454 : vect_slp_function (function *fun)
10577 : {
10578 905454 : bool r = false;
10579 905454 : int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
10580 905454 : auto_bitmap exit_bbs;
10581 905454 : bitmap_set_bit (exit_bbs, EXIT_BLOCK);
10582 905454 : edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
10583 905454 : unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
10584 905454 : true, rpo, NULL);
10585 :
10586 : /* For the moment split the function into pieces to avoid making
10587 : the iteration on the vector mode moot. Split at points we know
10588 : to not handle well which is CFG merges (SLP discovery doesn't
10589 : handle non-loop-header PHIs) and loop exits. Since pattern
10590 : recog requires reverse iteration to visit uses before defs
10591 : simply chop RPO into pieces. */
10592 905454 : auto_vec<basic_block> bbs;
10593 11546033 : for (unsigned i = 0; i < n; i++)
10594 : {
10595 10640579 : basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
10596 10640579 : bool split = false;
10597 :
10598 : /* Split when a BB is not dominated by the first block. */
10599 20072008 : if (!bbs.is_empty ()
10600 9431429 : && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
10601 : {
10602 656087 : if (dump_enabled_p ())
10603 146 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10604 : "splitting region at dominance boundary bb%d\n",
10605 : bb->index);
10606 : split = true;
10607 : }
10608 : /* Split when the loop determined by the first block
10609 : is exited. This is because we eventually insert
10610 : invariants at region begin. */
10611 18759834 : else if (!bbs.is_empty ()
10612 8775342 : && bbs[0]->loop_father != bb->loop_father
10613 2280719 : && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
10614 : {
10615 3747 : if (dump_enabled_p ())
10616 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10617 : "splitting region at loop %d exit at bb%d\n",
10618 3 : bbs[0]->loop_father->num, bb->index);
10619 : split = true;
10620 : }
10621 9980745 : else if (!bbs.is_empty ()
10622 8771595 : && bb->loop_father->header == bb
10623 472342 : && bb->loop_father->dont_vectorize)
10624 : {
10625 7269 : if (dump_enabled_p ())
10626 72 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10627 : "splitting region at dont-vectorize loop %d "
10628 : "entry at bb%d\n",
10629 : bb->loop_father->num, bb->index);
10630 : split = true;
10631 : }
10632 :
10633 11307682 : if (split && !bbs.is_empty ())
10634 : {
10635 667103 : r |= vect_slp_bbs (bbs, NULL);
10636 667103 : bbs.truncate (0);
10637 : }
10638 :
10639 10640579 : if (bbs.is_empty ())
10640 : {
10641 : /* We need to be able to insert at the head of the region which
10642 : we cannot for region starting with a returns-twice call. */
10643 1876253 : if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
10644 396454 : if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
10645 : {
10646 301 : if (dump_enabled_p ())
10647 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10648 : "skipping bb%d as start of region as it "
10649 : "starts with returns-twice call\n",
10650 : bb->index);
10651 30928 : continue;
10652 : }
10653 : /* If the loop this BB belongs to is marked as not to be vectorized
10654 : honor that also for BB vectorization. */
10655 1875952 : if (bb->loop_father->dont_vectorize)
10656 30627 : continue;
10657 : }
10658 :
10659 10609651 : bbs.safe_push (bb);
10660 :
10661 : /* When we have a stmt ending this block and defining a
10662 : value we have to insert on edges when inserting after it for
10663 : a vector containing its definition. Avoid this for now. */
10664 21219302 : if (gimple *last = *gsi_last_bb (bb))
10665 8583263 : if (gimple_get_lhs (last)
10666 8583263 : && is_ctrl_altering_stmt (last))
10667 : {
10668 272775 : if (dump_enabled_p ())
10669 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10670 : "splitting region at control altering "
10671 : "definition %G", last);
10672 272775 : r |= vect_slp_bbs (bbs, NULL);
10673 272775 : bbs.truncate (0);
10674 : }
10675 : }
10676 :
10677 905454 : if (!bbs.is_empty ())
10678 905447 : r |= vect_slp_bbs (bbs, NULL);
10679 :
10680 905454 : free (rpo);
10681 :
10682 905454 : return r;
10683 905454 : }
10684 :
10685 : /* Build a variable-length vector in which the elements in ELTS are repeated
10686 : to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
10687 : RESULTS and add any new instructions to SEQ.
10688 :
10689 : The approach we use is:
10690 :
10691 : (1) Find a vector mode VM with integer elements of mode IM.
10692 :
10693 : (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
10694 : ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
10695 : from small vectors to IM.
10696 :
10697 : (3) Duplicate each ELTS'[I] into a vector of mode VM.
10698 :
10699 : (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
10700 : correct byte contents.
10701 :
10702 : (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
10703 :
10704 : We try to find the largest IM for which this sequence works, in order
10705 : to cut down on the number of interleaves. */
10706 :
10707 : void
10708 0 : duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
10709 : const vec<tree> &elts, unsigned int nresults,
10710 : vec<tree> &results)
10711 : {
10712 0 : unsigned int nelts = elts.length ();
10713 0 : tree element_type = TREE_TYPE (vector_type);
10714 :
10715 : /* (1) Find a vector mode VM with integer elements of mode IM. */
10716 0 : unsigned int nvectors = 1;
10717 0 : tree new_vector_type;
10718 0 : tree permutes[2];
10719 0 : if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
10720 : &nvectors, &new_vector_type,
10721 : permutes))
10722 0 : gcc_unreachable ();
10723 :
10724 : /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
10725 0 : unsigned int partial_nelts = nelts / nvectors;
10726 0 : tree partial_vector_type = build_vector_type (element_type, partial_nelts);
10727 :
10728 0 : tree_vector_builder partial_elts;
10729 0 : auto_vec<tree, 32> pieces (nvectors * 2);
10730 0 : pieces.quick_grow_cleared (nvectors * 2);
10731 0 : for (unsigned int i = 0; i < nvectors; ++i)
10732 : {
10733 : /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
10734 : ELTS' has mode IM. */
10735 0 : partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
10736 0 : for (unsigned int j = 0; j < partial_nelts; ++j)
10737 0 : partial_elts.quick_push (elts[i * partial_nelts + j]);
10738 0 : tree t = gimple_build_vector (seq, &partial_elts);
10739 0 : t = gimple_build (seq, VIEW_CONVERT_EXPR,
10740 0 : TREE_TYPE (new_vector_type), t);
10741 :
10742 : /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
10743 0 : pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
10744 : }
10745 :
10746 : /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
10747 : correct byte contents.
10748 :
10749 : Conceptually, we need to repeat the following operation log2(nvectors)
10750 : times, where hi_start = nvectors / 2:
10751 :
10752 : out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
10753 : out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
10754 :
10755 : However, if each input repeats every N elements and the VF is
10756 : a multiple of N * 2, the HI result is the same as the LO result.
10757 : This will be true for the first N1 iterations of the outer loop,
10758 : followed by N2 iterations for which both the LO and HI results
10759 : are needed. I.e.:
10760 :
10761 : N1 + N2 = log2(nvectors)
10762 :
10763 : Each "N1 iteration" doubles the number of redundant vectors and the
10764 : effect of the process as a whole is to have a sequence of nvectors/2**N1
10765 : vectors that repeats 2**N1 times. Rather than generate these redundant
10766 : vectors, we halve the number of vectors for each N1 iteration. */
10767 : unsigned int in_start = 0;
10768 : unsigned int out_start = nvectors;
10769 : unsigned int new_nvectors = nvectors;
10770 0 : for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
10771 : {
10772 0 : unsigned int hi_start = new_nvectors / 2;
10773 0 : unsigned int out_i = 0;
10774 0 : for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
10775 : {
10776 0 : if ((in_i & 1) != 0
10777 0 : && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
10778 : 2 * in_repeat))
10779 0 : continue;
10780 :
10781 0 : tree output = make_ssa_name (new_vector_type);
10782 0 : tree input1 = pieces[in_start + (in_i / 2)];
10783 0 : tree input2 = pieces[in_start + (in_i / 2) + hi_start];
10784 0 : gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
10785 : input1, input2,
10786 : permutes[in_i & 1]);
10787 0 : gimple_seq_add_stmt (seq, stmt);
10788 0 : pieces[out_start + out_i] = output;
10789 0 : out_i += 1;
10790 : }
10791 0 : std::swap (in_start, out_start);
10792 0 : new_nvectors = out_i;
10793 : }
10794 :
10795 : /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
10796 0 : results.reserve (nresults);
10797 0 : for (unsigned int i = 0; i < nresults; ++i)
10798 0 : if (i < new_nvectors)
10799 0 : results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
10800 0 : pieces[in_start + i]));
10801 : else
10802 0 : results.quick_push (results[i - new_nvectors]);
10803 0 : }
10804 :
10805 :
10806 : /* For constant and loop invariant defs in OP_NODE this function creates
10807 : vector defs that will be used in the vectorized stmts and stores them
10808 : to SLP_TREE_VEC_DEFS of OP_NODE. */
10809 :
10810 : static void
10811 489703 : vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
10812 : {
10813 489703 : unsigned HOST_WIDE_INT nunits;
10814 489703 : tree vec_cst;
10815 489703 : unsigned j, number_of_places_left_in_vector;
10816 489703 : tree vector_type;
10817 489703 : tree vop;
10818 489703 : int group_size = op_node->ops.length ();
10819 489703 : unsigned int vec_num, i;
10820 489703 : unsigned number_of_copies = 1;
10821 489703 : bool constant_p;
10822 489703 : gimple_seq ctor_seq = NULL;
10823 489703 : auto_vec<tree, 16> permute_results;
10824 :
10825 : /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
10826 489703 : vector_type = SLP_TREE_VECTYPE (op_node);
10827 :
10828 489703 : unsigned int number_of_vectors = vect_get_num_copies (vinfo, op_node);
10829 489703 : SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
10830 489703 : auto_vec<tree> voprnds (number_of_vectors);
10831 :
10832 : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
10833 : created vectors. It is greater than 1 if unrolling is performed.
10834 :
10835 : For example, we have two scalar operands, s1 and s2 (e.g., group of
10836 : strided accesses of size two), while NUNITS is four (i.e., four scalars
10837 : of this type can be packed in a vector). The output vector will contain
10838 : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
10839 : will be 2).
10840 :
10841 : If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
10842 : containing the operands.
10843 :
10844 : For example, NUNITS is four as before, and the group size is 8
10845 : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
10846 : {s5, s6, s7, s8}. */
10847 :
10848 : /* When using duplicate_and_interleave, we just need one element for
10849 : each scalar statement. */
10850 489703 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
10851 : nunits = group_size;
10852 :
10853 489703 : number_of_copies = nunits * number_of_vectors / group_size;
10854 :
10855 489703 : number_of_places_left_in_vector = nunits;
10856 489703 : constant_p = true;
10857 489703 : tree uniform_elt = NULL_TREE;
10858 489703 : tree_vector_builder elts (vector_type, nunits, 1);
10859 489703 : elts.quick_grow (nunits);
10860 489703 : stmt_vec_info insert_after = NULL;
10861 1466112 : for (j = 0; j < number_of_copies; j++)
10862 : {
10863 976409 : tree op;
10864 3738338 : for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
10865 : {
10866 : /* Create 'vect_ = {op0,op1,...,opn}'. */
10867 1785520 : tree orig_op = op;
10868 1785520 : if (number_of_places_left_in_vector == nunits)
10869 : uniform_elt = op;
10870 1167382 : else if (uniform_elt && operand_equal_p (uniform_elt, op))
10871 745054 : op = elts[number_of_places_left_in_vector];
10872 : else
10873 : uniform_elt = NULL_TREE;
10874 1785520 : number_of_places_left_in_vector--;
10875 1785520 : if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
10876 : {
10877 273196 : if (CONSTANT_CLASS_P (op))
10878 : {
10879 99383 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10880 : {
10881 : /* Can't use VIEW_CONVERT_EXPR for booleans because
10882 : of possibly different sizes of scalar value and
10883 : vector element. */
10884 51 : if (integer_zerop (op))
10885 51 : op = build_int_cst (TREE_TYPE (vector_type), 0);
10886 0 : else if (integer_onep (op))
10887 0 : op = build_all_ones_cst (TREE_TYPE (vector_type));
10888 : else
10889 0 : gcc_unreachable ();
10890 : }
10891 : else
10892 99332 : op = fold_unary (VIEW_CONVERT_EXPR,
10893 : TREE_TYPE (vector_type), op);
10894 99383 : gcc_assert (op && CONSTANT_CLASS_P (op));
10895 : }
10896 : else
10897 : {
10898 173813 : tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
10899 173813 : gimple *init_stmt;
10900 173813 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10901 : {
10902 403 : tree true_val
10903 403 : = build_all_ones_cst (TREE_TYPE (vector_type));
10904 403 : tree false_val
10905 403 : = build_zero_cst (TREE_TYPE (vector_type));
10906 403 : gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
10907 403 : init_stmt = gimple_build_assign (new_temp, COND_EXPR,
10908 : op, true_val,
10909 : false_val);
10910 : }
10911 : else
10912 : {
10913 173410 : op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
10914 : op);
10915 173410 : init_stmt
10916 173410 : = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
10917 : op);
10918 : }
10919 173813 : gimple_seq_add_stmt (&ctor_seq, init_stmt);
10920 173813 : op = new_temp;
10921 : }
10922 : }
10923 1785520 : elts[number_of_places_left_in_vector] = op;
10924 1785520 : if (!CONSTANT_CLASS_P (op))
10925 316216 : constant_p = false;
10926 : /* For BB vectorization we have to compute an insert location
10927 : when a def is inside the analyzed region since we cannot
10928 : simply insert at the BB start in this case. */
10929 1785520 : stmt_vec_info opdef;
10930 1785520 : if (TREE_CODE (orig_op) == SSA_NAME
10931 182025 : && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
10932 162221 : && is_a <bb_vec_info> (vinfo)
10933 1889918 : && (opdef = vinfo->lookup_def (orig_op)))
10934 : {
10935 85555 : if (!insert_after)
10936 : insert_after = opdef;
10937 : else
10938 47216 : insert_after = get_later_stmt (insert_after, opdef);
10939 : }
10940 :
10941 1785520 : if (number_of_places_left_in_vector == 0)
10942 : {
10943 618138 : auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
10944 618138 : if (uniform_elt)
10945 646342 : vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
10946 323171 : elts[0]);
10947 589934 : else if (constant_p
10948 589934 : ? multiple_p (type_nunits, nunits)
10949 108833 : : known_eq (type_nunits, nunits))
10950 294967 : vec_cst = gimple_build_vector (&ctor_seq, &elts);
10951 : else
10952 : {
10953 0 : if (permute_results.is_empty ())
10954 0 : duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
10955 : elts, number_of_vectors,
10956 : permute_results);
10957 0 : vec_cst = permute_results[number_of_vectors - j - 1];
10958 : }
10959 618138 : if (!gimple_seq_empty_p (ctor_seq))
10960 : {
10961 136067 : if (insert_after)
10962 : {
10963 38339 : gimple_stmt_iterator gsi;
10964 38339 : if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
10965 : {
10966 620 : gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
10967 620 : gsi_insert_seq_before (&gsi, ctor_seq,
10968 : GSI_CONTINUE_LINKING);
10969 : }
10970 37719 : else if (!stmt_ends_bb_p (insert_after->stmt))
10971 : {
10972 37719 : gsi = gsi_for_stmt (insert_after->stmt);
10973 37719 : gsi_insert_seq_after (&gsi, ctor_seq,
10974 : GSI_CONTINUE_LINKING);
10975 : }
10976 : else
10977 : {
10978 : /* When we want to insert after a def where the
10979 : defining stmt throws then insert on the fallthru
10980 : edge. */
10981 0 : edge e = find_fallthru_edge
10982 0 : (gimple_bb (insert_after->stmt)->succs);
10983 0 : basic_block new_bb
10984 0 : = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
10985 0 : gcc_assert (!new_bb);
10986 : }
10987 : }
10988 : else
10989 97728 : vinfo->insert_seq_on_entry (NULL, ctor_seq);
10990 136067 : ctor_seq = NULL;
10991 : }
10992 618138 : voprnds.quick_push (vec_cst);
10993 618138 : insert_after = NULL;
10994 618138 : number_of_places_left_in_vector = nunits;
10995 618138 : constant_p = true;
10996 618138 : elts.new_vector (vector_type, nunits, 1);
10997 618138 : elts.quick_grow (nunits);
10998 : }
10999 : }
11000 : }
11001 :
11002 : /* Since the vectors are created in the reverse order, we should invert
11003 : them. */
11004 489703 : vec_num = voprnds.length ();
11005 1107841 : for (j = vec_num; j != 0; j--)
11006 : {
11007 618138 : vop = voprnds[j - 1];
11008 618138 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
11009 : }
11010 :
11011 : /* In case that VF is greater than the unrolling factor needed for the SLP
11012 : group of stmts, NUMBER_OF_VECTORS to be created is greater than
11013 : NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
11014 : to replicate the vectors. */
11015 489703 : while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
11016 489703 : for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
11017 : i++)
11018 0 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
11019 489703 : }
11020 :
11021 : /* Get the scalar definition of the Nth lane from SLP_NODE or NULL_TREE
11022 : if there is no definition for it in the scalar IL or it is not known. */
11023 :
11024 : tree
11025 1909 : vect_get_slp_scalar_def (slp_tree slp_node, unsigned n)
11026 : {
11027 1909 : if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
11028 : {
11029 1899 : if (!SLP_TREE_SCALAR_STMTS (slp_node).exists ())
11030 : return NULL_TREE;
11031 1899 : stmt_vec_info def = SLP_TREE_SCALAR_STMTS (slp_node)[n];
11032 1899 : if (!def)
11033 : return NULL_TREE;
11034 1899 : return gimple_get_lhs (STMT_VINFO_STMT (def));
11035 : }
11036 : else
11037 10 : return SLP_TREE_SCALAR_OPS (slp_node)[n];
11038 : }
11039 :
11040 : /* Get the Ith vectorized definition from SLP_NODE. */
11041 :
11042 : tree
11043 145870 : vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
11044 : {
11045 145870 : return SLP_TREE_VEC_DEFS (slp_node)[i];
11046 : }
11047 :
11048 : /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
11049 :
11050 : void
11051 928640 : vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
11052 : {
11053 1857280 : vec_defs->create (SLP_TREE_VEC_DEFS (slp_node).length ());
11054 928640 : vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
11055 928640 : }
11056 :
11057 : /* Get N vectorized definitions for SLP_NODE. */
11058 :
11059 : void
11060 2955 : vect_get_slp_defs (vec_info *,
11061 : slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
11062 : {
11063 2955 : if (n == -1U)
11064 2955 : n = SLP_TREE_CHILDREN (slp_node).length ();
11065 :
11066 10648 : for (unsigned i = 0; i < n; ++i)
11067 : {
11068 7693 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
11069 7693 : vec<tree> vec_defs = vNULL;
11070 7693 : vect_get_slp_defs (child, &vec_defs);
11071 7693 : vec_oprnds->quick_push (vec_defs);
11072 : }
11073 2955 : }
11074 :
11075 : /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
11076 : - PERM gives the permutation that the caller wants to use for NODE,
11077 : which might be different from SLP_LOAD_PERMUTATION.
11078 : - DUMP_P controls whether the function dumps information. */
11079 :
11080 : static bool
11081 119662 : vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
11082 : load_permutation_t &perm,
11083 : const vec<tree> &dr_chain,
11084 : gimple_stmt_iterator *gsi, poly_uint64 vf,
11085 : bool analyze_only, bool dump_p,
11086 : unsigned *n_perms, unsigned int *n_loads,
11087 : bool dce_chain)
11088 : {
11089 119662 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
11090 119662 : int vec_index = 0;
11091 119662 : tree vectype = SLP_TREE_VECTYPE (node);
11092 119662 : unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
11093 119662 : unsigned int mask_element;
11094 119662 : unsigned dr_group_size;
11095 119662 : machine_mode mode;
11096 :
11097 119662 : if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
11098 : {
11099 : /* We have both splats of the same non-grouped load and groups
11100 : of distinct invariant loads entering here. */
11101 1205 : unsigned max_idx = 0;
11102 6793 : for (auto idx : perm)
11103 3178 : max_idx = idx > max_idx ? idx : max_idx;
11104 1205 : dr_group_size = max_idx + 1;
11105 : }
11106 : else
11107 : {
11108 118457 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
11109 118457 : dr_group_size = DR_GROUP_SIZE (stmt_info);
11110 : }
11111 :
11112 119662 : mode = TYPE_MODE (vectype);
11113 119662 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
11114 119662 : unsigned int nstmts = vect_get_num_copies (vinfo, node);
11115 :
11116 : /* Initialize the vect stmts of NODE to properly insert the generated
11117 : stmts later. */
11118 119662 : if (! analyze_only)
11119 57826 : for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
11120 22274 : SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
11121 :
11122 : /* Generate permutation masks for every NODE. Number of masks for each NODE
11123 : is equal to GROUP_SIZE.
11124 : E.g., we have a group of three nodes with three loads from the same
11125 : location in each node, and the vector size is 4. I.e., we have a
11126 : a0b0c0a1b1c1... sequence and we need to create the following vectors:
11127 : for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
11128 : for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
11129 : ...
11130 :
11131 : The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
11132 : The last mask is illegal since we assume two operands for permute
11133 : operation, and the mask element values can't be outside that range.
11134 : Hence, the last mask must be converted into {2,5,5,5}.
11135 : For the first two permutations we need the first and the second input
11136 : vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
11137 : we need the second and the third vectors: {b1,c1,a2,b2} and
11138 : {c2,a3,b3,c3}. */
11139 :
11140 119662 : int vect_stmts_counter = 0;
11141 119662 : unsigned int index = 0;
11142 119662 : int first_vec_index = -1;
11143 119662 : int second_vec_index = -1;
11144 119662 : bool noop_p = true;
11145 119662 : *n_perms = 0;
11146 :
11147 119662 : vec_perm_builder mask;
11148 119662 : unsigned int nelts_to_build;
11149 119662 : unsigned int nvectors_per_build;
11150 119662 : unsigned int in_nlanes;
11151 119662 : bool repeating_p = (group_size == dr_group_size
11152 151413 : && multiple_p (nunits, group_size));
11153 119662 : if (repeating_p)
11154 : {
11155 : /* A single vector contains a whole number of copies of the node, so:
11156 : (a) all permutes can use the same mask; and
11157 : (b) the permutes only need a single vector input. */
11158 29584 : mask.new_vector (nunits, group_size, 3);
11159 29584 : nelts_to_build = mask.encoded_nelts ();
11160 : /* It's possible to obtain zero nstmts during analyze_only, so make
11161 : it at least one to ensure the later computation for n_perms
11162 : proceed. */
11163 29584 : nvectors_per_build = nstmts > 0 ? nstmts : 1;
11164 29584 : in_nlanes = dr_group_size * 3;
11165 : }
11166 : else
11167 : {
11168 : /* We need to construct a separate mask for each vector statement. */
11169 90078 : unsigned HOST_WIDE_INT const_nunits, const_vf;
11170 90078 : if (!nunits.is_constant (&const_nunits)
11171 90078 : || !vf.is_constant (&const_vf))
11172 : return false;
11173 90078 : mask.new_vector (const_nunits, const_nunits, 1);
11174 90078 : nelts_to_build = const_vf * group_size;
11175 90078 : nvectors_per_build = 1;
11176 90078 : in_nlanes = const_vf * dr_group_size;
11177 : }
11178 119662 : auto_sbitmap used_in_lanes (in_nlanes);
11179 119662 : bitmap_clear (used_in_lanes);
11180 119662 : auto_bitmap used_defs;
11181 :
11182 119662 : unsigned int count = mask.encoded_nelts ();
11183 119662 : mask.quick_grow (count);
11184 119662 : vec_perm_indices indices;
11185 :
11186 636473 : for (unsigned int j = 0; j < nelts_to_build; j++)
11187 : {
11188 526386 : unsigned int iter_num = j / group_size;
11189 526386 : unsigned int stmt_num = j % group_size;
11190 526386 : unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
11191 526386 : bitmap_set_bit (used_in_lanes, i);
11192 526386 : if (repeating_p)
11193 : {
11194 : first_vec_index = 0;
11195 : mask_element = i;
11196 : }
11197 : else
11198 : {
11199 : /* Enforced before the loop when !repeating_p. */
11200 335784 : unsigned int const_nunits = nunits.to_constant ();
11201 335784 : vec_index = i / const_nunits;
11202 335784 : mask_element = i % const_nunits;
11203 335784 : if (vec_index == first_vec_index
11204 335784 : || first_vec_index == -1)
11205 : {
11206 : first_vec_index = vec_index;
11207 : }
11208 133480 : else if (vec_index == second_vec_index
11209 133480 : || second_vec_index == -1)
11210 : {
11211 127428 : second_vec_index = vec_index;
11212 127428 : mask_element += const_nunits;
11213 : }
11214 : else
11215 : {
11216 6052 : if (dump_p)
11217 280 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11218 : "permutation requires at "
11219 : "least three vectors %G",
11220 : stmt_info->stmt);
11221 6052 : gcc_assert (analyze_only);
11222 : return false;
11223 : }
11224 :
11225 329732 : gcc_assert (mask_element < 2 * const_nunits);
11226 : }
11227 :
11228 520334 : if (mask_element != index)
11229 333318 : noop_p = false;
11230 520334 : mask[index++] = mask_element;
11231 :
11232 520334 : if (index == count)
11233 : {
11234 142308 : if (!noop_p)
11235 : {
11236 195070 : indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
11237 115458 : if (!can_vec_perm_const_p (mode, mode, indices))
11238 : {
11239 3523 : if (dump_p)
11240 : {
11241 79 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11242 : "unsupported vect permute { ");
11243 669 : for (i = 0; i < count; ++i)
11244 : {
11245 590 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
11246 590 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
11247 : }
11248 79 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
11249 : }
11250 3523 : gcc_assert (analyze_only);
11251 : return false;
11252 : }
11253 :
11254 111935 : tree mask_vec = NULL_TREE;
11255 111935 : if (!analyze_only)
11256 20634 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
11257 :
11258 111935 : if (second_vec_index == -1)
11259 33890 : second_vec_index = first_vec_index;
11260 :
11261 225861 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
11262 : {
11263 113926 : ++*n_perms;
11264 113926 : if (analyze_only)
11265 93009 : continue;
11266 : /* Generate the permute statement if necessary. */
11267 20917 : tree first_vec = dr_chain[first_vec_index + ri];
11268 20917 : tree second_vec = dr_chain[second_vec_index + ri];
11269 20917 : gassign *stmt = as_a<gassign *> (stmt_info->stmt);
11270 20917 : tree perm_dest
11271 20917 : = vect_create_destination_var (gimple_assign_lhs (stmt),
11272 : vectype);
11273 20917 : perm_dest = make_ssa_name (perm_dest);
11274 20917 : gimple *perm_stmt
11275 20917 : = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
11276 : second_vec, mask_vec);
11277 20917 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
11278 : gsi);
11279 20917 : if (dce_chain)
11280 : {
11281 20148 : bitmap_set_bit (used_defs, first_vec_index + ri);
11282 20148 : bitmap_set_bit (used_defs, second_vec_index + ri);
11283 : }
11284 :
11285 : /* Store the vector statement in NODE. */
11286 20917 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
11287 : }
11288 : }
11289 26850 : else if (!analyze_only)
11290 : {
11291 2714 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
11292 : {
11293 1357 : tree first_vec = dr_chain[first_vec_index + ri];
11294 : /* If mask was NULL_TREE generate the requested
11295 : identity transform. */
11296 1357 : if (dce_chain)
11297 1356 : bitmap_set_bit (used_defs, first_vec_index + ri);
11298 :
11299 : /* Store the vector statement in NODE. */
11300 1357 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
11301 : }
11302 : }
11303 :
11304 : index = 0;
11305 : first_vec_index = -1;
11306 : second_vec_index = -1;
11307 : noop_p = true;
11308 : }
11309 : }
11310 :
11311 110087 : if (n_loads)
11312 : {
11313 75628 : if (repeating_p)
11314 10354 : *n_loads = nstmts;
11315 : else
11316 : {
11317 : /* Enforced above when !repeating_p. */
11318 65274 : unsigned int const_nunits = nunits.to_constant ();
11319 65274 : *n_loads = 0;
11320 65274 : bool load_seen = false;
11321 929794 : for (unsigned i = 0; i < in_nlanes; ++i)
11322 : {
11323 864520 : if (i % const_nunits == 0)
11324 : {
11325 368131 : if (load_seen)
11326 104465 : *n_loads += 1;
11327 : load_seen = false;
11328 : }
11329 864520 : if (bitmap_bit_p (used_in_lanes, i))
11330 234285 : load_seen = true;
11331 : }
11332 65274 : if (load_seen)
11333 42965 : *n_loads += 1;
11334 : }
11335 : }
11336 :
11337 110087 : if (dce_chain)
11338 209030 : for (unsigned i = 0; i < dr_chain.length (); ++i)
11339 72213 : if (!bitmap_bit_p (used_defs, i))
11340 : {
11341 39350 : tree def = dr_chain[i];
11342 39685 : do
11343 : {
11344 39685 : gimple *stmt = SSA_NAME_DEF_STMT (def);
11345 39685 : if (is_gimple_assign (stmt)
11346 39685 : && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
11347 39685 : || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
11348 4916 : def = single_ssa_tree_operand (stmt, SSA_OP_USE);
11349 : else
11350 : def = NULL;
11351 39685 : gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
11352 39685 : gsi_remove (&rgsi, true);
11353 39685 : release_defs (stmt);
11354 : }
11355 39685 : while (def);
11356 : }
11357 :
11358 : return true;
11359 119662 : }
11360 :
11361 : /* Generate vector permute statements from a list of loads in DR_CHAIN.
11362 : If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
11363 : permute statements for the SLP node NODE. Store the number of vector
11364 : permute instructions in *N_PERMS and the number of vector load
11365 : instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
11366 : that were not needed. */
11367 :
11368 : bool
11369 84323 : vect_transform_slp_perm_load (vec_info *vinfo,
11370 : slp_tree node, const vec<tree> &dr_chain,
11371 : gimple_stmt_iterator *gsi, poly_uint64 vf,
11372 : bool analyze_only, unsigned *n_perms,
11373 : unsigned int *n_loads, bool dce_chain)
11374 : {
11375 84323 : return vect_transform_slp_perm_load_1 (vinfo, node,
11376 84323 : SLP_TREE_LOAD_PERMUTATION (node),
11377 : dr_chain, gsi, vf, analyze_only,
11378 : dump_enabled_p (), n_perms, n_loads,
11379 84323 : dce_chain);
11380 : }
11381 :
11382 : /* Produce the next vector result for SLP permutation NODE by adding a vector
11383 : statement at GSI. If MASK_VEC is nonnull, add:
11384 :
11385 : <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
11386 :
11387 : otherwise add:
11388 :
11389 : <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF,
11390 : { N, N+1, N+2, ... }>
11391 :
11392 : where N == IDENTITY_OFFSET which is either zero or equal to the
11393 : number of elements of the result. */
11394 :
11395 : static void
11396 31379 : vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11397 : slp_tree node, tree first_def, tree second_def,
11398 : tree mask_vec, poly_uint64 identity_offset)
11399 : {
11400 31379 : tree vectype = SLP_TREE_VECTYPE (node);
11401 :
11402 : /* ??? We SLP match existing vector element extracts but
11403 : allow punning which we need to re-instantiate at uses
11404 : but have no good way of explicitly representing. */
11405 31379 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
11406 31379 : && !types_compatible_p (TREE_TYPE (first_def), vectype))
11407 : {
11408 14 : gassign *conv_stmt
11409 14 : = gimple_build_assign (make_ssa_name (vectype),
11410 : build1 (VIEW_CONVERT_EXPR, vectype, first_def));
11411 14 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
11412 14 : first_def = gimple_assign_lhs (conv_stmt);
11413 : }
11414 31379 : gassign *perm_stmt;
11415 31379 : tree perm_dest = make_ssa_name (vectype);
11416 31379 : if (mask_vec)
11417 : {
11418 28100 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
11419 28100 : TYPE_SIZE (vectype))
11420 28100 : && !types_compatible_p (TREE_TYPE (second_def), vectype))
11421 : {
11422 8 : gassign *conv_stmt
11423 8 : = gimple_build_assign (make_ssa_name (vectype),
11424 : build1 (VIEW_CONVERT_EXPR,
11425 : vectype, second_def));
11426 8 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
11427 8 : second_def = gimple_assign_lhs (conv_stmt);
11428 : }
11429 28100 : perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
11430 : first_def, second_def,
11431 : mask_vec);
11432 : }
11433 : else
11434 : {
11435 3279 : auto def_nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
11436 3279 : unsigned HOST_WIDE_INT vecno;
11437 3279 : poly_uint64 eltno;
11438 3279 : if (!can_div_trunc_p (poly_uint64 (identity_offset), def_nunits,
11439 : &vecno, &eltno))
11440 : gcc_unreachable ();
11441 3279 : tree def = vecno & 1 ? second_def : first_def;
11442 3279 : if (!types_compatible_p (TREE_TYPE (def), vectype))
11443 : {
11444 : /* For identity permutes we still need to handle the case
11445 : of offsetted extracts or concats. */
11446 261 : unsigned HOST_WIDE_INT c;
11447 261 : if (known_le (TYPE_VECTOR_SUBPARTS (vectype), def_nunits))
11448 : {
11449 257 : unsigned HOST_WIDE_INT elsz
11450 257 : = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (def))));
11451 514 : tree lowpart = build3 (BIT_FIELD_REF, vectype, def,
11452 257 : TYPE_SIZE (vectype),
11453 257 : bitsize_int (eltno * elsz));
11454 257 : perm_stmt = gimple_build_assign (perm_dest, lowpart);
11455 : }
11456 4 : else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
11457 4 : def_nunits, &c) && c == 2)
11458 : {
11459 4 : gcc_assert (known_eq (identity_offset, 0U));
11460 4 : tree ctor = build_constructor_va (vectype, 2,
11461 : NULL_TREE, first_def,
11462 : NULL_TREE, second_def);
11463 4 : perm_stmt = gimple_build_assign (perm_dest, ctor);
11464 : }
11465 : else
11466 0 : gcc_unreachable ();
11467 : }
11468 : else
11469 : {
11470 : /* We need a copy here in case the def was external. */
11471 3018 : gcc_assert (known_eq (eltno, 0U));
11472 3018 : perm_stmt = gimple_build_assign (perm_dest, def);
11473 : }
11474 : }
11475 31379 : vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
11476 : /* Store the vector statement in NODE. */
11477 31379 : node->push_vec_def (perm_stmt);
11478 31379 : }
11479 :
11480 : /* Subroutine of vectorizable_slp_permutation. Check whether the target
11481 : can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
11482 : If GSI is nonnull, emit the permutation there.
11483 :
11484 : When GSI is null, the only purpose of NODE is to give properties
11485 : of the result, such as the vector type and number of SLP lanes.
11486 : The node does not need to be a VEC_PERM_EXPR.
11487 :
11488 : If the target supports the operation, return the number of individual
11489 : VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
11490 : dump file if DUMP_P is true. */
11491 :
11492 : static int
11493 436481 : vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
11494 : slp_tree node, lane_permutation_t &perm,
11495 : vec<slp_tree> &children, bool dump_p)
11496 : {
11497 436481 : tree vectype = SLP_TREE_VECTYPE (node);
11498 :
11499 : /* ??? We currently only support all same vector input types
11500 : while the SLP IL should really do a concat + select and thus accept
11501 : arbitrary mismatches. */
11502 436481 : slp_tree child;
11503 436481 : unsigned i;
11504 436481 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
11505 436481 : bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
11506 : /* True if we're permuting a single input of 2N vectors down
11507 : to N vectors. This case doesn't generalize beyond 2 since
11508 : VEC_PERM_EXPR only takes 2 inputs. */
11509 436481 : bool pack_p = false;
11510 : /* If we're permuting inputs of N vectors each into X*N outputs,
11511 : this is the value of X, otherwise it is 1. */
11512 436481 : unsigned int unpack_factor = 1;
11513 436481 : tree op_vectype = NULL_TREE;
11514 437666 : FOR_EACH_VEC_ELT (children, i, child)
11515 437587 : if (SLP_TREE_VECTYPE (child))
11516 : {
11517 : op_vectype = SLP_TREE_VECTYPE (child);
11518 : break;
11519 : }
11520 436481 : if (!op_vectype)
11521 79 : op_vectype = vectype;
11522 932244 : FOR_EACH_VEC_ELT (children, i, child)
11523 : {
11524 495763 : if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
11525 10086 : && !vect_maybe_update_slp_op_vectype (child, op_vectype))
11526 495763 : || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
11527 991526 : || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
11528 : {
11529 0 : if (dump_p)
11530 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11531 : "Unsupported vector types in lane permutation\n");
11532 0 : return -1;
11533 : }
11534 495763 : auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
11535 495763 : unsigned int this_unpack_factor;
11536 : /* Detect permutations of external, pre-existing vectors. The external
11537 : node's SLP_TREE_LANES stores the total number of units in the vector,
11538 : or zero if the vector has variable length.
11539 :
11540 : We are expected to keep the original VEC_PERM_EXPR for such cases.
11541 : There is no repetition to model. */
11542 495763 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def
11543 495763 : && SLP_TREE_SCALAR_OPS (child).is_empty ())
11544 : repeating_p = false;
11545 : /* Check whether the input has twice as many lanes per vector. */
11546 487791 : else if (children.length () == 1
11547 487791 : && known_eq (SLP_TREE_LANES (child) * nunits,
11548 : SLP_TREE_LANES (node) * op_nunits * 2))
11549 : pack_p = true;
11550 : /* Check whether the output has N times as many lanes per vector. */
11551 495763 : else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
11552 445274 : SLP_TREE_LANES (child) * nunits,
11553 : &this_unpack_factor)
11554 410691 : && (i == 0 || unpack_factor == this_unpack_factor))
11555 : unpack_factor = this_unpack_factor;
11556 : else
11557 : repeating_p = false;
11558 : }
11559 :
11560 872962 : gcc_assert (perm.length () == SLP_TREE_LANES (node));
11561 :
11562 : /* Load-lanes permute. This permute only acts as a forwarder to
11563 : select the correct vector def of the load-lanes load which
11564 : has the permuted vectors in its vector defs like
11565 : { v0, w0, r0, v1, w1, r1 ... } for a ld3. All costs are
11566 : accounted for in the costing for the actual load so we
11567 : return zero here. */
11568 436481 : if (node->ldst_lanes)
11569 : {
11570 0 : gcc_assert (children.length () == 1);
11571 0 : if (!gsi)
11572 : /* This is a trivial op always supported. */
11573 : return 0;
11574 0 : slp_tree child = children[0];
11575 0 : unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
11576 0 : / SLP_TREE_LANES (node));
11577 0 : unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
11578 0 : unsigned nvectors = vect_get_num_copies (vinfo, node);
11579 0 : for (unsigned i = 0; i < nvectors; ++i)
11580 : {
11581 0 : tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx];
11582 0 : node->push_vec_def (def);
11583 : }
11584 : return 0;
11585 : }
11586 :
11587 : /* Set REPEATING_P to true if the permutations are cyclical wrt UNPACK_FACTOR
11588 : and if we can generate the vectors in a vector-length agnostic way.
11589 : This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
11590 : compile time.
11591 :
11592 : The significance of UNPACK_STEP is that, when PACK_P is false,
11593 : output vector I operates on a window of UNPACK_STEP elements from each
11594 : input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR). For example,
11595 : when UNPACK_FACTOR is 2, the first output vector operates on lanes
11596 : [0, NUNITS / 2 - 1] of each input vector and the second output vector
11597 : operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
11598 :
11599 : When REPEATING_P is true, NOUTPUTS holds the total number of outputs
11600 : that we actually need to generate. */
11601 436481 : uint64_t noutputs = 0;
11602 436481 : poly_uint64 unpack_step = 0;
11603 436481 : loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
11604 148604 : if (!linfo
11605 474632 : || !multiple_p (nunits, unpack_factor, &unpack_step)
11606 147704 : || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
11607 147704 : * SLP_TREE_LANES (node), nunits, &noutputs))
11608 : repeating_p = false;
11609 :
11610 : /* We can handle the conditions described for REPEATING_P above for
11611 : both variable- and constant-length vectors. The fallback requires
11612 : us to generate every element of every permute vector explicitly,
11613 : which is only possible for constant-length permute vectors.
11614 :
11615 : Set:
11616 :
11617 : - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
11618 : mask vectors that we want to build.
11619 :
11620 : - NCOPIES to the number of copies of PERM that we need in order
11621 : to build the necessary permute mask vectors. */
11622 147704 : uint64_t npatterns;
11623 147704 : unsigned nelts_per_pattern;
11624 147704 : uint64_t ncopies;
11625 147704 : if (repeating_p)
11626 : {
11627 : /* We need permute mask vectors that have the form:
11628 :
11629 : { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
11630 :
11631 : In other words, the original n-element permute in PERM is
11632 : "unrolled" to fill a full vector. The stepped vector encoding
11633 : that we use for permutes requires 3n elements. */
11634 109553 : npatterns = SLP_TREE_LANES (node);
11635 109553 : nelts_per_pattern = ncopies = 3;
11636 : }
11637 : else
11638 : {
11639 : /* Calculate every element of every permute mask vector explicitly,
11640 : instead of relying on the pattern described above. */
11641 326928 : if (!nunits.is_constant (&npatterns)
11642 326928 : || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
11643 : {
11644 : if (dump_p)
11645 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11646 : "unsupported permutation %p on variable-length"
11647 : " vectors\n", (void *) node);
11648 : return -1;
11649 : }
11650 326928 : nelts_per_pattern = ncopies = 1;
11651 326928 : if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
11652 : {
11653 : if (dump_p)
11654 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11655 : "unsupported permutation %p for variable VF\n",
11656 : (void *) node);
11657 : return -1;
11658 : }
11659 : pack_p = false;
11660 : unpack_factor = 1;
11661 : }
11662 436481 : unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
11663 436481 : gcc_assert (repeating_p || multiple_p (olanes, nunits));
11664 :
11665 : /* Compute the { { SLP operand, vector index}, lane } permutation sequence
11666 : from the { SLP operand, scalar lane } permutation as recorded in the
11667 : SLP node as intermediate step. This part should already work
11668 : with SLP children with arbitrary number of lanes. */
11669 436481 : auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
11670 436481 : auto_vec<poly_uint64> active_lane;
11671 436481 : vperm.create (olanes);
11672 436481 : active_lane.safe_grow_cleared (children.length (), true);
11673 879932 : for (unsigned int ui = 0; ui < unpack_factor; ++ui)
11674 : {
11675 1906260 : for (unsigned j = 0; j < children.length (); ++j)
11676 509679 : active_lane[j] = ui * unpack_step;
11677 1218750 : for (unsigned i = 0; i < ncopies; ++i)
11678 : {
11679 4845936 : for (unsigned pi = 0; pi < perm.length (); ++pi)
11680 : {
11681 1647669 : std::pair<unsigned, unsigned> p = perm[pi];
11682 1647669 : tree vtype = SLP_TREE_VECTYPE (children[p.first]);
11683 1647669 : if (repeating_p)
11684 626427 : vperm.quick_push ({{p.first, 0},
11685 626427 : p.second + active_lane[p.first]});
11686 : else
11687 : {
11688 : /* We checked above that the vectors are constant-length. */
11689 1021242 : unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
11690 1021242 : .to_constant ();
11691 1021242 : unsigned lane = active_lane[p.first].to_constant ();
11692 1021242 : unsigned vi = (lane + p.second) / vnunits;
11693 1021242 : unsigned vl = (lane + p.second) % vnunits;
11694 1021242 : vperm.quick_push ({{p.first, vi}, vl});
11695 : }
11696 : }
11697 : /* Advance to the next group. */
11698 1669799 : for (unsigned j = 0; j < children.length (); ++j)
11699 894500 : active_lane[j] += SLP_TREE_LANES (children[j]);
11700 : }
11701 : }
11702 :
11703 436481 : if (dump_p)
11704 : {
11705 8827 : dump_printf_loc (MSG_NOTE, vect_location,
11706 : "vectorizing permutation %p", (void *)node);
11707 31996 : for (unsigned i = 0; i < perm.length (); ++i)
11708 23169 : dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
11709 8827 : if (repeating_p)
11710 7427 : dump_printf (MSG_NOTE, " (repeat %d)", SLP_TREE_LANES (node));
11711 8827 : dump_printf (MSG_NOTE, "\n");
11712 8827 : dump_printf_loc (MSG_NOTE, vect_location, "as");
11713 88790 : for (unsigned i = 0; i < vperm.length (); ++i)
11714 : {
11715 79963 : if (i != 0
11716 79963 : && (repeating_p
11717 53986 : ? multiple_p (i, npatterns)
11718 59505 : : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
11719 23952 : dump_printf (MSG_NOTE, ",");
11720 79963 : dump_printf (MSG_NOTE, " vops%u[%u][",
11721 79963 : vperm[i].first.first, vperm[i].first.second);
11722 79963 : dump_dec (MSG_NOTE, vperm[i].second);
11723 79963 : dump_printf (MSG_NOTE, "]");
11724 : }
11725 8827 : dump_printf (MSG_NOTE, "\n");
11726 : }
11727 :
11728 : /* We can only handle two-vector permutes, everything else should
11729 : be lowered on the SLP level. The following is closely inspired
11730 : by vect_transform_slp_perm_load and is supposed to eventually
11731 : replace it.
11732 : ??? As intermediate step do code-gen in the SLP tree representation
11733 : somehow? */
11734 436481 : std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
11735 436481 : std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
11736 436481 : unsigned int index = 0;
11737 436481 : poly_uint64 mask_element;
11738 436481 : vec_perm_builder mask;
11739 436481 : mask.new_vector (nunits, npatterns, nelts_per_pattern);
11740 436481 : unsigned int count = mask.encoded_nelts ();
11741 436481 : mask.quick_grow (count);
11742 436481 : vec_perm_indices indices;
11743 436481 : unsigned nperms = 0;
11744 : /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
11745 : vectors to check during analysis, but we need to generate NOUTPUTS
11746 : vectors during transformation. */
11747 436481 : unsigned total_nelts = olanes;
11748 436481 : unsigned process_nelts = olanes;
11749 436481 : if (repeating_p)
11750 : {
11751 109553 : total_nelts = (total_nelts / unpack_factor) * noutputs;
11752 109553 : if (gsi)
11753 9879 : process_nelts = total_nelts;
11754 : }
11755 436481 : unsigned last_ei = (total_nelts - 1) % process_nelts;
11756 2093376 : for (unsigned i = 0; i < process_nelts; ++i)
11757 : {
11758 : /* VI is the input vector index when generating code for REPEATING_P. */
11759 1664637 : unsigned vi = i / olanes * (pack_p ? 2 : 1);
11760 1664637 : unsigned ei = i % olanes;
11761 1664637 : mask_element = vperm[ei].second;
11762 1664637 : if (pack_p)
11763 : {
11764 : /* In this case, we have N outputs and the single child provides 2N
11765 : inputs. Output X permutes inputs 2X and 2X+1.
11766 :
11767 : The mask indices are taken directly from the SLP permutation node.
11768 : Index X selects from the first vector if (X / NUNITS) % 2 == 0;
11769 : X selects from the second vector otherwise. These conditions
11770 : are only known at compile time for constant-length vectors. */
11771 : first_vec = std::make_pair (0, 0);
11772 : second_vec = std::make_pair (0, 1);
11773 : }
11774 1501311 : else if (first_vec.first == -1U
11775 1501311 : || first_vec == vperm[ei].first)
11776 1306360 : first_vec = vperm[ei].first;
11777 194951 : else if (second_vec.first == -1U
11778 194951 : || second_vec == vperm[ei].first)
11779 : {
11780 194563 : second_vec = vperm[ei].first;
11781 194563 : mask_element += nunits;
11782 : }
11783 : else
11784 : {
11785 388 : if (dump_p)
11786 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11787 : "permutation requires at "
11788 : "least three vectors\n");
11789 388 : gcc_assert (!gsi);
11790 : return -1;
11791 : }
11792 :
11793 1664249 : mask[index++] = mask_element;
11794 :
11795 1664249 : if (index == count)
11796 : {
11797 720210 : indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
11798 : TYPE_VECTOR_SUBPARTS (op_vectype));
11799 573853 : bool identity_p = (indices.series_p (0, 1, mask[0], 1)
11800 889765 : && constant_multiple_p (mask[0], nunits));
11801 573853 : machine_mode vmode = TYPE_MODE (vectype);
11802 573853 : machine_mode op_vmode = TYPE_MODE (op_vectype);
11803 573853 : unsigned HOST_WIDE_INT c;
11804 573853 : if ((!identity_p
11805 533772 : && !can_vec_perm_const_p (vmode, op_vmode, indices))
11806 573853 : || (identity_p
11807 40081 : && !known_le (nunits,
11808 : TYPE_VECTOR_SUBPARTS (op_vectype))
11809 7362 : && (!constant_multiple_p (nunits,
11810 8 : TYPE_VECTOR_SUBPARTS (op_vectype),
11811 8 : &c) || c != 2)))
11812 : {
11813 7354 : if (dump_p)
11814 : {
11815 152 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
11816 : vect_location,
11817 : "unsupported vect permute { ");
11818 1586 : for (i = 0; i < count; ++i)
11819 : {
11820 1434 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
11821 1434 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
11822 : }
11823 152 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
11824 : }
11825 7354 : gcc_assert (!gsi);
11826 7742 : return -1;
11827 : }
11828 :
11829 566499 : if (!identity_p)
11830 526418 : nperms += CEIL (total_nelts, process_nelts) - (ei > last_ei);
11831 566499 : if (gsi)
11832 : {
11833 31379 : if (second_vec.first == -1U)
11834 7004 : second_vec = first_vec;
11835 :
11836 31379 : slp_tree
11837 31379 : first_node = children[first_vec.first],
11838 31379 : second_node = children[second_vec.first];
11839 :
11840 31379 : tree mask_vec = NULL_TREE;
11841 31379 : if (!identity_p)
11842 28100 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
11843 :
11844 31379 : tree first_def
11845 31379 : = vect_get_slp_vect_def (first_node, first_vec.second + vi);
11846 31379 : tree second_def
11847 31379 : = vect_get_slp_vect_def (second_node, second_vec.second + vi);
11848 31379 : vect_add_slp_permutation (vinfo, gsi, node, first_def,
11849 31379 : second_def, mask_vec, mask[0]);
11850 : }
11851 :
11852 : index = 0;
11853 : first_vec = std::make_pair (-1U, -1U);
11854 : second_vec = std::make_pair (-1U, -1U);
11855 : }
11856 : }
11857 :
11858 428739 : return nperms;
11859 436481 : }
11860 :
11861 : /* Vectorize the SLP permutations in NODE as specified
11862 : in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
11863 : child number and lane number.
11864 : Interleaving of two two-lane two-child SLP subtrees (not supported):
11865 : [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
11866 : A blend of two four-lane two-child SLP subtrees:
11867 : [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
11868 : Highpart of a four-lane one-child SLP subtree (not supported):
11869 : [ { 0, 2 }, { 0, 3 } ]
11870 : Where currently only a subset is supported by code generating below. */
11871 :
11872 : bool
11873 115590 : vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11874 : slp_tree node, stmt_vector_for_cost *cost_vec)
11875 : {
11876 115590 : tree vectype = SLP_TREE_VECTYPE (node);
11877 115590 : lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
11878 115590 : int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
11879 115590 : SLP_TREE_CHILDREN (node),
11880 : dump_enabled_p ());
11881 115590 : if (nperms < 0)
11882 : return false;
11883 :
11884 114261 : if (!gsi && nperms != 0)
11885 92757 : record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
11886 :
11887 : return true;
11888 : }
11889 :
11890 : /* Vectorize SLP NODE. */
11891 :
11892 : static void
11893 1466671 : vect_schedule_slp_node (vec_info *vinfo,
11894 : slp_tree node, slp_instance instance)
11895 : {
11896 1466671 : gimple_stmt_iterator si;
11897 1466671 : int i;
11898 1466671 : slp_tree child;
11899 :
11900 : /* Vectorize externals and constants. */
11901 1466671 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
11902 1466671 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
11903 : {
11904 : /* ??? vectorizable_shift can end up using a scalar operand which is
11905 : currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
11906 : node in this case. */
11907 496810 : if (!SLP_TREE_VECTYPE (node))
11908 496810 : return;
11909 :
11910 : /* There are two reasons vector defs might already exist. The first
11911 : is that we are vectorizing an existing vector def. The second is
11912 : when performing BB vectorization shared constant/external nodes
11913 : are not split apart during partitioning so during the code-gen
11914 : DFS walk we can end up visiting them twice. */
11915 490512 : if (! SLP_TREE_VEC_DEFS (node).exists ())
11916 489703 : vect_create_constant_vectors (vinfo, node);
11917 490512 : return;
11918 : }
11919 :
11920 969861 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
11921 :
11922 969861 : gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
11923 969861 : if (SLP_TREE_VECTYPE (node))
11924 969855 : SLP_TREE_VEC_DEFS (node).create (vect_get_num_copies (vinfo, node));
11925 :
11926 969861 : if (!SLP_TREE_PERMUTE_P (node) && STMT_VINFO_DATA_REF (stmt_info))
11927 : {
11928 : /* Vectorized loads go before the first scalar load to make it
11929 : ready early, vectorized stores go before the last scalar
11930 : stmt which is where all uses are ready. */
11931 709588 : stmt_vec_info last_stmt_info = NULL;
11932 709588 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
11933 165901 : last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
11934 : else /* DR_IS_WRITE */
11935 543687 : last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
11936 709588 : si = gsi_for_stmt (last_stmt_info->stmt);
11937 709588 : }
11938 260273 : else if (!SLP_TREE_PERMUTE_P (node)
11939 243796 : && (SLP_TREE_TYPE (node) == cycle_phi_info_type
11940 : || SLP_TREE_TYPE (node) == induc_vec_info_type
11941 : || SLP_TREE_TYPE (node) == phi_info_type))
11942 : {
11943 : /* For PHI node vectorization we do not use the insertion iterator. */
11944 53997 : si = gsi_none ();
11945 : }
11946 : else
11947 : {
11948 : /* Emit other stmts after the children vectorized defs which is
11949 : earliest possible. */
11950 : gimple *last_stmt = NULL;
11951 : bool seen_vector_def = false;
11952 574388 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11953 368112 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
11954 : {
11955 : /* For fold-left reductions we are retaining the scalar
11956 : reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
11957 : set so the representation isn't perfect. Resort to the
11958 : last scalar def here. */
11959 295360 : if (SLP_TREE_VEC_DEFS (child).is_empty ())
11960 : {
11961 866 : gcc_assert (SLP_TREE_TYPE (child) == cycle_phi_info_type);
11962 866 : gphi *phi = as_a <gphi *>
11963 866 : (vect_find_last_scalar_stmt_in_slp (child)->stmt);
11964 866 : if (!last_stmt)
11965 : last_stmt = phi;
11966 648 : else if (vect_stmt_dominates_stmt_p (last_stmt, phi))
11967 : last_stmt = phi;
11968 637 : else if (vect_stmt_dominates_stmt_p (phi, last_stmt))
11969 : ;
11970 : else
11971 0 : gcc_unreachable ();
11972 : }
11973 : /* We are emitting all vectorized stmts in the same place and
11974 : the last one is the last.
11975 : ??? Unless we have a load permutation applied and that
11976 : figures to re-use an earlier generated load. */
11977 : unsigned j;
11978 : tree vdef;
11979 697961 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11980 : {
11981 402601 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11982 402601 : if (!last_stmt)
11983 : last_stmt = vstmt;
11984 206964 : else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11985 : last_stmt = vstmt;
11986 45474 : else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
11987 : ;
11988 : else
11989 0 : gcc_unreachable ();
11990 : }
11991 : }
11992 72752 : else if (!SLP_TREE_VECTYPE (child))
11993 : {
11994 : /* For externals we use unvectorized at all scalar defs. */
11995 : unsigned j;
11996 : tree def;
11997 13434 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
11998 7770 : if (TREE_CODE (def) == SSA_NAME
11999 7770 : && !SSA_NAME_IS_DEFAULT_DEF (def))
12000 : {
12001 279 : gimple *stmt = SSA_NAME_DEF_STMT (def);
12002 279 : if (gimple_uid (stmt) == -1u)
12003 : /* If the stmt is not inside the region do not
12004 : use it as possible insertion point. */
12005 : ;
12006 271 : else if (!last_stmt)
12007 : last_stmt = stmt;
12008 255 : else if (vect_stmt_dominates_stmt_p (last_stmt, stmt))
12009 : last_stmt = stmt;
12010 153 : else if (vect_stmt_dominates_stmt_p (stmt, last_stmt))
12011 : ;
12012 : else
12013 0 : gcc_unreachable ();
12014 : }
12015 : }
12016 : else
12017 : {
12018 : /* For externals we have to look at all defs since their
12019 : insertion place is decided per vector. But beware
12020 : of pre-existing vectors where we need to make sure
12021 : we do not insert before the region boundary. */
12022 67088 : if (SLP_TREE_SCALAR_OPS (child).is_empty ()
12023 657 : && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
12024 : seen_vector_def = true;
12025 : else
12026 : {
12027 : unsigned j;
12028 : tree vdef;
12029 529966 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
12030 94883 : if (TREE_CODE (vdef) == SSA_NAME
12031 94883 : && !SSA_NAME_IS_DEFAULT_DEF (vdef))
12032 : {
12033 19659 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
12034 19659 : if (!last_stmt)
12035 : last_stmt = vstmt;
12036 10978 : else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
12037 : last_stmt = vstmt;
12038 8725 : else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
12039 : ;
12040 : else
12041 0 : gcc_unreachable ();
12042 : }
12043 : }
12044 : }
12045 : /* This can happen when all children are pre-existing vectors or
12046 : constants. */
12047 206276 : if (!last_stmt)
12048 1724 : last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
12049 1724 : if (!last_stmt)
12050 : {
12051 0 : gcc_assert (seen_vector_def);
12052 0 : si = gsi_after_labels (vinfo->bbs[0]);
12053 : }
12054 206276 : else if (is_ctrl_altering_stmt (last_stmt))
12055 : {
12056 : /* We split regions to vectorize at control altering stmts
12057 : with a definition so this must be an external which
12058 : we can insert at the start of the region. */
12059 0 : si = gsi_after_labels (vinfo->bbs[0]);
12060 : }
12061 206276 : else if (is_a <bb_vec_info> (vinfo)
12062 18017 : && !SLP_TREE_PERMUTE_P (node)
12063 16591 : && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
12064 207717 : && gimple_could_trap_p (stmt_info->stmt))
12065 : {
12066 : /* We've constrained possibly trapping operations to all come
12067 : from the same basic-block, if vectorized defs would allow earlier
12068 : scheduling still force vectorized stmts to the original block.
12069 : This is only necessary for BB vectorization since for loop vect
12070 : all operations are in a single BB and scalar stmt based
12071 : placement doesn't play well with epilogue vectorization. */
12072 53 : gcc_assert (dominated_by_p (CDI_DOMINATORS,
12073 : gimple_bb (stmt_info->stmt),
12074 : gimple_bb (last_stmt)));
12075 53 : si = gsi_after_labels (gimple_bb (stmt_info->stmt));
12076 : }
12077 206223 : else if (is_a <gphi *> (last_stmt))
12078 14380 : si = gsi_after_labels (gimple_bb (last_stmt));
12079 : else
12080 : {
12081 191843 : si = gsi_for_stmt (last_stmt);
12082 191843 : gsi_next (&si);
12083 :
12084 : /* Avoid scheduling internal defs outside of the loop when
12085 : we might have only implicitly tracked loop mask/len defs. */
12086 191843 : if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
12087 74 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
12088 174121 : || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
12089 : {
12090 74 : gimple_stmt_iterator si2
12091 74 : = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
12092 74 : if ((gsi_end_p (si2)
12093 0 : && (LOOP_VINFO_LOOP (loop_vinfo)->header
12094 0 : != gimple_bb (last_stmt))
12095 0 : && dominated_by_p (CDI_DOMINATORS,
12096 : LOOP_VINFO_LOOP (loop_vinfo)->header,
12097 0 : gimple_bb (last_stmt)))
12098 74 : || (!gsi_end_p (si2)
12099 74 : && last_stmt != *si2
12100 72 : && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
12101 3 : si = si2;
12102 : }
12103 : }
12104 : }
12105 :
12106 969861 : if (dump_enabled_p ())
12107 : {
12108 71780 : if (stmt_info)
12109 71727 : dump_printf_loc (MSG_NOTE, vect_location,
12110 : "------>vectorizing SLP node starting from: %G",
12111 : stmt_info->stmt);
12112 : else
12113 : {
12114 53 : dump_printf_loc (MSG_NOTE, vect_location,
12115 : "------>vectorizing SLP node:\n");
12116 53 : vect_print_slp_tree (MSG_NOTE, vect_location, node);
12117 : }
12118 : }
12119 969861 : vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
12120 : }
12121 :
12122 : /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
12123 : For loop vectorization this is done in vectorizable_call, but for SLP
12124 : it needs to be deferred until end of vect_schedule_slp, because multiple
12125 : SLP instances may refer to the same scalar stmt. */
12126 :
12127 : static void
12128 601522 : vect_remove_slp_scalar_calls (vec_info *vinfo,
12129 : slp_tree node, hash_set<slp_tree> &visited)
12130 : {
12131 601522 : gimple *new_stmt;
12132 601522 : gimple_stmt_iterator gsi;
12133 601522 : int i;
12134 601522 : slp_tree child;
12135 601522 : tree lhs;
12136 601522 : stmt_vec_info stmt_info;
12137 :
12138 601522 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
12139 188435 : return;
12140 :
12141 457090 : if (visited.add (node))
12142 : return;
12143 :
12144 924560 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
12145 511473 : vect_remove_slp_scalar_calls (vinfo, child, visited);
12146 :
12147 1308157 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
12148 : {
12149 486198 : if (!stmt_info)
12150 3974 : continue;
12151 482224 : stmt_info = vect_orig_stmt (stmt_info);
12152 482224 : gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
12153 5231 : if (!stmt || gimple_bb (stmt) == NULL)
12154 477031 : continue;
12155 5193 : lhs = gimple_call_lhs (stmt);
12156 5193 : if (lhs)
12157 4579 : new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
12158 : else
12159 614 : new_stmt = gimple_build_nop ();
12160 5193 : unlink_stmt_vdef (stmt_info->stmt);
12161 5193 : gsi = gsi_for_stmt (stmt);
12162 5193 : vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
12163 5193 : if (lhs)
12164 4579 : SSA_NAME_DEF_STMT (lhs) = new_stmt;
12165 : }
12166 : }
12167 :
12168 : static void
12169 90049 : vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
12170 : {
12171 90049 : hash_set<slp_tree> visited;
12172 90049 : vect_remove_slp_scalar_calls (vinfo, node, visited);
12173 90049 : }
12174 :
12175 : /* Vectorize the instance root. */
12176 :
12177 : void
12178 10984 : vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance instance)
12179 : {
12180 10984 : gassign *rstmt = NULL;
12181 :
12182 10984 : if (instance->kind == slp_inst_kind_ctor)
12183 : {
12184 5068 : if (SLP_TREE_VEC_DEFS (node).length () == 1)
12185 : {
12186 5031 : tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
12187 5031 : tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
12188 5031 : if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
12189 5031 : TREE_TYPE (vect_lhs)))
12190 0 : vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
12191 : vect_lhs);
12192 5031 : rstmt = gimple_build_assign (root_lhs, vect_lhs);
12193 : }
12194 : else
12195 : {
12196 37 : gcc_assert (SLP_TREE_VEC_DEFS (node).length () > 1);
12197 37 : tree child_def;
12198 37 : int j;
12199 37 : vec<constructor_elt, va_gc> *v;
12200 37 : vec_alloc (v, SLP_TREE_VEC_DEFS (node).length ());
12201 :
12202 : /* A CTOR can handle V16HI composition from VNx8HI so we
12203 : do not need to convert vector elements if the types
12204 : do not match. */
12205 111 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
12206 74 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
12207 37 : tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
12208 37 : tree rtype
12209 37 : = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
12210 37 : tree r_constructor = build_constructor (rtype, v);
12211 37 : rstmt = gimple_build_assign (lhs, r_constructor);
12212 : }
12213 : }
12214 5916 : else if (instance->kind == slp_inst_kind_bb_reduc)
12215 : {
12216 : /* Largely inspired by reduction chain epilogue handling in
12217 : vect_create_epilog_for_reduction. */
12218 4352 : vec<tree> vec_defs = vNULL;
12219 4352 : vect_get_slp_defs (node, &vec_defs);
12220 4352 : enum tree_code reduc_code
12221 4352 : = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
12222 : /* ??? We actually have to reflect signs somewhere. */
12223 4352 : if (reduc_code == MINUS_EXPR)
12224 0 : reduc_code = PLUS_EXPR;
12225 4352 : gimple_seq epilogue = NULL;
12226 : /* We may end up with more than one vector result, reduce them
12227 : to one vector. */
12228 4352 : tree vec_def = vec_defs[0];
12229 4352 : tree vectype = TREE_TYPE (vec_def);
12230 4352 : tree compute_vectype = vectype;
12231 4352 : bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
12232 4152 : && TYPE_OVERFLOW_UNDEFINED (vectype)
12233 7332 : && operation_can_overflow (reduc_code));
12234 2840 : if (pun_for_overflow_p)
12235 : {
12236 2840 : compute_vectype = unsigned_type_for (vectype);
12237 2840 : vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
12238 : compute_vectype, vec_def);
12239 : }
12240 6730 : for (unsigned i = 1; i < vec_defs.length (); ++i)
12241 : {
12242 2378 : tree def = vec_defs[i];
12243 2378 : if (pun_for_overflow_p)
12244 2273 : def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
12245 : compute_vectype, def);
12246 2378 : vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
12247 : vec_def, def);
12248 : }
12249 4352 : vec_defs.release ();
12250 : /* ??? Support other schemes than direct internal fn. */
12251 4352 : internal_fn reduc_fn;
12252 4352 : if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
12253 4352 : || reduc_fn == IFN_LAST)
12254 0 : gcc_unreachable ();
12255 4352 : tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
12256 4352 : TREE_TYPE (compute_vectype), vec_def);
12257 4352 : if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
12258 : {
12259 2813 : tree rem_def = NULL_TREE;
12260 12403 : for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
12261 : {
12262 9590 : def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
12263 9590 : if (!rem_def)
12264 : rem_def = def;
12265 : else
12266 6777 : rem_def = gimple_build (&epilogue, reduc_code,
12267 6777 : TREE_TYPE (scalar_def),
12268 : rem_def, def);
12269 : }
12270 2813 : scalar_def = gimple_build (&epilogue, reduc_code,
12271 2813 : TREE_TYPE (scalar_def),
12272 : scalar_def, rem_def);
12273 : }
12274 4352 : scalar_def = gimple_convert (&epilogue,
12275 4352 : TREE_TYPE (vectype), scalar_def);
12276 4352 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
12277 4352 : gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
12278 4352 : gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
12279 4352 : update_stmt (gsi_stmt (rgsi));
12280 4352 : return;
12281 : }
12282 1564 : else if (instance->kind == slp_inst_kind_gcond)
12283 : {
12284 : /* Only support a single root for now as we can't codegen CFG yet and so we
12285 : can't support lane > 1 at this time. */
12286 1564 : gcc_assert (instance->root_stmts.length () == 1);
12287 1564 : auto root_stmt_info = instance->root_stmts[0];
12288 1564 : auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
12289 1564 : gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
12290 1564 : gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
12291 1564 : bool res = vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
12292 : root_stmt_info, &rgsi, node, NULL);
12293 1564 : gcc_assert (res);
12294 1564 : return;
12295 : }
12296 : else
12297 0 : gcc_unreachable ();
12298 :
12299 5068 : gcc_assert (rstmt);
12300 :
12301 5068 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
12302 5068 : gsi_replace (&rgsi, rstmt, true);
12303 : }
12304 :
12305 : struct slp_scc_info
12306 : {
12307 : bool on_stack;
12308 : int dfs;
12309 : int lowlink;
12310 : };
12311 :
12312 : /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
12313 :
12314 : static void
12315 1466671 : vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
12316 : hash_map<slp_tree, slp_scc_info> &scc_info,
12317 : int &maxdfs, vec<slp_tree> &stack)
12318 : {
12319 1466671 : bool existed_p;
12320 1466671 : slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
12321 1466671 : gcc_assert (!existed_p);
12322 1466671 : info->dfs = maxdfs;
12323 1466671 : info->lowlink = maxdfs;
12324 1466671 : maxdfs++;
12325 :
12326 : /* Leaf. */
12327 1466671 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
12328 : {
12329 496810 : info->on_stack = false;
12330 496810 : vect_schedule_slp_node (vinfo, node, instance);
12331 1025280 : return;
12332 : }
12333 :
12334 969861 : info->on_stack = true;
12335 969861 : stack.safe_push (node);
12336 :
12337 969861 : unsigned i;
12338 969861 : slp_tree child;
12339 : /* DFS recurse. */
12340 2001496 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
12341 : {
12342 1031635 : if (!child)
12343 55111 : continue;
12344 976524 : slp_scc_info *child_info = scc_info.get (child);
12345 976524 : if (!child_info)
12346 : {
12347 886698 : vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
12348 : /* Recursion might have re-allocated the node. */
12349 886698 : info = scc_info.get (node);
12350 886698 : child_info = scc_info.get (child);
12351 886698 : info->lowlink = MIN (info->lowlink, child_info->lowlink);
12352 : }
12353 89826 : else if (child_info->on_stack)
12354 25492 : info->lowlink = MIN (info->lowlink, child_info->dfs);
12355 : }
12356 969861 : if (info->lowlink != info->dfs)
12357 : return;
12358 :
12359 938201 : auto_vec<slp_tree, 4> phis_to_fixup;
12360 :
12361 : /* Singleton. */
12362 938201 : if (stack.last () == node)
12363 : {
12364 914364 : stack.pop ();
12365 914364 : info->on_stack = false;
12366 914364 : vect_schedule_slp_node (vinfo, node, instance);
12367 914364 : if (!SLP_TREE_PERMUTE_P (node)
12368 914364 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
12369 30268 : phis_to_fixup.quick_push (node);
12370 : }
12371 : else
12372 : {
12373 : /* SCC. */
12374 23837 : int last_idx = stack.length () - 1;
12375 55497 : while (stack[last_idx] != node)
12376 31660 : last_idx--;
12377 : /* We can break the cycle at PHIs who have at least one child
12378 : code generated. Then we could re-start the DFS walk until
12379 : all nodes in the SCC are covered (we might have new entries
12380 : for only back-reachable nodes). But it's simpler to just
12381 : iterate and schedule those that are ready. */
12382 23837 : unsigned todo = stack.length () - last_idx;
12383 24164 : do
12384 : {
12385 105555 : for (int idx = stack.length () - 1; idx >= last_idx; --idx)
12386 : {
12387 57227 : slp_tree entry = stack[idx];
12388 57227 : if (!entry)
12389 934 : continue;
12390 56293 : bool phi = (!SLP_TREE_PERMUTE_P (entry)
12391 56293 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
12392 56293 : bool ready = !phi;
12393 142467 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
12394 111213 : if (!child)
12395 : {
12396 22983 : gcc_assert (phi);
12397 : ready = true;
12398 : break;
12399 : }
12400 88230 : else if (scc_info.get (child)->on_stack)
12401 : {
12402 24027 : if (!phi)
12403 : {
12404 : ready = false;
12405 : break;
12406 : }
12407 : }
12408 : else
12409 : {
12410 64203 : if (phi)
12411 : {
12412 : ready = true;
12413 : break;
12414 : }
12415 : }
12416 33310 : if (ready)
12417 : {
12418 55497 : vect_schedule_slp_node (vinfo, entry, instance);
12419 55497 : scc_info.get (entry)->on_stack = false;
12420 55497 : stack[idx] = NULL;
12421 55497 : todo--;
12422 55497 : if (phi)
12423 24273 : phis_to_fixup.safe_push (entry);
12424 : }
12425 : }
12426 : }
12427 24164 : while (todo != 0);
12428 :
12429 : /* Pop the SCC. */
12430 23837 : stack.truncate (last_idx);
12431 : }
12432 :
12433 : /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
12434 : slp_tree phi_node;
12435 1930943 : FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
12436 : {
12437 54541 : gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
12438 54541 : edge_iterator ei;
12439 54541 : edge e;
12440 171923 : FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
12441 : {
12442 117382 : unsigned dest_idx = e->dest_idx;
12443 117382 : child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
12444 117382 : if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
12445 66027 : continue;
12446 51355 : unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
12447 : /* Simply fill all args. */
12448 51355 : if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
12449 : != vect_first_order_recurrence)
12450 110347 : for (unsigned i = 0; i < n; ++i)
12451 : {
12452 59032 : tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
12453 59032 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
12454 59032 : add_phi_arg (phi, vect_get_slp_vect_def (child, i),
12455 : e, gimple_phi_arg_location (phi, dest_idx));
12456 : }
12457 : else
12458 : {
12459 : /* Unless it is a first order recurrence which needs
12460 : args filled in for both the PHI node and the permutes. */
12461 40 : gimple *perm
12462 40 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
12463 40 : gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
12464 40 : add_phi_arg (as_a <gphi *> (rphi),
12465 : vect_get_slp_vect_def (child, n - 1),
12466 : e, gimple_phi_arg_location (phi, dest_idx));
12467 117 : for (unsigned i = 0; i < n; ++i)
12468 : {
12469 77 : gimple *perm
12470 77 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
12471 77 : if (i > 0)
12472 37 : gimple_assign_set_rhs1 (perm,
12473 : vect_get_slp_vect_def (child, i - 1));
12474 77 : gimple_assign_set_rhs2 (perm,
12475 : vect_get_slp_vect_def (child, i));
12476 77 : update_stmt (perm);
12477 : }
12478 : }
12479 : }
12480 : }
12481 938201 : }
12482 :
12483 : /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
12484 :
12485 : void
12486 541109 : vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
12487 : {
12488 541109 : slp_instance instance;
12489 541109 : unsigned int i;
12490 :
12491 541109 : hash_map<slp_tree, slp_scc_info> scc_info;
12492 541109 : int maxdfs = 0;
12493 1121187 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
12494 : {
12495 580078 : slp_tree node = SLP_INSTANCE_TREE (instance);
12496 580078 : if (dump_enabled_p ())
12497 : {
12498 15987 : dump_printf_loc (MSG_NOTE, vect_location,
12499 : "Vectorizing SLP tree:\n");
12500 : /* ??? Dump all? */
12501 15987 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
12502 447 : dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
12503 447 : SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
12504 15987 : vect_print_slp_graph (MSG_NOTE, vect_location,
12505 : SLP_INSTANCE_TREE (instance));
12506 : }
12507 : /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
12508 : have a PHI be the node breaking the cycle. */
12509 580078 : auto_vec<slp_tree> stack;
12510 580078 : if (!scc_info.get (node))
12511 579973 : vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
12512 :
12513 580078 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
12514 10984 : vectorize_slp_instance_root_stmt (vinfo, node, instance);
12515 :
12516 580078 : if (dump_enabled_p ())
12517 15987 : dump_printf_loc (MSG_NOTE, vect_location,
12518 : "vectorizing stmts using SLP.\n");
12519 580078 : }
12520 :
12521 1662296 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
12522 : {
12523 580078 : slp_tree root = SLP_INSTANCE_TREE (instance);
12524 580078 : stmt_vec_info store_info;
12525 580078 : unsigned int j;
12526 :
12527 : /* Remove scalar call stmts. Do not do this for basic-block
12528 : vectorization as not all uses may be vectorized.
12529 : ??? Why should this be necessary? DCE should be able to
12530 : remove the stmts itself.
12531 : ??? For BB vectorization we can as well remove scalar
12532 : stmts starting from the SLP tree root if they have no
12533 : uses. */
12534 580078 : if (is_a <loop_vec_info> (vinfo))
12535 90049 : vect_remove_slp_scalar_calls (vinfo, root);
12536 :
12537 : /* Remove vectorized stores original scalar stmts. */
12538 2586663 : for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
12539 : {
12540 1462898 : if (!store_info
12541 1462884 : || !STMT_VINFO_DATA_REF (store_info)
12542 1435214 : || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
12543 : break;
12544 :
12545 1426507 : store_info = vect_orig_stmt (store_info);
12546 : /* Free the attached stmt_vec_info and remove the stmt. */
12547 1426507 : vinfo->remove_stmt (store_info);
12548 :
12549 : /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
12550 : to not crash in vect_free_slp_tree later. */
12551 1426507 : if (SLP_TREE_REPRESENTATIVE (root) == store_info)
12552 543388 : SLP_TREE_REPRESENTATIVE (root) = NULL;
12553 : }
12554 : }
12555 541109 : }
|