Line data Source code
1 : /* SLP - Basic Block Vectorization
2 : Copyright (C) 2007-2026 Free Software Foundation, Inc.
3 : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : and Ira Rosen <irar@il.ibm.com>
5 :
6 : This file is part of GCC.
7 :
8 : GCC is free software; you can redistribute it and/or modify it under
9 : the terms of the GNU General Public License as published by the Free
10 : Software Foundation; either version 3, or (at your option) any later
11 : version.
12 :
13 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : for more details.
17 :
18 : You should have received a copy of the GNU General Public License
19 : along with GCC; see the file COPYING3. If not see
20 : <http://www.gnu.org/licenses/>. */
21 :
22 : #include "config.h"
23 : #define INCLUDE_ALGORITHM
24 : #include "system.h"
25 : #include "coretypes.h"
26 : #include "backend.h"
27 : #include "target.h"
28 : #include "rtl.h"
29 : #include "tree.h"
30 : #include "gimple.h"
31 : #include "tree-pass.h"
32 : #include "ssa.h"
33 : #include "optabs-tree.h"
34 : #include "insn-config.h"
35 : #include "recog.h" /* FIXME: for insn_data */
36 : #include "fold-const.h"
37 : #include "stor-layout.h"
38 : #include "gimple-iterator.h"
39 : #include "cfgloop.h"
40 : #include "tree-vectorizer.h"
41 : #include "langhooks.h"
42 : #include "gimple-walk.h"
43 : #include "dbgcnt.h"
44 : #include "tree-vector-builder.h"
45 : #include "vec-perm-indices.h"
46 : #include "gimple-fold.h"
47 : #include "internal-fn.h"
48 : #include "dump-context.h"
49 : #include "cfganal.h"
50 : #include "tree-eh.h"
51 : #include "tree-cfg.h"
52 : #include "alloc-pool.h"
53 : #include "sreal.h"
54 : #include "predict.h"
55 :
56 : #define REDUC_GROUP_FIRST_ELEMENT(S) \
57 : (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element)
58 :
59 : static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
60 : load_permutation_t &,
61 : const vec<tree> &,
62 : gimple_stmt_iterator *,
63 : poly_uint64, bool, bool,
64 : unsigned *,
65 : unsigned * = nullptr,
66 : bool = false);
67 : static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
68 : slp_tree, lane_permutation_t &,
69 : vec<slp_tree> &, bool);
70 : static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
71 : static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
72 :
73 : static object_allocator<_slp_tree> *slp_tree_pool;
74 : static slp_tree slp_first_node;
75 :
76 : void
77 1117698 : vect_slp_init (void)
78 : {
79 1117698 : slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
80 1117698 : }
81 :
82 : void
83 1117698 : vect_slp_fini (void)
84 : {
85 1673636 : while (slp_first_node)
86 555938 : delete slp_first_node;
87 2235396 : delete slp_tree_pool;
88 1117698 : slp_tree_pool = NULL;
89 1117698 : }
90 :
91 : void *
92 7100223 : _slp_tree::operator new (size_t n)
93 : {
94 7100223 : gcc_assert (n == sizeof (_slp_tree));
95 7100223 : return slp_tree_pool->allocate_raw ();
96 : }
97 :
98 : void
99 7100223 : _slp_tree::operator delete (void *node, size_t n)
100 : {
101 7100223 : gcc_assert (n == sizeof (_slp_tree));
102 7100223 : slp_tree_pool->remove_raw (node);
103 7100223 : }
104 :
105 :
106 : /* Initialize a SLP node. */
107 :
108 7100223 : _slp_tree::_slp_tree ()
109 : {
110 7100223 : this->prev_node = NULL;
111 7100223 : if (slp_first_node)
112 6178246 : slp_first_node->prev_node = this;
113 7100223 : this->next_node = slp_first_node;
114 7100223 : slp_first_node = this;
115 7100223 : SLP_TREE_SCALAR_STMTS (this) = vNULL;
116 7100223 : SLP_TREE_SCALAR_OPS (this) = vNULL;
117 7100223 : SLP_TREE_VEC_DEFS (this) = vNULL;
118 7100223 : SLP_TREE_CHILDREN (this) = vNULL;
119 7100223 : SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
120 7100223 : SLP_TREE_LANE_PERMUTATION (this) = vNULL;
121 7100223 : SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122 7100223 : SLP_TREE_CODE (this) = ERROR_MARK;
123 7100223 : SLP_TREE_GS_SCALE (this) = 0;
124 7100223 : SLP_TREE_GS_BASE (this) = NULL_TREE;
125 7100223 : this->ldst_lanes = false;
126 7100223 : this->avoid_stlf_fail = false;
127 7100223 : SLP_TREE_VECTYPE (this) = NULL_TREE;
128 7100223 : SLP_TREE_REPRESENTATIVE (this) = NULL;
129 7100223 : this->cycle_info.id = -1;
130 7100223 : this->cycle_info.reduc_idx = -1;
131 7100223 : SLP_TREE_REF_COUNT (this) = 1;
132 7100223 : this->failed = NULL;
133 7100223 : this->max_nunits = 1;
134 7100223 : this->lanes = 0;
135 7100223 : SLP_TREE_TYPE (this) = undef_vec_info_type;
136 7100223 : this->data = NULL;
137 7100223 : }
138 :
139 : /* Tear down a SLP node. */
140 :
141 7100223 : _slp_tree::~_slp_tree ()
142 : {
143 7100223 : if (this->prev_node)
144 4361657 : this->prev_node->next_node = this->next_node;
145 : else
146 2738566 : slp_first_node = this->next_node;
147 7100223 : if (this->next_node)
148 5293040 : this->next_node->prev_node = this->prev_node;
149 7100223 : SLP_TREE_CHILDREN (this).release ();
150 7100223 : SLP_TREE_SCALAR_STMTS (this).release ();
151 7100223 : SLP_TREE_SCALAR_OPS (this).release ();
152 7100223 : SLP_TREE_VEC_DEFS (this).release ();
153 7100223 : SLP_TREE_LOAD_PERMUTATION (this).release ();
154 7100223 : SLP_TREE_LANE_PERMUTATION (this).release ();
155 7100223 : if (this->failed)
156 1921916 : free (failed);
157 7100223 : if (this->data)
158 1141549 : delete this->data;
159 7100223 : }
160 :
161 : /* Push the single SSA definition in DEF to the vector of vector defs. */
162 :
163 : void
164 520393 : _slp_tree::push_vec_def (gimple *def)
165 : {
166 520393 : if (gphi *phi = dyn_cast <gphi *> (def))
167 58619 : vec_defs.quick_push (gimple_phi_result (phi));
168 : else
169 : {
170 461774 : def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
171 461774 : vec_defs.quick_push (get_def_from_ptr (defop));
172 : }
173 520393 : }
174 :
175 : /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
176 :
177 : void
178 13289730 : vect_free_slp_tree (slp_tree node)
179 : {
180 13289730 : int i;
181 13289730 : slp_tree child;
182 :
183 13289730 : if (--SLP_TREE_REF_COUNT (node) != 0)
184 13289730 : return;
185 :
186 10090004 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
187 3545719 : if (child)
188 3224309 : vect_free_slp_tree (child);
189 :
190 : /* If the node defines any SLP only patterns then those patterns are no
191 : longer valid and should be removed. */
192 6544285 : stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
193 6544285 : if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
194 : {
195 973 : stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
196 973 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
197 973 : STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
198 : }
199 :
200 6544285 : delete node;
201 : }
202 :
203 : /* Return a location suitable for dumpings related to the SLP instance. */
204 :
205 : dump_user_location_t
206 3362144 : _slp_instance::location () const
207 : {
208 3362144 : if (!root_stmts.is_empty ())
209 313823 : return root_stmts[0]->stmt;
210 : else
211 3048321 : return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
212 : }
213 :
214 :
215 : /* Free the memory allocated for the SLP instance. */
216 :
217 : void
218 1448278 : vect_free_slp_instance (slp_instance instance)
219 : {
220 1448278 : vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
221 1448278 : SLP_INSTANCE_LOADS (instance).release ();
222 1448278 : SLP_INSTANCE_ROOT_STMTS (instance).release ();
223 1448278 : SLP_INSTANCE_REMAIN_DEFS (instance).release ();
224 1448278 : instance->subgraph_entries.release ();
225 1448278 : instance->cost_vec.release ();
226 1448278 : free (instance);
227 1448278 : }
228 :
229 :
230 : /* Create an SLP node for SCALAR_STMTS. */
231 :
232 : slp_tree
233 86705 : vect_create_new_slp_node (unsigned nops, tree_code code)
234 : {
235 86705 : slp_tree node = new _slp_tree;
236 86705 : SLP_TREE_SCALAR_STMTS (node) = vNULL;
237 86705 : SLP_TREE_CHILDREN (node).create (nops);
238 86705 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
239 86705 : SLP_TREE_CODE (node) = code;
240 86705 : return node;
241 : }
242 : /* Create an SLP node for SCALAR_STMTS. */
243 :
244 : static slp_tree
245 3330982 : vect_create_new_slp_node (slp_tree node,
246 : vec<stmt_vec_info> scalar_stmts, unsigned nops)
247 : {
248 3330982 : SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
249 3330982 : SLP_TREE_CHILDREN (node).create (nops);
250 3330982 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
251 3330982 : SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
252 3330982 : SLP_TREE_LANES (node) = scalar_stmts.length ();
253 3330982 : return node;
254 : }
255 :
256 : /* Create an SLP node for SCALAR_STMTS. */
257 :
258 : static slp_tree
259 6276 : vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
260 : {
261 6276 : return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
262 : }
263 :
264 : /* Create an SLP node for OPS. */
265 :
266 : static slp_tree
267 1750894 : vect_create_new_slp_node (slp_tree node, vec<tree> ops)
268 : {
269 1750894 : SLP_TREE_SCALAR_OPS (node) = ops;
270 1750894 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
271 0 : SLP_TREE_LANES (node) = ops.length ();
272 1750894 : return node;
273 : }
274 :
275 : /* Create an SLP node for OPS. */
276 :
277 : static slp_tree
278 1750894 : vect_create_new_slp_node (vec<tree> ops)
279 : {
280 1750894 : return vect_create_new_slp_node (new _slp_tree, ops);
281 : }
282 :
283 :
284 : /* This structure is used in creation of an SLP tree. Each instance
285 : corresponds to the same operand in a group of scalar stmts in an SLP
286 : node. */
287 : typedef struct _slp_oprnd_info
288 : {
289 : /* Def-stmts for the operands. */
290 : vec<stmt_vec_info> def_stmts;
291 : /* Operands. */
292 : vec<tree> ops;
293 : /* Information about the first statement, its vector def-type, type, the
294 : operand itself in case it's constant, and an indication if it's a pattern
295 : stmt and gather/scatter info. */
296 : tree first_op_type;
297 : enum vect_def_type first_dt;
298 : bool any_pattern;
299 : bool first_gs_p;
300 : gather_scatter_info first_gs_info;
301 : } *slp_oprnd_info;
302 :
303 :
304 : /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
305 : operand. */
306 : static vec<slp_oprnd_info>
307 2974746 : vect_create_oprnd_info (int nops, int group_size)
308 : {
309 2974746 : int i;
310 2974746 : slp_oprnd_info oprnd_info;
311 2974746 : vec<slp_oprnd_info> oprnds_info;
312 :
313 2974746 : oprnds_info.create (nops);
314 10631760 : for (i = 0; i < nops; i++)
315 : {
316 4682268 : oprnd_info = XNEW (struct _slp_oprnd_info);
317 4682268 : oprnd_info->def_stmts.create (group_size);
318 4682268 : oprnd_info->ops.create (group_size);
319 4682268 : oprnd_info->first_dt = vect_uninitialized_def;
320 4682268 : oprnd_info->first_op_type = NULL_TREE;
321 4682268 : oprnd_info->any_pattern = false;
322 4682268 : oprnd_info->first_gs_p = false;
323 4682268 : oprnds_info.quick_push (oprnd_info);
324 : }
325 :
326 2974746 : return oprnds_info;
327 : }
328 :
329 :
330 : /* Free operands info. */
331 :
332 : static void
333 2974746 : vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
334 : {
335 2974746 : int i;
336 2974746 : slp_oprnd_info oprnd_info;
337 :
338 7657014 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
339 : {
340 4682268 : oprnd_info->def_stmts.release ();
341 4682268 : oprnd_info->ops.release ();
342 4682268 : XDELETE (oprnd_info);
343 : }
344 :
345 2974746 : oprnds_info.release ();
346 2974746 : }
347 :
348 : /* Return the execution frequency of NODE (so that a higher value indicates
349 : a "more important" node when optimizing for speed). */
350 :
351 : static sreal
352 3130747 : vect_slp_node_weight (slp_tree node)
353 : {
354 3130747 : stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
355 3130747 : basic_block bb = gimple_bb (stmt_info->stmt);
356 3130747 : return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
357 : }
358 :
359 : /* Return true if STMTS contains a pattern statement. */
360 :
361 : static bool
362 22141 : vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
363 : {
364 22141 : stmt_vec_info stmt_info;
365 22141 : unsigned int i;
366 71725 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
367 51784 : if (stmt_info && is_pattern_stmt_p (stmt_info))
368 : return true;
369 : return false;
370 : }
371 :
372 : /* Return true when all lanes in the external or constant NODE have
373 : the same value. */
374 :
375 : static bool
376 589969 : vect_slp_tree_uniform_p (slp_tree node)
377 : {
378 589969 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
379 : || SLP_TREE_DEF_TYPE (node) == vect_external_def);
380 :
381 : /* Pre-exsting vectors. */
382 1038884 : if (SLP_TREE_SCALAR_OPS (node).is_empty ())
383 : return false;
384 :
385 : unsigned i;
386 : tree op, first = NULL_TREE;
387 1349884 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
388 1208830 : if (!first)
389 : first = op;
390 618861 : else if (!operand_equal_p (first, op, 0))
391 : return false;
392 :
393 : return true;
394 : }
395 :
396 : /* Find the place of the data-ref in STMT_INFO in the interleaving chain
397 : that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
398 : of the chain. */
399 :
400 : int
401 652513 : vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
402 : stmt_vec_info first_stmt_info)
403 : {
404 652513 : stmt_vec_info next_stmt_info = first_stmt_info;
405 652513 : int result = 0;
406 :
407 652513 : if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
408 : return -1;
409 :
410 1638960 : do
411 : {
412 1638960 : if (next_stmt_info == stmt_info)
413 : return result;
414 986447 : next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
415 986447 : if (next_stmt_info)
416 986447 : result += DR_GROUP_GAP (next_stmt_info);
417 : }
418 986447 : while (next_stmt_info);
419 :
420 : return -1;
421 : }
422 :
423 : /* Check whether it is possible to load COUNT elements of type ELT_TYPE
424 : using the method implemented by duplicate_and_interleave. Return true
425 : if so, returning the number of intermediate vectors in *NVECTORS_OUT
426 : (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
427 : (if nonnull). */
428 :
429 : bool
430 0 : can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
431 : tree elt_type, unsigned int *nvectors_out,
432 : tree *vector_type_out,
433 : tree *permutes)
434 : {
435 0 : tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
436 0 : if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
437 0 : return false;
438 :
439 0 : machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
440 0 : poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
441 0 : unsigned int nvectors = 1;
442 0 : for (;;)
443 : {
444 0 : scalar_int_mode int_mode;
445 0 : poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
446 0 : if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
447 : {
448 : /* Get the natural vector type for this SLP group size. */
449 0 : tree int_type = build_nonstandard_integer_type
450 0 : (GET_MODE_BITSIZE (int_mode), 1);
451 0 : tree vector_type
452 0 : = get_vectype_for_scalar_type (vinfo, int_type, count);
453 0 : poly_int64 half_nelts;
454 0 : if (vector_type
455 0 : && VECTOR_MODE_P (TYPE_MODE (vector_type))
456 0 : && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
457 : GET_MODE_SIZE (base_vector_mode))
458 0 : && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
459 : 2, &half_nelts))
460 : {
461 : /* Try fusing consecutive sequences of COUNT / NVECTORS elements
462 : together into elements of type INT_TYPE and using the result
463 : to build NVECTORS vectors. */
464 0 : poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
465 0 : vec_perm_builder sel1 (nelts, 2, 3);
466 0 : vec_perm_builder sel2 (nelts, 2, 3);
467 :
468 0 : for (unsigned int i = 0; i < 3; ++i)
469 : {
470 0 : sel1.quick_push (i);
471 0 : sel1.quick_push (i + nelts);
472 0 : sel2.quick_push (half_nelts + i);
473 0 : sel2.quick_push (half_nelts + i + nelts);
474 : }
475 0 : vec_perm_indices indices1 (sel1, 2, nelts);
476 0 : vec_perm_indices indices2 (sel2, 2, nelts);
477 0 : machine_mode vmode = TYPE_MODE (vector_type);
478 0 : if (can_vec_perm_const_p (vmode, vmode, indices1)
479 0 : && can_vec_perm_const_p (vmode, vmode, indices2))
480 : {
481 0 : if (nvectors_out)
482 0 : *nvectors_out = nvectors;
483 0 : if (vector_type_out)
484 0 : *vector_type_out = vector_type;
485 0 : if (permutes)
486 : {
487 0 : permutes[0] = vect_gen_perm_mask_checked (vector_type,
488 : indices1);
489 0 : permutes[1] = vect_gen_perm_mask_checked (vector_type,
490 : indices2);
491 : }
492 0 : return true;
493 : }
494 0 : }
495 : }
496 0 : if (!multiple_p (elt_bytes, 2, &elt_bytes))
497 : return false;
498 0 : nvectors *= 2;
499 : /* We need to be able to fuse COUNT / NVECTORS elements together. */
500 0 : if (!multiple_p (count, nvectors))
501 : return false;
502 : }
503 : }
504 :
505 : /* Return true if DTA and DTB match. */
506 :
507 : static bool
508 16777999 : vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
509 : {
510 16777999 : return (dta == dtb
511 342132 : || ((dta == vect_external_def || dta == vect_constant_def)
512 214103 : && (dtb == vect_external_def || dtb == vect_constant_def)));
513 : }
514 :
515 : #define GATHER_SCATTER_OFFSET (-3)
516 :
517 : static const int no_arg_map[] = { 0 };
518 : static const int arg0_map[] = { 1, 0 };
519 : static const int arg2_map[] = { 1, 2 };
520 : static const int arg2_arg3_map[] = { 2, 2, 3 };
521 : static const int arg2_arg4_map[] = { 2, 2, 4 };
522 : static const int arg2_arg5_arg6_map[] = { 3, 2, 5, 6 };
523 : static const int arg2_arg4_arg5_map[] = { 3, 2, 4, 5 };
524 : static const int arg3_arg2_map[] = { 2, 3, 2 };
525 : static const int op1_op0_map[] = { 2, 1, 0 };
526 : static const int off_map[] = { 1, GATHER_SCATTER_OFFSET };
527 : static const int off_op0_map[] = { 2, GATHER_SCATTER_OFFSET, 0 };
528 : static const int off_arg2_arg3_map[] = { 3, GATHER_SCATTER_OFFSET, 2, 3 };
529 : static const int off_arg3_arg2_map[] = { 3, GATHER_SCATTER_OFFSET, 3, 2 };
530 : static const int mask_call_maps[6][7] = {
531 : { 1, 1, },
532 : { 2, 1, 2, },
533 : { 3, 1, 2, 3, },
534 : { 4, 1, 2, 3, 4, },
535 : { 5, 1, 2, 3, 4, 5, },
536 : { 6, 1, 2, 3, 4, 5, 6 },
537 : };
538 :
539 : /* For most SLP statements, there is a one-to-one mapping between
540 : gimple arguments and child nodes. If that is not true for STMT,
541 : return an array that contains:
542 :
543 : - the number of child nodes, followed by
544 : - for each child node, the index of the argument associated with that node.
545 : The special index -1 is the first operand of an embedded comparison and
546 : the special index -2 is the second operand of an embedded comparison.
547 : The special indes -3 is the offset of a gather as analyzed by
548 : vect_check_gather_scatter.
549 :
550 : SWAP is as for vect_get_and_check_slp_defs. */
551 :
552 : static const int *
553 18783496 : vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
554 : unsigned char swap = 0)
555 : {
556 18783496 : if (auto assign = dyn_cast<const gassign *> (stmt))
557 : {
558 17643968 : if (gimple_assign_rhs_code (assign) == COND_EXPR
559 17643968 : && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
560 0 : gcc_unreachable ();
561 17643968 : if ((TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
562 16357246 : || commutative_tree_code (gimple_assign_rhs_code (assign)))
563 26169049 : && swap)
564 : return op1_op0_map;
565 17603664 : if (gather_scatter_p)
566 42215 : return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
567 42215 : ? off_op0_map : off_map);
568 : }
569 18700977 : gcc_assert (!swap);
570 18700977 : if (auto call = dyn_cast<const gcall *> (stmt))
571 : {
572 139291 : if (gimple_call_internal_p (call))
573 73317 : switch (gimple_call_internal_fn (call))
574 : {
575 12168 : case IFN_MASK_LOAD:
576 20116 : return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
577 :
578 0 : case IFN_GATHER_LOAD:
579 0 : return arg2_map;
580 :
581 0 : case IFN_MASK_GATHER_LOAD:
582 0 : case IFN_MASK_LEN_GATHER_LOAD:
583 0 : return arg2_arg5_arg6_map;
584 :
585 0 : case IFN_SCATTER_STORE:
586 0 : return arg2_arg4_map;
587 :
588 0 : case IFN_MASK_SCATTER_STORE:
589 0 : case IFN_MASK_LEN_SCATTER_STORE:
590 0 : return arg2_arg4_arg5_map;
591 :
592 6227 : case IFN_MASK_STORE:
593 11178 : return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
594 :
595 988 : case IFN_MASK_CALL:
596 988 : {
597 988 : unsigned nargs = gimple_call_num_args (call);
598 988 : if (nargs >= 2 && nargs <= 7)
599 988 : return mask_call_maps[nargs-2];
600 : else
601 : return nullptr;
602 : }
603 :
604 140 : case IFN_CLZ:
605 140 : case IFN_CTZ:
606 140 : return arg0_map;
607 :
608 6306 : case IFN_GOMP_SIMD_LANE:
609 6306 : return no_arg_map;
610 :
611 : default:
612 : break;
613 : }
614 : }
615 : return nullptr;
616 : }
617 :
618 : /* Return the SLP node child index for operand OP of STMT. */
619 :
620 : int
621 1322360 : vect_slp_child_index_for_operand (const gimple *stmt, int op,
622 : bool gather_scatter_p)
623 : {
624 1322360 : const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
625 1322360 : if (!opmap)
626 : return op;
627 18015 : for (int i = 1; i < 1 + opmap[0]; ++i)
628 18015 : if (opmap[i] == op)
629 9882 : return i - 1;
630 0 : gcc_unreachable ();
631 : }
632 :
633 : /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
634 : they are of a valid type and that they match the defs of the first stmt of
635 : the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
636 : by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
637 : indicates swap is required for cond_expr stmts. Specifically, SWAP
638 : is 1 if STMT is cond and operands of comparison need to be swapped;
639 : SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
640 :
641 : If there was a fatal error return -1; if the error could be corrected by
642 : swapping operands of father node of this one, return 1; if everything is
643 : ok return 0. */
644 : static int
645 12201219 : vect_get_and_check_slp_defs (vec_info *vinfo, tree vectype, unsigned char swap,
646 : bool *skip_args,
647 : vec<stmt_vec_info> stmts, unsigned stmt_num,
648 : vec<slp_oprnd_info> *oprnds_info)
649 : {
650 12201219 : stmt_vec_info stmt_info = stmts[stmt_num];
651 12201219 : tree oprnd;
652 12201219 : unsigned int i, number_of_oprnds;
653 12201219 : enum vect_def_type dt = vect_uninitialized_def;
654 12201219 : slp_oprnd_info oprnd_info;
655 12201219 : gather_scatter_info gs_info;
656 12201219 : unsigned int gs_op = -1u;
657 12201219 : unsigned int commutative_op = -1U;
658 12201219 : bool first = stmt_num == 0;
659 :
660 12201219 : if (!stmt_info)
661 : {
662 0 : for (auto oi : *oprnds_info)
663 : {
664 0 : oi->def_stmts.quick_push (NULL);
665 0 : oi->ops.quick_push (NULL_TREE);
666 : }
667 : return 0;
668 : }
669 :
670 12201219 : if (!is_a<gcall *> (stmt_info->stmt)
671 : && !is_a<gassign *> (stmt_info->stmt)
672 : && !is_a<gphi *> (stmt_info->stmt))
673 : return -1;
674 :
675 12201219 : number_of_oprnds = gimple_num_args (stmt_info->stmt);
676 12201219 : const int *map
677 24402438 : = vect_get_operand_map (stmt_info->stmt,
678 12201219 : STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
679 12201219 : if (map)
680 69668 : number_of_oprnds = *map++;
681 12201219 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
682 : {
683 40072 : if (gimple_call_internal_p (stmt))
684 : {
685 24252 : internal_fn ifn = gimple_call_internal_fn (stmt);
686 24252 : commutative_op = first_commutative_argument (ifn);
687 24252 : if (internal_gather_scatter_fn_p (ifn))
688 : {
689 0 : vect_describe_gather_scatter_call
690 0 : (stmt_info,
691 0 : first ? &(*oprnds_info)[0]->first_gs_info : &gs_info);
692 0 : if (first)
693 0 : (*oprnds_info)[0]->first_gs_p = true;
694 : gs_op = 0;
695 : }
696 : }
697 : }
698 12161147 : else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
699 : {
700 14218248 : if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
701 8176202 : commutative_op = 0;
702 : }
703 :
704 12201219 : bool swapped = (swap != 0);
705 12201219 : bool backedge = false;
706 12201219 : enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
707 33799340 : for (i = 0; i < number_of_oprnds; i++)
708 : {
709 21599236 : oprnd_info = (*oprnds_info)[i];
710 21599236 : int opno = map ? map[i] : int (i);
711 21599236 : if (opno == GATHER_SCATTER_OFFSET)
712 : {
713 22050 : gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
714 22050 : if (!is_a <loop_vec_info> (vinfo)
715 22050 : || !vect_check_gather_scatter (stmt_info, vectype,
716 : as_a <loop_vec_info> (vinfo),
717 : first ? &oprnd_info->first_gs_info
718 : : &gs_info))
719 1115 : return -1;
720 :
721 22050 : if (first)
722 : {
723 21813 : oprnd_info->first_gs_p = true;
724 21813 : oprnd = oprnd_info->first_gs_info.offset;
725 : }
726 : else
727 : {
728 237 : gs_op = i;
729 237 : oprnd = gs_info.offset;
730 : }
731 : }
732 21577186 : else if (opno < 0)
733 0 : oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
734 : else
735 : {
736 21577186 : oprnd = gimple_arg (stmt_info->stmt, opno);
737 21577186 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
738 : {
739 1088608 : edge e = gimple_phi_arg_edge (stmt, opno);
740 2177216 : backedge = (is_a <bb_vec_info> (vinfo)
741 1626956 : ? e->flags & EDGE_DFS_BACK
742 538348 : : dominated_by_p (CDI_DOMINATORS, e->src,
743 538348 : gimple_bb (stmt_info->stmt)));
744 : }
745 : }
746 21599236 : if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
747 2650 : oprnd = TREE_OPERAND (oprnd, 0);
748 :
749 21599236 : stmt_vec_info def_stmt_info;
750 21599236 : if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
751 : {
752 963 : if (dump_enabled_p ())
753 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
754 : "Build SLP failed: can't analyze def for %T\n",
755 : oprnd);
756 :
757 963 : return -1;
758 : }
759 :
760 21598273 : if (skip_args[i])
761 : {
762 444785 : oprnd_info->def_stmts.quick_push (NULL);
763 444785 : oprnd_info->ops.quick_push (NULL_TREE);
764 444785 : oprnd_info->first_dt = vect_uninitialized_def;
765 444785 : continue;
766 : }
767 :
768 21153488 : oprnd_info->def_stmts.quick_push (def_stmt_info);
769 21153488 : oprnd_info->ops.quick_push (oprnd);
770 :
771 21153488 : if (def_stmt_info
772 21153488 : && is_pattern_stmt_p (def_stmt_info))
773 : {
774 344299 : if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
775 : != def_stmt_info)
776 247311 : oprnd_info->any_pattern = true;
777 : else
778 : /* If we promote this to external use the original stmt def. */
779 96988 : oprnd_info->ops.last ()
780 193976 : = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
781 : }
782 :
783 : /* If there's a extern def on a backedge make sure we can
784 : code-generate at the region start.
785 : ??? This is another case that could be fixed by adjusting
786 : how we split the function but at the moment we'd have conflicting
787 : goals there. */
788 21153488 : if (backedge
789 126982 : && dts[i] == vect_external_def
790 173 : && is_a <bb_vec_info> (vinfo)
791 173 : && TREE_CODE (oprnd) == SSA_NAME
792 152 : && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
793 21153640 : && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
794 152 : gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
795 : {
796 152 : if (dump_enabled_p ())
797 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
798 : "Build SLP failed: extern def %T only defined "
799 : "on backedge\n", oprnd);
800 152 : return -1;
801 : }
802 :
803 21153336 : if (first)
804 : {
805 4261052 : tree type = TREE_TYPE (oprnd);
806 4261052 : dt = dts[i];
807 :
808 : /* For the swapping logic below force vect_reduction_def
809 : for the reduction op in a SLP reduction group. */
810 4261052 : if (!STMT_VINFO_DATA_REF (stmt_info)
811 3175706 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
812 3288 : && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
813 4262672 : && def_stmt_info)
814 1620 : dts[i] = dt = vect_reduction_def;
815 :
816 : /* Check the types of the definition. */
817 4261052 : switch (dt)
818 : {
819 4261052 : case vect_external_def:
820 4261052 : case vect_constant_def:
821 4261052 : case vect_internal_def:
822 4261052 : case vect_reduction_def:
823 4261052 : case vect_double_reduction_def:
824 4261052 : case vect_induction_def:
825 4261052 : case vect_nested_cycle:
826 4261052 : case vect_first_order_recurrence:
827 4261052 : break;
828 :
829 0 : default:
830 : /* FORNOW: Not supported. */
831 0 : if (dump_enabled_p ())
832 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
833 : "Build SLP failed: illegal type of def %T\n",
834 : oprnd);
835 0 : return -1;
836 : }
837 :
838 4261052 : oprnd_info->first_dt = dt;
839 4261052 : oprnd_info->first_op_type = type;
840 : }
841 : }
842 12200104 : if (first)
843 : return 0;
844 :
845 : /* Now match the operand definition types to that of the first stmt. */
846 25852480 : for (i = 0; i < number_of_oprnds;)
847 : {
848 16888230 : if (skip_args[i])
849 : {
850 27772 : ++i;
851 27772 : continue;
852 : }
853 :
854 16860458 : oprnd_info = (*oprnds_info)[i];
855 16860458 : dt = dts[i];
856 16860458 : stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
857 16860458 : oprnd = oprnd_info->ops[stmt_num];
858 16860458 : tree type = TREE_TYPE (oprnd);
859 :
860 16860458 : if (!types_compatible_p (oprnd_info->first_op_type, type))
861 : {
862 88803 : if (dump_enabled_p ())
863 107 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
864 : "Build SLP failed: different operand types\n");
865 88803 : return 1;
866 : }
867 :
868 16771655 : if ((gs_op == i) != oprnd_info->first_gs_p)
869 : {
870 0 : if (dump_enabled_p ())
871 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
872 : "Build SLP failed: mixed gather and non-gather\n");
873 0 : return 1;
874 : }
875 16771655 : else if (gs_op == i)
876 : {
877 207 : if (!operand_equal_p (oprnd_info->first_gs_info.base,
878 207 : gs_info.base))
879 : {
880 16 : if (dump_enabled_p ())
881 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
882 : "Build SLP failed: different gather base\n");
883 16 : return 1;
884 : }
885 191 : if (oprnd_info->first_gs_info.scale != gs_info.scale)
886 : {
887 8 : if (dump_enabled_p ())
888 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
889 : "Build SLP failed: different gather scale\n");
890 8 : return 1;
891 : }
892 : }
893 :
894 : /* Not first stmt of the group, check that the def-stmt/s match
895 : the def-stmt/s of the first stmt. Allow different definition
896 : types for reduction chains: the first stmt must be a
897 : vect_reduction_def (a phi node), and the rest
898 : end in the reduction chain. */
899 16771631 : if ((!vect_def_types_match (oprnd_info->first_dt, dt)
900 284804 : && !(oprnd_info->first_dt == vect_reduction_def
901 2777 : && !STMT_VINFO_DATA_REF (stmt_info)
902 2777 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
903 2767 : && def_stmt_info
904 2767 : && !STMT_VINFO_DATA_REF (def_stmt_info)
905 2767 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
906 : == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
907 16489594 : || (!STMT_VINFO_DATA_REF (stmt_info)
908 15217699 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
909 5814 : && ((!def_stmt_info
910 5652 : || STMT_VINFO_DATA_REF (def_stmt_info)
911 10379 : || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
912 : != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
913 5814 : != (oprnd_info->first_dt != vect_reduction_def))))
914 : {
915 : /* Try swapping operands if we got a mismatch. For BB
916 : vectorization only in case it will clearly improve things. */
917 283968 : if (i == commutative_op && !swapped
918 282037 : && (!is_a <bb_vec_info> (vinfo)
919 5114 : || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
920 5114 : dts[i+1])
921 1108 : && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
922 : || vect_def_types_match
923 146 : ((*oprnds_info)[i+1]->first_dt, dts[i])))))
924 : {
925 1931 : if (dump_enabled_p ())
926 144 : dump_printf_loc (MSG_NOTE, vect_location,
927 : "trying swapped operands\n");
928 1931 : std::swap (dts[i], dts[i+1]);
929 1931 : std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
930 1931 : (*oprnds_info)[i+1]->def_stmts[stmt_num]);
931 1931 : std::swap ((*oprnds_info)[i]->ops[stmt_num],
932 1931 : (*oprnds_info)[i+1]->ops[stmt_num]);
933 : /* After swapping some operands we lost track whether an
934 : operand has any pattern defs so be conservative here. */
935 1931 : if ((*oprnds_info)[i]->any_pattern
936 1931 : || (*oprnds_info)[i+1]->any_pattern)
937 4 : (*oprnds_info)[i]->any_pattern
938 2 : = (*oprnds_info)[i+1]->any_pattern = true;
939 1931 : swapped = true;
940 1931 : continue;
941 : }
942 :
943 280106 : if (is_a <bb_vec_info> (vinfo)
944 269619 : && !oprnd_info->any_pattern
945 549487 : && number_of_oprnds > 1)
946 : {
947 : /* Now for commutative ops we should see whether we can
948 : make the other operand matching. */
949 103988 : if (dump_enabled_p ())
950 149 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
951 : "treating operand as external\n");
952 103988 : oprnd_info->first_dt = dt = vect_external_def;
953 : }
954 : else
955 : {
956 176118 : if (dump_enabled_p ())
957 406 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
958 : "Build SLP failed: different types\n");
959 176118 : return 1;
960 : }
961 : }
962 :
963 : /* Make sure to demote the overall operand to external. */
964 16593582 : if (dt == vect_external_def)
965 330363 : oprnd_info->first_dt = vect_external_def;
966 : /* For a SLP reduction chain we want to duplicate the reduction to
967 : each of the chain members. That gets us a sane SLP graph (still
968 : the stmts are not 100% correct wrt the initial values). */
969 16263219 : else if ((dt == vect_internal_def
970 16263219 : || dt == vect_reduction_def)
971 15359454 : && oprnd_info->first_dt == vect_reduction_def
972 64716 : && !STMT_VINFO_DATA_REF (stmt_info)
973 64716 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
974 2767 : && !STMT_VINFO_DATA_REF (def_stmt_info)
975 16265986 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
976 : == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
977 : {
978 2767 : oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
979 2767 : oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
980 : }
981 :
982 16593582 : ++i;
983 : }
984 :
985 : /* Swap operands. */
986 8964250 : if (swapped)
987 : {
988 39978 : if (dump_enabled_p ())
989 432 : dump_printf_loc (MSG_NOTE, vect_location,
990 : "swapped operands to match def types in %G",
991 : stmt_info->stmt);
992 : }
993 :
994 : return 0;
995 : }
996 :
997 : /* Return true if call statements CALL1 and CALL2 are similar enough
998 : to be combined into the same SLP group. */
999 :
1000 : bool
1001 20886 : compatible_calls_p (gcall *call1, gcall *call2, bool allow_two_operators)
1002 : {
1003 20886 : unsigned int nargs = gimple_call_num_args (call1);
1004 20886 : if (nargs != gimple_call_num_args (call2))
1005 : return false;
1006 :
1007 18950 : auto cfn1 = gimple_call_combined_fn (call1);
1008 18950 : auto cfn2 = gimple_call_combined_fn (call2);
1009 18950 : if (cfn1 != cfn2
1010 2 : && (!allow_two_operators
1011 2 : || !((cfn1 == CFN_FMA || cfn1 == CFN_FMS)
1012 2 : && (cfn2 == CFN_FMA || cfn2 == CFN_FMS))))
1013 : return false;
1014 :
1015 18950 : if (gimple_call_internal_p (call1))
1016 : {
1017 7084 : if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
1018 7084 : TREE_TYPE (gimple_call_lhs (call2))))
1019 : return false;
1020 14393 : for (unsigned int i = 0; i < nargs; ++i)
1021 7309 : if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
1022 7309 : TREE_TYPE (gimple_call_arg (call2, i))))
1023 : return false;
1024 : }
1025 : else
1026 : {
1027 11866 : if (!operand_equal_p (gimple_call_fn (call1),
1028 11866 : gimple_call_fn (call2), 0))
1029 : return false;
1030 :
1031 25848 : if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
1032 : return false;
1033 : }
1034 :
1035 : /* Check that any unvectorized arguments are equal. */
1036 15700 : if (const int *map = vect_get_operand_map (call1))
1037 : {
1038 15 : unsigned int nkept = *map++;
1039 15 : unsigned int mapi = 0;
1040 57 : for (unsigned int i = 0; i < nargs; ++i)
1041 42 : if (mapi < nkept && map[mapi] == int (i))
1042 27 : mapi += 1;
1043 15 : else if (!operand_equal_p (gimple_call_arg (call1, i),
1044 15 : gimple_call_arg (call2, i)))
1045 : return false;
1046 : }
1047 :
1048 : return true;
1049 : }
1050 :
1051 : /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1052 : caller's attempt to find the vector type in STMT_INFO with the narrowest
1053 : element type. Return true if VECTYPE is nonnull and if it is valid
1054 : for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1055 : number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1056 : vect_build_slp_tree. */
1057 :
1058 : static bool
1059 4954791 : vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1060 : unsigned int group_size,
1061 : tree vectype, poly_uint64 *max_nunits)
1062 : {
1063 4954791 : if (!vectype)
1064 : {
1065 4404 : if (dump_enabled_p ())
1066 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1067 : "Build SLP failed: unsupported data-type in %G\n",
1068 : stmt_info->stmt);
1069 : /* Fatal mismatch. */
1070 4404 : return false;
1071 : }
1072 :
1073 : /* If populating the vector type requires unrolling then fail
1074 : before adjusting *max_nunits for basic-block vectorization. */
1075 4950387 : if (is_a <bb_vec_info> (vinfo)
1076 4950387 : && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1077 : {
1078 141278 : if (dump_enabled_p ())
1079 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1080 : "Build SLP failed: unrolling required "
1081 : "in basic block SLP\n");
1082 : /* Fatal mismatch. */
1083 141278 : return false;
1084 : }
1085 :
1086 : /* In case of multiple types we need to detect the smallest type. */
1087 4809109 : vect_update_max_nunits (max_nunits, vectype);
1088 4809109 : return true;
1089 : }
1090 :
1091 : /* Verify if the scalar stmts STMTS are isomorphic, require data
1092 : permutation or are of unsupported types of operation. Return
1093 : true if they are, otherwise return false and indicate in *MATCHES
1094 : which stmts are not isomorphic to the first one. If MATCHES[0]
1095 : is false then this indicates the comparison could not be
1096 : carried out or the stmts will never be vectorized by SLP.
1097 :
1098 : Note COND_EXPR is possibly isomorphic to another one after swapping its
1099 : operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1100 : the first stmt by swapping the two operands of comparison; set SWAP[i]
1101 : to 2 if stmt I is isormorphic to the first stmt by inverting the code
1102 : of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1103 : to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1104 :
1105 : static bool
1106 5239811 : vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1107 : vec<stmt_vec_info> stmts, unsigned int group_size,
1108 : poly_uint64 *max_nunits, bool *matches,
1109 : bool *two_operators, tree *node_vectype)
1110 : {
1111 5239811 : unsigned int i;
1112 5239811 : stmt_vec_info first_stmt_info = stmts[0];
1113 5239811 : code_helper first_stmt_code = ERROR_MARK;
1114 5239811 : code_helper alt_stmt_code = ERROR_MARK;
1115 5239811 : code_helper first_cond_code = ERROR_MARK;
1116 5239811 : bool need_same_oprnds = false;
1117 5239811 : tree first_lhs = NULL_TREE;
1118 5239811 : tree first_op1 = NULL_TREE;
1119 5239811 : stmt_vec_info first_load = NULL, prev_first_load = NULL;
1120 5239811 : bool first_stmt_ldst_p = false, first_stmt_ldst_masklen_p = false;
1121 5239811 : bool first_stmt_phi_p = false;
1122 5239811 : int first_reduc_idx = -1;
1123 5239811 : bool maybe_soft_fail = false;
1124 5239811 : tree soft_fail_nunits_vectype = NULL_TREE;
1125 :
1126 5239811 : tree vectype, nunits_vectype;
1127 5239811 : if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
1128 : &nunits_vectype, group_size))
1129 : {
1130 : /* Fatal mismatch. */
1131 193781 : matches[0] = false;
1132 193781 : return false;
1133 : }
1134 5046030 : if (is_a <bb_vec_info> (vinfo)
1135 5046030 : && known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U))
1136 : {
1137 343438 : if (dump_enabled_p ())
1138 290 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1139 : "Build SLP failed: not using single lane "
1140 : "vector type %T\n", vectype);
1141 343438 : matches[0] = false;
1142 343438 : return false;
1143 : }
1144 : /* Record nunits required but continue analysis, producing matches[]
1145 : as if nunits was not an issue. This allows splitting of groups
1146 : to happen. */
1147 4702592 : if (nunits_vectype
1148 4702592 : && !vect_record_max_nunits (vinfo, first_stmt_info, group_size,
1149 : nunits_vectype, max_nunits))
1150 : {
1151 141278 : gcc_assert (is_a <bb_vec_info> (vinfo));
1152 141278 : maybe_soft_fail = true;
1153 141278 : soft_fail_nunits_vectype = nunits_vectype;
1154 : }
1155 :
1156 4702592 : gcc_assert (vectype || !gimple_get_lhs (first_stmt_info->stmt));
1157 4702592 : *node_vectype = vectype;
1158 :
1159 : /* For every stmt in NODE find its def stmt/s. */
1160 4702592 : stmt_vec_info stmt_info;
1161 20939920 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1162 : {
1163 16396246 : bool ldst_p = false;
1164 16396246 : bool ldst_masklen_p = false;
1165 16396246 : bool phi_p = false;
1166 16396246 : code_helper rhs_code = ERROR_MARK;
1167 :
1168 16396246 : swap[i] = 0;
1169 16396246 : matches[i] = false;
1170 16396246 : if (!stmt_info)
1171 : {
1172 39729 : matches[i] = true;
1173 16277057 : continue;
1174 : }
1175 :
1176 16356517 : gimple *stmt = stmt_info->stmt;
1177 16356517 : if (dump_enabled_p ())
1178 213305 : dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1179 :
1180 : /* Fail to vectorize statements marked as unvectorizable, throw
1181 : or are volatile. */
1182 16356517 : if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1183 16169231 : || stmt_can_throw_internal (cfun, stmt)
1184 31802764 : || gimple_has_volatile_ops (stmt))
1185 : {
1186 192770 : if (dump_enabled_p ())
1187 199 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1188 : "Build SLP failed: unvectorizable statement %G",
1189 : stmt);
1190 : /* ??? For BB vectorization we want to commutate operands in a way
1191 : to shuffle all unvectorizable defs into one operand and have
1192 : the other still vectorized. The following doesn't reliably
1193 : work for this though but it's the easiest we can do here. */
1194 192770 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1195 62854 : continue;
1196 : /* Fatal mismatch. */
1197 129916 : matches[0] = false;
1198 129916 : return false;
1199 : }
1200 :
1201 16163747 : gcall *call_stmt = dyn_cast <gcall *> (stmt);
1202 16163747 : tree lhs = gimple_get_lhs (stmt);
1203 16163747 : if (lhs == NULL_TREE && !call_stmt)
1204 : {
1205 36 : if (dump_enabled_p ())
1206 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1207 : "Build SLP failed: not GIMPLE_ASSIGN nor "
1208 : "GIMPLE_CALL %G", stmt);
1209 36 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1210 36 : continue;
1211 : /* Fatal mismatch. */
1212 0 : matches[0] = false;
1213 0 : return false;
1214 : }
1215 :
1216 16163711 : if (call_stmt)
1217 : {
1218 92843 : combined_fn cfn = gimple_call_combined_fn (call_stmt);
1219 92843 : if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1220 49465 : rhs_code = cfn;
1221 : else
1222 : rhs_code = CALL_EXPR;
1223 :
1224 92843 : if (cfn == CFN_GATHER_LOAD
1225 92843 : || cfn == CFN_SCATTER_STORE)
1226 : ldst_p = true;
1227 : else if (cfn == CFN_MASK_LOAD
1228 : || cfn == CFN_MASK_GATHER_LOAD
1229 : || cfn == CFN_MASK_LEN_GATHER_LOAD
1230 : || cfn == CFN_MASK_SCATTER_STORE
1231 : || cfn == CFN_MASK_LEN_SCATTER_STORE)
1232 : {
1233 : ldst_p = true;
1234 : ldst_masklen_p = true;
1235 : }
1236 : else if (cfn == CFN_MASK_STORE)
1237 : {
1238 : ldst_p = true;
1239 : ldst_masklen_p = true;
1240 : rhs_code = CFN_MASK_STORE;
1241 : }
1242 : else if (cfn == CFN_GOMP_SIMD_LANE)
1243 : ;
1244 83700 : else if ((cfn != CFN_LAST
1245 : && cfn != CFN_MASK_CALL
1246 40322 : && internal_fn_p (cfn)
1247 31185 : && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1248 83626 : || gimple_call_tail_p (call_stmt)
1249 83626 : || gimple_call_noreturn_p (call_stmt)
1250 167326 : || gimple_call_chain (call_stmt))
1251 : {
1252 423 : if (dump_enabled_p ())
1253 13 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254 : "Build SLP failed: unsupported call type %G",
1255 : (gimple *) call_stmt);
1256 423 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1257 62 : continue;
1258 : /* Fatal mismatch. */
1259 361 : matches[0] = false;
1260 361 : return false;
1261 : }
1262 : }
1263 16070868 : else if (gimple_code (stmt) == GIMPLE_PHI)
1264 : {
1265 : rhs_code = ERROR_MARK;
1266 : phi_p = true;
1267 : }
1268 : else
1269 : {
1270 15347884 : rhs_code = gimple_assign_rhs_code (stmt);
1271 15347884 : ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1272 : }
1273 :
1274 : /* Check the operation. */
1275 16163288 : if (i == 0)
1276 : {
1277 4572315 : first_lhs = lhs;
1278 4572315 : first_stmt_code = rhs_code;
1279 4572315 : first_stmt_ldst_p = ldst_p;
1280 4572315 : first_stmt_ldst_masklen_p = ldst_masklen_p;
1281 4572315 : first_stmt_phi_p = phi_p;
1282 4572315 : first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
1283 :
1284 : /* Shift arguments should be equal in all the packed stmts for a
1285 : vector shift with scalar shift operand. */
1286 4572315 : if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1287 4449936 : || rhs_code == LROTATE_EXPR
1288 9022209 : || rhs_code == RROTATE_EXPR)
1289 : {
1290 : /* First see if we have a vector/vector shift. */
1291 122622 : if (!directly_supported_p (rhs_code, vectype, optab_vector))
1292 : {
1293 : /* No vector/vector shift, try for a vector/scalar shift. */
1294 114562 : if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1295 : {
1296 9419 : if (dump_enabled_p ())
1297 375 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1298 : "Build SLP failed: "
1299 : "op not supported by target.\n");
1300 9419 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1301 : continue;
1302 : /* Fatal mismatch. */
1303 9419 : matches[0] = false;
1304 9419 : return false;
1305 : }
1306 105143 : need_same_oprnds = true;
1307 105143 : first_op1 = gimple_assign_rhs2 (stmt);
1308 : }
1309 : }
1310 4449693 : else if (rhs_code == WIDEN_LSHIFT_EXPR)
1311 : {
1312 0 : need_same_oprnds = true;
1313 0 : first_op1 = gimple_assign_rhs2 (stmt);
1314 : }
1315 4449693 : else if (!ldst_p
1316 4449693 : && rhs_code == BIT_FIELD_REF)
1317 : {
1318 5743 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1319 5743 : if (!is_a <bb_vec_info> (vinfo)
1320 5617 : || TREE_CODE (vec) != SSA_NAME
1321 : /* When the element types are not compatible we pun the
1322 : source to the target vectype which requires equal size. */
1323 11348 : || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1324 4890 : || !types_compatible_p (TREE_TYPE (vectype),
1325 4890 : TREE_TYPE (TREE_TYPE (vec))))
1326 1031 : && !operand_equal_p (TYPE_SIZE (vectype),
1327 1031 : TYPE_SIZE (TREE_TYPE (vec)))))
1328 : {
1329 781 : if (dump_enabled_p ())
1330 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1331 : "Build SLP failed: "
1332 : "BIT_FIELD_REF not supported\n");
1333 : /* Fatal mismatch. */
1334 781 : matches[0] = false;
1335 781 : return false;
1336 : }
1337 : }
1338 4443950 : else if (rhs_code == CFN_DIV_POW2)
1339 : {
1340 0 : need_same_oprnds = true;
1341 0 : first_op1 = gimple_call_arg (call_stmt, 1);
1342 : }
1343 4443950 : else if (rhs_code == CFN_GOMP_SIMD_LANE)
1344 : {
1345 3153 : need_same_oprnds = true;
1346 3153 : first_op1 = gimple_call_arg (call_stmt, 1);
1347 : }
1348 : }
1349 : else
1350 : {
1351 11591305 : if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1352 : /* For SLP reduction groups the index isn't necessarily
1353 : uniform but only that of the first stmt matters. */
1354 1640 : && !(first_reduc_idx != -1
1355 1640 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1356 1640 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1357 11590973 : && !(first_reduc_idx != -1
1358 898 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1359 898 : && rhs_code.is_tree_code ()
1360 898 : && commutative_tree_code (tree_code (rhs_code))
1361 704 : && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info)))
1362 : {
1363 332 : if (dump_enabled_p ())
1364 : {
1365 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1366 : "Build SLP failed: different reduc_idx "
1367 : "%d instead of %d in %G",
1368 : STMT_VINFO_REDUC_IDX (stmt_info),
1369 : first_reduc_idx, stmt);
1370 : }
1371 : /* Mismatch. */
1372 332 : continue;
1373 : }
1374 11590641 : if (!ldst_p
1375 9167541 : && first_stmt_code != rhs_code
1376 12974558 : && alt_stmt_code == ERROR_MARK)
1377 : alt_stmt_code = rhs_code;
1378 12954586 : if ((!ldst_p
1379 9167541 : && first_stmt_code != rhs_code
1380 1383917 : && (first_stmt_code != IMAGPART_EXPR
1381 127 : || rhs_code != REALPART_EXPR)
1382 1383897 : && (first_stmt_code != REALPART_EXPR
1383 458 : || rhs_code != IMAGPART_EXPR)
1384 : /* Handle mismatches in plus/minus by computing both
1385 : and merging the results. */
1386 1383886 : && !((((first_stmt_code == PLUS_EXPR
1387 1287169 : || first_stmt_code == MINUS_EXPR)
1388 116616 : && (alt_stmt_code == PLUS_EXPR
1389 107822 : || alt_stmt_code == MINUS_EXPR))
1390 1361366 : || ((first_stmt_code == CFN_FMA
1391 1361364 : || first_stmt_code == CFN_FMS)
1392 2 : && (alt_stmt_code == CFN_FMA
1393 2 : || alt_stmt_code == CFN_FMS)))
1394 22522 : && rhs_code == alt_stmt_code)
1395 1401243 : && !(first_stmt_code.is_tree_code ()
1396 1285846 : && rhs_code.is_tree_code ()
1397 1193189 : && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1398 : == tcc_comparison)
1399 127701 : && (swap_tree_comparison (tree_code (first_stmt_code))
1400 127701 : == tree_code (rhs_code))
1401 : && (first_reduc_idx == -1
1402 0 : || REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
1403 : || (ldst_p
1404 4846200 : && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1405 2423100 : != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1406 : || (ldst_p
1407 2381129 : && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1408 2381129 : != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1409 10226838 : || first_stmt_ldst_p != ldst_p
1410 10226704 : || (ldst_p && first_stmt_ldst_masklen_p != ldst_masklen_p)
1411 21817337 : || first_stmt_phi_p != phi_p)
1412 : {
1413 1363945 : if (dump_enabled_p ())
1414 : {
1415 2845 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1416 : "Build SLP failed: different operation "
1417 : "in stmt %G", stmt);
1418 2845 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1419 : "original stmt %G", first_stmt_info->stmt);
1420 : }
1421 : /* Mismatch. */
1422 1363945 : continue;
1423 : }
1424 :
1425 10229059 : if (!ldst_p
1426 7845697 : && first_stmt_code == BIT_FIELD_REF
1427 10232499 : && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1428 5803 : != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1429 : {
1430 2363 : if (dump_enabled_p ())
1431 40 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1432 : "Build SLP failed: different BIT_FIELD_REF "
1433 : "arguments in %G", stmt);
1434 : /* Mismatch. */
1435 2363 : continue;
1436 : }
1437 :
1438 10224333 : if (call_stmt
1439 21718 : && first_stmt_code != CFN_MASK_LOAD
1440 10245565 : && first_stmt_code != CFN_MASK_STORE)
1441 : {
1442 20886 : if (!is_a <gcall *> (stmts[0]->stmt)
1443 20886 : || !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1444 : call_stmt, true))
1445 : {
1446 5186 : if (dump_enabled_p ())
1447 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1448 : "Build SLP failed: different calls in %G",
1449 : stmt);
1450 : /* Mismatch. */
1451 5186 : continue;
1452 : }
1453 : }
1454 :
1455 10048605 : if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1456 10908612 : && (gimple_bb (first_stmt_info->stmt)
1457 860007 : != gimple_bb (stmt_info->stmt)))
1458 : {
1459 27078 : if (dump_enabled_p ())
1460 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1461 : "Build SLP failed: different BB for PHI "
1462 : "or possibly trapping operation in %G", stmt);
1463 : /* Mismatch. */
1464 27078 : continue;
1465 : }
1466 :
1467 10192069 : if (need_same_oprnds)
1468 : {
1469 53025 : tree other_op1 = gimple_arg (stmt, 1);
1470 53025 : if (!operand_equal_p (first_op1, other_op1, 0))
1471 : {
1472 6951 : if (dump_enabled_p ())
1473 123 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474 : "Build SLP failed: different shift "
1475 : "arguments in %G", stmt);
1476 : /* Mismatch. */
1477 6951 : continue;
1478 : }
1479 : }
1480 :
1481 10185855 : if (first_lhs
1482 10185118 : && lhs
1483 10185118 : && !types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (first_lhs)))
1484 : {
1485 737 : if (dump_enabled_p ())
1486 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1487 : "Build SLP failed: different vector type "
1488 : "in %G", stmt);
1489 : /* Mismatch. */
1490 737 : continue;
1491 : }
1492 : }
1493 :
1494 : /* Grouped store or load. */
1495 14746496 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1496 : {
1497 3729985 : gcc_assert (ldst_p);
1498 3729985 : if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1499 : {
1500 : /* Store. */
1501 2963851 : gcc_assert (rhs_code == CFN_MASK_STORE
1502 : || REFERENCE_CLASS_P (lhs)
1503 : || DECL_P (lhs));
1504 : }
1505 : else
1506 : {
1507 : /* Load. */
1508 766134 : first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1509 766134 : if (prev_first_load)
1510 : {
1511 : /* Check that there are no loads from different interleaving
1512 : chains in the same node. */
1513 339183 : if (prev_first_load != first_load)
1514 : {
1515 41632 : if (dump_enabled_p ())
1516 1988 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1517 : vect_location,
1518 : "Build SLP failed: different "
1519 : "interleaving chains in one node %G",
1520 : stmt);
1521 : /* Mismatch. */
1522 41632 : continue;
1523 : }
1524 : }
1525 : else
1526 : prev_first_load = first_load;
1527 : }
1528 : }
1529 : /* Non-grouped store or load. */
1530 11016511 : else if (ldst_p)
1531 : {
1532 705924 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1533 495068 : && rhs_code != CFN_GATHER_LOAD
1534 : && rhs_code != CFN_MASK_GATHER_LOAD
1535 : && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1536 : && rhs_code != CFN_SCATTER_STORE
1537 : && rhs_code != CFN_MASK_SCATTER_STORE
1538 : && rhs_code != CFN_MASK_LEN_SCATTER_STORE
1539 495068 : && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1540 : /* Not grouped loads are handled as externals for BB
1541 : vectorization. For loop vectorization we can handle
1542 : splats the same we handle single element interleaving.
1543 : Likewise we can handle a collection of invariant refs. */
1544 1182658 : && (is_a <bb_vec_info> (vinfo)
1545 476734 : || (stmt_info != first_stmt_info
1546 44304 : && !(integer_zerop (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
1547 157 : && integer_zerop (DR_STEP (STMT_VINFO_DATA_REF
1548 : (first_stmt_info)))))))
1549 : {
1550 : /* Not grouped load. */
1551 43990 : if (dump_enabled_p ())
1552 121 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1553 : "Build SLP failed: not grouped load %G", stmt);
1554 :
1555 43990 : if (i != 0)
1556 43990 : continue;
1557 : /* Fatal mismatch. */
1558 0 : matches[0] = false;
1559 0 : return false;
1560 : }
1561 : }
1562 : /* Not memory operation. */
1563 : else
1564 : {
1565 10310587 : if (!phi_p
1566 9709605 : && rhs_code.is_tree_code ()
1567 9667785 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1568 1415523 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1569 902340 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1570 854089 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1571 64399 : && rhs_code != VIEW_CONVERT_EXPR
1572 : && rhs_code != CALL_EXPR
1573 : && rhs_code != BIT_FIELD_REF
1574 10310587 : && rhs_code != SSA_NAME)
1575 : {
1576 18441 : if (dump_enabled_p ())
1577 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1578 : "Build SLP failed: operation unsupported %G",
1579 : stmt);
1580 18441 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1581 0 : continue;
1582 : /* Fatal mismatch. */
1583 18441 : matches[0] = false;
1584 18441 : return false;
1585 : }
1586 :
1587 10292146 : if (rhs_code == COND_EXPR)
1588 : {
1589 45840 : tree cond_expr = gimple_assign_rhs1 (stmt);
1590 45840 : enum tree_code cond_code = TREE_CODE (cond_expr);
1591 45840 : enum tree_code swap_code = ERROR_MARK;
1592 45840 : enum tree_code invert_code = ERROR_MARK;
1593 :
1594 45840 : if (i == 0)
1595 37161 : first_cond_code = TREE_CODE (cond_expr);
1596 8679 : else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1597 : {
1598 0 : bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1599 0 : swap_code = swap_tree_comparison (cond_code);
1600 0 : invert_code = invert_tree_comparison (cond_code, honor_nans);
1601 : }
1602 :
1603 45840 : if (first_cond_code == cond_code)
1604 : ;
1605 : /* Isomorphic can be achieved by swapping. */
1606 0 : else if (first_cond_code == swap_code)
1607 0 : swap[i] = 1;
1608 : /* Isomorphic can be achieved by inverting. */
1609 0 : else if (first_cond_code == invert_code)
1610 0 : swap[i] = 2;
1611 : else
1612 : {
1613 0 : if (dump_enabled_p ())
1614 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1615 : "Build SLP failed: different"
1616 : " operation %G", stmt);
1617 : /* Mismatch. */
1618 0 : continue;
1619 : }
1620 : }
1621 :
1622 10292146 : if (i != 0
1623 7804561 : && first_stmt_code != rhs_code
1624 62073 : && first_stmt_code.is_tree_code ()
1625 62071 : && rhs_code.is_tree_code ()
1626 62071 : && TREE_CODE_CLASS ((tree_code)first_stmt_code) == tcc_comparison
1627 10331861 : && (swap_tree_comparison ((tree_code)first_stmt_code)
1628 39715 : == (tree_code)rhs_code))
1629 39715 : swap[i] = 1;
1630 :
1631 10292146 : if (i != 0
1632 7804561 : && first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1633 1084 : && first_reduc_idx != -1
1634 1084 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1635 1084 : && rhs_code.is_tree_code ()
1636 1084 : && commutative_tree_code (tree_code (rhs_code))
1637 10293230 : && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info))
1638 1084 : swap[i] = 1;
1639 : }
1640 :
1641 14642433 : matches[i] = true;
1642 : }
1643 :
1644 19201097 : for (i = 0; i < group_size; ++i)
1645 15311347 : if (!matches[i])
1646 : return false;
1647 :
1648 : /* If we allowed a two-operation SLP node verify the target can cope
1649 : with the permute we are going to use. */
1650 3889750 : if (alt_stmt_code != ERROR_MARK
1651 3889750 : && (!alt_stmt_code.is_tree_code ()
1652 51411 : || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1653 51411 : && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1654 : {
1655 12263 : *two_operators = true;
1656 : }
1657 :
1658 3889750 : if (maybe_soft_fail)
1659 : {
1660 140863 : unsigned HOST_WIDE_INT const_nunits;
1661 140863 : if (!TYPE_VECTOR_SUBPARTS
1662 140863 : (soft_fail_nunits_vectype).is_constant (&const_nunits)
1663 140863 : || const_nunits > group_size)
1664 0 : matches[0] = false;
1665 : else
1666 : {
1667 : /* With constant vector elements simulate a mismatch at the
1668 : point we need to split. */
1669 140863 : unsigned tail = group_size & (const_nunits - 1);
1670 140863 : memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1671 : }
1672 140863 : return false;
1673 : }
1674 :
1675 : return true;
1676 : }
1677 :
1678 : /* Traits for the hash_set to record failed SLP builds for a stmt set.
1679 : Note we never remove apart from at destruction time so we do not
1680 : need a special value for deleted that differs from empty. */
1681 : struct bst_traits
1682 : {
1683 : typedef vec <stmt_vec_info> value_type;
1684 : typedef vec <stmt_vec_info> compare_type;
1685 : static inline hashval_t hash (value_type);
1686 : static inline bool equal (value_type existing, value_type candidate);
1687 432043470 : static inline bool is_empty (value_type x) { return !x.exists (); }
1688 96138605 : static inline bool is_deleted (value_type x) { return !x.exists (); }
1689 : static const bool empty_zero_p = true;
1690 0 : static inline void mark_empty (value_type &x) { x.release (); }
1691 : static inline void mark_deleted (value_type &x) { x.release (); }
1692 8301678 : static inline void remove (value_type &x) { x.release (); }
1693 : };
1694 : inline hashval_t
1695 83776600 : bst_traits::hash (value_type x)
1696 : {
1697 83776600 : inchash::hash h;
1698 398055869 : for (unsigned i = 0; i < x.length (); ++i)
1699 314279269 : h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
1700 83776600 : return h.end ();
1701 : }
1702 : inline bool
1703 73157694 : bst_traits::equal (value_type existing, value_type candidate)
1704 : {
1705 219473082 : if (existing.length () != candidate.length ())
1706 : return false;
1707 74981630 : for (unsigned i = 0; i < existing.length (); ++i)
1708 71159687 : if (existing[i] != candidate[i])
1709 : return false;
1710 : return true;
1711 : }
1712 :
1713 : typedef hash_map <vec <stmt_vec_info>, slp_tree,
1714 : simple_hashmap_traits <bst_traits, slp_tree> >
1715 : scalar_stmts_to_slp_tree_map_t;
1716 :
1717 : /* Release BST_MAP. */
1718 :
1719 : static void
1720 1638392 : release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
1721 : {
1722 : /* The map keeps a reference on SLP nodes built, release that. */
1723 9940070 : for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1724 18241748 : it != bst_map->end (); ++it)
1725 8301678 : if ((*it).second)
1726 8301678 : vect_free_slp_tree ((*it).second);
1727 1638392 : delete bst_map;
1728 1638392 : }
1729 :
1730 : /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1731 : but then vec::insert does memmove and that's not compatible with
1732 : std::pair. */
1733 : struct chain_op_t
1734 : {
1735 3645069 : chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1736 3645069 : : code (code_), dt (dt_), op (op_) {}
1737 : tree_code code;
1738 : vect_def_type dt;
1739 : tree op;
1740 : };
1741 :
1742 : /* Comparator for sorting associatable chains. */
1743 :
1744 : static int
1745 8449842 : dt_sort_cmp (const void *op1_, const void *op2_, void *)
1746 : {
1747 8449842 : auto *op1 = (const chain_op_t *) op1_;
1748 8449842 : auto *op2 = (const chain_op_t *) op2_;
1749 8449842 : if (op1->dt != op2->dt)
1750 1038710 : return (int)op1->dt - (int)op2->dt;
1751 7411132 : return (int)op1->code - (int)op2->code;
1752 : }
1753 :
1754 : /* Linearize the associatable expression chain at START with the
1755 : associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1756 : filling CHAIN with the result and using WORKLIST as intermediate storage.
1757 : CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1758 : or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1759 : stmts, starting with START. */
1760 :
1761 : static void
1762 1633216 : vect_slp_linearize_chain (vec_info *vinfo,
1763 : vec<std::pair<tree_code, gimple *> > &worklist,
1764 : vec<chain_op_t> &chain,
1765 : enum tree_code code, gimple *start,
1766 : gimple *&code_stmt, gimple *&alt_code_stmt,
1767 : vec<gimple *> *chain_stmts)
1768 : {
1769 : /* For each lane linearize the addition/subtraction (or other
1770 : uniform associatable operation) expression tree. */
1771 1633216 : worklist.safe_push (std::make_pair (code, start));
1772 3645069 : while (!worklist.is_empty ())
1773 : {
1774 2011853 : auto entry = worklist.pop ();
1775 2011853 : gassign *stmt = as_a <gassign *> (entry.second);
1776 2011853 : enum tree_code in_code = entry.first;
1777 4023706 : enum tree_code this_code = gimple_assign_rhs_code (stmt);
1778 : /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1779 2011853 : if (!code_stmt
1780 2011853 : && gimple_assign_rhs_code (stmt) == code)
1781 1379727 : code_stmt = stmt;
1782 632126 : else if (!alt_code_stmt
1783 632126 : && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1784 330802 : alt_code_stmt = stmt;
1785 2011853 : if (chain_stmts)
1786 1962575 : chain_stmts->safe_push (stmt);
1787 6035559 : for (unsigned opnum = 1; opnum <= 2; ++opnum)
1788 : {
1789 4023706 : tree op = gimple_op (stmt, opnum);
1790 4023706 : vect_def_type dt;
1791 4023706 : stmt_vec_info def_stmt_info;
1792 4023706 : bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1793 4023706 : gcc_assert (res);
1794 4023706 : if (dt == vect_internal_def
1795 4023706 : && is_pattern_stmt_p (def_stmt_info))
1796 6495 : op = gimple_get_lhs (def_stmt_info->stmt);
1797 4023706 : gimple *use_stmt;
1798 4023706 : use_operand_p use_p;
1799 4023706 : if (dt == vect_internal_def
1800 3731472 : && single_imm_use (op, &use_p, &use_stmt)
1801 2301379 : && is_gimple_assign (def_stmt_info->stmt)
1802 6143088 : && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1803 1768324 : || (code == PLUS_EXPR
1804 888744 : && (gimple_assign_rhs_code (def_stmt_info->stmt)
1805 : == MINUS_EXPR))))
1806 : {
1807 378637 : tree_code op_def_code = this_code;
1808 378637 : if (op_def_code == MINUS_EXPR && opnum == 1)
1809 55589 : op_def_code = PLUS_EXPR;
1810 378637 : if (in_code == MINUS_EXPR)
1811 193 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1812 378637 : worklist.safe_push (std::make_pair (op_def_code,
1813 378637 : def_stmt_info->stmt));
1814 : }
1815 : else
1816 : {
1817 3645069 : tree_code op_def_code = this_code;
1818 3645069 : if (op_def_code == MINUS_EXPR && opnum == 1)
1819 279509 : op_def_code = PLUS_EXPR;
1820 3645069 : if (in_code == MINUS_EXPR)
1821 6745 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1822 3645069 : chain.safe_push (chain_op_t (op_def_code, dt, op));
1823 : }
1824 : }
1825 : }
1826 1633216 : }
1827 :
1828 : static slp_tree
1829 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1830 : vec<stmt_vec_info> stmts, unsigned int group_size,
1831 : poly_uint64 *max_nunits,
1832 : bool *matches, unsigned *limit, unsigned *tree_size,
1833 : scalar_stmts_to_slp_tree_map_t *bst_map);
1834 :
1835 : static slp_tree
1836 5635730 : vect_build_slp_tree (vec_info *vinfo,
1837 : vec<stmt_vec_info> stmts, unsigned int group_size,
1838 : poly_uint64 *max_nunits,
1839 : bool *matches, unsigned *limit, unsigned *tree_size,
1840 : scalar_stmts_to_slp_tree_map_t *bst_map)
1841 : {
1842 5635730 : if (slp_tree *leader = bst_map->get (stmts))
1843 : {
1844 389941 : if (dump_enabled_p ())
1845 16822 : dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1846 16822 : !(*leader)->failed ? "" : "failed ",
1847 : (void *) *leader);
1848 389941 : if (!(*leader)->failed)
1849 : {
1850 343470 : SLP_TREE_REF_COUNT (*leader)++;
1851 343470 : vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1852 343470 : stmts.release ();
1853 343470 : return *leader;
1854 : }
1855 46471 : memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1856 46471 : return NULL;
1857 : }
1858 :
1859 : /* Single-lane SLP doesn't have the chance of run-away, do not account
1860 : it to the limit. */
1861 5245789 : if (stmts.length () > 1)
1862 : {
1863 3065353 : if (*limit == 0)
1864 : {
1865 1501 : if (dump_enabled_p ())
1866 12 : dump_printf_loc (MSG_NOTE, vect_location,
1867 : "SLP discovery limit exceeded\n");
1868 1501 : memset (matches, 0, sizeof (bool) * group_size);
1869 1501 : return NULL;
1870 : }
1871 3063852 : --*limit;
1872 : }
1873 :
1874 : /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1875 : so we can pick up backedge destinations during discovery. */
1876 5244288 : slp_tree res = new _slp_tree;
1877 5244288 : SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1878 5244288 : SLP_TREE_SCALAR_STMTS (res) = stmts;
1879 5244288 : bst_map->put (stmts.copy (), res);
1880 :
1881 5244288 : if (dump_enabled_p ())
1882 141845 : dump_printf_loc (MSG_NOTE, vect_location,
1883 : "starting SLP discovery for node %p\n", (void *) res);
1884 :
1885 5244288 : poly_uint64 this_max_nunits = 1;
1886 5244288 : slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1887 : &this_max_nunits,
1888 : matches, limit, tree_size, bst_map);
1889 5244288 : if (!res_)
1890 : {
1891 1921916 : if (dump_enabled_p ())
1892 8016 : dump_printf_loc (MSG_NOTE, vect_location,
1893 : "SLP discovery for node %p failed\n", (void *) res);
1894 : /* Mark the node invalid so we can detect those when still in use
1895 : as backedge destinations. */
1896 1921916 : SLP_TREE_SCALAR_STMTS (res) = vNULL;
1897 1921916 : SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1898 1921916 : res->failed = XNEWVEC (bool, group_size);
1899 1921916 : if (flag_checking)
1900 : {
1901 : unsigned i;
1902 3417161 : for (i = 0; i < group_size; ++i)
1903 3417161 : if (!matches[i])
1904 : break;
1905 1921916 : gcc_assert (i < group_size);
1906 : }
1907 1921916 : memcpy (res->failed, matches, sizeof (bool) * group_size);
1908 : }
1909 : else
1910 : {
1911 3322372 : if (dump_enabled_p ())
1912 133829 : dump_printf_loc (MSG_NOTE, vect_location,
1913 : "SLP discovery for node %p succeeded\n",
1914 : (void *) res);
1915 3322372 : gcc_assert (res_ == res);
1916 3322372 : res->max_nunits = this_max_nunits;
1917 3322372 : vect_update_max_nunits (max_nunits, this_max_nunits);
1918 : /* Keep a reference for the bst_map use. */
1919 3322372 : SLP_TREE_REF_COUNT (res)++;
1920 : }
1921 : return res_;
1922 : }
1923 :
1924 : /* Helper for building an associated SLP node chain. */
1925 :
1926 : static void
1927 122 : vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1928 : slp_tree op0, slp_tree op1,
1929 : stmt_vec_info oper1, stmt_vec_info oper2,
1930 : vec<std::pair<unsigned, unsigned> > lperm)
1931 : {
1932 122 : unsigned group_size = SLP_TREE_LANES (op1);
1933 :
1934 122 : slp_tree child1 = new _slp_tree;
1935 122 : SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1936 122 : SLP_TREE_VECTYPE (child1) = vectype;
1937 122 : SLP_TREE_LANES (child1) = group_size;
1938 122 : SLP_TREE_CHILDREN (child1).create (2);
1939 122 : SLP_TREE_CHILDREN (child1).quick_push (op0);
1940 122 : SLP_TREE_CHILDREN (child1).quick_push (op1);
1941 122 : SLP_TREE_REPRESENTATIVE (child1) = oper1;
1942 :
1943 122 : slp_tree child2 = new _slp_tree;
1944 122 : SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1945 122 : SLP_TREE_VECTYPE (child2) = vectype;
1946 122 : SLP_TREE_LANES (child2) = group_size;
1947 122 : SLP_TREE_CHILDREN (child2).create (2);
1948 122 : SLP_TREE_CHILDREN (child2).quick_push (op0);
1949 122 : SLP_TREE_REF_COUNT (op0)++;
1950 122 : SLP_TREE_CHILDREN (child2).quick_push (op1);
1951 122 : SLP_TREE_REF_COUNT (op1)++;
1952 122 : SLP_TREE_REPRESENTATIVE (child2) = oper2;
1953 :
1954 122 : SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1955 122 : SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1956 122 : SLP_TREE_VECTYPE (perm) = vectype;
1957 122 : SLP_TREE_LANES (perm) = group_size;
1958 : /* ??? We should set this NULL but that's not expected. */
1959 122 : SLP_TREE_REPRESENTATIVE (perm) = oper1;
1960 122 : SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1961 122 : SLP_TREE_CHILDREN (perm).quick_push (child1);
1962 122 : SLP_TREE_CHILDREN (perm).quick_push (child2);
1963 122 : }
1964 :
1965 : /* Recursively build an SLP tree starting from NODE.
1966 : Fail (and return a value not equal to zero) if def-stmts are not
1967 : isomorphic, require data permutation or are of unsupported types of
1968 : operation. Otherwise, return 0.
1969 : The value returned is the depth in the SLP tree where a mismatch
1970 : was found. */
1971 :
1972 : static slp_tree
1973 5244288 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1974 : vec<stmt_vec_info> stmts, unsigned int group_size,
1975 : poly_uint64 *max_nunits,
1976 : bool *matches, unsigned *limit, unsigned *tree_size,
1977 : scalar_stmts_to_slp_tree_map_t *bst_map)
1978 : {
1979 5244288 : unsigned nops, i, this_tree_size = 0;
1980 5244288 : poly_uint64 this_max_nunits = *max_nunits;
1981 :
1982 5244288 : matches[0] = false;
1983 :
1984 5244288 : stmt_vec_info stmt_info = stmts[0];
1985 5244288 : if (!is_a<gcall *> (stmt_info->stmt)
1986 : && !is_a<gassign *> (stmt_info->stmt)
1987 : && !is_a<gphi *> (stmt_info->stmt))
1988 : return NULL;
1989 :
1990 5244217 : nops = gimple_num_args (stmt_info->stmt);
1991 5244217 : if (const int *map = vect_get_operand_map (stmt_info->stmt,
1992 5244217 : STMT_VINFO_GATHER_SCATTER_P
1993 : (stmt_info)))
1994 28783 : nops = map[0];
1995 :
1996 : /* If the SLP node is a PHI (induction or reduction), terminate
1997 : the recursion. */
1998 5244217 : bool *skip_args = XALLOCAVEC (bool, nops);
1999 5244217 : memset (skip_args, 0, sizeof (bool) * nops);
2000 5244217 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2001 2325194 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
2002 : {
2003 252219 : tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
2004 252219 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2005 : group_size);
2006 252219 : if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
2007 : max_nunits))
2008 : return NULL;
2009 :
2010 247815 : vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
2011 247815 : if (def_type == vect_induction_def)
2012 : {
2013 : /* Induction PHIs are not cycles but walk the initial
2014 : value. Only for inner loops through, for outer loops
2015 : we need to pick up the value from the actual PHIs
2016 : to more easily support peeling and epilogue vectorization. */
2017 172531 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2018 172531 : if (!nested_in_vect_loop_p (loop, stmt_info))
2019 171788 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
2020 : else
2021 : loop = loop->inner;
2022 172531 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
2023 : }
2024 75284 : else if (def_type == vect_reduction_def
2025 : || def_type == vect_double_reduction_def
2026 : || def_type == vect_nested_cycle
2027 75284 : || def_type == vect_first_order_recurrence)
2028 : {
2029 : /* Else def types have to match. */
2030 : stmt_vec_info other_info;
2031 : bool all_same = true;
2032 166626 : FOR_EACH_VEC_ELT (stmts, i, other_info)
2033 : {
2034 92476 : if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
2035 1706459 : return NULL;
2036 92474 : if (other_info != stmt_info)
2037 15673 : all_same = false;
2038 : }
2039 74150 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2040 : /* Reduction initial values are not explicitly represented. */
2041 74150 : if (def_type != vect_first_order_recurrence
2042 74150 : && gimple_bb (stmt_info->stmt) == loop->header)
2043 71280 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
2044 : /* Reduction chain backedge defs are filled manually.
2045 : ??? Need a better way to identify a SLP reduction chain PHI.
2046 : Or a better overall way to SLP match those. */
2047 74150 : if (stmts.length () > 1
2048 74150 : && all_same && def_type == vect_reduction_def)
2049 1414 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
2050 : }
2051 1132 : else if (def_type != vect_internal_def)
2052 : return NULL;
2053 : }
2054 :
2055 :
2056 5239811 : bool two_operators = false;
2057 5239811 : unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
2058 5239811 : tree vectype = NULL_TREE;
2059 5239811 : if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
2060 : &this_max_nunits, matches, &two_operators,
2061 : &vectype))
2062 : return NULL;
2063 :
2064 : /* If the SLP node is a load, terminate the recursion unless masked. */
2065 3748887 : if (STMT_VINFO_DATA_REF (stmt_info)
2066 1849599 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2067 : {
2068 794504 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2069 : gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
2070 : else
2071 : {
2072 776407 : *max_nunits = this_max_nunits;
2073 776407 : (*tree_size)++;
2074 776407 : node = vect_create_new_slp_node (node, stmts, 0);
2075 776407 : SLP_TREE_VECTYPE (node) = vectype;
2076 : /* And compute the load permutation. Whether it is actually
2077 : a permutation depends on the unrolling factor which is
2078 : decided later. */
2079 776407 : vec<unsigned> load_permutation;
2080 776407 : int j;
2081 776407 : stmt_vec_info load_info;
2082 776407 : load_permutation.create (group_size);
2083 776407 : stmt_vec_info first_stmt_info
2084 776407 : = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2085 776407 : ? DR_GROUP_FIRST_ELEMENT (stmt_info) : stmt_info;
2086 776407 : bool any_permute = false;
2087 1883748 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2088 : {
2089 1107341 : int load_place;
2090 1107341 : if (! load_info)
2091 : {
2092 39409 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2093 : load_place = j;
2094 : else
2095 : load_place = 0;
2096 : }
2097 1067932 : else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2098 652513 : load_place = vect_get_place_in_interleaving_chain
2099 652513 : (load_info, first_stmt_info);
2100 : else
2101 : /* Recognize the splat case as { 0, 0, ... } but make
2102 : sure to use the appropriate refs for collections
2103 : of invariant refs. */
2104 415419 : load_place = (load_info == stmt_info) ? 0 : j;
2105 692079 : gcc_assert (load_place != -1);
2106 1107341 : any_permute |= load_place != j;
2107 1107341 : load_permutation.quick_push (load_place);
2108 : }
2109 :
2110 776407 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
2111 : {
2112 2350 : gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
2113 2350 : bool has_gaps = false;
2114 2350 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2115 209 : for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
2116 1346 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2117 1137 : if (DR_GROUP_GAP (si) != 1)
2118 160 : has_gaps = true;
2119 : /* We cannot handle permuted masked loads directly, see
2120 : PR114375. We cannot handle strided masked loads or masked
2121 : loads with gaps unless the mask is uniform. */
2122 2350 : if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
2123 209 : && (DR_GROUP_GAP (first_stmt_info) != 0
2124 149 : || (has_gaps
2125 55 : && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))))
2126 4605 : || STMT_VINFO_STRIDED_P (stmt_info))
2127 : {
2128 108 : load_permutation.release ();
2129 108 : matches[0] = false;
2130 774209 : return NULL;
2131 : }
2132 :
2133 : /* For permuted masked loads do an unpermuted masked load of
2134 : the whole group followed by a SLP permute node. */
2135 2242 : if (any_permute
2136 2242 : || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2137 84 : && DR_GROUP_SIZE (first_stmt_info) != group_size))
2138 : {
2139 : /* Discover the whole unpermuted load. */
2140 44 : vec<stmt_vec_info> stmts2;
2141 44 : unsigned dr_group_size = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2142 78 : ? DR_GROUP_SIZE (first_stmt_info) : 1;
2143 44 : stmts2.create (dr_group_size);
2144 44 : stmts2.quick_grow_cleared (dr_group_size);
2145 44 : unsigned i = 0;
2146 44 : for (stmt_vec_info si = first_stmt_info;
2147 594 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2148 : {
2149 550 : if (si != first_stmt_info)
2150 2106 : for (unsigned k = 1; k < DR_GROUP_GAP (si); ++k)
2151 1600 : stmts2[i++] = NULL;
2152 550 : stmts2[i++] = si;
2153 : }
2154 44 : bool *matches2 = XALLOCAVEC (bool, dr_group_size);
2155 44 : slp_tree unperm_load
2156 44 : = vect_build_slp_tree (vinfo, stmts2, dr_group_size,
2157 : &this_max_nunits, matches2, limit,
2158 44 : &this_tree_size, bst_map);
2159 : /* When we are able to do the full masked load emit that
2160 : followed by 'node' being the desired final permutation. */
2161 44 : if (unperm_load)
2162 : {
2163 16 : gcc_assert
2164 : (!SLP_TREE_LOAD_PERMUTATION (unperm_load).exists ());
2165 16 : lane_permutation_t lperm;
2166 16 : lperm.create (group_size);
2167 56 : for (unsigned j = 0; j < load_permutation.length (); ++j)
2168 40 : lperm.quick_push
2169 40 : (std::make_pair (0, load_permutation[j]));
2170 16 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2171 16 : SLP_TREE_CHILDREN (node).safe_push (unperm_load);
2172 16 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2173 16 : load_permutation.release ();
2174 16 : return node;
2175 : }
2176 28 : stmts2.release ();
2177 28 : load_permutation.release ();
2178 28 : matches[0] = false;
2179 28 : return NULL;
2180 : }
2181 2198 : load_permutation.release ();
2182 : }
2183 : else
2184 : {
2185 774057 : if (!any_permute
2186 674054 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2187 1049289 : && group_size == DR_GROUP_SIZE (first_stmt_info))
2188 118578 : load_permutation.release ();
2189 774057 : SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
2190 774057 : return node;
2191 : }
2192 : }
2193 : }
2194 2954383 : else if (gimple_assign_single_p (stmt_info->stmt)
2195 2116602 : && !gimple_vuse (stmt_info->stmt)
2196 2962086 : && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
2197 : {
2198 : /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
2199 : the same SSA name vector of a compatible type to vectype. */
2200 2385 : vec<std::pair<unsigned, unsigned> > lperm = vNULL;
2201 2385 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
2202 2385 : stmt_vec_info estmt_info;
2203 7513 : FOR_EACH_VEC_ELT (stmts, i, estmt_info)
2204 : {
2205 5275 : gassign *estmt = as_a <gassign *> (estmt_info->stmt);
2206 5275 : tree bfref = gimple_assign_rhs1 (estmt);
2207 5275 : HOST_WIDE_INT lane;
2208 5275 : if (!known_eq (bit_field_size (bfref),
2209 : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2210 10403 : || !constant_multiple_p (bit_field_offset (bfref),
2211 5128 : bit_field_size (bfref), &lane))
2212 : {
2213 147 : lperm.release ();
2214 147 : matches[0] = false;
2215 147 : return NULL;
2216 : }
2217 5128 : lperm.safe_push (std::make_pair (0, (unsigned)lane));
2218 : }
2219 2238 : slp_tree vnode = vect_create_new_slp_node (vNULL);
2220 2238 : if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2221 : /* ??? We record vectype here but we hide eventually necessary
2222 : punning and instead rely on code generation to materialize
2223 : VIEW_CONVERT_EXPRs as necessary. We instead should make
2224 : this explicit somehow. */
2225 704 : SLP_TREE_VECTYPE (vnode) = vectype;
2226 : else
2227 : {
2228 : /* For different size but compatible elements we can still
2229 : use VEC_PERM_EXPR without punning. */
2230 1534 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2231 : && types_compatible_p (TREE_TYPE (vectype),
2232 : TREE_TYPE (TREE_TYPE (vec))));
2233 1534 : SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2234 : }
2235 2238 : auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2236 2238 : unsigned HOST_WIDE_INT const_nunits;
2237 2238 : if (nunits.is_constant (&const_nunits))
2238 2238 : SLP_TREE_LANES (vnode) = const_nunits;
2239 2238 : SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2240 : /* We are always building a permutation node even if it is an identity
2241 : permute to shield the rest of the vectorizer from the odd node
2242 : representing an actual vector without any scalar ops.
2243 : ??? We could hide it completely with making the permute node
2244 : external? */
2245 2238 : node = vect_create_new_slp_node (node, stmts, 1);
2246 2238 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2247 2238 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2248 2238 : SLP_TREE_VECTYPE (node) = vectype;
2249 2238 : SLP_TREE_CHILDREN (node).quick_push (vnode);
2250 2238 : return node;
2251 : }
2252 : /* When discovery reaches an associatable operation see whether we can
2253 : improve that to match up lanes in a way superior to the operand
2254 : swapping code which at most looks at two defs.
2255 : ??? For BB vectorization we cannot do the brute-force search
2256 : for matching as we can succeed by means of builds from scalars
2257 : and have no good way to "cost" one build against another. */
2258 2951998 : else if (is_a <loop_vec_info> (vinfo)
2259 : /* Do not bother for single-lane SLP. */
2260 1623393 : && group_size > 1
2261 : /* ??? We don't handle !vect_internal_def defs below. */
2262 80168 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2263 : /* ??? Do not associate a reduction, this will wreck REDUC_IDX
2264 : mapping as long as that exists on the stmt_info level. */
2265 63608 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1
2266 58465 : && is_gimple_assign (stmt_info->stmt)
2267 58197 : && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2268 40629 : || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2269 2971155 : && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2270 11656 : || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2271 9713 : && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2272 : {
2273 : /* See if we have a chain of (mixed) adds or subtracts or other
2274 : associatable ops. */
2275 13653 : enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2276 13653 : if (code == MINUS_EXPR)
2277 686 : code = PLUS_EXPR;
2278 13653 : stmt_vec_info other_op_stmt_info = NULL;
2279 13653 : stmt_vec_info op_stmt_info = NULL;
2280 13653 : unsigned chain_len = 0;
2281 13653 : auto_vec<chain_op_t> chain;
2282 13653 : auto_vec<std::pair<tree_code, gimple *> > worklist;
2283 13653 : auto_vec<vec<chain_op_t> > chains (group_size);
2284 13653 : auto_vec<slp_tree, 4> children;
2285 13653 : bool hard_fail = true;
2286 14538 : for (unsigned lane = 0; lane < group_size; ++lane)
2287 : {
2288 14269 : if (!stmts[lane])
2289 : {
2290 : /* ??? Below we require lane zero is present. */
2291 0 : if (lane == 0)
2292 : {
2293 : hard_fail = false;
2294 13384 : break;
2295 : }
2296 0 : chains.quick_push (vNULL);
2297 0 : continue;
2298 : }
2299 : /* For each lane linearize the addition/subtraction (or other
2300 : uniform associatable operation) expression tree. */
2301 14269 : gimple *op_stmt = NULL, *other_op_stmt = NULL;
2302 14269 : vect_slp_linearize_chain (vinfo, worklist, chain, code,
2303 14269 : stmts[lane]->stmt, op_stmt, other_op_stmt,
2304 : NULL);
2305 14269 : if (!op_stmt_info && op_stmt)
2306 13123 : op_stmt_info = vinfo->lookup_stmt (op_stmt);
2307 14269 : if (!other_op_stmt_info && other_op_stmt)
2308 722 : other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2309 14269 : if (chain.length () == 2)
2310 : {
2311 : /* In a chain of just two elements resort to the regular
2312 : operand swapping scheme. Likewise if we run into a
2313 : length mismatch process regularly as well as we did not
2314 : process the other lanes we cannot report a good hint what
2315 : lanes to try swapping in the parent. */
2316 : hard_fail = false;
2317 : break;
2318 : }
2319 888 : else if (chain_len == 0)
2320 309 : chain_len = chain.length ();
2321 1158 : else if (chain.length () != chain_len)
2322 : {
2323 : /* ??? Here we could slip in magic to compensate with
2324 : neutral operands. */
2325 3 : matches[lane] = false;
2326 3 : if (lane != group_size - 1)
2327 3 : matches[0] = false;
2328 : break;
2329 : }
2330 885 : chains.quick_push (chain.copy ());
2331 885 : chain.truncate (0);
2332 : }
2333 27306 : if (chains.length () == group_size)
2334 : {
2335 : /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2336 269 : if (!op_stmt_info)
2337 : {
2338 2 : hard_fail = false;
2339 2 : goto out;
2340 : }
2341 : /* Now we have a set of chains with the same length. */
2342 : /* 1. pre-sort according to def_type and operation. */
2343 1042 : for (unsigned lane = 0; lane < group_size; ++lane)
2344 1550 : chains[lane].stablesort (dt_sort_cmp, vinfo);
2345 267 : if (dump_enabled_p ())
2346 : {
2347 145 : dump_printf_loc (MSG_NOTE, vect_location,
2348 : "pre-sorted chains of %s\n",
2349 : get_tree_code_name (code));
2350 649 : for (unsigned lane = 0; lane < group_size; ++lane)
2351 : {
2352 504 : if (!stmts[lane])
2353 0 : dump_printf (MSG_NOTE, "--");
2354 : else
2355 2326 : for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2356 3644 : dump_printf (MSG_NOTE, "%s %T ",
2357 1822 : get_tree_code_name (chains[lane][opnum].code),
2358 1822 : chains[lane][opnum].op);
2359 504 : dump_printf (MSG_NOTE, "\n");
2360 : }
2361 : }
2362 : /* 2. try to build children nodes, associating as necessary. */
2363 : /* 2a. prepare and perform early checks to avoid eating into
2364 : discovery limit unnecessarily. */
2365 267 : vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
2366 1135 : for (unsigned n = 0; n < chain_len; ++n)
2367 : {
2368 868 : vect_def_type dt = chains[0][n].dt;
2369 868 : unsigned lane;
2370 3535 : for (lane = 0; lane < group_size; ++lane)
2371 5334 : if (stmts[lane] && chains[lane][n].dt != dt)
2372 : {
2373 0 : if (dt == vect_constant_def
2374 0 : && chains[lane][n].dt == vect_external_def)
2375 : dt = vect_external_def;
2376 0 : else if (dt == vect_external_def
2377 0 : && chains[lane][n].dt == vect_constant_def)
2378 : ;
2379 : else
2380 : break;
2381 : }
2382 868 : if (lane != group_size)
2383 : {
2384 0 : if (dump_enabled_p ())
2385 0 : dump_printf_loc (MSG_NOTE, vect_location,
2386 : "giving up on chain due to mismatched "
2387 : "def types\n");
2388 0 : matches[lane] = false;
2389 0 : if (lane != group_size - 1)
2390 0 : matches[0] = false;
2391 0 : goto out;
2392 : }
2393 868 : dts[n] = dt;
2394 868 : if (dt == vect_constant_def
2395 868 : || dt == vect_external_def)
2396 : {
2397 : /* Check whether we can build the invariant. If we can't
2398 : we never will be able to. */
2399 77 : tree type = TREE_TYPE (chains[0][n].op);
2400 868 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2401 : && (TREE_CODE (type) == BOOLEAN_TYPE
2402 : || !can_duplicate_and_interleave_p (vinfo, group_size,
2403 : type)))
2404 : {
2405 : matches[0] = false;
2406 : goto out;
2407 : }
2408 : }
2409 791 : else if (dt != vect_internal_def)
2410 : {
2411 : /* Not sure, we might need sth special.
2412 : gcc.dg/vect/pr96854.c,
2413 : gfortran.dg/vect/fast-math-pr37021.f90
2414 : and gfortran.dg/vect/pr61171.f trigger. */
2415 : /* Soft-fail for now. */
2416 0 : hard_fail = false;
2417 0 : goto out;
2418 : }
2419 : }
2420 : /* 2b. do the actual build. */
2421 1081 : for (unsigned n = 0; n < chain_len; ++n)
2422 : {
2423 833 : vect_def_type dt = dts[n];
2424 833 : unsigned lane;
2425 833 : if (dt == vect_constant_def
2426 833 : || dt == vect_external_def)
2427 : {
2428 77 : vec<tree> ops;
2429 77 : ops.create (group_size);
2430 397 : for (lane = 0; lane < group_size; ++lane)
2431 243 : if (stmts[lane])
2432 243 : ops.quick_push (chains[lane][n].op);
2433 : else
2434 0 : ops.quick_push (NULL_TREE);
2435 77 : slp_tree child = vect_create_new_slp_node (ops);
2436 77 : SLP_TREE_DEF_TYPE (child) = dt;
2437 77 : children.safe_push (child);
2438 : }
2439 : else
2440 : {
2441 756 : vec<stmt_vec_info> op_stmts;
2442 756 : op_stmts.create (group_size);
2443 756 : slp_tree child = NULL;
2444 : /* Brute-force our way. We have to consider a lane
2445 : failing after fixing an earlier fail up in the
2446 : SLP discovery recursion. So track the current
2447 : permute per lane. */
2448 756 : unsigned *perms = XALLOCAVEC (unsigned, group_size);
2449 756 : memset (perms, 0, sizeof (unsigned) * group_size);
2450 835 : do
2451 : {
2452 835 : op_stmts.truncate (0);
2453 4248 : for (lane = 0; lane < group_size; ++lane)
2454 2578 : if (stmts[lane])
2455 2578 : op_stmts.quick_push
2456 2578 : (vinfo->lookup_def (chains[lane][n].op));
2457 : else
2458 0 : op_stmts.quick_push (NULL);
2459 835 : child = vect_build_slp_tree (vinfo, op_stmts,
2460 : group_size, &this_max_nunits,
2461 : matches, limit,
2462 : &this_tree_size, bst_map);
2463 : /* ??? We're likely getting too many fatal mismatches
2464 : here so maybe we want to ignore them (but then we
2465 : have no idea which lanes fatally mismatched). */
2466 835 : if (child || !matches[0])
2467 : break;
2468 : /* Swap another lane we have not yet matched up into
2469 : lanes that did not match. If we run out of
2470 : permute possibilities for a lane terminate the
2471 : search. */
2472 257 : bool term = false;
2473 257 : for (lane = 1; lane < group_size; ++lane)
2474 178 : if (!matches[lane])
2475 : {
2476 150 : if (n + perms[lane] + 1 == chain_len)
2477 : {
2478 : term = true;
2479 : break;
2480 : }
2481 131 : if (dump_enabled_p ())
2482 113 : dump_printf_loc (MSG_NOTE, vect_location,
2483 : "swapping operand %d and %d "
2484 : "of lane %d\n",
2485 : n, n + perms[lane] + 1, lane);
2486 262 : std::swap (chains[lane][n],
2487 131 : chains[lane][n + perms[lane] + 1]);
2488 131 : perms[lane]++;
2489 : }
2490 98 : if (term)
2491 : break;
2492 : }
2493 : while (1);
2494 756 : if (!child)
2495 : {
2496 19 : if (dump_enabled_p ())
2497 18 : dump_printf_loc (MSG_NOTE, vect_location,
2498 : "failed to match up op %d\n", n);
2499 19 : op_stmts.release ();
2500 19 : if (lane != group_size - 1)
2501 9 : matches[0] = false;
2502 : else
2503 10 : matches[lane] = false;
2504 19 : goto out;
2505 : }
2506 737 : if (dump_enabled_p ())
2507 : {
2508 397 : dump_printf_loc (MSG_NOTE, vect_location,
2509 : "matched up op %d to\n", n);
2510 397 : vect_print_slp_tree (MSG_NOTE, vect_location, child);
2511 : }
2512 737 : children.safe_push (child);
2513 : }
2514 : }
2515 : /* 3. build SLP nodes to combine the chain. */
2516 950 : for (unsigned lane = 0; lane < group_size; ++lane)
2517 1416 : if (stmts[lane] && chains[lane][0].code != code)
2518 : {
2519 : /* See if there's any alternate all-PLUS entry. */
2520 : unsigned n;
2521 6 : for (n = 1; n < chain_len; ++n)
2522 : {
2523 30 : for (lane = 0; lane < group_size; ++lane)
2524 48 : if (stmts[lane] && chains[lane][n].code != code)
2525 : break;
2526 6 : if (lane == group_size)
2527 : break;
2528 : }
2529 6 : if (n != chain_len)
2530 : {
2531 : /* Swap that in at first position. */
2532 6 : std::swap (children[0], children[n]);
2533 30 : for (lane = 0; lane < group_size; ++lane)
2534 24 : if (stmts[lane])
2535 24 : std::swap (chains[lane][0], chains[lane][n]);
2536 : }
2537 : else
2538 : {
2539 : /* ??? When this triggers and we end up with two
2540 : vect_constant/external_def up-front things break (ICE)
2541 : spectacularly finding an insertion place for the
2542 : all-constant op. We should have a fully
2543 : vect_internal_def operand though(?) so we can swap
2544 : that into first place and then prepend the all-zero
2545 : constant. */
2546 0 : if (dump_enabled_p ())
2547 0 : dump_printf_loc (MSG_NOTE, vect_location,
2548 : "inserting constant zero to compensate "
2549 : "for (partially) negated first "
2550 : "operand\n");
2551 0 : chain_len++;
2552 0 : for (lane = 0; lane < group_size; ++lane)
2553 0 : if (stmts[lane])
2554 0 : chains[lane].safe_insert
2555 0 : (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2556 0 : vec<tree> zero_ops;
2557 0 : zero_ops.create (group_size);
2558 0 : zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2559 0 : for (lane = 1; lane < group_size; ++lane)
2560 0 : if (stmts[lane])
2561 0 : zero_ops.quick_push (zero_ops[0]);
2562 : else
2563 0 : zero_ops.quick_push (NULL_TREE);
2564 0 : slp_tree zero = vect_create_new_slp_node (zero_ops);
2565 0 : SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2566 0 : children.safe_insert (0, zero);
2567 : }
2568 : break;
2569 : }
2570 809 : for (unsigned i = 1; i < children.length (); ++i)
2571 : {
2572 561 : slp_tree op0 = children[i - 1];
2573 561 : slp_tree op1 = children[i];
2574 561 : bool this_two_op = false;
2575 2169 : for (unsigned lane = 0; lane < group_size; ++lane)
2576 3460 : if (stmts[lane] && chains[lane][i].code != chains[0][i].code)
2577 : {
2578 : this_two_op = true;
2579 : break;
2580 : }
2581 561 : slp_tree child;
2582 561 : if (i == children.length () - 1)
2583 248 : child = vect_create_new_slp_node (node, stmts, 2);
2584 : else
2585 313 : child = vect_create_new_slp_node (2, ERROR_MARK);
2586 561 : if (this_two_op)
2587 : {
2588 122 : vec<std::pair<unsigned, unsigned> > lperm;
2589 122 : lperm.create (group_size);
2590 462 : for (unsigned lane = 0; lane < group_size; ++lane)
2591 680 : lperm.quick_push (std::make_pair
2592 340 : (chains[lane][i].code != chains[0][i].code, lane));
2593 244 : vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2594 122 : (chains[0][i].code == code
2595 : ? op_stmt_info
2596 : : other_op_stmt_info),
2597 122 : (chains[0][i].code == code
2598 : ? other_op_stmt_info
2599 : : op_stmt_info),
2600 : lperm);
2601 : }
2602 : else
2603 : {
2604 439 : SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2605 439 : SLP_TREE_VECTYPE (child) = vectype;
2606 439 : SLP_TREE_LANES (child) = group_size;
2607 439 : SLP_TREE_CHILDREN (child).quick_push (op0);
2608 439 : SLP_TREE_CHILDREN (child).quick_push (op1);
2609 439 : SLP_TREE_REPRESENTATIVE (child)
2610 878 : = (chains[0][i].code == code
2611 439 : ? op_stmt_info : other_op_stmt_info);
2612 : }
2613 561 : children[i] = child;
2614 : }
2615 248 : *tree_size += this_tree_size + 1;
2616 248 : *max_nunits = this_max_nunits;
2617 1244 : while (!chains.is_empty ())
2618 726 : chains.pop ().release ();
2619 : return node;
2620 : }
2621 13384 : out:
2622 13405 : if (dump_enabled_p ())
2623 2775 : dump_printf_loc (MSG_NOTE, vect_location,
2624 : "failed to line up SLP graph by re-associating "
2625 : "operations in lanes%s\n",
2626 : !hard_fail ? " trying regular discovery" : "");
2627 13410 : while (!children.is_empty ())
2628 5 : vect_free_slp_tree (children.pop ());
2629 13564 : while (!chains.is_empty ())
2630 159 : chains.pop ().release ();
2631 : /* Hard-fail, otherwise we might run into quadratic processing of the
2632 : chains starting one stmt into the chain again. */
2633 13405 : if (hard_fail)
2634 : return NULL;
2635 : /* Fall thru to normal processing. */
2636 13653 : }
2637 :
2638 : /* Get at the operands, verifying they are compatible. */
2639 2972023 : vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2640 2972023 : slp_oprnd_info oprnd_info;
2641 15172127 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2642 : {
2643 24402438 : int res = vect_get_and_check_slp_defs (vinfo, vectype,
2644 12201219 : swap[i], skip_args,
2645 : stmts, i, &oprnds_info);
2646 12201219 : if (res != 0)
2647 531005 : matches[(res == -1) ? 0 : i] = false;
2648 12201219 : if (!matches[0])
2649 : break;
2650 : }
2651 14871622 : for (i = 0; i < group_size; ++i)
2652 12110657 : if (!matches[i])
2653 : {
2654 211058 : vect_free_oprnd_info (oprnds_info);
2655 211058 : return NULL;
2656 : }
2657 8282895 : swap = NULL;
2658 :
2659 8282895 : bool has_two_operators_perm = false;
2660 16565790 : auto_vec<unsigned> two_op_perm_indices[2];
2661 2760965 : vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
2662 :
2663 2773070 : if (two_operators && oprnds_info.length () == 2 && group_size > 2)
2664 : {
2665 2723 : unsigned idx = 0;
2666 2723 : hash_map<gimple *, unsigned> seen;
2667 2723 : vec<slp_oprnd_info> new_oprnds_info
2668 2723 : = vect_create_oprnd_info (1, group_size);
2669 2723 : bool success = true;
2670 :
2671 2723 : enum tree_code code = ERROR_MARK;
2672 2723 : if (oprnds_info[0]->def_stmts[0]
2673 2723 : && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
2674 2665 : code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
2675 2723 : basic_block bb = nullptr;
2676 :
2677 5992 : for (unsigned j = 0; j < group_size; ++j)
2678 : {
2679 14323 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2680 : {
2681 11054 : stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
2682 11054 : if (!stmt_info
2683 10843 : || !is_a<gassign *> (stmt_info->stmt)
2684 10840 : || gimple_assign_rhs_code (stmt_info->stmt) != code
2685 19783 : || skip_args[i])
2686 : {
2687 : success = false;
2688 2329 : break;
2689 : }
2690 : /* Avoid mixing lanes with defs in different basic-blocks. */
2691 8729 : if (!bb)
2692 2821 : bb = gimple_bb (vect_orig_stmt (stmt_info)->stmt);
2693 7428 : else if (gimple_bb (vect_orig_stmt (stmt_info)->stmt) != bb)
2694 : {
2695 : success = false;
2696 : break;
2697 : }
2698 :
2699 8725 : bool exists;
2700 8725 : unsigned &stmt_idx
2701 8725 : = seen.get_or_insert (stmt_info->stmt, &exists);
2702 :
2703 8725 : if (!exists)
2704 : {
2705 7676 : new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
2706 7676 : new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
2707 7676 : stmt_idx = idx;
2708 7676 : idx++;
2709 : }
2710 :
2711 8725 : two_op_perm_indices[i].safe_push (stmt_idx);
2712 : }
2713 :
2714 5598 : if (!success)
2715 : break;
2716 : }
2717 :
2718 2723 : if (success && idx == group_size)
2719 : {
2720 56 : if (dump_enabled_p ())
2721 : {
2722 0 : dump_printf_loc (MSG_NOTE, vect_location,
2723 : "Replace two_operators operands:\n");
2724 :
2725 0 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2726 : {
2727 0 : dump_printf_loc (MSG_NOTE, vect_location,
2728 : "Operand %u:\n", i);
2729 0 : for (unsigned j = 0; j < group_size; j++)
2730 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2731 0 : j, oprnd_info->def_stmts[j]->stmt);
2732 : }
2733 :
2734 0 : dump_printf_loc (MSG_NOTE, vect_location,
2735 : "With a single operand:\n");
2736 0 : for (unsigned j = 0; j < group_size; j++)
2737 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2738 0 : j, new_oprnds_info[0]->def_stmts[j]->stmt);
2739 : }
2740 :
2741 56 : two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
2742 56 : two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
2743 :
2744 56 : new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
2745 56 : new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
2746 56 : new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
2747 56 : new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
2748 56 : new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
2749 :
2750 56 : vect_free_oprnd_info (oprnds_info);
2751 56 : oprnds_info = new_oprnds_info;
2752 56 : nops = 1;
2753 56 : has_two_operators_perm = true;
2754 : }
2755 : else
2756 2667 : vect_free_oprnd_info (new_oprnds_info);
2757 2723 : }
2758 :
2759 5521930 : auto_vec<slp_tree, 4> children;
2760 :
2761 2760965 : stmt_info = stmts[0];
2762 :
2763 2760965 : int reduc_idx = -1;
2764 2760965 : int gs_scale = 0;
2765 2760965 : tree gs_base = NULL_TREE;
2766 :
2767 : /* Create SLP_TREE nodes for the definition node/s. */
2768 7059074 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2769 : {
2770 4385171 : slp_tree child = nullptr;
2771 4385171 : unsigned int j;
2772 :
2773 : /* We're skipping certain operands from processing, for example
2774 : outer loop reduction initial defs. */
2775 4385171 : if (skip_args[i])
2776 : {
2777 417013 : children.safe_push (NULL);
2778 4715122 : continue;
2779 : }
2780 :
2781 3968158 : if (oprnd_info->first_dt == vect_uninitialized_def)
2782 : {
2783 : /* COND_EXPR have one too many eventually if the condition
2784 : is a SSA name. */
2785 0 : gcc_assert (i == 3 && nops == 4);
2786 0 : continue;
2787 : }
2788 :
2789 3968158 : if (oprnd_info->first_gs_p)
2790 : {
2791 21765 : gs_scale = oprnd_info->first_gs_info.scale;
2792 21765 : gs_base = oprnd_info->first_gs_info.base;
2793 : }
2794 :
2795 3968158 : if (is_a <bb_vec_info> (vinfo)
2796 1560561 : && oprnd_info->first_dt == vect_internal_def
2797 4775279 : && !oprnd_info->any_pattern)
2798 : {
2799 : /* For BB vectorization, if all defs are the same do not
2800 : bother to continue the build along the single-lane
2801 : graph but use a splat of the scalar value. */
2802 764295 : stmt_vec_info first_def = oprnd_info->def_stmts[0];
2803 820050 : for (j = 1; j < group_size; ++j)
2804 779908 : if (oprnd_info->def_stmts[j] != first_def)
2805 : break;
2806 764295 : if (j == group_size
2807 : /* But avoid doing this for loads where we may be
2808 : able to CSE things, unless the stmt is not
2809 : vectorizable. */
2810 764295 : && (!STMT_VINFO_VECTORIZABLE (first_def)
2811 49400 : || !gimple_vuse (first_def->stmt)))
2812 : {
2813 30856 : if (dump_enabled_p ())
2814 93 : dump_printf_loc (MSG_NOTE, vect_location,
2815 : "Using a splat of the uniform operand %G",
2816 : first_def->stmt);
2817 30856 : oprnd_info->first_dt = vect_external_def;
2818 : }
2819 : }
2820 :
2821 3968158 : if (oprnd_info->first_dt == vect_external_def
2822 3968158 : || oprnd_info->first_dt == vect_constant_def)
2823 : {
2824 1388155 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2825 : {
2826 : tree op0;
2827 : tree uniform_val = op0 = oprnd_info->ops[0];
2828 : for (j = 1; j < oprnd_info->ops.length (); ++j)
2829 : if (oprnd_info->ops[j]
2830 : && !operand_equal_p (uniform_val, oprnd_info->ops[j]))
2831 : {
2832 : uniform_val = NULL_TREE;
2833 : break;
2834 : }
2835 : if (!uniform_val
2836 : && !can_duplicate_and_interleave_p (vinfo,
2837 : oprnd_info->ops.length (),
2838 : TREE_TYPE (op0)))
2839 : {
2840 : matches[j] = false;
2841 : if (dump_enabled_p ())
2842 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2843 : "Build SLP failed: invalid type of def "
2844 : "for variable-length SLP %T\n", op0);
2845 : goto fail;
2846 : }
2847 : }
2848 1388155 : slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2849 1388155 : SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2850 1388155 : oprnd_info->ops = vNULL;
2851 1388155 : children.safe_push (invnode);
2852 1388155 : continue;
2853 1388155 : }
2854 :
2855 : /* See which SLP operand a reduction chain continues on. We want
2856 : to chain even PHIs but not backedges. */
2857 2580003 : if (STMT_VINFO_REDUC_DEF (oprnd_info->def_stmts[0])
2858 2580003 : || STMT_VINFO_REDUC_IDX (oprnd_info->def_stmts[0]) != -1)
2859 : {
2860 160448 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2861 : {
2862 638 : if (oprnd_info->first_dt == vect_double_reduction_def)
2863 319 : reduc_idx = i;
2864 : }
2865 159810 : else if (is_a <gphi *> (stmt_info->stmt)
2866 159810 : && gimple_phi_num_args
2867 70190 : (as_a <gphi *> (stmt_info->stmt)) != 1)
2868 : ;
2869 89944 : else if (STMT_VINFO_REDUC_IDX (stmt_info) == -1
2870 324 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2871 : ;
2872 89944 : else if (reduc_idx == -1)
2873 85628 : reduc_idx = i;
2874 : else
2875 : /* For .COND_* reduction operations the else value can be the
2876 : same as one of the operation operands. The other def
2877 : stmts have been moved, so we can't check easily. Check
2878 : it's a call at least. */
2879 4316 : gcc_assert (is_a <gcall *> (stmt_info->stmt));
2880 : }
2881 :
2882 : /* When we have a masked load with uniform mask discover this
2883 : as a single-lane mask with a splat permute. This way we can
2884 : recognize this as a masked load-lane by stripping the splat. */
2885 2580003 : if (is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
2886 34757 : && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
2887 : IFN_MASK_LOAD)
2888 4737 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2889 2580080 : && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info)))
2890 : {
2891 35 : vec<stmt_vec_info> def_stmts2;
2892 35 : def_stmts2.create (1);
2893 35 : def_stmts2.quick_push (oprnd_info->def_stmts[0]);
2894 35 : child = vect_build_slp_tree (vinfo, def_stmts2, 1,
2895 : &this_max_nunits,
2896 : matches, limit,
2897 : &this_tree_size, bst_map);
2898 35 : if (child)
2899 : {
2900 35 : slp_tree pnode = vect_create_new_slp_node (1, VEC_PERM_EXPR);
2901 35 : SLP_TREE_VECTYPE (pnode) = SLP_TREE_VECTYPE (child);
2902 35 : SLP_TREE_LANES (pnode) = group_size;
2903 35 : SLP_TREE_SCALAR_STMTS (pnode).create (group_size);
2904 35 : SLP_TREE_LANE_PERMUTATION (pnode).create (group_size);
2905 210 : for (unsigned k = 0; k < group_size; ++k)
2906 : {
2907 175 : SLP_TREE_SCALAR_STMTS (pnode)
2908 175 : .quick_push (oprnd_info->def_stmts[0]);
2909 175 : SLP_TREE_LANE_PERMUTATION (pnode)
2910 175 : .quick_push (std::make_pair (0u, 0u));
2911 : }
2912 35 : SLP_TREE_CHILDREN (pnode).quick_push (child);
2913 35 : pnode->max_nunits = child->max_nunits;
2914 35 : children.safe_push (pnode);
2915 35 : oprnd_info->def_stmts = vNULL;
2916 35 : continue;
2917 35 : }
2918 : else
2919 0 : def_stmts2.release ();
2920 : }
2921 :
2922 2579968 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2923 : group_size, &this_max_nunits,
2924 : matches, limit,
2925 : &this_tree_size, bst_map)) != NULL)
2926 : {
2927 2128815 : oprnd_info->def_stmts = vNULL;
2928 2128815 : children.safe_push (child);
2929 2128815 : continue;
2930 : }
2931 :
2932 : /* If the SLP build for operand zero failed and operand zero
2933 : and one can be commutated try that for the scalar stmts
2934 : that failed the match. */
2935 451153 : if (i == 0
2936 : /* A first scalar stmt mismatch signals a fatal mismatch. */
2937 354897 : && matches[0]
2938 : /* ??? For COND_EXPRs we can swap the comparison operands
2939 : as well as the arms under some constraints. */
2940 168383 : && (nops == 2 || nops == 3)
2941 101205 : && oprnds_info[1]->first_dt == vect_internal_def
2942 55229 : && (is_gimple_assign (stmt_info->stmt)
2943 11374 : || is_gimple_call (stmt_info->stmt))
2944 : /* Swapping operands for reductions breaks assumptions later on. */
2945 495021 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
2946 : {
2947 : /* See whether we can swap the matching or the non-matching
2948 : stmt operands. */
2949 : bool swap_not_matching = true;
2950 49309 : do
2951 : {
2952 7033912 : for (j = 0; j < group_size; ++j)
2953 : {
2954 6998376 : if (matches[j] != !swap_not_matching)
2955 64099 : continue;
2956 6934277 : stmt_vec_info stmt_info = stmts[j];
2957 : /* Verify if we can swap operands of this stmt. */
2958 6934277 : if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
2959 : {
2960 6934251 : tree_code code = gimple_assign_rhs_code (stmt);
2961 6934251 : if (! commutative_tree_code (code)
2962 6934251 : && ! commutative_ternary_tree_code (code))
2963 : {
2964 13749 : if (!swap_not_matching)
2965 6339 : goto fail;
2966 : swap_not_matching = false;
2967 : break;
2968 : }
2969 : }
2970 6984629 : else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
2971 : {
2972 26 : internal_fn fn = (gimple_call_internal_p (call)
2973 26 : ? gimple_call_internal_fn (call)
2974 : : IFN_LAST);
2975 26 : if ((! commutative_binary_fn_p (fn)
2976 26 : && ! commutative_ternary_fn_p (fn))
2977 28 : || first_commutative_argument (fn) != 0)
2978 : {
2979 24 : if (!swap_not_matching)
2980 12 : goto fail;
2981 : swap_not_matching = false;
2982 : break;
2983 : }
2984 : }
2985 : }
2986 : }
2987 42958 : while (j != group_size);
2988 :
2989 : /* Swap mismatched definition stmts. */
2990 35536 : if (dump_enabled_p ())
2991 345 : dump_printf_loc (MSG_NOTE, vect_location,
2992 : "Re-trying with swapped operands of stmts ");
2993 7011912 : for (j = 0; j < group_size; ++j)
2994 6976376 : if (matches[j] == !swap_not_matching)
2995 : {
2996 13840696 : std::swap (oprnds_info[0]->def_stmts[j],
2997 6920348 : oprnds_info[1]->def_stmts[j]);
2998 13840696 : std::swap (oprnds_info[0]->ops[j],
2999 6920348 : oprnds_info[1]->ops[j]);
3000 6920348 : if (dump_enabled_p ())
3001 938 : dump_printf (MSG_NOTE, "%d ", j);
3002 : }
3003 35536 : if (dump_enabled_p ())
3004 345 : dump_printf (MSG_NOTE, "\n");
3005 : /* After swapping some operands we lost track whether an
3006 : operand has any pattern defs so be conservative here. */
3007 67843 : if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
3008 3272 : oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
3009 : /* And try again with scratch 'matches' ... */
3010 35536 : bool *tem = XALLOCAVEC (bool, group_size);
3011 35536 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
3012 : group_size, &this_max_nunits,
3013 : tem, limit,
3014 : &this_tree_size, bst_map)) != NULL)
3015 : {
3016 5623 : oprnd_info->def_stmts = vNULL;
3017 5623 : children.safe_push (child);
3018 5623 : continue;
3019 : }
3020 : }
3021 445530 : fail:
3022 :
3023 : /* If the SLP build failed and we analyze a basic-block
3024 : simply treat nodes we fail to build as externally defined
3025 : (and thus build vectors from the scalar defs).
3026 : The cost model will reject outright expensive cases.
3027 : ??? This doesn't treat cases where permutation ultimatively
3028 : fails (or we don't try permutation below). Ideally we'd
3029 : even compute a permutation that will end up with the maximum
3030 : SLP tree size... */
3031 445530 : if (is_a <bb_vec_info> (vinfo)
3032 : /* ??? Rejecting patterns this way doesn't work. We'd have to
3033 : do extra work to cancel the pattern so the uses see the
3034 : scalar version. */
3035 393030 : && !is_pattern_stmt_p (stmt_info)
3036 814557 : && !oprnd_info->any_pattern)
3037 : {
3038 : /* But if there's a leading vector sized set of matching stmts
3039 : fail here so we can split the group. This matches the condition
3040 : vect_analyze_slp_instance uses. */
3041 : /* ??? We might want to split here and combine the results to support
3042 : multiple vector sizes better. */
3043 578979 : for (j = 0; j < group_size; ++j)
3044 578979 : if (!matches[j])
3045 : break;
3046 368766 : if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
3047 368737 : && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
3048 : {
3049 358468 : if (dump_enabled_p ())
3050 501 : dump_printf_loc (MSG_NOTE, vect_location,
3051 : "Building vector operands from scalars\n");
3052 358468 : this_tree_size++;
3053 358468 : child = vect_create_new_slp_node (oprnd_info->ops);
3054 358468 : children.safe_push (child);
3055 358468 : oprnd_info->ops = vNULL;
3056 358468 : continue;
3057 : }
3058 : }
3059 :
3060 87062 : gcc_assert (child == NULL);
3061 97955 : FOR_EACH_VEC_ELT (children, j, child)
3062 10893 : if (child)
3063 10893 : vect_free_slp_tree (child);
3064 87062 : vect_free_oprnd_info (oprnds_info);
3065 87062 : return NULL;
3066 : }
3067 :
3068 2673903 : vect_free_oprnd_info (oprnds_info);
3069 :
3070 : /* If we have all children of a child built up from uniform scalars
3071 : or does more than one possibly expensive vector construction then
3072 : just throw that away, causing it built up from scalars.
3073 : The exception is the SLP node for the vector store. */
3074 2673903 : if (is_a <bb_vec_info> (vinfo)
3075 1087131 : && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3076 : /* ??? Rejecting patterns this way doesn't work. We'd have to
3077 : do extra work to cancel the pattern so the uses see the
3078 : scalar version. */
3079 3106291 : && !is_pattern_stmt_p (stmt_info))
3080 : {
3081 : slp_tree child;
3082 : unsigned j;
3083 : bool all_uniform_p = true;
3084 : unsigned n_vector_builds = 0;
3085 1229078 : FOR_EACH_VEC_ELT (children, j, child)
3086 : {
3087 821750 : if (!child)
3088 : ;
3089 821750 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3090 : all_uniform_p = false;
3091 586721 : else if (!vect_slp_tree_uniform_p (child))
3092 : {
3093 446959 : all_uniform_p = false;
3094 446959 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
3095 412845 : n_vector_builds++;
3096 : }
3097 : }
3098 407328 : if (all_uniform_p
3099 407328 : || n_vector_builds > 1
3100 691365 : || (n_vector_builds == children.length ()
3101 30174 : && is_a <gphi *> (stmt_info->stmt)))
3102 : {
3103 : /* Roll back. */
3104 128090 : matches[0] = false;
3105 407072 : FOR_EACH_VEC_ELT (children, j, child)
3106 278982 : if (child)
3107 278982 : vect_free_slp_tree (child);
3108 :
3109 128090 : if (dump_enabled_p ())
3110 129 : dump_printf_loc (MSG_NOTE, vect_location,
3111 : "Building parent vector operands from "
3112 : "scalars instead\n");
3113 128090 : return NULL;
3114 : }
3115 : }
3116 :
3117 2545813 : *tree_size += this_tree_size + 1;
3118 2545813 : *max_nunits = this_max_nunits;
3119 :
3120 2545813 : if (two_operators)
3121 : {
3122 : /* ??? We'd likely want to either cache in bst_map sth like
3123 : { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
3124 : the true { a+b, a+b, a+b, a+b } ... but there we don't have
3125 : explicit stmts to put in so the keying on 'stmts' doesn't
3126 : work (but we have the same issue with nodes that use 'ops'). */
3127 :
3128 5908 : if (has_two_operators_perm)
3129 : {
3130 22 : slp_tree child = children[0];
3131 22 : children.truncate (0);
3132 66 : for (i = 0; i < 2; i++)
3133 : {
3134 44 : slp_tree pnode
3135 44 : = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
3136 44 : SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
3137 44 : SLP_TREE_VECTYPE (pnode) = vectype;
3138 44 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3139 44 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3140 44 : lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
3141 44 : children.safe_push (pnode);
3142 :
3143 476 : for (unsigned j = 0; j < stmts.length (); j++)
3144 432 : perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
3145 : }
3146 :
3147 22 : SLP_TREE_REF_COUNT (child) += 4;
3148 : }
3149 :
3150 5908 : slp_tree one = new _slp_tree;
3151 5908 : slp_tree two = new _slp_tree;
3152 5908 : SLP_TREE_DEF_TYPE (one) = vect_internal_def;
3153 5908 : SLP_TREE_DEF_TYPE (two) = vect_internal_def;
3154 5908 : SLP_TREE_VECTYPE (one) = vectype;
3155 5908 : SLP_TREE_VECTYPE (two) = vectype;
3156 5908 : SLP_TREE_CHILDREN (one).safe_splice (children);
3157 5908 : SLP_TREE_CHILDREN (two).safe_splice (children);
3158 5908 : slp_tree child;
3159 23634 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
3160 11818 : SLP_TREE_REF_COUNT (child)++;
3161 :
3162 : /* Here we record the original defs since this
3163 : node represents the final lane configuration. */
3164 5908 : node = vect_create_new_slp_node (node, stmts, 2);
3165 5908 : SLP_TREE_VECTYPE (node) = vectype;
3166 5908 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3167 5908 : SLP_TREE_CHILDREN (node).quick_push (one);
3168 5908 : SLP_TREE_CHILDREN (node).quick_push (two);
3169 5908 : enum tree_code code0 = ERROR_MARK;
3170 5908 : enum tree_code ocode = ERROR_MARK;
3171 5908 : if (gassign *stmt = dyn_cast <gassign *> (stmts[0]->stmt))
3172 5906 : code0 = gimple_assign_rhs_code (stmt);
3173 5908 : stmt_vec_info ostmt_info;
3174 5908 : unsigned j = 0;
3175 22009 : FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
3176 : {
3177 16101 : int op = 0;
3178 16101 : if (gassign *ostmt = dyn_cast <gassign *> (ostmt_info->stmt))
3179 : {
3180 16097 : if (gimple_assign_rhs_code (ostmt) != code0)
3181 : {
3182 8083 : ocode = gimple_assign_rhs_code (ostmt);
3183 : op = 1;
3184 : j = i;
3185 : }
3186 : }
3187 : else
3188 : {
3189 8 : if (gimple_call_combined_fn (stmts[0]->stmt)
3190 4 : != gimple_call_combined_fn (ostmt_info->stmt))
3191 : {
3192 2 : op = 1;
3193 2 : j = i;
3194 : }
3195 : }
3196 16101 : SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (op, i));
3197 : }
3198 5908 : SLP_TREE_CODE (one) = code0;
3199 5908 : SLP_TREE_CODE (two) = ocode;
3200 5908 : SLP_TREE_LANES (one) = stmts.length ();
3201 5908 : SLP_TREE_LANES (two) = stmts.length ();
3202 5908 : SLP_TREE_REPRESENTATIVE (one) = stmts[0];
3203 5908 : SLP_TREE_REPRESENTATIVE (two) = stmts[j];
3204 :
3205 5908 : return node;
3206 : }
3207 :
3208 2539905 : node = vect_create_new_slp_node (node, stmts, nops);
3209 2539905 : SLP_TREE_VECTYPE (node) = vectype;
3210 2539905 : SLP_TREE_CHILDREN (node).splice (children);
3211 2539905 : SLP_TREE_GS_SCALE (node) = gs_scale;
3212 2539905 : SLP_TREE_GS_BASE (node) = gs_base;
3213 2539905 : if (reduc_idx != -1)
3214 : {
3215 80797 : gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) != -1
3216 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
3217 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def);
3218 80797 : SLP_TREE_REDUC_IDX (node) = reduc_idx;
3219 80797 : node->cycle_info.id = SLP_TREE_CHILDREN (node)[reduc_idx]->cycle_info.id;
3220 : }
3221 : /* When reaching the reduction PHI, create a vect_reduc_info. */
3222 2459108 : else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3223 2459108 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3224 2459108 : && is_a <gphi *> (STMT_VINFO_STMT (stmt_info)))
3225 : {
3226 71280 : loop_vec_info loop_vinfo = as_a <loop_vec_info> (vinfo);
3227 71280 : gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) == -1);
3228 71280 : node->cycle_info.id = loop_vinfo->reduc_infos.length ();
3229 71280 : vect_reduc_info reduc_info = new vect_reduc_info_s ();
3230 71280 : loop_vinfo->reduc_infos.safe_push (reduc_info);
3231 71280 : stmt_vec_info reduc_phi = stmt_info;
3232 : /* ??? For double reductions vect_is_simple_reduction stores the
3233 : reduction type and code on the inner loop header PHI. */
3234 71280 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3235 : {
3236 319 : use_operand_p use_p;
3237 319 : gimple *use_stmt;
3238 319 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
3239 : &use_p, &use_stmt);
3240 319 : gcc_assert (res);
3241 319 : reduc_phi = loop_vinfo->lookup_stmt (use_stmt);
3242 : }
3243 71280 : VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (stmt_info);
3244 71280 : VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (reduc_phi);
3245 71280 : VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (reduc_phi);
3246 71280 : VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
3247 : }
3248 : return node;
3249 8282895 : }
3250 :
3251 : /* Dump a single SLP tree NODE. */
3252 :
3253 : static void
3254 437743 : vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
3255 : slp_tree node)
3256 : {
3257 437743 : unsigned i, j;
3258 437743 : slp_tree child;
3259 437743 : stmt_vec_info stmt_info;
3260 437743 : tree op;
3261 :
3262 437743 : dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
3263 437743 : dump_user_location_t user_loc = loc.get_user_location ();
3264 437743 : dump_printf_loc (metadata, user_loc,
3265 : "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
3266 : ", refcnt=%u)",
3267 437743 : SLP_TREE_DEF_TYPE (node) == vect_external_def
3268 : ? " (external)"
3269 : : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
3270 422504 : ? " (constant)"
3271 : : ""), (void *) node,
3272 437743 : estimated_poly_value (node->max_nunits),
3273 : SLP_TREE_REF_COUNT (node));
3274 437743 : if (SLP_TREE_VECTYPE (node))
3275 371419 : dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
3276 437743 : dump_printf (metadata, "%s",
3277 437743 : node->avoid_stlf_fail ? " (avoid-stlf-fail)" : "");
3278 437743 : if (node->cycle_info.id != -1 || node->cycle_info.reduc_idx != -1)
3279 23089 : dump_printf (metadata, " cycle %d, link %d", node->cycle_info.id,
3280 : node->cycle_info.reduc_idx);
3281 437743 : dump_printf (metadata, "\n");
3282 437743 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3283 : {
3284 356053 : if (SLP_TREE_PERMUTE_P (node))
3285 13548 : dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
3286 : else
3287 342505 : dump_printf_loc (metadata, user_loc, "op template: %G",
3288 342505 : SLP_TREE_REPRESENTATIVE (node)->stmt);
3289 : }
3290 437743 : if (SLP_TREE_SCALAR_STMTS (node).exists ())
3291 853136 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3292 505124 : if (stmt_info)
3293 499843 : dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
3294 499843 : STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
3295 : i, stmt_info->stmt);
3296 : else
3297 5281 : dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
3298 : else
3299 : {
3300 89731 : dump_printf_loc (metadata, user_loc, "\t{ ");
3301 287370 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
3302 107908 : dump_printf (metadata, "%T%s ", op,
3303 107908 : i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
3304 89731 : dump_printf (metadata, "}\n");
3305 : }
3306 437743 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3307 : {
3308 62198 : dump_printf_loc (metadata, user_loc, "\tload permutation {");
3309 204260 : FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
3310 79864 : dump_printf (dump_kind, " %u", j);
3311 62198 : dump_printf (dump_kind, " }\n");
3312 : }
3313 437743 : if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3314 : {
3315 13556 : dump_printf_loc (metadata, user_loc, "\tlane permutation {");
3316 64464 : for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
3317 37352 : dump_printf (dump_kind, " %u[%u]",
3318 37352 : SLP_TREE_LANE_PERMUTATION (node)[i].first,
3319 37352 : SLP_TREE_LANE_PERMUTATION (node)[i].second);
3320 13556 : dump_printf (dump_kind, " }%s\n",
3321 13556 : node->ldst_lanes ? " (load-lanes)" : "");
3322 : }
3323 437743 : if (SLP_TREE_CHILDREN (node).is_empty ())
3324 166051 : return;
3325 271692 : dump_printf_loc (metadata, user_loc, "\tchildren");
3326 988783 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3327 445399 : dump_printf (dump_kind, " %p", (void *)child);
3328 271692 : dump_printf (dump_kind, "%s\n",
3329 271692 : node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
3330 : ? " (store-lanes)" : "");
3331 : }
3332 :
3333 : DEBUG_FUNCTION void
3334 0 : debug (slp_tree node)
3335 : {
3336 0 : debug_dump_context ctx;
3337 0 : vect_print_slp_tree (MSG_NOTE,
3338 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3339 : node);
3340 0 : }
3341 :
3342 : /* Recursive helper for the dot producer below. */
3343 :
3344 : static void
3345 0 : dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
3346 : {
3347 0 : if (visited.add (node))
3348 : return;
3349 :
3350 0 : fprintf (f, "\"%p\" [label=\"", (void *)node);
3351 0 : vect_print_slp_tree (MSG_NOTE,
3352 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3353 : node);
3354 0 : fprintf (f, "\"];\n");
3355 :
3356 :
3357 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3358 0 : fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
3359 :
3360 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3361 0 : if (child)
3362 0 : dot_slp_tree (f, child, visited);
3363 : }
3364 :
3365 : DEBUG_FUNCTION void
3366 0 : dot_slp_tree (const char *fname, slp_tree node)
3367 : {
3368 0 : FILE *f = fopen (fname, "w");
3369 0 : fprintf (f, "digraph {\n");
3370 0 : fflush (f);
3371 0 : {
3372 0 : debug_dump_context ctx (f);
3373 0 : hash_set<slp_tree> visited;
3374 0 : dot_slp_tree (f, node, visited);
3375 0 : }
3376 0 : fflush (f);
3377 0 : fprintf (f, "}\n");
3378 0 : fclose (f);
3379 0 : }
3380 :
3381 : DEBUG_FUNCTION void
3382 0 : dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
3383 : {
3384 0 : FILE *f = fopen (fname, "w");
3385 0 : fprintf (f, "digraph {\n");
3386 0 : fflush (f);
3387 0 : {
3388 0 : debug_dump_context ctx (f);
3389 0 : hash_set<slp_tree> visited;
3390 0 : for (auto inst : slp_instances)
3391 0 : dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
3392 0 : }
3393 0 : fflush (f);
3394 0 : fprintf (f, "}\n");
3395 0 : fclose (f);
3396 0 : }
3397 :
3398 : /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
3399 :
3400 : static void
3401 476877 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3402 : slp_tree node, hash_set<slp_tree> &visited)
3403 : {
3404 476877 : unsigned i;
3405 476877 : slp_tree child;
3406 :
3407 476877 : if (visited.add (node))
3408 476877 : return;
3409 :
3410 437293 : vect_print_slp_tree (dump_kind, loc, node);
3411 :
3412 1319471 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3413 444885 : if (child)
3414 403026 : vect_print_slp_graph (dump_kind, loc, child, visited);
3415 : }
3416 :
3417 : static void
3418 45559 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3419 : slp_tree entry)
3420 : {
3421 45559 : hash_set<slp_tree> visited;
3422 45559 : vect_print_slp_graph (dump_kind, loc, entry, visited);
3423 45559 : }
3424 :
3425 : DEBUG_FUNCTION void
3426 0 : debug (slp_instance instance)
3427 : {
3428 0 : debug_dump_context ctx;
3429 0 : vect_print_slp_graph (MSG_NOTE,
3430 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3431 : SLP_INSTANCE_TREE (instance));
3432 0 : }
3433 :
3434 : /* Mark the tree rooted at NODE with PURE_SLP. */
3435 :
3436 : static void
3437 5581674 : vect_mark_slp_stmts (vec_info *vinfo, slp_tree node,
3438 : hash_set<slp_tree> &visited)
3439 : {
3440 5581674 : int i;
3441 5581674 : stmt_vec_info stmt_info;
3442 5581674 : slp_tree child;
3443 :
3444 5581674 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3445 : return;
3446 :
3447 3998563 : if (visited.add (node))
3448 : return;
3449 :
3450 9510244 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3451 5786021 : if (stmt_info)
3452 : {
3453 5724568 : STMT_SLP_TYPE (stmt_info) = pure_slp;
3454 : /* ??? For .MASK_LOAD and .MASK_STORE detected as load/store-lanes
3455 : when there is the mask_conversion pattern applied we have lost the
3456 : alternate lanes of the uniform mask which nevertheless
3457 : have separate pattern defs. To not confuse hybrid
3458 : analysis we mark those as covered as well here. */
3459 5724568 : if (node->ldst_lanes)
3460 5786021 : if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
3461 0 : if (gimple_call_internal_p (call, IFN_MASK_LOAD)
3462 0 : || gimple_call_internal_p (call, IFN_MASK_STORE))
3463 : {
3464 0 : tree mask = gimple_call_arg (call,
3465 : internal_fn_mask_index
3466 0 : (gimple_call_internal_fn (call)));
3467 0 : if (TREE_CODE (mask) == SSA_NAME)
3468 0 : if (stmt_vec_info mask_info = vinfo->lookup_def (mask))
3469 : {
3470 0 : mask_info = vect_stmt_to_vectorize (mask_info);
3471 0 : STMT_SLP_TYPE (mask_info) = pure_slp;
3472 : }
3473 : }
3474 : }
3475 :
3476 8383762 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3477 4659539 : if (child)
3478 4144642 : vect_mark_slp_stmts (vinfo, child, visited);
3479 : }
3480 :
3481 : static void
3482 1437032 : vect_mark_slp_stmts (vec_info *vinfo, slp_tree node)
3483 : {
3484 1437032 : hash_set<slp_tree> visited;
3485 1437032 : vect_mark_slp_stmts (vinfo, node, visited);
3486 1437032 : }
3487 :
3488 : /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
3489 :
3490 : static void
3491 2319255 : vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
3492 : {
3493 2319255 : int i;
3494 2319255 : stmt_vec_info stmt_info;
3495 2319255 : slp_tree child;
3496 :
3497 2319255 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3498 : return;
3499 :
3500 1362522 : if (visited.add (node))
3501 : return;
3502 :
3503 4239704 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3504 2983340 : if (stmt_info)
3505 : {
3506 2983340 : gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
3507 : || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
3508 2983340 : STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
3509 : }
3510 :
3511 2802391 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3512 1546027 : if (child)
3513 1546027 : vect_mark_slp_stmts_relevant (child, visited);
3514 : }
3515 :
3516 : static void
3517 773228 : vect_mark_slp_stmts_relevant (slp_tree node)
3518 : {
3519 773228 : hash_set<slp_tree> visited;
3520 773228 : vect_mark_slp_stmts_relevant (node, visited);
3521 773228 : }
3522 :
3523 :
3524 : /* Gather loads in the SLP graph NODE and populate the INST loads array. */
3525 :
3526 : static void
3527 9191610 : vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
3528 : hash_set<slp_tree> &visited)
3529 : {
3530 9191610 : if (!node || visited.add (node))
3531 1408610 : return;
3532 :
3533 7783000 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3534 : return;
3535 :
3536 5671303 : if (!SLP_TREE_PERMUTE_P (node))
3537 : {
3538 5494249 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3539 5494249 : if (STMT_VINFO_DATA_REF (stmt_info)
3540 2442863 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3541 1356725 : loads.safe_push (node);
3542 : }
3543 :
3544 : unsigned i;
3545 : slp_tree child;
3546 12820823 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3547 7149520 : vect_gather_slp_loads (loads, child, visited);
3548 : }
3549 :
3550 :
3551 : /* Find the last store in SLP INSTANCE. */
3552 :
3553 : stmt_vec_info
3554 2706035 : vect_find_last_scalar_stmt_in_slp (slp_tree node)
3555 : {
3556 2706035 : stmt_vec_info last = NULL;
3557 2706035 : stmt_vec_info stmt_vinfo;
3558 :
3559 9853121 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3560 7147086 : if (stmt_vinfo)
3561 : {
3562 7147086 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3563 7147086 : last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
3564 : }
3565 :
3566 2706035 : return last;
3567 : }
3568 :
3569 : /* Find the first stmt in NODE. */
3570 :
3571 : stmt_vec_info
3572 520860 : vect_find_first_scalar_stmt_in_slp (slp_tree node)
3573 : {
3574 520860 : stmt_vec_info first = NULL;
3575 520860 : stmt_vec_info stmt_vinfo;
3576 :
3577 1754885 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3578 1234025 : if (stmt_vinfo)
3579 : {
3580 1231331 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3581 1231331 : if (!first
3582 1231331 : || get_later_stmt (stmt_vinfo, first) == first)
3583 : first = stmt_vinfo;
3584 : }
3585 :
3586 520860 : return first;
3587 : }
3588 :
3589 : /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3590 : two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3591 : (also containing the first GROUP1_SIZE stmts, since stores are
3592 : consecutive), the second containing the remainder.
3593 : Return the first stmt in the second group. */
3594 :
3595 : static stmt_vec_info
3596 156486 : vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3597 : {
3598 156486 : gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3599 156486 : gcc_assert (group1_size > 0);
3600 156486 : int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3601 156486 : gcc_assert (group2_size > 0);
3602 156486 : DR_GROUP_SIZE (first_vinfo) = group1_size;
3603 :
3604 156486 : stmt_vec_info stmt_info = first_vinfo;
3605 523237 : for (unsigned i = group1_size; i > 1; i--)
3606 : {
3607 366751 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3608 366751 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3609 : }
3610 : /* STMT is now the last element of the first group. */
3611 156486 : stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3612 156486 : DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3613 :
3614 156486 : DR_GROUP_SIZE (group2) = group2_size;
3615 437230 : for (stmt_info = group2; stmt_info;
3616 280744 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3617 : {
3618 280744 : DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3619 280744 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3620 : }
3621 :
3622 : /* For the second group, the DR_GROUP_GAP is that before the original group,
3623 : plus skipping over the first vector. */
3624 156486 : DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3625 :
3626 : /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3627 156486 : DR_GROUP_GAP (first_vinfo) += group2_size;
3628 :
3629 156486 : if (dump_enabled_p ())
3630 61 : dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3631 : group1_size, group2_size);
3632 :
3633 156486 : return group2;
3634 : }
3635 :
3636 : /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3637 : statements and a vector of NUNITS elements. */
3638 :
3639 : static poly_uint64
3640 3666989 : calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3641 : {
3642 3666989 : return exact_div (common_multiple (nunits, group_size), group_size);
3643 : }
3644 :
3645 : /* Helper that checks to see if a node is a load node. */
3646 :
3647 : static inline bool
3648 54 : vect_is_slp_load_node (slp_tree root)
3649 : {
3650 54 : return (!SLP_TREE_PERMUTE_P (root)
3651 54 : && SLP_TREE_DEF_TYPE (root) == vect_internal_def
3652 48 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3653 94 : && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
3654 : }
3655 :
3656 :
3657 : /* Helper function of optimize_load_redistribution that performs the operation
3658 : recursively. */
3659 :
3660 : static slp_tree
3661 20132 : optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3662 : vec_info *vinfo, unsigned int group_size,
3663 : hash_map<slp_tree, slp_tree> *load_map,
3664 : slp_tree root)
3665 : {
3666 20132 : if (slp_tree *leader = load_map->get (root))
3667 3576 : return *leader;
3668 :
3669 16556 : slp_tree node;
3670 16556 : unsigned i;
3671 :
3672 : /* For now, we don't know anything about externals so do not do anything. */
3673 16556 : if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3674 : return NULL;
3675 12002 : else if (SLP_TREE_PERMUTE_P (root))
3676 : {
3677 : /* First convert this node into a load node and add it to the leaves
3678 : list and flatten the permute from a lane to a load one. If it's
3679 : unneeded it will be elided later. */
3680 34 : vec<stmt_vec_info> stmts;
3681 34 : stmts.create (SLP_TREE_LANES (root));
3682 34 : lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3683 74 : for (unsigned j = 0; j < lane_perm.length (); j++)
3684 : {
3685 54 : std::pair<unsigned, unsigned> perm = lane_perm[j];
3686 54 : node = SLP_TREE_CHILDREN (root)[perm.first];
3687 :
3688 54 : if (!vect_is_slp_load_node (node)
3689 54 : || SLP_TREE_CHILDREN (node).exists ())
3690 : {
3691 14 : stmts.release ();
3692 14 : goto next;
3693 : }
3694 :
3695 40 : stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3696 : }
3697 :
3698 20 : if (dump_enabled_p ())
3699 0 : dump_printf_loc (MSG_NOTE, vect_location,
3700 : "converting stmts on permute node %p\n",
3701 : (void *) root);
3702 :
3703 20 : bool *matches = XALLOCAVEC (bool, group_size);
3704 20 : poly_uint64 max_nunits = 1;
3705 20 : unsigned tree_size = 0, limit = 1;
3706 20 : node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3707 : matches, &limit, &tree_size, bst_map);
3708 20 : if (!node)
3709 0 : stmts.release ();
3710 :
3711 20 : load_map->put (root, node);
3712 20 : return node;
3713 : }
3714 :
3715 11968 : next:
3716 11982 : load_map->put (root, NULL);
3717 :
3718 28363 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3719 : {
3720 16381 : slp_tree value
3721 16381 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3722 : node);
3723 16381 : if (value)
3724 : {
3725 20 : SLP_TREE_REF_COUNT (value)++;
3726 20 : SLP_TREE_CHILDREN (root)[i] = value;
3727 : /* ??? We know the original leafs of the replaced nodes will
3728 : be referenced by bst_map, only the permutes created by
3729 : pattern matching are not. */
3730 20 : if (SLP_TREE_REF_COUNT (node) == 1)
3731 20 : load_map->remove (node);
3732 20 : vect_free_slp_tree (node);
3733 : }
3734 : }
3735 :
3736 : return NULL;
3737 : }
3738 :
3739 : /* Temporary workaround for loads not being CSEd during SLP build. This
3740 : function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3741 : VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3742 : same DR such that the final operation is equal to a permuted load. Such
3743 : NODES are then directly converted into LOADS themselves. The nodes are
3744 : CSEd using BST_MAP. */
3745 :
3746 : static void
3747 2835 : optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3748 : vec_info *vinfo, unsigned int group_size,
3749 : hash_map<slp_tree, slp_tree> *load_map,
3750 : slp_tree root)
3751 : {
3752 2835 : slp_tree node;
3753 2835 : unsigned i;
3754 :
3755 6586 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3756 : {
3757 3751 : slp_tree value
3758 3751 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3759 : node);
3760 3751 : if (value)
3761 : {
3762 0 : SLP_TREE_REF_COUNT (value)++;
3763 0 : SLP_TREE_CHILDREN (root)[i] = value;
3764 : /* ??? We know the original leafs of the replaced nodes will
3765 : be referenced by bst_map, only the permutes created by
3766 : pattern matching are not. */
3767 0 : if (SLP_TREE_REF_COUNT (node) == 1)
3768 0 : load_map->remove (node);
3769 0 : vect_free_slp_tree (node);
3770 : }
3771 : }
3772 2835 : }
3773 :
3774 : /* Helper function of vect_match_slp_patterns.
3775 :
3776 : Attempts to match patterns against the slp tree rooted in REF_NODE using
3777 : VINFO. Patterns are matched in post-order traversal.
3778 :
3779 : If matching is successful the value in REF_NODE is updated and returned, if
3780 : not then it is returned unchanged. */
3781 :
3782 : static bool
3783 5456673 : vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3784 : slp_tree_to_load_perm_map_t *perm_cache,
3785 : slp_compat_nodes_map_t *compat_cache,
3786 : hash_set<slp_tree> *visited)
3787 : {
3788 5456673 : unsigned i;
3789 5456673 : slp_tree node = *ref_node;
3790 5456673 : bool found_p = false;
3791 5456673 : if (!node || visited->add (node))
3792 722047 : return false;
3793 :
3794 : slp_tree child;
3795 8745862 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3796 4011236 : found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3797 : vinfo, perm_cache, compat_cache,
3798 : visited);
3799 :
3800 14203878 : for (unsigned x = 0; x < num__slp_patterns; x++)
3801 : {
3802 9469252 : vect_pattern *pattern
3803 9469252 : = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3804 9469252 : if (pattern)
3805 : {
3806 1081 : pattern->build (vinfo);
3807 1081 : delete pattern;
3808 1081 : found_p = true;
3809 : }
3810 : }
3811 :
3812 : return found_p;
3813 : }
3814 :
3815 : /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3816 : vec_info VINFO.
3817 :
3818 : The modified tree is returned. Patterns are tried in order and multiple
3819 : patterns may match. */
3820 :
3821 : static bool
3822 1445437 : vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3823 : hash_set<slp_tree> *visited,
3824 : slp_tree_to_load_perm_map_t *perm_cache,
3825 : slp_compat_nodes_map_t *compat_cache)
3826 : {
3827 1445437 : DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3828 1445437 : slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3829 :
3830 1445437 : if (dump_enabled_p ())
3831 29452 : dump_printf_loc (MSG_NOTE, vect_location,
3832 : "Analyzing SLP tree %p for patterns\n",
3833 29452 : (void *) SLP_INSTANCE_TREE (instance));
3834 :
3835 1445437 : return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3836 1445437 : visited);
3837 : }
3838 :
3839 : /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3840 : vectorizing with VECTYPE that might be NULL. MASKED_P indicates whether
3841 : the stores are masked.
3842 : Return true if we could use IFN_STORE_LANES instead and if that appears
3843 : to be the better approach. */
3844 :
3845 : static bool
3846 4866 : vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3847 : tree vectype, bool masked_p,
3848 : unsigned int group_size,
3849 : unsigned int new_group_size)
3850 : {
3851 4866 : if (!vectype)
3852 : {
3853 4866 : tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3854 4866 : vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3855 : }
3856 4866 : if (!vectype)
3857 : return false;
3858 : /* Allow the split if one of the two new groups would operate on full
3859 : vectors *within* rather than across one scalar loop iteration.
3860 : This is purely a heuristic, but it should work well for group
3861 : sizes of 3 and 4, where the possible splits are:
3862 :
3863 : 3->2+1: OK if the vector has exactly two elements
3864 : 4->2+2: Likewise
3865 : 4->3+1: Less clear-cut. */
3866 4866 : if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3867 2537 : || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3868 2346 : return false;
3869 2520 : return vect_store_lanes_supported (vectype, group_size, masked_p) != IFN_LAST;
3870 : }
3871 :
3872 : /* Analyze an SLP instance starting from a group of grouped stores. Call
3873 : vect_build_slp_tree to build a tree of packed stmts if possible.
3874 : Return FALSE if it's impossible to SLP any stmt in the loop. */
3875 :
3876 : static bool
3877 : vect_analyze_slp_instance (vec_info *vinfo,
3878 : scalar_stmts_to_slp_tree_map_t *bst_map,
3879 : stmt_vec_info stmt_info, slp_instance_kind kind,
3880 : unsigned max_tree_size, unsigned *limit,
3881 : bool force_single_lane);
3882 :
3883 : /* Build an interleaving scheme for the store sources RHS_NODES from
3884 : SCALAR_STMTS. */
3885 :
3886 : static slp_tree
3887 6204 : vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
3888 : vec<stmt_vec_info> &scalar_stmts,
3889 : poly_uint64 max_nunits)
3890 : {
3891 6204 : unsigned int group_size = scalar_stmts.length ();
3892 12408 : slp_tree node = vect_create_new_slp_node (scalar_stmts,
3893 6204 : SLP_TREE_CHILDREN
3894 : (rhs_nodes[0]).length ());
3895 6204 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
3896 6204 : node->max_nunits = max_nunits;
3897 6204 : for (unsigned l = 0;
3898 12435 : l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
3899 : {
3900 : /* And a permute merging all RHS SLP trees. */
3901 6231 : slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
3902 6231 : VEC_PERM_EXPR);
3903 6231 : SLP_TREE_CHILDREN (node).quick_push (perm);
3904 6231 : SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
3905 6231 : SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
3906 6231 : perm->max_nunits = max_nunits;
3907 6231 : SLP_TREE_LANES (perm) = group_size;
3908 : /* ??? We should set this NULL but that's not expected. */
3909 6231 : SLP_TREE_REPRESENTATIVE (perm)
3910 6231 : = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
3911 24558 : for (unsigned j = 0; j < rhs_nodes.length (); ++j)
3912 : {
3913 18327 : SLP_TREE_CHILDREN (perm)
3914 18327 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
3915 18327 : SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
3916 18327 : for (unsigned k = 0;
3917 38665 : k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
3918 : {
3919 : /* ??? We should populate SLP_TREE_SCALAR_STMTS
3920 : or SLP_TREE_SCALAR_OPS but then we might have
3921 : a mix of both in our children. */
3922 20338 : SLP_TREE_LANE_PERMUTATION (perm)
3923 20338 : .quick_push (std::make_pair (j, k));
3924 : }
3925 : }
3926 :
3927 : /* Now we have a single permute node but we cannot code-generate
3928 : the case with more than two inputs.
3929 : Perform pairwise reduction, reducing the two inputs
3930 : with the least number of lanes to one and then repeat until
3931 : we end up with two inputs. That scheme makes sure we end
3932 : up with permutes satisfying the restriction of requiring at
3933 : most two vector inputs to produce a single vector output
3934 : when the number of lanes is even. */
3935 12096 : while (SLP_TREE_CHILDREN (perm).length () > 2)
3936 : {
3937 : /* When we have three equal sized groups left the pairwise
3938 : reduction does not result in a scheme that avoids using
3939 : three vectors. Instead merge the first two groups
3940 : to the final size with do-not-care elements (chosen
3941 : from the first group) and then merge with the third.
3942 : { A0, B0, x, A1, B1, x, ... }
3943 : -> { A0, B0, C0, A1, B1, C1, ... }
3944 : This handles group size of three (and at least
3945 : power-of-two multiples of that). */
3946 5865 : if (SLP_TREE_CHILDREN (perm).length () == 3
3947 3022 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3948 3022 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
3949 5865 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3950 2280 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
3951 : {
3952 2084 : int ai = 0;
3953 2084 : int bi = 1;
3954 2084 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3955 2084 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3956 2084 : unsigned n = SLP_TREE_LANES (perm);
3957 :
3958 2084 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3959 2084 : SLP_TREE_LANES (permab) = n;
3960 2084 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
3961 2084 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3962 2084 : permab->max_nunits = max_nunits;
3963 : /* ??? Should be NULL but that's not expected. */
3964 2084 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3965 2084 : SLP_TREE_CHILDREN (permab).quick_push (a);
3966 4179 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3967 2095 : SLP_TREE_LANE_PERMUTATION (permab)
3968 2095 : .quick_push (std::make_pair (0, k));
3969 2084 : SLP_TREE_CHILDREN (permab).quick_push (b);
3970 4179 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3971 2095 : SLP_TREE_LANE_PERMUTATION (permab)
3972 2095 : .quick_push (std::make_pair (1, k));
3973 : /* Push the do-not-care lanes. */
3974 4179 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3975 2095 : SLP_TREE_LANE_PERMUTATION (permab)
3976 2095 : .quick_push (std::make_pair (0, k));
3977 :
3978 : /* Put the merged node into 'perm', in place of a. */
3979 2084 : SLP_TREE_CHILDREN (perm)[ai] = permab;
3980 : /* Adjust the references to b in the permutation
3981 : of perm and to the later children which we'll
3982 : remove. */
3983 8369 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3984 : {
3985 6285 : std::pair<unsigned, unsigned> &p
3986 6285 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
3987 6285 : if (p.first == (unsigned) bi)
3988 : {
3989 2095 : p.first = ai;
3990 2095 : p.second += SLP_TREE_LANES (a);
3991 : }
3992 4190 : else if (p.first > (unsigned) bi)
3993 2095 : p.first--;
3994 : }
3995 2084 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3996 2084 : break;
3997 : }
3998 :
3999 : /* Pick the two nodes with the least number of lanes,
4000 : prefer the earliest candidate and maintain ai < bi. */
4001 : int ai = -1;
4002 : int bi = -1;
4003 33069 : for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
4004 : {
4005 29288 : if (ai == -1)
4006 3781 : ai = ci;
4007 25507 : else if (bi == -1)
4008 3781 : bi = ci;
4009 21726 : else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
4010 21726 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
4011 21726 : || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
4012 17812 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
4013 : {
4014 8714 : if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
4015 4357 : <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
4016 2074 : bi = ci;
4017 : else
4018 : {
4019 2283 : ai = bi;
4020 2283 : bi = ci;
4021 : }
4022 : }
4023 : }
4024 :
4025 : /* Produce a merge of nodes ai and bi. */
4026 3781 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
4027 3781 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
4028 3781 : unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
4029 3781 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
4030 3781 : SLP_TREE_LANES (permab) = n;
4031 3781 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
4032 3781 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
4033 3781 : permab->max_nunits = max_nunits;
4034 : /* ??? Should be NULL but that's not expected. */
4035 3781 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
4036 3781 : SLP_TREE_CHILDREN (permab).quick_push (a);
4037 9886 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
4038 6105 : SLP_TREE_LANE_PERMUTATION (permab)
4039 6105 : .quick_push (std::make_pair (0, k));
4040 3781 : SLP_TREE_CHILDREN (permab).quick_push (b);
4041 9398 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
4042 5617 : SLP_TREE_LANE_PERMUTATION (permab)
4043 5617 : .quick_push (std::make_pair (1, k));
4044 :
4045 : /* Put the merged node into 'perm', in place of a. */
4046 3781 : SLP_TREE_CHILDREN (perm)[ai] = permab;
4047 : /* Adjust the references to b in the permutation
4048 : of perm and to the later children which we'll
4049 : remove. */
4050 52693 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
4051 : {
4052 48912 : std::pair<unsigned, unsigned> &p
4053 48912 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
4054 48912 : if (p.first == (unsigned) bi)
4055 : {
4056 5617 : p.first = ai;
4057 5617 : p.second += SLP_TREE_LANES (a);
4058 : }
4059 43295 : else if (p.first > (unsigned) bi)
4060 17862 : p.first--;
4061 : }
4062 3781 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
4063 : }
4064 : }
4065 :
4066 6204 : return node;
4067 : }
4068 :
4069 : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
4070 : of KIND. Return true if successful. SCALAR_STMTS is owned by this
4071 : function, REMAIN and ROOT_STMT_INFOS ownership is transferred back to
4072 : the caller upon failure. */
4073 :
4074 : static bool
4075 1790563 : vect_build_slp_instance (vec_info *vinfo,
4076 : slp_instance_kind kind,
4077 : vec<stmt_vec_info> &scalar_stmts,
4078 : vec<stmt_vec_info> &root_stmt_infos,
4079 : vec<tree> &remain,
4080 : unsigned max_tree_size, unsigned *limit,
4081 : scalar_stmts_to_slp_tree_map_t *bst_map,
4082 : bool force_single_lane)
4083 : {
4084 : /* If there's no budget left bail out early. */
4085 1790563 : if (*limit == 0)
4086 : {
4087 27205 : scalar_stmts.release ();
4088 27205 : return false;
4089 : }
4090 :
4091 1763358 : if (kind == slp_inst_kind_ctor)
4092 : {
4093 12453 : if (dump_enabled_p ())
4094 86 : dump_printf_loc (MSG_NOTE, vect_location,
4095 : "Analyzing vectorizable constructor: %G\n",
4096 43 : root_stmt_infos[0]->stmt);
4097 : }
4098 1750905 : else if (kind == slp_inst_kind_gcond)
4099 : {
4100 272810 : if (dump_enabled_p ())
4101 5260 : dump_printf_loc (MSG_NOTE, vect_location,
4102 : "Analyzing vectorizable control flow: %G",
4103 2630 : root_stmt_infos[0]->stmt);
4104 : }
4105 :
4106 1763358 : if (dump_enabled_p ())
4107 : {
4108 24636 : dump_printf_loc (MSG_NOTE, vect_location,
4109 : "Starting SLP discovery for\n");
4110 52615 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4111 55958 : dump_printf_loc (MSG_NOTE, vect_location,
4112 27979 : " %G", scalar_stmts[i]->stmt);
4113 : }
4114 :
4115 : /* Build the tree for the SLP instance. */
4116 1763358 : unsigned int group_size = scalar_stmts.length ();
4117 1763358 : bool *matches = XALLOCAVEC (bool, group_size);
4118 1763358 : poly_uint64 max_nunits = 1;
4119 1763358 : unsigned tree_size = 0;
4120 :
4121 1763358 : slp_tree node = NULL;
4122 1763358 : if (group_size > 1 && force_single_lane)
4123 : {
4124 0 : matches[0] = true;
4125 0 : matches[1] = false;
4126 : }
4127 : else
4128 1763358 : node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4129 : &max_nunits, matches, limit,
4130 : &tree_size, bst_map);
4131 1763358 : if (node != NULL)
4132 : {
4133 : /* Calculate the unrolling factor based on the smallest type. */
4134 700502 : poly_uint64 unrolling_factor
4135 700502 : = calculate_unrolling_factor (max_nunits, group_size);
4136 :
4137 700502 : if (maybe_ne (unrolling_factor, 1U)
4138 700502 : && is_a <bb_vec_info> (vinfo))
4139 : {
4140 0 : unsigned HOST_WIDE_INT const_max_nunits;
4141 0 : if (!max_nunits.is_constant (&const_max_nunits)
4142 0 : || const_max_nunits > group_size)
4143 : {
4144 0 : if (dump_enabled_p ())
4145 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4146 : "Build SLP failed: store group "
4147 : "size not a multiple of the vector size "
4148 : "in basic block SLP\n");
4149 0 : vect_free_slp_tree (node);
4150 0 : return false;
4151 : }
4152 : /* Fatal mismatch. */
4153 0 : if (dump_enabled_p ())
4154 0 : dump_printf_loc (MSG_NOTE, vect_location,
4155 : "SLP discovery succeeded but node needs "
4156 : "splitting\n");
4157 0 : memset (matches, true, group_size);
4158 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
4159 0 : vect_free_slp_tree (node);
4160 : }
4161 : else
4162 : {
4163 : /* Create a new SLP instance. */
4164 700502 : slp_instance new_instance = XNEW (class _slp_instance);
4165 700502 : SLP_INSTANCE_TREE (new_instance) = node;
4166 700502 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4167 700502 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4168 700502 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4169 700502 : SLP_INSTANCE_KIND (new_instance) = kind;
4170 700502 : new_instance->reduc_phis = NULL;
4171 700502 : new_instance->cost_vec = vNULL;
4172 700502 : new_instance->subgraph_entries = vNULL;
4173 :
4174 700502 : if (dump_enabled_p ())
4175 21654 : dump_printf_loc (MSG_NOTE, vect_location,
4176 : "SLP size %u vs. limit %u.\n",
4177 : tree_size, max_tree_size);
4178 :
4179 700502 : vinfo->slp_instances.safe_push (new_instance);
4180 :
4181 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4182 : the number of scalar stmts in the root in a few places.
4183 : Verify that assumption holds. */
4184 1401004 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4185 : .length () == group_size);
4186 :
4187 700502 : if (dump_enabled_p ())
4188 : {
4189 21654 : if (kind == slp_inst_kind_reduc_group)
4190 1407 : dump_printf_loc (MSG_NOTE, vect_location,
4191 : "SLP discovery of size %d reduction group "
4192 : "succeeded\n", group_size);
4193 21654 : dump_printf_loc (MSG_NOTE, vect_location,
4194 : "Final SLP tree for instance %p:\n",
4195 : (void *) new_instance);
4196 21654 : vect_print_slp_graph (MSG_NOTE, vect_location,
4197 : SLP_INSTANCE_TREE (new_instance));
4198 : }
4199 :
4200 700502 : return true;
4201 : }
4202 : }
4203 : /* Failed to SLP. */
4204 :
4205 : /* While we arrive here even with slp_inst_kind_store we should only
4206 : for group_size == 1. The code to split store groups is only in
4207 : vect_analyze_slp_instance now. */
4208 1062856 : gcc_assert (kind != slp_inst_kind_store || group_size == 1);
4209 :
4210 : /* Free the allocated memory. */
4211 1062856 : scalar_stmts.release ();
4212 :
4213 : /* Failed to SLP. */
4214 1062856 : if (dump_enabled_p ())
4215 2982 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4216 : return false;
4217 : }
4218 :
4219 : /* Analyze an SLP instance starting from a the start of a reduction chain.
4220 : Call vect_build_slp_tree to build a tree of packed stmts if possible.
4221 : Return FALSE if SLP build fails. */
4222 :
4223 : static bool
4224 42762 : vect_analyze_slp_reduc_chain (loop_vec_info vinfo,
4225 : scalar_stmts_to_slp_tree_map_t *bst_map,
4226 : stmt_vec_info scalar_stmt,
4227 : unsigned max_tree_size, unsigned *limit)
4228 : {
4229 42762 : vec<stmt_vec_info> scalar_stmts = vNULL;
4230 :
4231 42762 : bool fail = false;
4232 : /* ??? We could leave operation code checking to SLP discovery. */
4233 42762 : code_helper code = STMT_VINFO_REDUC_CODE (STMT_VINFO_REDUC_DEF
4234 : (vect_orig_stmt (scalar_stmt)));
4235 42762 : bool first = true;
4236 42762 : stmt_vec_info next_stmt = scalar_stmt;
4237 47906 : do
4238 : {
4239 47906 : stmt_vec_info stmt = next_stmt;
4240 47906 : gimple_match_op op;
4241 47906 : if (!gimple_extract_op (STMT_VINFO_STMT (stmt), &op))
4242 0 : gcc_unreachable ();
4243 95812 : tree reduc_def = gimple_arg (STMT_VINFO_STMT (stmt),
4244 47906 : STMT_VINFO_REDUC_IDX (stmt));
4245 47906 : next_stmt = vect_stmt_to_vectorize (vinfo->lookup_def (reduc_def));
4246 47906 : gcc_assert (is_a <gphi *> (STMT_VINFO_STMT (next_stmt))
4247 : || STMT_VINFO_REDUC_IDX (next_stmt) != -1);
4248 51310 : if (!gimple_extract_op (STMT_VINFO_STMT (vect_orig_stmt (stmt)), &op))
4249 0 : gcc_unreachable ();
4250 47906 : if (CONVERT_EXPR_CODE_P (op.code)
4251 2149 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))
4252 50043 : && (first
4253 1058 : || is_a <gphi *> (STMT_VINFO_STMT (next_stmt))))
4254 : ;
4255 45771 : else if (code != op.code)
4256 : {
4257 1718 : fail = true;
4258 1718 : break;
4259 : }
4260 : else
4261 44053 : scalar_stmts.safe_push (stmt);
4262 46188 : first = false;
4263 : }
4264 46188 : while (!is_a <gphi *> (STMT_VINFO_STMT (next_stmt)));
4265 42762 : if (fail)
4266 1718 : return false;
4267 :
4268 : /* Remember a stmt with the actual reduction operation. */
4269 41044 : stmt_vec_info reduc_scalar_stmt = scalar_stmts[0];
4270 :
4271 : /* When the SSA def chain through reduc-idx does not form a natural
4272 : reduction chain try to linearize an associative operation manually. */
4273 41044 : if (scalar_stmts.length () == 1
4274 39389 : && code.is_tree_code ()
4275 36003 : && associative_tree_code ((tree_code)code)
4276 : /* We may not associate if a fold-left reduction is required. */
4277 76176 : && !needs_fold_left_reduction_p (TREE_TYPE (gimple_get_lhs
4278 : (reduc_scalar_stmt->stmt)),
4279 : code))
4280 : {
4281 33308 : auto_vec<chain_op_t> chain;
4282 33308 : auto_vec<std::pair<tree_code, gimple *> > worklist;
4283 33308 : gimple *op_stmt = NULL, *other_op_stmt = NULL;
4284 33308 : vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
4285 33308 : scalar_stmts[0]->stmt, op_stmt, other_op_stmt,
4286 : NULL);
4287 :
4288 33308 : scalar_stmts.truncate (0);
4289 33308 : stmt_vec_info tail = NULL;
4290 165781 : for (auto el : chain)
4291 : {
4292 66539 : if (el.dt == vect_external_def
4293 66539 : || el.dt == vect_constant_def
4294 66539 : || el.code != (tree_code) code)
4295 : {
4296 682 : scalar_stmts.release ();
4297 682 : return false;
4298 : }
4299 65857 : stmt_vec_info stmt = vinfo->lookup_def (el.op);
4300 65857 : if (STMT_VINFO_REDUC_IDX (stmt) != -1
4301 64890 : || STMT_VINFO_REDUC_DEF (stmt))
4302 : {
4303 32802 : gcc_assert (tail == NULL);
4304 32802 : tail = stmt;
4305 32802 : continue;
4306 : }
4307 33055 : scalar_stmts.safe_push (stmt);
4308 : }
4309 32626 : gcc_assert (tail);
4310 :
4311 : /* When this linearization didn't produce a chain see if stripping
4312 : a wrapping sign conversion produces one. */
4313 32626 : if (scalar_stmts.length () == 1
4314 32626 : && (code == PLUS_EXPR || code == MULT_EXPR || code == BIT_IOR_EXPR
4315 : || code == BIT_AND_EXPR || code == BIT_XOR_EXPR))
4316 : {
4317 31336 : gimple *stmt = scalar_stmts[0]->stmt;
4318 31336 : if (!is_gimple_assign (stmt)
4319 30296 : || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
4320 3917 : || TREE_CODE (gimple_assign_rhs1 (stmt)) != SSA_NAME
4321 35253 : || !tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
4322 3917 : TREE_TYPE (gimple_assign_rhs1 (stmt))))
4323 : {
4324 29856 : scalar_stmts.release ();
4325 29856 : return false;
4326 : }
4327 1480 : stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
4328 1480 : if (!is_gimple_assign (stmt)
4329 1480 : || gimple_assign_rhs_code (stmt) != (tree_code)code)
4330 : {
4331 1462 : scalar_stmts.release ();
4332 1462 : return false;
4333 : }
4334 18 : chain.truncate (0);
4335 18 : vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
4336 : stmt, op_stmt, other_op_stmt, NULL);
4337 :
4338 18 : scalar_stmts.truncate (0);
4339 18 : tail = NULL;
4340 88 : for (auto el : chain)
4341 : {
4342 42 : if (el.dt == vect_external_def
4343 42 : || el.dt == vect_constant_def
4344 42 : || el.code != (tree_code) code)
4345 : {
4346 8 : scalar_stmts.release ();
4347 8 : return false;
4348 : }
4349 34 : stmt_vec_info stmt = vinfo->lookup_def (el.op);
4350 34 : if (STMT_VINFO_REDUC_IDX (stmt) != -1
4351 34 : || STMT_VINFO_REDUC_DEF (stmt))
4352 : {
4353 0 : gcc_assert (tail == NULL);
4354 0 : tail = stmt;
4355 0 : continue;
4356 : }
4357 34 : scalar_stmts.safe_push (stmt);
4358 : }
4359 : /* Unlike the above this does not include the reduction SSA
4360 : cycle. */
4361 10 : gcc_assert (!tail);
4362 : }
4363 :
4364 1300 : if (scalar_stmts.length () < 2)
4365 : {
4366 1207 : scalar_stmts.release ();
4367 1207 : return false;
4368 : }
4369 :
4370 93 : if (dump_enabled_p ())
4371 : {
4372 34 : dump_printf_loc (MSG_NOTE, vect_location,
4373 : "Starting SLP discovery of reduction chain for\n");
4374 140 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4375 212 : dump_printf_loc (MSG_NOTE, vect_location,
4376 106 : " %G", scalar_stmts[i]->stmt);
4377 : }
4378 :
4379 93 : unsigned int group_size = scalar_stmts.length ();
4380 93 : bool *matches = XALLOCAVEC (bool, group_size);
4381 93 : poly_uint64 max_nunits = 1;
4382 93 : unsigned tree_size = 0;
4383 93 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4384 : &max_nunits, matches, limit,
4385 93 : &tree_size, bst_map);
4386 93 : if (!node)
4387 : {
4388 37 : scalar_stmts.release ();
4389 37 : return false;
4390 : }
4391 :
4392 56 : unsigned cycle_id = vinfo->reduc_infos.length ();
4393 56 : vect_reduc_info reduc_info = new vect_reduc_info_s ();
4394 56 : vinfo->reduc_infos.safe_push (reduc_info);
4395 56 : VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (next_stmt);
4396 56 : VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (next_stmt);
4397 56 : VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (next_stmt);
4398 56 : VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
4399 56 : reduc_info->is_reduc_chain = true;
4400 :
4401 : /* Build the node for the PHI and possibly the conversions. */
4402 56 : slp_tree phis = vect_create_new_slp_node (2, ERROR_MARK);
4403 56 : SLP_TREE_REPRESENTATIVE (phis) = next_stmt;
4404 56 : phis->cycle_info.id = cycle_id;
4405 56 : SLP_TREE_LANES (phis) = group_size;
4406 56 : if (reduc_scalar_stmt == scalar_stmt)
4407 52 : SLP_TREE_VECTYPE (phis) = SLP_TREE_VECTYPE (node);
4408 : else
4409 4 : SLP_TREE_VECTYPE (phis)
4410 4 : = signed_or_unsigned_type_for (TYPE_UNSIGNED
4411 : (TREE_TYPE (gimple_get_lhs
4412 : (scalar_stmt->stmt))),
4413 : SLP_TREE_VECTYPE (node));
4414 : /* ??? vect_cse_slp_nodes cannot cope with cycles without any
4415 : SLP_TREE_SCALAR_STMTS. */
4416 56 : SLP_TREE_SCALAR_STMTS (phis).create (group_size);
4417 235 : for (unsigned i = 0; i < group_size; ++i)
4418 179 : SLP_TREE_SCALAR_STMTS (phis).quick_push (next_stmt);
4419 :
4420 56 : slp_tree op_input = phis;
4421 56 : if (reduc_scalar_stmt != scalar_stmt)
4422 : {
4423 4 : slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
4424 4 : SLP_TREE_REPRESENTATIVE (conv)
4425 4 : = vinfo->lookup_def (gimple_arg (reduc_scalar_stmt->stmt,
4426 4 : STMT_VINFO_REDUC_IDX
4427 : (reduc_scalar_stmt)));
4428 4 : SLP_TREE_CHILDREN (conv).quick_push (phis);
4429 4 : conv->cycle_info.id = cycle_id;
4430 4 : SLP_TREE_REDUC_IDX (conv) = 0;
4431 4 : SLP_TREE_LANES (conv) = group_size;
4432 4 : SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (node);
4433 4 : SLP_TREE_SCALAR_STMTS (conv) = vNULL;
4434 4 : op_input = conv;
4435 : }
4436 :
4437 56 : slp_tree reduc = vect_create_new_slp_node (2, ERROR_MARK);
4438 56 : SLP_TREE_REPRESENTATIVE (reduc) = reduc_scalar_stmt;
4439 56 : SLP_TREE_CHILDREN (reduc).quick_push (op_input);
4440 56 : SLP_TREE_CHILDREN (reduc).quick_push (node);
4441 56 : reduc->cycle_info.id = cycle_id;
4442 56 : SLP_TREE_REDUC_IDX (reduc) = 0;
4443 56 : SLP_TREE_LANES (reduc) = group_size;
4444 56 : SLP_TREE_VECTYPE (reduc) = SLP_TREE_VECTYPE (node);
4445 : /* ??? For the reduction epilogue we need a live lane. */
4446 56 : SLP_TREE_SCALAR_STMTS (reduc).create (group_size);
4447 56 : SLP_TREE_SCALAR_STMTS (reduc).quick_push (reduc_scalar_stmt);
4448 179 : for (unsigned i = 1; i < group_size; ++i)
4449 123 : SLP_TREE_SCALAR_STMTS (reduc).quick_push (NULL);
4450 :
4451 56 : if (reduc_scalar_stmt != scalar_stmt)
4452 : {
4453 4 : slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
4454 4 : SLP_TREE_REPRESENTATIVE (conv) = scalar_stmt;
4455 4 : SLP_TREE_CHILDREN (conv).quick_push (reduc);
4456 4 : conv->cycle_info.id = cycle_id;
4457 4 : SLP_TREE_REDUC_IDX (conv) = 0;
4458 4 : SLP_TREE_LANES (conv) = group_size;
4459 4 : SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (phis);
4460 : /* ??? For the reduction epilogue we need a live lane. */
4461 4 : SLP_TREE_SCALAR_STMTS (conv).create (group_size);
4462 4 : SLP_TREE_SCALAR_STMTS (conv).quick_push (scalar_stmt);
4463 8 : for (unsigned i = 1; i < group_size; ++i)
4464 4 : SLP_TREE_SCALAR_STMTS (conv).quick_push (NULL);
4465 4 : reduc = conv;
4466 : }
4467 :
4468 56 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (vinfo));
4469 56 : SLP_TREE_CHILDREN (phis).quick_push (NULL);
4470 56 : SLP_TREE_CHILDREN (phis).quick_push (NULL);
4471 56 : SLP_TREE_CHILDREN (phis)[le->dest_idx] = reduc;
4472 56 : SLP_TREE_REF_COUNT (reduc)++;
4473 :
4474 : /* Create a new SLP instance. */
4475 56 : slp_instance new_instance = XNEW (class _slp_instance);
4476 56 : SLP_INSTANCE_TREE (new_instance) = reduc;
4477 56 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4478 56 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4479 56 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4480 56 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
4481 56 : new_instance->reduc_phis = NULL;
4482 56 : new_instance->cost_vec = vNULL;
4483 56 : new_instance->subgraph_entries = vNULL;
4484 :
4485 56 : vinfo->slp_instances.safe_push (new_instance);
4486 :
4487 56 : if (dump_enabled_p ())
4488 : {
4489 24 : dump_printf_loc (MSG_NOTE, vect_location,
4490 : "Final SLP tree for instance %p:\n",
4491 : (void *) new_instance);
4492 24 : vect_print_slp_graph (MSG_NOTE, vect_location,
4493 : SLP_INSTANCE_TREE (new_instance));
4494 : }
4495 :
4496 56 : return true;
4497 33308 : }
4498 :
4499 7736 : if (scalar_stmts.length () <= 1)
4500 : {
4501 6081 : scalar_stmts.release ();
4502 6081 : return false;
4503 : }
4504 :
4505 1655 : scalar_stmts.reverse ();
4506 1655 : stmt_vec_info reduc_phi_info = next_stmt;
4507 :
4508 : /* Build the tree for the SLP instance. */
4509 1655 : vec<stmt_vec_info> root_stmt_infos = vNULL;
4510 1655 : vec<tree> remain = vNULL;
4511 :
4512 1655 : if (dump_enabled_p ())
4513 : {
4514 180 : dump_printf_loc (MSG_NOTE, vect_location,
4515 : "Starting SLP discovery of reduction chain for\n");
4516 966 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4517 1572 : dump_printf_loc (MSG_NOTE, vect_location,
4518 786 : " %G", scalar_stmts[i]->stmt);
4519 : }
4520 :
4521 : /* Build the tree for the SLP instance. */
4522 1655 : unsigned int group_size = scalar_stmts.length ();
4523 1655 : bool *matches = XALLOCAVEC (bool, group_size);
4524 1655 : poly_uint64 max_nunits = 1;
4525 1655 : unsigned tree_size = 0;
4526 :
4527 : /* ??? We need this only for SLP discovery. */
4528 6315 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4529 4660 : REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = scalar_stmts[0];
4530 :
4531 1655 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4532 : &max_nunits, matches, limit,
4533 1655 : &tree_size, bst_map);
4534 :
4535 6315 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4536 4660 : REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = NULL;
4537 :
4538 1655 : if (node != NULL)
4539 : {
4540 : /* Create a new SLP instance. */
4541 1395 : slp_instance new_instance = XNEW (class _slp_instance);
4542 1395 : SLP_INSTANCE_TREE (new_instance) = node;
4543 1395 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4544 1395 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4545 1395 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4546 1395 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
4547 1395 : new_instance->reduc_phis = NULL;
4548 1395 : new_instance->cost_vec = vNULL;
4549 1395 : new_instance->subgraph_entries = vNULL;
4550 :
4551 1395 : vect_reduc_info reduc_info = info_for_reduction (vinfo, node);
4552 1395 : reduc_info->is_reduc_chain = true;
4553 :
4554 1395 : if (dump_enabled_p ())
4555 135 : dump_printf_loc (MSG_NOTE, vect_location,
4556 : "SLP size %u vs. limit %u.\n",
4557 : tree_size, max_tree_size);
4558 :
4559 : /* Fixup SLP reduction chains. If this is a reduction chain with
4560 : a conversion in front amend the SLP tree with a node for that. */
4561 1395 : gimple *scalar_def = STMT_VINFO_REDUC_DEF (reduc_phi_info)->stmt;
4562 1395 : if (is_gimple_assign (scalar_def)
4563 1395 : && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (scalar_def)))
4564 : {
4565 28 : stmt_vec_info conv_info = vect_stmt_to_vectorize
4566 28 : (STMT_VINFO_REDUC_DEF (reduc_phi_info));
4567 28 : scalar_stmts = vNULL;
4568 28 : scalar_stmts.create (group_size);
4569 90 : for (unsigned i = 0; i < group_size; ++i)
4570 62 : scalar_stmts.quick_push (conv_info);
4571 28 : slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
4572 28 : SLP_TREE_VECTYPE (conv)
4573 28 : = get_vectype_for_scalar_type (vinfo,
4574 28 : TREE_TYPE
4575 : (gimple_assign_lhs (scalar_def)),
4576 : group_size);
4577 28 : SLP_TREE_REDUC_IDX (conv) = 0;
4578 28 : conv->cycle_info.id = node->cycle_info.id;
4579 28 : SLP_TREE_CHILDREN (conv).quick_push (node);
4580 28 : SLP_INSTANCE_TREE (new_instance) = conv;
4581 : }
4582 : /* Fill the backedge child of the PHI SLP node. The
4583 : general matching code cannot find it because the
4584 : scalar code does not reflect how we vectorize the
4585 : reduction. */
4586 1395 : use_operand_p use_p;
4587 1395 : imm_use_iterator imm_iter;
4588 1395 : class loop *loop = LOOP_VINFO_LOOP (vinfo);
4589 6670 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
4590 : gimple_get_lhs (scalar_def))
4591 : /* There are exactly two non-debug uses, the reduction
4592 : PHI and the loop-closed PHI node. */
4593 3880 : if (!is_gimple_debug (USE_STMT (use_p))
4594 3880 : && gimple_bb (USE_STMT (use_p)) == loop->header)
4595 : {
4596 1395 : auto_vec<stmt_vec_info, 64> phis (group_size);
4597 1395 : stmt_vec_info phi_info = vinfo->lookup_stmt (USE_STMT (use_p));
4598 5386 : for (unsigned i = 0; i < group_size; ++i)
4599 3991 : phis.quick_push (phi_info);
4600 1395 : slp_tree *phi_node = bst_map->get (phis);
4601 1395 : unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
4602 2790 : SLP_TREE_CHILDREN (*phi_node)[dest_idx]
4603 1395 : = SLP_INSTANCE_TREE (new_instance);
4604 1395 : SLP_INSTANCE_TREE (new_instance)->refcnt++;
4605 1395 : }
4606 :
4607 1395 : vinfo->slp_instances.safe_push (new_instance);
4608 :
4609 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4610 : the number of scalar stmts in the root in a few places.
4611 : Verify that assumption holds. */
4612 2790 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4613 : .length () == group_size);
4614 :
4615 1395 : if (dump_enabled_p ())
4616 : {
4617 135 : dump_printf_loc (MSG_NOTE, vect_location,
4618 : "Final SLP tree for instance %p:\n",
4619 : (void *) new_instance);
4620 135 : vect_print_slp_graph (MSG_NOTE, vect_location,
4621 : SLP_INSTANCE_TREE (new_instance));
4622 : }
4623 :
4624 1395 : return true;
4625 : }
4626 :
4627 : /* Failed to SLP. */
4628 260 : scalar_stmts.release ();
4629 260 : if (dump_enabled_p ())
4630 45 : dump_printf_loc (MSG_NOTE, vect_location,
4631 : "SLP discovery of reduction chain failed\n");
4632 : return false;
4633 : }
4634 :
4635 : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
4636 : of KIND. Return true if successful. */
4637 :
4638 : static bool
4639 63542 : vect_analyze_slp_reduction (loop_vec_info vinfo,
4640 : stmt_vec_info scalar_stmt,
4641 : unsigned max_tree_size, unsigned *limit,
4642 : scalar_stmts_to_slp_tree_map_t *bst_map,
4643 : bool force_single_lane)
4644 : {
4645 63542 : slp_instance_kind kind = slp_inst_kind_reduc_group;
4646 :
4647 : /* If there's no budget left bail out early. */
4648 63542 : if (*limit == 0)
4649 : return false;
4650 :
4651 : /* Try to gather a reduction chain. */
4652 63542 : if (! force_single_lane
4653 42979 : && STMT_VINFO_DEF_TYPE (scalar_stmt) == vect_reduction_def
4654 106304 : && vect_analyze_slp_reduc_chain (vinfo, bst_map, scalar_stmt,
4655 : max_tree_size, limit))
4656 : return true;
4657 :
4658 62091 : vec<stmt_vec_info> scalar_stmts;
4659 62091 : scalar_stmts.create (1);
4660 62091 : scalar_stmts.quick_push (scalar_stmt);
4661 :
4662 62091 : if (dump_enabled_p ())
4663 : {
4664 3338 : dump_printf_loc (MSG_NOTE, vect_location,
4665 : "Starting SLP discovery for\n");
4666 6676 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4667 6676 : dump_printf_loc (MSG_NOTE, vect_location,
4668 3338 : " %G", scalar_stmts[i]->stmt);
4669 : }
4670 :
4671 : /* Build the tree for the SLP instance. */
4672 62091 : unsigned int group_size = scalar_stmts.length ();
4673 62091 : bool *matches = XALLOCAVEC (bool, group_size);
4674 62091 : poly_uint64 max_nunits = 1;
4675 62091 : unsigned tree_size = 0;
4676 :
4677 62091 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4678 : &max_nunits, matches, limit,
4679 : &tree_size, bst_map);
4680 62091 : if (node != NULL)
4681 : {
4682 : /* Create a new SLP instance. */
4683 59506 : slp_instance new_instance = XNEW (class _slp_instance);
4684 59506 : SLP_INSTANCE_TREE (new_instance) = node;
4685 59506 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4686 59506 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4687 59506 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4688 59506 : SLP_INSTANCE_KIND (new_instance) = kind;
4689 59506 : new_instance->reduc_phis = NULL;
4690 59506 : new_instance->cost_vec = vNULL;
4691 59506 : new_instance->subgraph_entries = vNULL;
4692 :
4693 59506 : if (dump_enabled_p ())
4694 3222 : dump_printf_loc (MSG_NOTE, vect_location,
4695 : "SLP size %u vs. limit %u.\n",
4696 : tree_size, max_tree_size);
4697 :
4698 59506 : vinfo->slp_instances.safe_push (new_instance);
4699 :
4700 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4701 : the number of scalar stmts in the root in a few places.
4702 : Verify that assumption holds. */
4703 119012 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4704 : .length () == group_size);
4705 :
4706 59506 : if (dump_enabled_p ())
4707 : {
4708 3222 : dump_printf_loc (MSG_NOTE, vect_location,
4709 : "Final SLP tree for instance %p:\n",
4710 : (void *) new_instance);
4711 3222 : vect_print_slp_graph (MSG_NOTE, vect_location,
4712 : SLP_INSTANCE_TREE (new_instance));
4713 : }
4714 :
4715 59506 : return true;
4716 : }
4717 : /* Failed to SLP. */
4718 :
4719 : /* Free the allocated memory. */
4720 2585 : scalar_stmts.release ();
4721 :
4722 : /* Failed to SLP. */
4723 2585 : if (dump_enabled_p ())
4724 116 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4725 : return false;
4726 : }
4727 :
4728 : /* Analyze a single SLP reduction group. If successful add a SLP instance
4729 : for it and return true, otherwise return false and have *MATCHES
4730 : populated. */
4731 :
4732 : static bool
4733 18143 : vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo,
4734 : vec<stmt_vec_info> scalar_stmts,
4735 : scalar_stmts_to_slp_tree_map_t *bst_map,
4736 : unsigned max_tree_size, unsigned *limit,
4737 : bool *matches)
4738 : {
4739 : /* Try to form a reduction group. */
4740 18143 : unsigned int group_size = scalar_stmts.length ();
4741 18143 : if (!matches)
4742 7417 : matches = XALLOCAVEC (bool, group_size);
4743 18143 : poly_uint64 max_nunits = 1;
4744 18143 : unsigned tree_size = 0;
4745 18143 : slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts,
4746 : group_size,
4747 : &max_nunits, matches, limit,
4748 : &tree_size, bst_map);
4749 18143 : if (!node)
4750 : return false;
4751 :
4752 : /* Create a new SLP instance. */
4753 8601 : slp_instance new_instance = XNEW (class _slp_instance);
4754 8601 : SLP_INSTANCE_TREE (new_instance) = node;
4755 8601 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4756 8601 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4757 8601 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4758 8601 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group;
4759 8601 : new_instance->reduc_phis = NULL;
4760 8601 : new_instance->cost_vec = vNULL;
4761 8601 : new_instance->subgraph_entries = vNULL;
4762 :
4763 8601 : if (dump_enabled_p ())
4764 544 : dump_printf_loc (MSG_NOTE, vect_location,
4765 : "SLP size %u vs. limit %u.\n",
4766 : tree_size, max_tree_size);
4767 :
4768 8601 : loop_vinfo->slp_instances.safe_push (new_instance);
4769 :
4770 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4771 : the number of scalar stmts in the root in a few places.
4772 : Verify that assumption holds. */
4773 17202 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4774 : .length () == group_size);
4775 :
4776 8601 : if (dump_enabled_p ())
4777 : {
4778 544 : dump_printf_loc (MSG_NOTE, vect_location,
4779 : "SLP discovery of size %d reduction group "
4780 : "succeeded\n", group_size);
4781 544 : dump_printf_loc (MSG_NOTE, vect_location,
4782 : "Final SLP tree for instance %p:\n",
4783 : (void *) new_instance);
4784 544 : vect_print_slp_graph (MSG_NOTE, vect_location,
4785 : SLP_INSTANCE_TREE (new_instance));
4786 : }
4787 :
4788 : return true;
4789 : }
4790 :
4791 : /* Analyze reductions in LOOP_VINFO and populate SLP instances
4792 : accordingly. Returns false if something fails. */
4793 :
4794 : static bool
4795 422764 : vect_analyze_slp_reductions (loop_vec_info loop_vinfo,
4796 : unsigned max_tree_size, unsigned *limit,
4797 : scalar_stmts_to_slp_tree_map_t *bst_map,
4798 : bool force_single_lane)
4799 : {
4800 470067 : if (loop_vinfo->reductions.is_empty ())
4801 : return true;
4802 :
4803 : /* Collect reduction statements we can combine into
4804 : a SLP reduction. */
4805 53093 : vec<stmt_vec_info> scalar_stmts;
4806 53093 : scalar_stmts.create (loop_vinfo->reductions.length ());
4807 234085 : for (auto next_info : loop_vinfo->reductions)
4808 : {
4809 74806 : next_info = vect_stmt_to_vectorize (next_info);
4810 74806 : if ((STMT_VINFO_RELEVANT_P (next_info)
4811 14 : || STMT_VINFO_LIVE_P (next_info))
4812 : /* ??? Make sure we didn't skip a conversion around a
4813 : reduction path. In that case we'd have to reverse
4814 : engineer that conversion stmt following the chain using
4815 : reduc_idx and from the PHI using reduc_def. */
4816 74792 : && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
4817 74792 : || (STMT_VINFO_DEF_TYPE (next_info)
4818 : == vect_double_reduction_def)))
4819 : {
4820 : /* Do not discover SLP reductions combining lane-reducing
4821 : ops, that will fail later. */
4822 74792 : if (!force_single_lane
4823 74792 : && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
4824 53794 : scalar_stmts.quick_push (next_info);
4825 : /* Do SLP discovery for single-lane reductions. */
4826 20998 : else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
4827 : max_tree_size, limit,
4828 : bst_map,
4829 : force_single_lane))
4830 : {
4831 0 : scalar_stmts.release ();
4832 0 : return false;
4833 : }
4834 : }
4835 : }
4836 :
4837 53093 : if (scalar_stmts.length () > 1)
4838 : {
4839 : /* Try to form a reduction group. */
4840 3331 : unsigned int group_size = scalar_stmts.length ();
4841 3331 : bool *matches = XALLOCAVEC (bool, group_size);
4842 3331 : if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts, bst_map,
4843 : max_tree_size, limit, matches))
4844 3227 : return true;
4845 :
4846 : /* When analysis as a single SLP reduction group failed try to
4847 : form sub-groups by collecting matching lanes. Do not recurse
4848 : that on failure (to limit compile-time costs), but recurse
4849 : for the initial non-matching parts. Everything not covered
4850 : by a sub-group gets single-reduction treatment. */
4851 2418 : vec<stmt_vec_info> cands = vNULL;
4852 7521 : while (matches[0])
4853 : {
4854 7417 : cands.truncate (0);
4855 7417 : cands.reserve (group_size, true);
4856 58074 : for (unsigned i = 0; i < group_size; ++i)
4857 50657 : if (matches[i])
4858 12395 : cands.quick_push (scalar_stmts[i]);
4859 :
4860 : /* Try to form a reduction group. */
4861 7417 : if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map,
4862 : max_tree_size, limit, NULL))
4863 5396 : cands = vNULL;
4864 : else
4865 : {
4866 : /* Do SLP discovery for single-lane reductions. */
4867 12272 : for (auto stmt_info : cands)
4868 6231 : if (! vect_analyze_slp_reduction (loop_vinfo,
4869 : vect_stmt_to_vectorize
4870 : (stmt_info),
4871 : max_tree_size, limit,
4872 : bst_map, force_single_lane))
4873 : {
4874 22 : scalar_stmts.release ();
4875 22 : cands.release ();
4876 22 : return false;
4877 : }
4878 : }
4879 : /* Remove the handled stmts from scalar_stmts and try again,
4880 : possibly repeating the above with updated matches[]. */
4881 : unsigned j = 0;
4882 57990 : for (unsigned i = 0; i < group_size; ++i)
4883 50595 : if (!matches[i])
4884 : {
4885 38235 : scalar_stmts[j] = scalar_stmts[i];
4886 38235 : ++j;
4887 : }
4888 7395 : scalar_stmts.truncate (j);
4889 7395 : group_size = scalar_stmts.length ();
4890 7395 : if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
4891 : bst_map, max_tree_size, limit,
4892 : matches))
4893 : return true;
4894 : }
4895 : }
4896 : /* Do SLP discovery for single-lane reductions. */
4897 183348 : for (auto stmt_info : scalar_stmts)
4898 36313 : if (! vect_analyze_slp_reduction (loop_vinfo,
4899 : vect_stmt_to_vectorize (stmt_info),
4900 : max_tree_size, limit,
4901 : bst_map, force_single_lane))
4902 : {
4903 2563 : scalar_stmts.release ();
4904 2563 : return false;
4905 : }
4906 :
4907 47303 : scalar_stmts.release ();
4908 47303 : return true;
4909 : }
4910 :
4911 : /* Analyze an SLP instance starting from a group of grouped stores. Call
4912 : vect_build_slp_tree to build a tree of packed stmts if possible.
4913 : Return FALSE if it's impossible to SLP any stmt in the group. */
4914 :
4915 : static bool
4916 1082258 : vect_analyze_slp_instance (vec_info *vinfo,
4917 : scalar_stmts_to_slp_tree_map_t *bst_map,
4918 : stmt_vec_info stmt_info,
4919 : slp_instance_kind kind,
4920 : unsigned max_tree_size, unsigned *limit,
4921 : bool force_single_lane)
4922 : {
4923 1082258 : vec<stmt_vec_info> scalar_stmts;
4924 :
4925 1082258 : if (is_a <bb_vec_info> (vinfo))
4926 1058542 : vect_location = stmt_info->stmt;
4927 :
4928 1082258 : gcc_assert (kind == slp_inst_kind_store);
4929 :
4930 : /* Collect the stores and store them in scalar_stmts. */
4931 1082258 : scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
4932 1082258 : stmt_vec_info next_info = stmt_info;
4933 5371221 : while (next_info)
4934 : {
4935 3206705 : scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4936 3206705 : next_info = DR_GROUP_NEXT_ELEMENT (next_info);
4937 : }
4938 :
4939 1082258 : vec<stmt_vec_info> root_stmt_infos = vNULL;
4940 1082258 : vec<tree> remain = vNULL;
4941 :
4942 : /* Build the tree for the SLP instance. */
4943 :
4944 : /* If there's no budget left bail out early. */
4945 1082258 : if (*limit == 0)
4946 : return false;
4947 :
4948 1082235 : if (dump_enabled_p ())
4949 : {
4950 4111 : dump_printf_loc (MSG_NOTE, vect_location,
4951 : "Starting SLP discovery for\n");
4952 23684 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4953 39146 : dump_printf_loc (MSG_NOTE, vect_location,
4954 19573 : " %G", scalar_stmts[i]->stmt);
4955 : }
4956 :
4957 : /* Build the tree for the SLP instance. */
4958 1082235 : unsigned int group_size = scalar_stmts.length ();
4959 1082235 : bool *matches = XALLOCAVEC (bool, group_size);
4960 1082235 : poly_uint64 max_nunits = 1;
4961 1082235 : unsigned tree_size = 0;
4962 1082235 : unsigned i;
4963 :
4964 1082235 : slp_tree node = NULL;
4965 1082235 : if (group_size > 1 && force_single_lane)
4966 : {
4967 1498 : matches[0] = true;
4968 1498 : matches[1] = false;
4969 : }
4970 : else
4971 1080737 : node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4972 : &max_nunits, matches, limit,
4973 : &tree_size, bst_map);
4974 1082235 : if (node != NULL)
4975 : {
4976 : /* Calculate the unrolling factor based on the smallest type. */
4977 672014 : poly_uint64 unrolling_factor
4978 672014 : = calculate_unrolling_factor (max_nunits, group_size);
4979 :
4980 672014 : if (maybe_ne (unrolling_factor, 1U)
4981 672014 : && is_a <bb_vec_info> (vinfo))
4982 : {
4983 0 : unsigned HOST_WIDE_INT const_max_nunits;
4984 0 : if (!max_nunits.is_constant (&const_max_nunits)
4985 0 : || const_max_nunits > group_size)
4986 : {
4987 0 : if (dump_enabled_p ())
4988 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4989 : "Build SLP failed: store group "
4990 : "size not a multiple of the vector size "
4991 : "in basic block SLP\n");
4992 0 : vect_free_slp_tree (node);
4993 0 : return false;
4994 : }
4995 : /* Fatal mismatch. */
4996 0 : if (dump_enabled_p ())
4997 0 : dump_printf_loc (MSG_NOTE, vect_location,
4998 : "SLP discovery succeeded but node needs "
4999 : "splitting\n");
5000 0 : memset (matches, true, group_size);
5001 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
5002 0 : vect_free_slp_tree (node);
5003 : }
5004 : else
5005 : {
5006 : /* Create a new SLP instance. */
5007 672014 : slp_instance new_instance = XNEW (class _slp_instance);
5008 672014 : SLP_INSTANCE_TREE (new_instance) = node;
5009 672014 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
5010 672014 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
5011 672014 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
5012 672014 : SLP_INSTANCE_KIND (new_instance) = kind;
5013 672014 : new_instance->reduc_phis = NULL;
5014 672014 : new_instance->cost_vec = vNULL;
5015 672014 : new_instance->subgraph_entries = vNULL;
5016 :
5017 672014 : if (dump_enabled_p ())
5018 3128 : dump_printf_loc (MSG_NOTE, vect_location,
5019 : "SLP size %u vs. limit %u.\n",
5020 : tree_size, max_tree_size);
5021 :
5022 672014 : vinfo->slp_instances.safe_push (new_instance);
5023 :
5024 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
5025 : the number of scalar stmts in the root in a few places.
5026 : Verify that assumption holds. */
5027 1344028 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
5028 : .length () == group_size);
5029 :
5030 672014 : if (dump_enabled_p ())
5031 : {
5032 3128 : dump_printf_loc (MSG_NOTE, vect_location,
5033 : "Final SLP tree for instance %p:\n",
5034 : (void *) new_instance);
5035 3128 : vect_print_slp_graph (MSG_NOTE, vect_location,
5036 : SLP_INSTANCE_TREE (new_instance));
5037 : }
5038 :
5039 672014 : return true;
5040 : }
5041 : }
5042 : /* Failed to SLP. */
5043 :
5044 : /* Try to break the group up into pieces. */
5045 410221 : if (*limit > 0 && kind == slp_inst_kind_store)
5046 : {
5047 : /* ??? We could delay all the actual splitting of store-groups
5048 : until after SLP discovery of the original group completed.
5049 : Then we can recurse to vect_build_slp_instance directly. */
5050 1073934 : for (i = 0; i < group_size; i++)
5051 1073934 : if (!matches[i])
5052 : break;
5053 :
5054 : /* For basic block SLP, try to break the group up into multiples of
5055 : a vector size. */
5056 410220 : if (is_a <bb_vec_info> (vinfo)
5057 410220 : && (i > 1 && i < group_size))
5058 : {
5059 : /* Free the allocated memory. */
5060 154084 : scalar_stmts.release ();
5061 :
5062 154084 : tree scalar_type
5063 154084 : = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
5064 308168 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
5065 154084 : 1 << floor_log2 (i));
5066 154084 : unsigned HOST_WIDE_INT const_nunits;
5067 154084 : if (vectype
5068 154084 : && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
5069 : {
5070 : /* Split into two groups at the first vector boundary. */
5071 154084 : gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
5072 154084 : unsigned group1_size = i & ~(const_nunits - 1);
5073 :
5074 154084 : if (dump_enabled_p ())
5075 59 : dump_printf_loc (MSG_NOTE, vect_location,
5076 : "Splitting SLP group at stmt %u\n", i);
5077 154084 : stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
5078 : group1_size);
5079 154084 : bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
5080 : kind, max_tree_size,
5081 : limit, false);
5082 : /* Split the rest at the failure point and possibly
5083 : re-analyze the remaining matching part if it has
5084 : at least two lanes. */
5085 154084 : if (group1_size < i
5086 5272 : && (i + 1 < group_size
5087 2902 : || i - group1_size > 1))
5088 : {
5089 2402 : stmt_vec_info rest2 = rest;
5090 2402 : rest = vect_split_slp_store_group (rest, i - group1_size);
5091 2402 : if (i - group1_size > 1)
5092 61 : res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
5093 : kind, max_tree_size,
5094 : limit, false);
5095 : }
5096 : /* Re-analyze the non-matching tail if it has at least
5097 : two lanes. */
5098 154084 : if (i + 1 < group_size)
5099 21780 : res |= vect_analyze_slp_instance (vinfo, bst_map,
5100 : rest, kind, max_tree_size,
5101 : limit, false);
5102 154084 : return res;
5103 : }
5104 : }
5105 :
5106 : /* For loop vectorization split the RHS into arbitrary pieces of
5107 : size >= 1. */
5108 256136 : else if (is_a <loop_vec_info> (vinfo)
5109 256136 : && (group_size != 1 && i < group_size))
5110 : {
5111 6434 : gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
5112 28 : bool masked_p = call
5113 28 : && gimple_call_internal_p (call)
5114 28 : && internal_fn_mask_index (gimple_call_internal_fn (call)) != -1;
5115 : /* There are targets that cannot do even/odd interleaving schemes
5116 : so they absolutely need to use load/store-lanes. For now
5117 : force single-lane SLP for them - they would be happy with
5118 : uniform power-of-two lanes (but depending on element size),
5119 : but even if we can use 'i' as indicator we would need to
5120 : backtrack when later lanes fail to discover with the same
5121 : granularity. We cannot turn any of strided or scatter store
5122 : into store-lanes. */
5123 : /* ??? If this is not in sync with what get_load_store_type
5124 : later decides the SLP representation is not good for other
5125 : store vectorization methods. */
5126 6434 : bool want_store_lanes
5127 6434 : = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5128 6434 : && ! STMT_VINFO_STRIDED_P (stmt_info)
5129 4893 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
5130 4889 : && compare_step_with_zero (vinfo, stmt_info) > 0
5131 11300 : && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
5132 12868 : masked_p, group_size, i));
5133 6434 : if (want_store_lanes || force_single_lane)
5134 : i = 1;
5135 :
5136 : /* A fatal discovery fail doesn't always mean single-lane SLP
5137 : isn't a possibility, so try. */
5138 4936 : if (i == 0)
5139 : i = 1;
5140 :
5141 6434 : if (dump_enabled_p ())
5142 882 : dump_printf_loc (MSG_NOTE, vect_location,
5143 : "Splitting SLP group at stmt %u\n", i);
5144 :
5145 : /* Analyze the stored values and pinch them together with
5146 : a permute node so we can preserve the whole store group. */
5147 6434 : auto_vec<slp_tree> rhs_nodes;
5148 6434 : poly_uint64 max_nunits = 1;
5149 :
5150 6434 : unsigned int rhs_common_nlanes = 0;
5151 6434 : unsigned int start = 0, end = i;
5152 29167 : while (start < group_size)
5153 : {
5154 22963 : gcc_assert (end - start >= 1);
5155 22963 : vec<stmt_vec_info> substmts;
5156 22963 : substmts.create (end - start);
5157 69463 : for (unsigned j = start; j < end; ++j)
5158 46500 : substmts.quick_push (scalar_stmts[j]);
5159 22963 : max_nunits = 1;
5160 22963 : node = vect_build_slp_tree (vinfo, substmts, end - start,
5161 : &max_nunits,
5162 : matches, limit, &tree_size, bst_map);
5163 22963 : if (node)
5164 : {
5165 18270 : rhs_nodes.safe_push (node);
5166 18270 : vect_update_max_nunits (&max_nunits, node->max_nunits);
5167 18270 : if (start == 0)
5168 6208 : rhs_common_nlanes = SLP_TREE_LANES (node);
5169 12062 : else if (rhs_common_nlanes != SLP_TREE_LANES (node))
5170 1267 : rhs_common_nlanes = 0;
5171 18270 : start = end;
5172 18270 : if (want_store_lanes || force_single_lane)
5173 4532 : end = start + 1;
5174 : else
5175 : end = group_size;
5176 : }
5177 : else
5178 : {
5179 4693 : substmts.release ();
5180 4693 : if (end - start == 1)
5181 : {
5182 : /* Single-lane discovery failed. Free ressources. */
5183 244 : for (auto node : rhs_nodes)
5184 6 : vect_free_slp_tree (node);
5185 230 : scalar_stmts.release ();
5186 230 : if (dump_enabled_p ())
5187 38 : dump_printf_loc (MSG_NOTE, vect_location,
5188 : "SLP discovery failed\n");
5189 230 : return false;
5190 : }
5191 :
5192 : /* ??? It really happens that we soft-fail SLP
5193 : build at a mismatch but the matching part hard-fails
5194 : later. As we know we arrived here with a group
5195 : larger than one try a group of size one! */
5196 4463 : if (!matches[0])
5197 42 : end = start + 1;
5198 : else
5199 9934 : for (unsigned j = start; j < end; j++)
5200 9934 : if (!matches[j - start])
5201 : {
5202 : end = j;
5203 : break;
5204 : }
5205 : }
5206 : }
5207 :
5208 : /* Now re-assess whether we want store lanes in case the
5209 : discovery ended up producing all single-lane RHSs. */
5210 6204 : if (! want_store_lanes
5211 6204 : && rhs_common_nlanes == 1
5212 5339 : && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5213 5339 : && ! STMT_VINFO_STRIDED_P (stmt_info)
5214 4052 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
5215 4049 : && compare_step_with_zero (vinfo, stmt_info) > 0
5216 10242 : && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes[0]),
5217 : group_size, masked_p)
5218 : != IFN_LAST))
5219 : want_store_lanes = true;
5220 :
5221 : /* Now we assume we can build the root SLP node from all stores. */
5222 6204 : if (want_store_lanes)
5223 : {
5224 : /* For store-lanes feed the store node with all RHS nodes
5225 : in order. */
5226 0 : node = vect_create_new_slp_node (scalar_stmts,
5227 0 : SLP_TREE_CHILDREN
5228 : (rhs_nodes[0]).length ());
5229 0 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
5230 0 : node->max_nunits = max_nunits;
5231 0 : node->ldst_lanes = true;
5232 0 : SLP_TREE_CHILDREN (node)
5233 0 : .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
5234 0 : + rhs_nodes.length () - 1);
5235 : /* First store value and possibly mask. */
5236 0 : SLP_TREE_CHILDREN (node)
5237 0 : .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
5238 : /* Rest of the store values. All mask nodes are the same,
5239 : this should be guaranteed by dataref group discovery. */
5240 0 : for (unsigned j = 1; j < rhs_nodes.length (); ++j)
5241 0 : SLP_TREE_CHILDREN (node)
5242 0 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
5243 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
5244 0 : child->refcnt++;
5245 : }
5246 : else
5247 6204 : node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
5248 : max_nunits);
5249 :
5250 24468 : while (!rhs_nodes.is_empty ())
5251 18264 : vect_free_slp_tree (rhs_nodes.pop ());
5252 :
5253 : /* Create a new SLP instance. */
5254 6204 : slp_instance new_instance = XNEW (class _slp_instance);
5255 6204 : SLP_INSTANCE_TREE (new_instance) = node;
5256 6204 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
5257 6204 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
5258 6204 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
5259 6204 : SLP_INSTANCE_KIND (new_instance) = kind;
5260 6204 : new_instance->reduc_phis = NULL;
5261 6204 : new_instance->cost_vec = vNULL;
5262 6204 : new_instance->subgraph_entries = vNULL;
5263 :
5264 6204 : if (dump_enabled_p ())
5265 844 : dump_printf_loc (MSG_NOTE, vect_location,
5266 : "SLP size %u vs. limit %u.\n",
5267 : tree_size, max_tree_size);
5268 :
5269 6204 : vinfo->slp_instances.safe_push (new_instance);
5270 :
5271 : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
5272 : the number of scalar stmts in the root in a few places.
5273 : Verify that assumption holds. */
5274 12408 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
5275 : .length () == group_size);
5276 :
5277 6204 : if (dump_enabled_p ())
5278 : {
5279 844 : dump_printf_loc (MSG_NOTE, vect_location,
5280 : "Final SLP tree for instance %p:\n",
5281 : (void *) new_instance);
5282 844 : vect_print_slp_graph (MSG_NOTE, vect_location,
5283 : SLP_INSTANCE_TREE (new_instance));
5284 : }
5285 6204 : return true;
5286 6434 : }
5287 : else
5288 : /* Free the allocated memory. */
5289 249702 : scalar_stmts.release ();
5290 :
5291 : /* Even though the first vector did not all match, we might be able to SLP
5292 : (some) of the remainder. FORNOW ignore this possibility. */
5293 : }
5294 : else
5295 : /* Free the allocated memory. */
5296 1 : scalar_stmts.release ();
5297 :
5298 : /* Failed to SLP. */
5299 249703 : if (dump_enabled_p ())
5300 42 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
5301 : return false;
5302 : }
5303 :
5304 : /* qsort comparator ordering SLP load nodes. */
5305 :
5306 : static int
5307 2235261 : vllp_cmp (const void *a_, const void *b_)
5308 : {
5309 2235261 : const slp_tree a = *(const slp_tree *)a_;
5310 2235261 : const slp_tree b = *(const slp_tree *)b_;
5311 2235261 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
5312 2235261 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
5313 2235261 : if (STMT_VINFO_GROUPED_ACCESS (a0)
5314 1366277 : && STMT_VINFO_GROUPED_ACCESS (b0)
5315 3541172 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
5316 : {
5317 : /* Same group, order after lanes used. */
5318 296429 : if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
5319 : return 1;
5320 290398 : else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
5321 : return -1;
5322 : else
5323 : {
5324 : /* Try to order loads using the same lanes together, breaking
5325 : the tie with the lane number that first differs. */
5326 283740 : if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
5327 283740 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
5328 : return 0;
5329 283740 : else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
5330 283740 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
5331 : return 1;
5332 281146 : else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
5333 281146 : && SLP_TREE_LOAD_PERMUTATION (b).exists ())
5334 : return -1;
5335 : else
5336 : {
5337 276366 : for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
5338 276366 : if (SLP_TREE_LOAD_PERMUTATION (a)[i]
5339 276366 : != SLP_TREE_LOAD_PERMUTATION (b)[i])
5340 : {
5341 : /* In-order lane first, that's what the above case for
5342 : no permutation does. */
5343 275534 : if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
5344 : return -1;
5345 167925 : else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
5346 : return 1;
5347 88830 : else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
5348 88830 : < SLP_TREE_LOAD_PERMUTATION (b)[i])
5349 : return -1;
5350 : else
5351 : return 1;
5352 : }
5353 : return 0;
5354 : }
5355 : }
5356 : }
5357 : else /* Different groups or non-groups. */
5358 : {
5359 : /* Order groups as their first element to keep them together. */
5360 1938832 : if (STMT_VINFO_GROUPED_ACCESS (a0))
5361 1938832 : a0 = DR_GROUP_FIRST_ELEMENT (a0);
5362 1938832 : if (STMT_VINFO_GROUPED_ACCESS (b0))
5363 1938832 : b0 = DR_GROUP_FIRST_ELEMENT (b0);
5364 1938832 : if (a0 == b0)
5365 : return 0;
5366 : /* Tie using UID. */
5367 1938712 : else if (gimple_uid (STMT_VINFO_STMT (a0))
5368 1938712 : < gimple_uid (STMT_VINFO_STMT (b0)))
5369 : return -1;
5370 : else
5371 : {
5372 853181 : gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
5373 : != gimple_uid (STMT_VINFO_STMT (b0)));
5374 : return 1;
5375 : }
5376 : }
5377 : }
5378 :
5379 : /* Return whether if the load permutation of NODE is consecutive starting
5380 : with value START_VAL in the first element. If START_VAL is not given
5381 : the first element's value is used. */
5382 :
5383 : bool
5384 544112 : vect_load_perm_consecutive_p (slp_tree node, unsigned start_val)
5385 : {
5386 544112 : load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
5387 :
5388 544112 : if (!perm.exists () || !perm.length ())
5389 : return false;
5390 :
5391 544112 : if (start_val == UINT_MAX)
5392 73984 : start_val = perm[0];
5393 :
5394 1075901 : for (unsigned int i = 0; i < perm.length (); i++)
5395 549905 : if (perm[i] != start_val + (unsigned int) i)
5396 : return false;
5397 :
5398 : return true;
5399 : }
5400 :
5401 : /* Process the set of LOADS that are all from the same dataref group. */
5402 :
5403 : static void
5404 150948 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
5405 : scalar_stmts_to_slp_tree_map_t *bst_map,
5406 : const array_slice<slp_tree> &loads,
5407 : bool force_single_lane)
5408 : {
5409 : /* We at this point want to lower without a fixed VF or vector
5410 : size in mind which means we cannot actually compute whether we
5411 : need three or more vectors for a load permutation yet. So always
5412 : lower. */
5413 150948 : stmt_vec_info first
5414 150948 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
5415 150948 : unsigned group_lanes = DR_GROUP_SIZE (first);
5416 :
5417 : /* Verify if all load permutations can be implemented with a suitably
5418 : large element load-lanes operation. */
5419 150948 : unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
5420 150948 : if (STMT_VINFO_STRIDED_P (first)
5421 148834 : || compare_step_with_zero (loop_vinfo, first) <= 0
5422 146503 : || exact_log2 (ld_lanes_lanes) == -1
5423 : /* ??? For now only support the single-lane case as there is
5424 : missing support on the store-lane side and code generation
5425 : isn't up to the task yet. */
5426 144393 : || ld_lanes_lanes != 1
5427 288206 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
5428 : group_lanes / ld_lanes_lanes,
5429 : false) == IFN_LAST)
5430 : ld_lanes_lanes = 0;
5431 : else
5432 : /* Verify the loads access the same number of lanes aligned to
5433 : ld_lanes_lanes. */
5434 0 : for (slp_tree load : loads)
5435 : {
5436 0 : if (SLP_TREE_LANES (load) != ld_lanes_lanes)
5437 : {
5438 : ld_lanes_lanes = 0;
5439 : break;
5440 : }
5441 0 : unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
5442 0 : if (first % ld_lanes_lanes != 0)
5443 : {
5444 : ld_lanes_lanes = 0;
5445 : break;
5446 : }
5447 0 : if (!vect_load_perm_consecutive_p (load))
5448 : {
5449 : ld_lanes_lanes = 0;
5450 : break;
5451 : }
5452 : }
5453 :
5454 : /* Only a power-of-two number of lanes matches interleaving with N levels.
5455 : ??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
5456 : at each step. */
5457 248439 : if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
5458 : return;
5459 :
5460 237971 : for (slp_tree load : loads)
5461 : {
5462 : /* Leave masked or gather loads alone for now. */
5463 168825 : if (!SLP_TREE_CHILDREN (load).is_empty ())
5464 48193 : continue;
5465 :
5466 : /* For single-element interleaving spanning multiple vectors avoid
5467 : lowering, we want to use VMAT_ELEMENTWISE later. */
5468 168819 : if (ld_lanes_lanes == 0
5469 168819 : && SLP_TREE_LANES (load) == 1
5470 155680 : && !DR_GROUP_NEXT_ELEMENT (first)
5471 247018 : && maybe_gt (group_lanes,
5472 : TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (load))))
5473 50380 : return;
5474 :
5475 : /* We want to pattern-match special cases here and keep those
5476 : alone. Candidates are splats and load-lane. */
5477 :
5478 : /* We need to lower only loads of less than half of the groups
5479 : lanes, including duplicate lanes. Note this leaves nodes
5480 : with a non-1:1 load permutation around instead of canonicalizing
5481 : those into a load and a permute node. Removing this early
5482 : check would do such canonicalization. */
5483 118439 : if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
5484 44809 : && ld_lanes_lanes == 0)
5485 44809 : continue;
5486 :
5487 : /* Build the permute to get the original load permutation order. */
5488 73630 : bool contiguous = vect_load_perm_consecutive_p (load);
5489 73630 : lane_permutation_t final_perm;
5490 73630 : final_perm.create (SLP_TREE_LANES (load));
5491 147918 : for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
5492 148576 : final_perm.quick_push (
5493 74288 : std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
5494 :
5495 : /* When the load permutation accesses a contiguous unpermuted,
5496 : power-of-two aligned and sized chunk leave the load alone.
5497 : We can likely (re-)load it more efficiently rather than
5498 : extracting it from the larger load.
5499 : ??? Long-term some of the lowering should move to where
5500 : the vector types involved are fixed. */
5501 77008 : if (!force_single_lane
5502 73630 : && ld_lanes_lanes == 0
5503 48904 : && contiguous
5504 48667 : && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
5505 6373 : && pow2p_hwi (SLP_TREE_LANES (load))
5506 6337 : && pow2p_hwi (group_lanes)
5507 3378 : && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
5508 77008 : && group_lanes % SLP_TREE_LANES (load) == 0)
5509 : {
5510 3378 : final_perm.release ();
5511 3378 : continue;
5512 : }
5513 :
5514 : /* First build (and possibly re-use) a load node for the
5515 : unpermuted group. Gaps in the middle and on the end are
5516 : represented with NULL stmts. */
5517 70252 : vec<stmt_vec_info> stmts;
5518 70252 : stmts.create (group_lanes);
5519 245273 : for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
5520 : {
5521 175021 : if (s != first)
5522 108852 : for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
5523 4083 : stmts.quick_push (NULL);
5524 175021 : stmts.quick_push (s);
5525 : }
5526 131478 : for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
5527 61226 : stmts.quick_push (NULL);
5528 70252 : poly_uint64 max_nunits = 1;
5529 70252 : bool *matches = XALLOCAVEC (bool, group_lanes);
5530 70252 : unsigned limit = 1;
5531 70252 : unsigned tree_size = 0;
5532 70252 : slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
5533 : group_lanes,
5534 : &max_nunits, matches, &limit,
5535 70252 : &tree_size, bst_map);
5536 70252 : gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0).exists ());
5537 :
5538 70252 : if (ld_lanes_lanes != 0)
5539 : {
5540 : /* ??? If this is not in sync with what get_load_store_type
5541 : later decides the SLP representation is not good for other
5542 : store vectorization methods. */
5543 0 : l0->ldst_lanes = true;
5544 0 : load->ldst_lanes = true;
5545 : }
5546 :
5547 217422 : while (1)
5548 : {
5549 143837 : unsigned group_lanes = SLP_TREE_LANES (l0);
5550 143837 : if (ld_lanes_lanes != 0
5551 143837 : || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
5552 : break;
5553 :
5554 : /* Try to lower by reducing the group to half its size using an
5555 : interleaving scheme. For this try to compute whether all
5556 : elements needed for this load are in even or odd elements of
5557 : an even/odd decomposition with N consecutive elements.
5558 : Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
5559 : with N == 2. */
5560 : /* ??? Only an even number of lanes can be handed this way, but the
5561 : fallback below could work for any number. We have to make sure
5562 : to round up in that case. */
5563 73585 : gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
5564 9807 : unsigned even = 0, odd = 0;
5565 9807 : if ((group_lanes & 1) == 0)
5566 : {
5567 9807 : even = (1 << ceil_log2 (group_lanes)) - 1;
5568 9807 : odd = even;
5569 39899 : for (auto l : final_perm)
5570 : {
5571 10478 : even &= ~l.second;
5572 10478 : odd &= l.second;
5573 : }
5574 : }
5575 :
5576 : /* Now build an even or odd extraction from the unpermuted load. */
5577 73585 : lane_permutation_t perm;
5578 73585 : perm.create ((group_lanes + 1) / 2);
5579 73585 : unsigned even_level = even ? 1 << ctz_hwi (even) : 0;
5580 73585 : unsigned odd_level = odd ? 1 << ctz_hwi (odd) : 0;
5581 73585 : if (even_level
5582 9051 : && group_lanes % (2 * even_level) == 0
5583 : /* ??? When code generating permutes we do not try to pun
5584 : to larger component modes so level != 1 isn't a natural
5585 : even/odd extract. Prefer one if possible. */
5586 9051 : && (even_level == 1 || !odd_level || odd_level != 1))
5587 : {
5588 : /* { 0, 1, ... 4, 5 ..., } */
5589 33232 : for (unsigned i = 0; i < group_lanes / 2 / even_level; ++i)
5590 52520 : for (unsigned j = 0; j < even_level; ++j)
5591 26430 : perm.quick_push (std::make_pair (0, 2 * i * even_level + j));
5592 : }
5593 64534 : else if (odd_level)
5594 : {
5595 : /* { ..., 2, 3, ... 6, 7 } */
5596 2635 : gcc_assert (group_lanes % (2 * odd_level) == 0);
5597 11413 : for (unsigned i = 0; i < group_lanes / 2 / odd_level; ++i)
5598 17610 : for (unsigned j = 0; j < odd_level; ++j)
5599 8832 : perm.quick_push
5600 8832 : (std::make_pair (0, (2 * i + 1) * odd_level + j));
5601 : }
5602 : else
5603 : {
5604 : /* As fallback extract all used lanes and fill to half the
5605 : group size by repeating the last element.
5606 : ??? This is quite a bad strathegy for re-use - we could
5607 : brute force our way to find more optimal filling lanes to
5608 : maximize re-use when looking at all loads from the group. */
5609 63808 : auto_bitmap l;
5610 255288 : for (auto p : final_perm)
5611 63864 : bitmap_set_bit (l, p.second);
5612 63808 : unsigned i = 0;
5613 63808 : bitmap_iterator bi;
5614 127672 : EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
5615 63864 : perm.quick_push (std::make_pair (0, i));
5616 255384 : while (perm.length () < (group_lanes + 1) / 2)
5617 63884 : perm.quick_push (perm.last ());
5618 63808 : }
5619 :
5620 : /* Update final_perm with the intermediate permute. */
5621 147841 : for (unsigned i = 0; i < final_perm.length (); ++i)
5622 : {
5623 74256 : unsigned l = final_perm[i].second;
5624 74256 : unsigned j;
5625 81456 : for (j = 0; j < perm.length (); ++j)
5626 81456 : if (perm[j].second == l)
5627 : {
5628 74256 : final_perm[i].second = j;
5629 74256 : break;
5630 : }
5631 74256 : gcc_assert (j < perm.length ());
5632 : }
5633 :
5634 : /* And create scalar stmts. */
5635 73585 : vec<stmt_vec_info> perm_stmts;
5636 73585 : perm_stmts.create (perm.length ());
5637 236595 : for (unsigned i = 0; i < perm.length (); ++i)
5638 163010 : perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
5639 :
5640 73585 : slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
5641 73585 : SLP_TREE_CHILDREN (p).quick_push (l0);
5642 73585 : SLP_TREE_LANE_PERMUTATION (p) = perm;
5643 73585 : SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
5644 73585 : SLP_TREE_LANES (p) = perm.length ();
5645 73585 : SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
5646 : /* ??? As we have scalar stmts for this intermediate permute we
5647 : could CSE it via bst_map but we do not want to pick up
5648 : another SLP node with a load permutation. We instead should
5649 : have a "local" CSE map here. */
5650 73585 : SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
5651 :
5652 : /* We now have a node for (group_lanes + 1) / 2 lanes. */
5653 73585 : l0 = p;
5654 73585 : }
5655 :
5656 : /* And finally from the ordered reduction node create the
5657 : permute to shuffle the lanes into the original load-permutation
5658 : order. We replace the original load node with this. */
5659 70252 : SLP_TREE_CODE (load) = VEC_PERM_EXPR;
5660 70252 : SLP_TREE_LOAD_PERMUTATION (load).release ();
5661 70252 : SLP_TREE_LANE_PERMUTATION (load) = final_perm;
5662 70252 : SLP_TREE_CHILDREN (load).create (1);
5663 70252 : SLP_TREE_CHILDREN (load).quick_push (l0);
5664 : }
5665 : }
5666 :
5667 : /* Transform SLP loads in the SLP graph created by SLP discovery to
5668 : group loads from the same group and lower load permutations that
5669 : are unlikely to be supported into a series of permutes.
5670 : In the degenerate case of having only single-lane SLP instances
5671 : this should result in a series of permute nodes emulating an
5672 : interleaving scheme. */
5673 :
5674 : static void
5675 405174 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
5676 : scalar_stmts_to_slp_tree_map_t *bst_map,
5677 : bool force_single_lane)
5678 : {
5679 : /* Gather and sort loads across all instances. */
5680 405174 : hash_set<slp_tree> visited;
5681 405174 : auto_vec<slp_tree> loads;
5682 1877416 : for (auto inst : loop_vinfo->slp_instances)
5683 663804 : vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
5684 405174 : if (loads.is_empty ())
5685 70841 : return;
5686 334333 : loads.qsort (vllp_cmp);
5687 :
5688 : /* Now process each dataref group separately. */
5689 334333 : unsigned firsti = 0;
5690 621571 : for (unsigned i = 1; i < loads.length (); ++i)
5691 : {
5692 287238 : slp_tree first = loads[firsti];
5693 287238 : slp_tree next = loads[i];
5694 287238 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
5695 287238 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
5696 287238 : if (STMT_VINFO_GROUPED_ACCESS (a0)
5697 144831 : && STMT_VINFO_GROUPED_ACCESS (b0)
5698 419114 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
5699 54256 : continue;
5700 : /* Now we have one or multiple SLP loads of the same group from
5701 : firsti to i - 1. */
5702 232982 : if (STMT_VINFO_GROUPED_ACCESS (a0))
5703 90575 : vect_lower_load_permutations (loop_vinfo, bst_map,
5704 90575 : make_array_slice (&loads[firsti],
5705 : i - firsti),
5706 : force_single_lane);
5707 : firsti = i;
5708 : }
5709 668666 : if (firsti < loads.length ()
5710 668666 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
5711 60373 : vect_lower_load_permutations (loop_vinfo, bst_map,
5712 60373 : make_array_slice (&loads[firsti],
5713 60373 : loads.length () - firsti),
5714 : force_single_lane);
5715 405174 : }
5716 :
5717 : /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
5718 : trees of packed scalar stmts if SLP is possible. */
5719 :
5720 : opt_result
5721 1035590 : vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
5722 : bool force_single_lane)
5723 : {
5724 1035590 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5725 1035590 : unsigned int i;
5726 1035590 : stmt_vec_info first_element;
5727 1035590 : slp_instance instance;
5728 :
5729 1035590 : DUMP_VECT_SCOPE ("vect_analyze_slp");
5730 :
5731 1035590 : unsigned limit = max_tree_size;
5732 :
5733 1035590 : scalar_stmts_to_slp_tree_map_t *bst_map
5734 1035590 : = new scalar_stmts_to_slp_tree_map_t ();
5735 :
5736 : /* Find SLP sequences starting from groups of grouped stores. */
5737 2977275 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5738 906333 : if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
5739 : slp_inst_kind_store, max_tree_size, &limit,
5740 : force_single_lane)
5741 906333 : && loop_vinfo)
5742 238 : return opt_result::failure_at (vect_location, "SLP build failed.\n");
5743 :
5744 : /* For loops also start SLP discovery from non-grouped stores. */
5745 1035352 : if (loop_vinfo)
5746 : {
5747 : data_reference_p dr;
5748 1369971 : FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
5749 947207 : if (DR_IS_WRITE (dr))
5750 : {
5751 286926 : stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
5752 : /* Grouped stores are already handled above. */
5753 286926 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5754 76070 : continue;
5755 210856 : vec<stmt_vec_info> stmts;
5756 210856 : vec<stmt_vec_info> roots = vNULL;
5757 210856 : vec<tree> remain = vNULL;
5758 210856 : stmts.create (1);
5759 210856 : stmts.quick_push (stmt_info);
5760 210856 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
5761 : stmts, roots, remain, max_tree_size,
5762 : &limit, bst_map, force_single_lane))
5763 3585 : return opt_result::failure_at (vect_location,
5764 : "SLP build failed.\n");
5765 : }
5766 :
5767 : stmt_vec_info stmt_info;
5768 422804 : FOR_EACH_VEC_ELT (LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo), i, stmt_info)
5769 : {
5770 20 : vec<stmt_vec_info> stmts;
5771 20 : vec<stmt_vec_info> roots = vNULL;
5772 20 : vec<tree> remain = vNULL;
5773 20 : stmts.create (1);
5774 20 : stmts.quick_push (stmt_info);
5775 20 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
5776 : stmts, roots, remain, max_tree_size,
5777 : &limit, bst_map, force_single_lane))
5778 0 : return opt_result::failure_at (vect_location,
5779 : "SLP build failed.\n");
5780 : }
5781 : }
5782 :
5783 1031767 : if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
5784 : {
5785 1809092 : for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
5786 : {
5787 1200089 : vect_location = bb_vinfo->roots[i].roots[0]->stmt;
5788 : /* Apply patterns. */
5789 3752061 : for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
5790 5103944 : bb_vinfo->roots[i].stmts[j]
5791 2625048 : = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
5792 1200089 : if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
5793 1200089 : bb_vinfo->roots[i].stmts,
5794 1200089 : bb_vinfo->roots[i].roots,
5795 1200089 : bb_vinfo->roots[i].remain,
5796 : max_tree_size, &limit, bst_map, false))
5797 : {
5798 126893 : bb_vinfo->roots[i].roots = vNULL;
5799 126893 : bb_vinfo->roots[i].remain = vNULL;
5800 : }
5801 1200089 : bb_vinfo->roots[i].stmts = vNULL;
5802 : }
5803 : }
5804 :
5805 1031767 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5806 : {
5807 : /* Find SLP sequences starting from groups of reductions. */
5808 422764 : if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit,
5809 : bst_map, force_single_lane))
5810 2585 : return opt_result::failure_at (vect_location, "SLP build failed.\n");
5811 :
5812 : /* Make sure to vectorize only-live stmts, usually inductions. */
5813 1923343 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
5814 1270335 : for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
5815 597939 : gsi_next (&gsi))
5816 : {
5817 607529 : gphi *lc_phi = *gsi;
5818 607529 : tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
5819 607529 : stmt_vec_info stmt_info;
5820 607529 : if (TREE_CODE (def) == SSA_NAME
5821 496475 : && !virtual_operand_p (def)
5822 268934 : && (stmt_info = loop_vinfo->lookup_def (def))
5823 238256 : && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
5824 238256 : && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
5825 178573 : && STMT_VINFO_LIVE_P (stmt_info)
5826 178573 : && !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))
5827 714396 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
5828 : {
5829 106788 : vec<stmt_vec_info> stmts;
5830 106788 : vec<stmt_vec_info> roots = vNULL;
5831 106788 : vec<tree> remain = vNULL;
5832 106788 : stmts.create (1);
5833 106788 : stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
5834 106788 : if (! vect_build_slp_instance (vinfo,
5835 : slp_inst_kind_reduc_group,
5836 : stmts, roots, remain,
5837 : max_tree_size, &limit,
5838 : bst_map, force_single_lane))
5839 9590 : return opt_result::failure_at (vect_location,
5840 : "SLP build failed.\n");
5841 : }
5842 9590 : }
5843 :
5844 : /* Find SLP sequences starting from gconds. */
5845 1108487 : for (auto cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
5846 : {
5847 274535 : auto cond_info = loop_vinfo->lookup_stmt (cond);
5848 :
5849 274535 : cond_info = vect_stmt_to_vectorize (cond_info);
5850 274535 : vec<stmt_vec_info> roots = vNULL;
5851 274535 : roots.safe_push (cond_info);
5852 274535 : gimple *stmt = STMT_VINFO_STMT (cond_info);
5853 274535 : tree args0 = gimple_cond_lhs (stmt);
5854 274535 : tree args1 = gimple_cond_rhs (stmt);
5855 :
5856 : /* These should be enforced by cond lowering, but if it failed
5857 : bail. */
5858 274535 : if (gimple_cond_code (stmt) != NE_EXPR
5859 273457 : || TREE_TYPE (args0) != boolean_type_node
5860 547345 : || !integer_zerop (args1))
5861 : {
5862 1725 : roots.release ();
5863 1725 : return opt_result::failure_at (vect_location,
5864 : "SLP build failed.\n");
5865 : }
5866 :
5867 : /* An argument without a loop def will be codegened from vectorizing the
5868 : root gcond itself. As such we don't need to try to build an SLP tree
5869 : from them. It's highly likely that the resulting SLP tree here if both
5870 : arguments have a def will be incompatible, but we rely on it being split
5871 : later on. */
5872 272810 : auto varg = loop_vinfo->lookup_def (args0);
5873 272810 : vec<stmt_vec_info> stmts;
5874 272810 : vec<tree> remain = vNULL;
5875 272810 : stmts.create (1);
5876 272810 : stmts.quick_push (vect_stmt_to_vectorize (varg));
5877 :
5878 272810 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
5879 : stmts, roots, remain,
5880 : max_tree_size, &limit,
5881 : bst_map, force_single_lane))
5882 : {
5883 3690 : roots.release ();
5884 3690 : return opt_result::failure_at (vect_location,
5885 : "SLP build failed.\n");
5886 : }
5887 : }
5888 : }
5889 :
5890 1014177 : hash_set<slp_tree> visited_patterns;
5891 1014177 : slp_tree_to_load_perm_map_t perm_cache;
5892 1014177 : slp_compat_nodes_map_t compat_cache;
5893 :
5894 : /* See if any patterns can be found in the SLP tree. */
5895 1014177 : bool pattern_found = false;
5896 3473791 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5897 1445437 : pattern_found |= vect_match_slp_patterns (instance, vinfo,
5898 : &visited_patterns, &perm_cache,
5899 : &compat_cache);
5900 :
5901 : /* If any were found optimize permutations of loads. */
5902 1014177 : if (pattern_found)
5903 : {
5904 202 : hash_map<slp_tree, slp_tree> load_map;
5905 3239 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5906 : {
5907 2835 : slp_tree root = SLP_INSTANCE_TREE (instance);
5908 2835 : optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
5909 : &load_map, root);
5910 : }
5911 202 : }
5912 :
5913 : /* Check whether we should force some SLP instances to use load/store-lanes
5914 : and do so by forcing SLP re-discovery with single lanes. We used
5915 : to cancel SLP when this applied to all instances in a loop but now
5916 : we decide this per SLP instance. It's important to do this only
5917 : after SLP pattern recognition. */
5918 1014177 : if (is_a <loop_vec_info> (vinfo))
5919 1068978 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5920 663804 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
5921 229031 : && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
5922 : {
5923 229031 : slp_tree slp_root = SLP_INSTANCE_TREE (instance);
5924 229031 : unsigned int group_size = SLP_TREE_LANES (slp_root);
5925 229031 : tree vectype = SLP_TREE_VECTYPE (slp_root);
5926 :
5927 229031 : stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
5928 229031 : gimple *rep = STMT_VINFO_STMT (rep_info);
5929 229031 : bool masked = (is_gimple_call (rep)
5930 1366 : && gimple_call_internal_p (rep)
5931 230377 : && internal_fn_mask_index
5932 1346 : (gimple_call_internal_fn (rep)) != -1);
5933 229011 : if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
5934 23454 : || slp_root->ldst_lanes
5935 252485 : || (vect_store_lanes_supported (vectype, group_size, masked)
5936 : == IFN_LAST))
5937 229031 : continue;
5938 :
5939 0 : auto_vec<slp_tree> loads;
5940 0 : hash_set<slp_tree> visited;
5941 0 : vect_gather_slp_loads (loads, slp_root, visited);
5942 :
5943 : /* Check whether any load in the SLP instance is possibly
5944 : permuted. */
5945 0 : bool loads_permuted = false;
5946 0 : slp_tree load_node;
5947 0 : unsigned j;
5948 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
5949 : {
5950 0 : if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
5951 0 : continue;
5952 : unsigned k;
5953 : stmt_vec_info load_info;
5954 0 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
5955 0 : if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
5956 : {
5957 : loads_permuted = true;
5958 : break;
5959 : }
5960 : }
5961 :
5962 : /* If the loads and stores can use load/store-lanes force re-discovery
5963 : with single lanes. */
5964 0 : if (loads_permuted)
5965 : {
5966 0 : bool can_use_lanes = true;
5967 : bool prefer_load_lanes = false;
5968 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
5969 0 : if (STMT_VINFO_GROUPED_ACCESS
5970 : (SLP_TREE_REPRESENTATIVE (load_node)))
5971 : {
5972 0 : stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
5973 : (SLP_TREE_REPRESENTATIVE (load_node));
5974 0 : rep = STMT_VINFO_STMT (stmt_vinfo);
5975 0 : masked = (is_gimple_call (rep)
5976 0 : && gimple_call_internal_p (rep)
5977 0 : && internal_fn_mask_index
5978 0 : (gimple_call_internal_fn (rep)));
5979 : /* Use SLP for strided accesses (or if we can't
5980 : load-lanes). */
5981 0 : if (STMT_VINFO_STRIDED_P (stmt_vinfo)
5982 0 : || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
5983 0 : || vect_load_lanes_supported
5984 0 : (SLP_TREE_VECTYPE (load_node),
5985 0 : DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
5986 : /* ??? During SLP re-discovery with a single lane
5987 : a masked grouped load will appear permuted and
5988 : discovery will fail. We have to rework this
5989 : on the discovery side - for now avoid ICEing. */
5990 0 : || masked)
5991 : {
5992 : can_use_lanes = false;
5993 : break;
5994 : }
5995 : /* Make sure that the target would prefer store-lanes
5996 : for at least one of the loads.
5997 :
5998 : ??? Perhaps we should instead require this for
5999 : all loads? */
6000 0 : prefer_load_lanes
6001 : = (prefer_load_lanes
6002 0 : || SLP_TREE_LANES (load_node) == group_size
6003 0 : || (vect_slp_prefer_store_lanes_p
6004 0 : (vinfo, stmt_vinfo,
6005 : SLP_TREE_VECTYPE (load_node), masked,
6006 : group_size, SLP_TREE_LANES (load_node))));
6007 : }
6008 :
6009 0 : if (can_use_lanes && prefer_load_lanes)
6010 : {
6011 0 : if (dump_enabled_p ())
6012 0 : dump_printf_loc (MSG_NOTE, vect_location,
6013 : "SLP instance %p can use load/store-lanes,"
6014 : " re-discovering with single-lanes\n",
6015 : (void *) instance);
6016 :
6017 0 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
6018 :
6019 0 : vect_free_slp_instance (instance);
6020 0 : limit = max_tree_size;
6021 0 : bool res = vect_analyze_slp_instance (vinfo, bst_map,
6022 : stmt_info,
6023 : slp_inst_kind_store,
6024 : max_tree_size, &limit,
6025 : true);
6026 0 : gcc_assert (res);
6027 0 : auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
6028 0 : LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
6029 : }
6030 : }
6031 0 : }
6032 :
6033 : /* When we end up with load permutations that we cannot possibly handle,
6034 : like those requiring three vector inputs, lower them using interleaving
6035 : like schemes. */
6036 1014177 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6037 : {
6038 405174 : vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
6039 405174 : if (dump_enabled_p ())
6040 : {
6041 19216 : dump_printf_loc (MSG_NOTE, vect_location,
6042 : "SLP graph after lowering permutations:\n");
6043 19216 : hash_set<slp_tree> visited;
6044 85782 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6045 28159 : vect_print_slp_graph (MSG_NOTE, vect_location,
6046 : SLP_INSTANCE_TREE (instance), visited);
6047 19216 : }
6048 : }
6049 :
6050 1014177 : release_scalar_stmts_to_slp_tree_map (bst_map);
6051 :
6052 1014177 : if (pattern_found && dump_enabled_p ())
6053 : {
6054 14 : dump_printf_loc (MSG_NOTE, vect_location,
6055 : "Pattern matched SLP tree\n");
6056 14 : hash_set<slp_tree> visited;
6057 74 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6058 32 : vect_print_slp_graph (MSG_NOTE, vect_location,
6059 : SLP_INSTANCE_TREE (instance), visited);
6060 14 : }
6061 :
6062 1014177 : return opt_result::success ();
6063 1014177 : }
6064 :
6065 : /* Estimates the cost of inserting layout changes into the SLP graph.
6066 : It can also say that the insertion is impossible. */
6067 :
6068 : struct slpg_layout_cost
6069 : {
6070 9608829 : slpg_layout_cost () = default;
6071 : slpg_layout_cost (sreal, bool);
6072 :
6073 448284 : static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
6074 4949256 : bool is_possible () const { return depth != sreal::max (); }
6075 :
6076 : bool operator== (const slpg_layout_cost &) const;
6077 : bool operator!= (const slpg_layout_cost &) const;
6078 :
6079 : bool is_better_than (const slpg_layout_cost &, bool) const;
6080 :
6081 : void add_parallel_cost (const slpg_layout_cost &);
6082 : void add_serial_cost (const slpg_layout_cost &);
6083 : void split (unsigned int);
6084 :
6085 : /* The longest sequence of layout changes needed during any traversal
6086 : of the partition dag, weighted by execution frequency.
6087 :
6088 : This is the most important metric when optimizing for speed, since
6089 : it helps to ensure that we keep the number of operations on
6090 : critical paths to a minimum. */
6091 : sreal depth = 0;
6092 :
6093 : /* An estimate of the total number of operations needed. It is weighted by
6094 : execution frequency when optimizing for speed but not when optimizing for
6095 : size. In order to avoid double-counting, a node with a fanout of N will
6096 : distribute 1/N of its total cost to each successor.
6097 :
6098 : This is the most important metric when optimizing for size, since
6099 : it helps to keep the total number of operations to a minimum, */
6100 : sreal total = 0;
6101 : };
6102 :
6103 : /* Construct costs for a node with weight WEIGHT. A higher weight
6104 : indicates more frequent execution. IS_FOR_SIZE is true if we are
6105 : optimizing for size rather than speed. */
6106 :
6107 1163084 : slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
6108 1163952 : : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
6109 : {
6110 1163084 : }
6111 :
6112 : bool
6113 0 : slpg_layout_cost::operator== (const slpg_layout_cost &other) const
6114 : {
6115 0 : return depth == other.depth && total == other.total;
6116 : }
6117 :
6118 : bool
6119 0 : slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
6120 : {
6121 0 : return !operator== (other);
6122 : }
6123 :
6124 : /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
6125 : true if we are optimizing for size rather than speed. */
6126 :
6127 : bool
6128 291381 : slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
6129 : bool is_for_size) const
6130 : {
6131 291381 : if (is_for_size)
6132 : {
6133 382 : if (total != other.total)
6134 159 : return total < other.total;
6135 223 : return depth < other.depth;
6136 : }
6137 : else
6138 : {
6139 290999 : if (depth != other.depth)
6140 124583 : return depth < other.depth;
6141 166416 : return total < other.total;
6142 : }
6143 : }
6144 :
6145 : /* Increase the costs to account for something with cost INPUT_COST
6146 : happening in parallel with the current costs. */
6147 :
6148 : void
6149 344137 : slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
6150 : {
6151 344137 : depth = std::max (depth, input_cost.depth);
6152 344137 : total += input_cost.total;
6153 344137 : }
6154 :
6155 : /* Increase the costs to account for something with cost INPUT_COST
6156 : happening in series with the current costs. */
6157 :
6158 : void
6159 1401731 : slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
6160 : {
6161 1401731 : depth += other.depth;
6162 1401731 : total += other.total;
6163 1401731 : }
6164 :
6165 : /* Split the total cost among TIMES successors or predecessors. */
6166 :
6167 : void
6168 1155497 : slpg_layout_cost::split (unsigned int times)
6169 : {
6170 1155497 : if (times > 1)
6171 483023 : total /= times;
6172 1155497 : }
6173 :
6174 : /* Information about one node in the SLP graph, for use during
6175 : vect_optimize_slp_pass. */
6176 :
6177 : struct slpg_vertex
6178 : {
6179 9083528 : slpg_vertex (slp_tree node_) : node (node_) {}
6180 :
6181 : /* The node itself. */
6182 : slp_tree node;
6183 :
6184 : /* Which partition the node belongs to, or -1 if none. Nodes outside of
6185 : partitions are flexible; they can have whichever layout consumers
6186 : want them to have. */
6187 : int partition = -1;
6188 :
6189 : /* The number of nodes that directly use the result of this one
6190 : (i.e. the number of nodes that count this one as a child). */
6191 : unsigned int out_degree = 0;
6192 :
6193 : /* The execution frequency of the node. */
6194 : sreal weight = 0;
6195 :
6196 : /* The total execution frequency of all nodes that directly use the
6197 : result of this one. */
6198 : sreal out_weight = 0;
6199 : };
6200 :
6201 : /* Information about one partition of the SLP graph, for use during
6202 : vect_optimize_slp_pass. */
6203 :
6204 : struct slpg_partition_info
6205 : {
6206 : /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
6207 : of m_partitioned_nodes. */
6208 : unsigned int node_begin = 0;
6209 : unsigned int node_end = 0;
6210 :
6211 : /* Which layout we've chosen to use for this partition, or -1 if
6212 : we haven't picked one yet. */
6213 : int layout = -1;
6214 :
6215 : /* The number of predecessors and successors in the partition dag.
6216 : The predecessors always have lower partition numbers and the
6217 : successors always have higher partition numbers.
6218 :
6219 : Note that the directions of these edges are not necessarily the
6220 : same as in the data flow graph. For example, if an SCC has separate
6221 : partitions for an inner loop and an outer loop, the inner loop's
6222 : partition will have at least two incoming edges from the outer loop's
6223 : partition: one for a live-in value and one for a live-out value.
6224 : In data flow terms, one of these edges would also be from the outer loop
6225 : to the inner loop, but the other would be in the opposite direction. */
6226 : unsigned int in_degree = 0;
6227 : unsigned int out_degree = 0;
6228 : };
6229 :
6230 : /* Information about the costs of using a particular layout for a
6231 : particular partition. It can also say that the combination is
6232 : impossible. */
6233 :
6234 : struct slpg_partition_layout_costs
6235 : {
6236 1415138 : bool is_possible () const { return internal_cost.is_possible (); }
6237 49822 : void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
6238 :
6239 : /* The costs inherited from predecessor partitions. */
6240 : slpg_layout_cost in_cost;
6241 :
6242 : /* The inherent cost of the layout within the node itself. For example,
6243 : this is nonzero for a load if choosing a particular layout would require
6244 : the load to permute the loaded elements. It is nonzero for a
6245 : VEC_PERM_EXPR if the permutation cannot be eliminated or converted
6246 : to full-vector moves. */
6247 : slpg_layout_cost internal_cost;
6248 :
6249 : /* The costs inherited from successor partitions. */
6250 : slpg_layout_cost out_cost;
6251 : };
6252 :
6253 : /* This class tries to optimize the layout of vectors in order to avoid
6254 : unnecessary shuffling. At the moment, the set of possible layouts are
6255 : restricted to bijective permutations.
6256 :
6257 : The goal of the pass depends on whether we're optimizing for size or
6258 : for speed. When optimizing for size, the goal is to reduce the overall
6259 : number of layout changes (including layout changes implied by things
6260 : like load permutations). When optimizing for speed, the goal is to
6261 : reduce the maximum latency attributable to layout changes on any
6262 : non-cyclical path through the data flow graph.
6263 :
6264 : For example, when optimizing a loop nest for speed, we will prefer
6265 : to make layout changes outside of a loop rather than inside of a loop,
6266 : and will prefer to make layout changes in parallel rather than serially,
6267 : even if that increases the overall number of layout changes.
6268 :
6269 : The high-level procedure is:
6270 :
6271 : (1) Build a graph in which edges go from uses (parents) to definitions
6272 : (children).
6273 :
6274 : (2) Divide the graph into a dag of strongly-connected components (SCCs).
6275 :
6276 : (3) When optimizing for speed, partition the nodes in each SCC based
6277 : on their containing cfg loop. When optimizing for size, treat
6278 : each SCC as a single partition.
6279 :
6280 : This gives us a dag of partitions. The goal is now to assign a
6281 : layout to each partition.
6282 :
6283 : (4) Construct a set of vector layouts that are worth considering.
6284 : Record which nodes must keep their current layout.
6285 :
6286 : (5) Perform a forward walk over the partition dag (from loads to stores)
6287 : accumulating the "forward" cost of using each layout. When visiting
6288 : each partition, assign a tentative choice of layout to the partition
6289 : and use that choice when calculating the cost of using a different
6290 : layout in successor partitions.
6291 :
6292 : (6) Perform a backward walk over the partition dag (from stores to loads),
6293 : accumulating the "backward" cost of using each layout. When visiting
6294 : each partition, make a final choice of layout for that partition based
6295 : on the accumulated forward costs (from (5)) and backward costs
6296 : (from (6)).
6297 :
6298 : (7) Apply the chosen layouts to the SLP graph.
6299 :
6300 : For example, consider the SLP statements:
6301 :
6302 : S1: a_1 = load
6303 : loop:
6304 : S2: a_2 = PHI<a_1, a_3>
6305 : S3: b_1 = load
6306 : S4: a_3 = a_2 + b_1
6307 : exit:
6308 : S5: a_4 = PHI<a_3>
6309 : S6: store a_4
6310 :
6311 : S2 and S4 form an SCC and are part of the same loop. Every other
6312 : statement is in a singleton SCC. In this example there is a one-to-one
6313 : mapping between SCCs and partitions and the partition dag looks like this;
6314 :
6315 : S1 S3
6316 : \ /
6317 : S2+S4
6318 : |
6319 : S5
6320 : |
6321 : S6
6322 :
6323 : S2, S3 and S4 will have a higher execution frequency than the other
6324 : statements, so when optimizing for speed, the goal is to avoid any
6325 : layout changes:
6326 :
6327 : - within S3
6328 : - within S2+S4
6329 : - on the S3->S2+S4 edge
6330 :
6331 : For example, if S3 was originally a reversing load, the goal of the
6332 : pass is to make it an unreversed load and change the layout on the
6333 : S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
6334 : on S1->S2+S4 and S5->S6 would also be acceptable.)
6335 :
6336 : The difference between SCCs and partitions becomes important if we
6337 : add an outer loop:
6338 :
6339 : S1: a_1 = ...
6340 : loop1:
6341 : S2: a_2 = PHI<a_1, a_6>
6342 : S3: b_1 = load
6343 : S4: a_3 = a_2 + b_1
6344 : loop2:
6345 : S5: a_4 = PHI<a_3, a_5>
6346 : S6: c_1 = load
6347 : S7: a_5 = a_4 + c_1
6348 : exit2:
6349 : S8: a_6 = PHI<a_5>
6350 : S9: store a_6
6351 : exit1:
6352 :
6353 : Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
6354 : for speed, we usually do not want restrictions in the outer loop to "infect"
6355 : the decision for the inner loop. For example, if an outer-loop node
6356 : in the SCC contains a statement with a fixed layout, that should not
6357 : prevent the inner loop from using a different layout. Conversely,
6358 : the inner loop should not dictate a layout to the outer loop: if the
6359 : outer loop does a lot of computation, then it may not be efficient to
6360 : do all of that computation in the inner loop's preferred layout.
6361 :
6362 : So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
6363 : and S5+S7 (inner). We also try to arrange partitions so that:
6364 :
6365 : - the partition for an outer loop comes before the partition for
6366 : an inner loop
6367 :
6368 : - if a sibling loop A dominates a sibling loop B, A's partition
6369 : comes before B's
6370 :
6371 : This gives the following partition dag for the example above:
6372 :
6373 : S1 S3
6374 : \ /
6375 : S2+S4+S8 S6
6376 : | \\ /
6377 : | S5+S7
6378 : |
6379 : S9
6380 :
6381 : There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
6382 : one for a reversal of the edge S7->S8.
6383 :
6384 : The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
6385 : for S2+S4+S8 therefore has to balance the cost of using the outer loop's
6386 : preferred layout against the cost of changing the layout on entry to the
6387 : inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
6388 :
6389 : Although this works well when optimizing for speed, it has the downside
6390 : when optimizing for size that the choice of layout for S5+S7 is completely
6391 : independent of S9, which lessens the chance of reducing the overall number
6392 : of permutations. We therefore do not partition SCCs when optimizing
6393 : for size.
6394 :
6395 : To give a concrete example of the difference between optimizing
6396 : for size and speed, consider:
6397 :
6398 : a[0] = (b[1] << c[3]) - d[1];
6399 : a[1] = (b[0] << c[2]) - d[0];
6400 : a[2] = (b[3] << c[1]) - d[3];
6401 : a[3] = (b[2] << c[0]) - d[2];
6402 :
6403 : There are three different layouts here: one for a, one for b and d,
6404 : and one for c. When optimizing for speed it is better to permute each
6405 : of b, c and d into the order required by a, since those permutations
6406 : happen in parallel. But when optimizing for size, it is better to:
6407 :
6408 : - permute c into the same order as b
6409 : - do the arithmetic
6410 : - permute the result into the order required by a
6411 :
6412 : This gives 2 permutations rather than 3. */
6413 :
6414 : class vect_optimize_slp_pass
6415 : {
6416 : public:
6417 624215 : vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
6418 : void run ();
6419 :
6420 : private:
6421 : /* Graph building. */
6422 : struct loop *containing_loop (slp_tree);
6423 : bool is_cfg_latch_edge (graph_edge *);
6424 : void build_vertices (hash_set<slp_tree> &, slp_tree);
6425 : void build_vertices ();
6426 : void build_graph ();
6427 :
6428 : /* Partitioning. */
6429 : void create_partitions ();
6430 : template<typename T> void for_each_partition_edge (unsigned int, T);
6431 :
6432 : /* Layout selection. */
6433 : bool is_compatible_layout (slp_tree, unsigned int);
6434 : bool is_compatible_layout (const slpg_partition_info &, unsigned int);
6435 : int change_layout_cost (slp_tree, unsigned int, unsigned int);
6436 : slpg_partition_layout_costs &partition_layout_costs (unsigned int,
6437 : unsigned int);
6438 : void change_vec_perm_layout (slp_tree, lane_permutation_t &,
6439 : int, unsigned int);
6440 : int internal_node_cost (slp_tree, int, unsigned int);
6441 : void start_choosing_layouts ();
6442 : bool legitimize ();
6443 :
6444 : /* Cost propagation. */
6445 : slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
6446 : unsigned int, unsigned int);
6447 : slpg_layout_cost total_in_cost (unsigned int);
6448 : slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
6449 : slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
6450 : void forward_pass ();
6451 : void backward_pass ();
6452 :
6453 : /* Rematerialization. */
6454 : slp_tree get_result_with_layout (slp_tree, unsigned int);
6455 : void materialize ();
6456 :
6457 : /* Clean-up. */
6458 : void remove_redundant_permutations ();
6459 :
6460 : /* Masked load lanes discovery. */
6461 : void decide_masked_load_lanes ();
6462 :
6463 : void dump ();
6464 :
6465 : vec_info *m_vinfo;
6466 :
6467 : /* True if we should optimize the graph for size, false if we should
6468 : optimize it for speed. (It wouldn't be easy to make this decision
6469 : more locally.) */
6470 : bool m_optimize_size;
6471 :
6472 : /* A graph of all SLP nodes, with edges leading from uses to definitions.
6473 : In other words, a node's predecessors are its slp_tree parents and
6474 : a node's successors are its slp_tree children. */
6475 : graph *m_slpg = nullptr;
6476 :
6477 : /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
6478 : auto_vec<slpg_vertex> m_vertices;
6479 :
6480 : /* The list of all leaves of M_SLPG. such as external definitions, constants,
6481 : and loads. */
6482 : auto_vec<int> m_leafs;
6483 :
6484 : /* This array has one entry for every vector layout that we're considering.
6485 : Element 0 is null and indicates "no change". Other entries describe
6486 : permutations that are inherent in the current graph and that we would
6487 : like to reverse if possible.
6488 :
6489 : For example, a permutation { 1, 2, 3, 0 } means that something has
6490 : effectively been permuted in that way, such as a load group
6491 : { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
6492 : We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
6493 : in order to put things "back" in order. */
6494 : auto_vec<vec<unsigned> > m_perms;
6495 :
6496 : /* A partitioning of the nodes for which a layout must be chosen.
6497 : Each partition represents an <SCC, cfg loop> pair; that is,
6498 : nodes in different SCCs belong to different partitions, and nodes
6499 : within an SCC can be further partitioned according to a containing
6500 : cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
6501 :
6502 : - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
6503 : from leaves (such as loads) to roots (such as stores).
6504 :
6505 : - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
6506 : auto_vec<slpg_partition_info> m_partitions;
6507 :
6508 : /* The list of all nodes for which a layout must be chosen. Nodes for
6509 : partition P come before the nodes for partition P+1. Nodes within a
6510 : partition are in reverse postorder. */
6511 : auto_vec<unsigned int> m_partitioned_nodes;
6512 :
6513 : /* Index P * num-layouts + L contains the cost of using layout L
6514 : for partition P. */
6515 : auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
6516 :
6517 : /* Index N * num-layouts + L, if nonnull, is a node that provides the
6518 : original output of node N adjusted to have layout L. */
6519 : auto_vec<slp_tree> m_node_layouts;
6520 : };
6521 :
6522 : /* Fill the vertices and leafs vector with all nodes in the SLP graph.
6523 : Also record whether we should optimize anything for speed rather
6524 : than size. */
6525 :
6526 : void
6527 9727212 : vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
6528 : slp_tree node)
6529 : {
6530 9727212 : unsigned i;
6531 9727212 : slp_tree child;
6532 :
6533 9727212 : if (visited.add (node))
6534 9727212 : return;
6535 :
6536 9083528 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
6537 : {
6538 7044187 : basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
6539 6262027 : if (optimize_bb_for_speed_p (bb))
6540 6144913 : m_optimize_size = false;
6541 : }
6542 :
6543 9083528 : node->vertex = m_vertices.length ();
6544 9083528 : m_vertices.safe_push (slpg_vertex (node));
6545 :
6546 9083528 : bool leaf = true;
6547 9083528 : bool force_leaf = false;
6548 16808002 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6549 7724474 : if (child)
6550 : {
6551 6970640 : leaf = false;
6552 6970640 : build_vertices (visited, child);
6553 : }
6554 : else
6555 : force_leaf = true;
6556 : /* Since SLP discovery works along use-def edges all cycles have an
6557 : entry - but there's the exception of cycles where we do not handle
6558 : the entry explicitly (but with a NULL SLP node), like some reductions
6559 : and inductions. Force those SLP PHIs to act as leafs to make them
6560 : backwards reachable. */
6561 9083528 : if (leaf || force_leaf)
6562 4525802 : m_leafs.safe_push (node->vertex);
6563 : }
6564 :
6565 : /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
6566 :
6567 : void
6568 1248430 : vect_optimize_slp_pass::build_vertices ()
6569 : {
6570 1248430 : hash_set<slp_tree> visited;
6571 1248430 : unsigned i;
6572 1248430 : slp_instance instance;
6573 1248430 : m_vertices.truncate (0);
6574 1248430 : m_leafs.truncate (0);
6575 6501862 : FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
6576 2756572 : build_vertices (visited, SLP_INSTANCE_TREE (instance));
6577 1248430 : }
6578 :
6579 : /* Apply (reverse) bijectite PERM to VEC. */
6580 :
6581 : template <class T>
6582 : static void
6583 190896 : vect_slp_permute (vec<unsigned> perm,
6584 : vec<T> &vec, bool reverse)
6585 : {
6586 190896 : auto_vec<T, 64> saved;
6587 190896 : saved.create (vec.length ());
6588 623222 : for (unsigned i = 0; i < vec.length (); ++i)
6589 432326 : saved.quick_push (vec[i]);
6590 :
6591 190896 : if (reverse)
6592 : {
6593 1236597 : for (unsigned i = 0; i < vec.length (); ++i)
6594 431114 : vec[perm[i]] = saved[i];
6595 621472 : for (unsigned i = 0; i < vec.length (); ++i)
6596 759695 : gcc_assert (vec[perm[i]] == saved[i]);
6597 : }
6598 : else
6599 : {
6600 3500 : for (unsigned i = 0; i < vec.length (); ++i)
6601 1212 : vec[i] = saved[perm[i]];
6602 192108 : for (unsigned i = 0; i < vec.length (); ++i)
6603 1818 : gcc_assert (vec[i] == saved[perm[i]]);
6604 : }
6605 190896 : }
6606 :
6607 : /* Return the cfg loop that contains NODE. */
6608 :
6609 : struct loop *
6610 3425412 : vect_optimize_slp_pass::containing_loop (slp_tree node)
6611 : {
6612 3425412 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
6613 3425412 : if (!rep)
6614 4602 : return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
6615 3821043 : return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
6616 : }
6617 :
6618 : /* Return true if UD (an edge from a use to a definition) is associated
6619 : with a loop latch edge in the cfg. */
6620 :
6621 : bool
6622 6970640 : vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
6623 : {
6624 6970640 : slp_tree use = m_vertices[ud->src].node;
6625 6970640 : slp_tree def = m_vertices[ud->dest].node;
6626 6970640 : if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
6627 6970640 : || SLP_TREE_PERMUTE_P (use))
6628 6679160 : || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
6629 : return false;
6630 :
6631 3868608 : stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
6632 3868608 : return (is_a<gphi *> (use_rep->stmt)
6633 319358 : && bb_loop_header_p (gimple_bb (use_rep->stmt))
6634 4025734 : && containing_loop (def) == containing_loop (use));
6635 : }
6636 :
6637 : /* Build the graph. Mark edges that correspond to cfg loop latch edges with
6638 : a nonnull data field. */
6639 :
6640 : void
6641 1248430 : vect_optimize_slp_pass::build_graph ()
6642 : {
6643 1248430 : m_optimize_size = true;
6644 1248430 : build_vertices ();
6645 :
6646 2496860 : m_slpg = new_graph (m_vertices.length ());
6647 12828818 : for (slpg_vertex &v : m_vertices)
6648 26805086 : for (slp_tree child : SLP_TREE_CHILDREN (v.node))
6649 7724474 : if (child)
6650 : {
6651 6970640 : graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
6652 6970640 : if (is_cfg_latch_edge (ud))
6653 148672 : ud->data = this;
6654 : }
6655 1248430 : }
6656 :
6657 : /* Return true if E corresponds to a loop latch edge in the cfg. */
6658 :
6659 : static bool
6660 3559387 : skip_cfg_latch_edges (graph_edge *e)
6661 : {
6662 3559387 : return e->data;
6663 : }
6664 :
6665 : /* Create the node partitions. */
6666 :
6667 : void
6668 624215 : vect_optimize_slp_pass::create_partitions ()
6669 : {
6670 : /* Calculate a postorder of the graph, ignoring edges that correspond
6671 : to natural latch edges in the cfg. Reading the vector from the end
6672 : to the beginning gives the reverse postorder. */
6673 624215 : auto_vec<int> initial_rpo;
6674 1248430 : graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
6675 : false, NULL, skip_cfg_latch_edges);
6676 1872645 : gcc_assert (initial_rpo.length () == m_vertices.length ());
6677 :
6678 : /* Calculate the strongly connected components of the graph. */
6679 624215 : auto_vec<int> scc_grouping;
6680 624215 : unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
6681 :
6682 : /* Create a new index order in which all nodes from the same SCC are
6683 : consecutive. Use scc_pos to record the index of the first node in
6684 : each SCC. */
6685 624215 : auto_vec<unsigned int> scc_pos (num_sccs);
6686 624215 : int last_component = -1;
6687 624215 : unsigned int node_count = 0;
6688 6414142 : for (unsigned int node_i : scc_grouping)
6689 : {
6690 4541497 : if (last_component != m_slpg->vertices[node_i].component)
6691 : {
6692 4449831 : last_component = m_slpg->vertices[node_i].component;
6693 8899662 : gcc_assert (last_component == int (scc_pos.length ()));
6694 4449831 : scc_pos.quick_push (node_count);
6695 : }
6696 4541497 : node_count += 1;
6697 : }
6698 1248430 : gcc_assert (node_count == initial_rpo.length ()
6699 : && last_component + 1 == int (num_sccs));
6700 :
6701 : /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
6702 : inside each SCC following the RPO we calculated above. The fact that
6703 : we ignored natural latch edges when calculating the RPO should ensure
6704 : that, for natural loop nests:
6705 :
6706 : - the first node that we encounter in a cfg loop is the loop header phi
6707 : - the loop header phis are in dominance order
6708 :
6709 : Arranging for this is an optimization (see below) rather than a
6710 : correctness issue. Unnatural loops with a tangled mess of backedges
6711 : will still work correctly, but might give poorer results.
6712 :
6713 : Also update scc_pos so that it gives 1 + the index of the last node
6714 : in the SCC. */
6715 624215 : m_partitioned_nodes.safe_grow (node_count);
6716 5789927 : for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
6717 : {
6718 4541497 : unsigned int node_i = initial_rpo[old_i];
6719 4541497 : unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
6720 4541497 : m_partitioned_nodes[new_i] = node_i;
6721 : }
6722 :
6723 : /* When optimizing for speed, partition each SCC based on the containing
6724 : cfg loop. The order we constructed above should ensure that, for natural
6725 : cfg loops, we'll create sub-SCC partitions for outer loops before
6726 : the corresponding sub-SCC partitions for inner loops. Similarly,
6727 : when one sibling loop A dominates another sibling loop B, we should
6728 : create a sub-SCC partition for A before a sub-SCC partition for B.
6729 :
6730 : As above, nothing depends for correctness on whether this achieves
6731 : a natural nesting, but we should get better results when it does. */
6732 1248430 : m_partitions.reserve (m_vertices.length ());
6733 624215 : unsigned int next_partition_i = 0;
6734 624215 : hash_map<struct loop *, int> loop_partitions;
6735 624215 : unsigned int rpo_begin = 0;
6736 624215 : unsigned int num_partitioned_nodes = 0;
6737 6322476 : for (unsigned int rpo_end : scc_pos)
6738 : {
6739 4449831 : loop_partitions.empty ();
6740 : unsigned int partition_i = next_partition_i;
6741 8991328 : for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
6742 : {
6743 : /* Handle externals and constants optimistically throughout.
6744 : But treat existing vectors as fixed since we do not handle
6745 : permuting them. */
6746 4541497 : unsigned int node_i = m_partitioned_nodes[rpo_i];
6747 4541497 : auto &vertex = m_vertices[node_i];
6748 4541497 : if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
6749 494727 : && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
6750 4543735 : || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
6751 1406148 : vertex.partition = -1;
6752 : else
6753 : {
6754 3135349 : bool existed;
6755 3135349 : if (m_optimize_size)
6756 24189 : existed = next_partition_i > partition_i;
6757 : else
6758 : {
6759 3111160 : struct loop *loop = containing_loop (vertex.node);
6760 3111160 : auto &entry = loop_partitions.get_or_insert (loop, &existed);
6761 3111160 : if (!existed)
6762 3020478 : entry = next_partition_i;
6763 3111160 : partition_i = entry;
6764 : }
6765 3135349 : if (!existed)
6766 : {
6767 3044589 : m_partitions.quick_push (slpg_partition_info ());
6768 3044589 : next_partition_i += 1;
6769 : }
6770 3135349 : vertex.partition = partition_i;
6771 3135349 : num_partitioned_nodes += 1;
6772 3135349 : m_partitions[partition_i].node_end += 1;
6773 : }
6774 : }
6775 4449831 : rpo_begin = rpo_end;
6776 : }
6777 :
6778 : /* Assign ranges of consecutive node indices to each partition,
6779 : in partition order. Start with node_end being the same as
6780 : node_begin so that the next loop can use it as a counter. */
6781 624215 : unsigned int node_begin = 0;
6782 4917234 : for (auto &partition : m_partitions)
6783 : {
6784 3044589 : partition.node_begin = node_begin;
6785 3044589 : node_begin += partition.node_end;
6786 3044589 : partition.node_end = partition.node_begin;
6787 : }
6788 624215 : gcc_assert (node_begin == num_partitioned_nodes);
6789 :
6790 : /* Finally build the list of nodes in partition order. */
6791 624215 : m_partitioned_nodes.truncate (num_partitioned_nodes);
6792 5165712 : for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
6793 : {
6794 4541497 : int partition_i = m_vertices[node_i].partition;
6795 4541497 : if (partition_i >= 0)
6796 : {
6797 3135349 : unsigned int order_i = m_partitions[partition_i].node_end++;
6798 3135349 : m_partitioned_nodes[order_i] = node_i;
6799 : }
6800 : }
6801 624215 : }
6802 :
6803 : /* Look for edges from earlier partitions into node NODE_I and edges from
6804 : node NODE_I into later partitions. Call:
6805 :
6806 : FN (ud, other_node_i)
6807 :
6808 : for each such use-to-def edge ud, where other_node_i is the node at the
6809 : other end of the edge. */
6810 :
6811 : template<typename T>
6812 : void
6813 3524504 : vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
6814 : {
6815 3524504 : int partition_i = m_vertices[node_i].partition;
6816 3524504 : for (graph_edge *pred = m_slpg->vertices[node_i].pred;
6817 5994510 : pred; pred = pred->pred_next)
6818 : {
6819 2470006 : int src_partition_i = m_vertices[pred->src].partition;
6820 2470006 : if (src_partition_i >= 0 && src_partition_i != partition_i)
6821 2244276 : fn (pred, pred->src);
6822 : }
6823 3524504 : for (graph_edge *succ = m_slpg->vertices[node_i].succ;
6824 7547774 : succ; succ = succ->succ_next)
6825 : {
6826 4023270 : int dest_partition_i = m_vertices[succ->dest].partition;
6827 4023270 : if (dest_partition_i >= 0 && dest_partition_i != partition_i)
6828 2266053 : fn (succ, succ->dest);
6829 : }
6830 3524504 : }
6831 :
6832 : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
6833 : that NODE would operate on. This test is independent of NODE's actual
6834 : operation. */
6835 :
6836 : bool
6837 1574170 : vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
6838 : unsigned int layout_i)
6839 : {
6840 1574170 : if (layout_i == 0)
6841 : return true;
6842 :
6843 912258 : if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
6844 11596 : return false;
6845 :
6846 : return true;
6847 : }
6848 :
6849 : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
6850 : that NODE would operate on for each NODE in PARTITION.
6851 : This test is independent of NODE's actual operations. */
6852 :
6853 : bool
6854 17263 : vect_optimize_slp_pass::is_compatible_layout (const slpg_partition_info
6855 : &partition,
6856 : unsigned int layout_i)
6857 : {
6858 34760 : for (unsigned int order_i = partition.node_begin;
6859 34760 : order_i < partition.node_end; ++order_i)
6860 : {
6861 17563 : unsigned int node_i = m_partitioned_nodes[order_i];
6862 17563 : auto &vertex = m_vertices[node_i];
6863 :
6864 : /* The layout is incompatible if it is individually incompatible
6865 : with any node in the partition. */
6866 17563 : if (!is_compatible_layout (vertex.node, layout_i))
6867 : return false;
6868 : }
6869 : return true;
6870 : }
6871 :
6872 : /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
6873 : to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
6874 : layouts is incompatible with NODE or if the change is not possible for
6875 : some other reason.
6876 :
6877 : The properties taken from NODE include the number of lanes and the
6878 : vector type. The actual operation doesn't matter. */
6879 :
6880 : int
6881 674851 : vect_optimize_slp_pass::change_layout_cost (slp_tree node,
6882 : unsigned int from_layout_i,
6883 : unsigned int to_layout_i)
6884 : {
6885 674851 : if (!is_compatible_layout (node, from_layout_i)
6886 674851 : || !is_compatible_layout (node, to_layout_i))
6887 569 : return -1;
6888 :
6889 674282 : if (from_layout_i == to_layout_i)
6890 : return 0;
6891 :
6892 292254 : auto_vec<slp_tree, 1> children (1);
6893 292254 : children.quick_push (node);
6894 292254 : auto_lane_permutation_t perm (SLP_TREE_LANES (node));
6895 292254 : if (from_layout_i > 0)
6896 826424 : for (unsigned int i : m_perms[from_layout_i])
6897 363947 : perm.quick_push ({ 0, i });
6898 : else
6899 447104 : for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
6900 309009 : perm.quick_push ({ 0, i });
6901 292254 : if (to_layout_i > 0)
6902 138522 : vect_slp_permute (m_perms[to_layout_i], perm, true);
6903 292254 : auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
6904 : children, false);
6905 292254 : if (count >= 0)
6906 287786 : return MAX (count, 1);
6907 :
6908 : /* ??? In principle we could try changing via layout 0, giving two
6909 : layout changes rather than 1. Doing that would require
6910 : corresponding support in get_result_with_layout. */
6911 : return -1;
6912 292254 : }
6913 :
6914 : /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
6915 :
6916 : inline slpg_partition_layout_costs &
6917 972710 : vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
6918 : unsigned int layout_i)
6919 : {
6920 1945420 : return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
6921 : }
6922 :
6923 : /* Change PERM in one of two ways:
6924 :
6925 : - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
6926 : chosen for child I of NODE.
6927 :
6928 : - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
6929 :
6930 : In both cases, arrange for the output to have layout OUT_LAYOUT_I */
6931 :
6932 : void
6933 27867 : vect_optimize_slp_pass::
6934 : change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
6935 : int in_layout_i, unsigned int out_layout_i)
6936 : {
6937 163837 : for (auto &entry : perm)
6938 : {
6939 80236 : int this_in_layout_i = in_layout_i;
6940 80236 : if (this_in_layout_i < 0)
6941 : {
6942 57281 : slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
6943 57281 : unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
6944 57281 : if (in_partition_i == -1u)
6945 329 : continue;
6946 56952 : this_in_layout_i = m_partitions[in_partition_i].layout;
6947 : }
6948 79907 : if (this_in_layout_i > 0)
6949 17441 : entry.second = m_perms[this_in_layout_i][entry.second];
6950 : }
6951 27867 : if (out_layout_i > 0)
6952 6305 : vect_slp_permute (m_perms[out_layout_i], perm, true);
6953 27867 : }
6954 :
6955 : /* Check whether the target allows NODE to be rearranged so that the node's
6956 : output has layout OUT_LAYOUT_I. Return the cost of the change if so,
6957 : in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
6958 :
6959 : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
6960 : NODE can adapt to the layout changes that have (perhaps provisionally)
6961 : been chosen for NODE's children, so that no extra permutations are
6962 : needed on either the input or the output of NODE.
6963 :
6964 : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
6965 : that all inputs will be forced into layout IN_LAYOUT_I beforehand.
6966 :
6967 : IN_LAYOUT_I has no meaning for other types of node.
6968 :
6969 : Keeping the node as-is is always valid. If the target doesn't appear
6970 : to support the node as-is, but might realistically support other layouts,
6971 : then layout 0 instead has the cost of a worst-case permutation. On the
6972 : one hand, this ensures that every node has at least one valid layout,
6973 : avoiding what would otherwise be an awkward special case. On the other,
6974 : it still encourages the pass to change an invalid pre-existing layout
6975 : choice into a valid one. */
6976 :
6977 : int
6978 206870 : vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
6979 : unsigned int out_layout_i)
6980 : {
6981 206870 : const int fallback_cost = 1;
6982 :
6983 206870 : if (SLP_TREE_PERMUTE_P (node))
6984 : {
6985 23544 : auto_lane_permutation_t tmp_perm;
6986 23544 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
6987 :
6988 : /* Check that the child nodes support the chosen layout. Checking
6989 : the first child is enough, since any second child would have the
6990 : same shape. */
6991 23544 : auto first_child = SLP_TREE_CHILDREN (node)[0];
6992 23544 : if (in_layout_i > 0
6993 23544 : && !is_compatible_layout (first_child, in_layout_i))
6994 : return -1;
6995 :
6996 22979 : change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
6997 45958 : int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
6998 : node, tmp_perm,
6999 22979 : SLP_TREE_CHILDREN (node),
7000 : false);
7001 22979 : if (count < 0)
7002 : {
7003 1516 : if (in_layout_i == 0 && out_layout_i == 0)
7004 : {
7005 : /* Use the fallback cost if the node could in principle support
7006 : some nonzero layout for both the inputs and the outputs.
7007 : Otherwise assume that the node will be rejected later
7008 : and rebuilt from scalars. */
7009 369 : if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
7010 : return fallback_cost;
7011 299 : return 0;
7012 : }
7013 : return -1;
7014 : }
7015 :
7016 : /* We currently have no way of telling whether the new layout is cheaper
7017 : or more expensive than the old one. But at least in principle,
7018 : it should be worth making zero permutations (whole-vector shuffles)
7019 : cheaper than real permutations, in case the pass is able to remove
7020 : the latter. */
7021 21463 : return count == 0 ? 0 : 1;
7022 23544 : }
7023 :
7024 183326 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
7025 183326 : if (rep
7026 182387 : && STMT_VINFO_DATA_REF (rep)
7027 57723 : && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
7028 224351 : && SLP_TREE_LOAD_PERMUTATION (node).exists ())
7029 : {
7030 35160 : auto_load_permutation_t tmp_perm;
7031 35160 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
7032 35160 : if (out_layout_i > 0)
7033 12277 : vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
7034 :
7035 35160 : poly_uint64 vf = 1;
7036 35160 : if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
7037 7972 : vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7038 35160 : unsigned int n_perms;
7039 35160 : if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
7040 : nullptr, vf, true, false, &n_perms))
7041 : {
7042 1481 : auto rep = SLP_TREE_REPRESENTATIVE (node);
7043 1481 : if (out_layout_i == 0)
7044 : {
7045 : /* Use the fallback cost if the load is an N-to-N permutation.
7046 : Otherwise assume that the node will be rejected later
7047 : and rebuilt from scalars. */
7048 1078 : if (STMT_VINFO_GROUPED_ACCESS (rep)
7049 2156 : && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
7050 1078 : == SLP_TREE_LANES (node)))
7051 582 : return fallback_cost;
7052 : return 0;
7053 : }
7054 : return -1;
7055 : }
7056 :
7057 : /* See the comment above the corresponding VEC_PERM_EXPR handling. */
7058 33679 : return n_perms == 0 ? 0 : 1;
7059 35160 : }
7060 :
7061 : return 0;
7062 : }
7063 :
7064 : /* Decide which element layouts we should consider using. Calculate the
7065 : weights associated with inserting layout changes on partition edges.
7066 : Also mark partitions that cannot change layout, by setting their
7067 : layout to zero. */
7068 :
7069 : void
7070 624215 : vect_optimize_slp_pass::start_choosing_layouts ()
7071 : {
7072 : /* Used to assign unique permutation indices. */
7073 624215 : using perm_hash = unbounded_hashmap_traits<
7074 : vec_free_hash_base<int_hash_base<unsigned>>,
7075 : int_hash<int, -1, -2>
7076 : >;
7077 624215 : hash_map<vec<unsigned>, int, perm_hash> layout_ids;
7078 :
7079 : /* Layout 0 is "no change". */
7080 624215 : m_perms.safe_push (vNULL);
7081 :
7082 : /* Create layouts from existing permutations. */
7083 624215 : auto_load_permutation_t tmp_perm;
7084 5007994 : for (unsigned int node_i : m_partitioned_nodes)
7085 : {
7086 : /* Leafs also double as entries to the reverse graph. Allow the
7087 : layout of those to be changed. */
7088 3135349 : auto &vertex = m_vertices[node_i];
7089 3135349 : auto &partition = m_partitions[vertex.partition];
7090 3135349 : if (!m_slpg->vertices[node_i].succ)
7091 792854 : partition.layout = 0;
7092 :
7093 : /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
7094 3135349 : slp_tree node = vertex.node;
7095 3135349 : stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
7096 3135349 : slp_tree child;
7097 3135349 : unsigned HOST_WIDE_INT imin, imax = 0;
7098 3135349 : bool any_permute = false;
7099 3135349 : tmp_perm.truncate (0);
7100 3135349 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
7101 : {
7102 : /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
7103 : unpermuted, record a layout that reverses this permutation.
7104 :
7105 : We would need more work to cope with loads that are internally
7106 : permuted and also have inputs (such as masks for
7107 : IFN_MASK_LOADs). */
7108 521464 : gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
7109 521464 : if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
7110 : {
7111 357824 : partition.layout = -1;
7112 3119317 : continue;
7113 : }
7114 163640 : dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
7115 163640 : imin = DR_GROUP_SIZE (dr_stmt) + 1;
7116 163640 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
7117 : }
7118 5112701 : else if (SLP_TREE_PERMUTE_P (node)
7119 130270 : && SLP_TREE_CHILDREN (node).length () == 1
7120 115069 : && (child = SLP_TREE_CHILDREN (node)[0])
7121 2728954 : && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
7122 115069 : .is_constant (&imin)))
7123 : {
7124 : /* If the child has the same vector size as this node,
7125 : reversing the permutation can make the permutation a no-op.
7126 : In other cases it can change a true permutation into a
7127 : full-vector extract. */
7128 115069 : tmp_perm.reserve (SLP_TREE_LANES (node));
7129 307183 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7130 192114 : tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
7131 : }
7132 : else
7133 2498816 : continue;
7134 :
7135 734731 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7136 : {
7137 456022 : unsigned idx = tmp_perm[j];
7138 456022 : imin = MIN (imin, idx);
7139 456022 : imax = MAX (imax, idx);
7140 456022 : if (idx - tmp_perm[0] != j)
7141 132373 : any_permute = true;
7142 : }
7143 : /* If the span doesn't match we'd disrupt VF computation, avoid
7144 : that for now. */
7145 278709 : if (imax - imin + 1 != SLP_TREE_LANES (node))
7146 79790 : continue;
7147 : /* If there's no permute no need to split one out. In this case
7148 : we can consider turning a load into a permuted load, if that
7149 : turns out to be cheaper than alternatives. */
7150 198919 : if (!any_permute)
7151 : {
7152 182753 : partition.layout = -1;
7153 182753 : continue;
7154 : }
7155 :
7156 : /* For now only handle true permutes, like
7157 : vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
7158 : when permuting constants and invariants keeping the permute
7159 : bijective. */
7160 16166 : auto_sbitmap load_index (SLP_TREE_LANES (node));
7161 16166 : bitmap_clear (load_index);
7162 62502 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7163 46336 : bitmap_set_bit (load_index, tmp_perm[j] - imin);
7164 : unsigned j;
7165 61826 : for (j = 0; j < SLP_TREE_LANES (node); ++j)
7166 45794 : if (!bitmap_bit_p (load_index, j))
7167 : break;
7168 16166 : if (j != SLP_TREE_LANES (node))
7169 134 : continue;
7170 :
7171 16032 : vec<unsigned> perm = vNULL;
7172 16032 : perm.safe_grow (SLP_TREE_LANES (node), true);
7173 61591 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7174 45559 : perm[j] = tmp_perm[j] - imin;
7175 :
7176 32064 : if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
7177 : {
7178 : /* Continue to use existing layouts, but don't add any more. */
7179 0 : int *entry = layout_ids.get (perm);
7180 0 : partition.layout = entry ? *entry : 0;
7181 0 : perm.release ();
7182 : }
7183 : else
7184 : {
7185 16032 : bool existed;
7186 16032 : int &layout_i = layout_ids.get_or_insert (perm, &existed);
7187 16032 : if (existed)
7188 5496 : perm.release ();
7189 : else
7190 : {
7191 10536 : layout_i = m_perms.length ();
7192 10536 : m_perms.safe_push (perm);
7193 : }
7194 16032 : partition.layout = layout_i;
7195 : }
7196 16166 : }
7197 :
7198 : /* Initially assume that every layout is possible and has zero cost
7199 : in every partition. */
7200 624215 : m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
7201 1248430 : * m_perms.length ());
7202 :
7203 : /* We have to mark outgoing permutations facing non-associating-reduction
7204 : graph entries that are not represented as to be materialized.
7205 : slp_inst_kind_bb_reduc currently only covers associatable reductions. */
7206 3250931 : for (slp_instance instance : m_vinfo->slp_instances)
7207 1378286 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
7208 : {
7209 5977 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
7210 5977 : m_partitions[m_vertices[node_i].partition].layout = 0;
7211 : }
7212 1372309 : else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
7213 : {
7214 1399 : stmt_vec_info stmt_info
7215 1399 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
7216 1399 : vect_reduc_info reduc_info
7217 1399 : = info_for_reduction (as_a <loop_vec_info> (m_vinfo),
7218 : SLP_INSTANCE_TREE (instance));
7219 1399 : if (needs_fold_left_reduction_p (TREE_TYPE
7220 : (gimple_get_lhs (stmt_info->stmt)),
7221 : VECT_REDUC_INFO_CODE (reduc_info)))
7222 : {
7223 64 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
7224 64 : m_partitions[m_vertices[node_i].partition].layout = 0;
7225 : }
7226 : }
7227 :
7228 : /* Check which layouts each node and partition can handle. Calculate the
7229 : weights associated with inserting layout changes on edges. */
7230 5007994 : for (unsigned int node_i : m_partitioned_nodes)
7231 : {
7232 3135349 : auto &vertex = m_vertices[node_i];
7233 3135349 : auto &partition = m_partitions[vertex.partition];
7234 3135349 : slp_tree node = vertex.node;
7235 :
7236 3135349 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
7237 : {
7238 3130747 : vertex.weight = vect_slp_node_weight (node);
7239 :
7240 : /* We do not handle stores with a permutation, so all
7241 : incoming permutations must have been materialized.
7242 :
7243 : We also don't handle masked grouped loads, which lack a
7244 : permutation vector. In this case the memory locations
7245 : form an implicit second input to the loads, on top of the
7246 : explicit mask input, and the memory input's layout cannot
7247 : be changed.
7248 :
7249 : On the other hand, we do support permuting gather loads and
7250 : masked gather loads, where each scalar load is independent
7251 : of the others. This can be useful if the address/index input
7252 : benefits from permutation. */
7253 3130747 : if (STMT_VINFO_DATA_REF (rep)
7254 1621698 : && STMT_VINFO_GROUPED_ACCESS (rep)
7255 4195154 : && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
7256 900767 : partition.layout = 0;
7257 :
7258 : /* We cannot change the layout of an operation that is
7259 : not independent on lanes. Note this is an explicit
7260 : negative list since that's much shorter than the respective
7261 : positive one but it's critical to keep maintaining it. */
7262 3130747 : if (is_gimple_call (STMT_VINFO_STMT (rep)))
7263 23350 : switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
7264 : {
7265 1071 : case CFN_COMPLEX_ADD_ROT90:
7266 1071 : case CFN_COMPLEX_ADD_ROT270:
7267 1071 : case CFN_COMPLEX_MUL:
7268 1071 : case CFN_COMPLEX_MUL_CONJ:
7269 1071 : case CFN_VEC_ADDSUB:
7270 1071 : case CFN_VEC_FMADDSUB:
7271 1071 : case CFN_VEC_FMSUBADD:
7272 1071 : partition.layout = 0;
7273 : default:;
7274 : }
7275 : }
7276 :
7277 6943883 : auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
7278 : {
7279 3808534 : auto &other_vertex = m_vertices[other_node_i];
7280 :
7281 : /* Count the number of edges from earlier partitions and the number
7282 : of edges to later partitions. */
7283 3808534 : if (other_vertex.partition < vertex.partition)
7284 1904267 : partition.in_degree += 1;
7285 : else
7286 1904267 : partition.out_degree += 1;
7287 :
7288 : /* If the current node uses the result of OTHER_NODE_I, accumulate
7289 : the effects of that. */
7290 3808534 : if (ud->src == int (node_i))
7291 : {
7292 1904267 : other_vertex.out_weight += vertex.weight;
7293 1904267 : other_vertex.out_degree += 1;
7294 : }
7295 6943883 : };
7296 3135349 : for_each_partition_edge (node_i, process_edge);
7297 : }
7298 624215 : }
7299 :
7300 : /* Return the incoming costs for node NODE_I, assuming that each input keeps
7301 : its current (provisional) choice of layout. The inputs do not necessarily
7302 : have the same layout as each other. */
7303 :
7304 : slpg_layout_cost
7305 3116 : vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
7306 : {
7307 3116 : auto &vertex = m_vertices[node_i];
7308 3116 : slpg_layout_cost cost;
7309 11365 : auto add_cost = [&](graph_edge *, unsigned int other_node_i)
7310 : {
7311 8249 : auto &other_vertex = m_vertices[other_node_i];
7312 8249 : if (other_vertex.partition < vertex.partition)
7313 : {
7314 5228 : auto &other_partition = m_partitions[other_vertex.partition];
7315 10456 : auto &other_costs = partition_layout_costs (other_vertex.partition,
7316 5228 : other_partition.layout);
7317 5228 : slpg_layout_cost this_cost = other_costs.in_cost;
7318 5228 : this_cost.add_serial_cost (other_costs.internal_cost);
7319 5228 : this_cost.split (other_partition.out_degree);
7320 5228 : cost.add_parallel_cost (this_cost);
7321 : }
7322 11365 : };
7323 3116 : for_each_partition_edge (node_i, add_cost);
7324 3116 : return cost;
7325 : }
7326 :
7327 : /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
7328 : and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
7329 : slpg_layout_cost::impossible () if the change isn't possible. */
7330 :
7331 : slpg_layout_cost
7332 674851 : vect_optimize_slp_pass::
7333 : edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
7334 : unsigned int layout2_i)
7335 : {
7336 674851 : auto &def_vertex = m_vertices[ud->dest];
7337 674851 : auto &use_vertex = m_vertices[ud->src];
7338 674851 : auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
7339 674851 : auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
7340 674851 : auto factor = change_layout_cost (def_vertex.node, def_layout_i,
7341 : use_layout_i);
7342 674851 : if (factor < 0)
7343 5037 : return slpg_layout_cost::impossible ();
7344 :
7345 : /* We have a choice of putting the layout change at the site of the
7346 : definition or at the site of the use. Prefer the former when
7347 : optimizing for size or when the execution frequency of the
7348 : definition is no greater than the combined execution frequencies of
7349 : the uses. When putting the layout change at the site of the definition,
7350 : divvy up the cost among all consumers. */
7351 669814 : if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
7352 : {
7353 653368 : slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
7354 653368 : cost.split (def_vertex.out_degree);
7355 653368 : return cost;
7356 : }
7357 16446 : return { use_vertex.weight * factor, m_optimize_size };
7358 : }
7359 :
7360 : /* UD represents a use-def link between FROM_NODE_I and a node in a later
7361 : partition; FROM_NODE_I could be the definition node or the use node.
7362 : The node at the other end of the link wants to use layout TO_LAYOUT_I.
7363 : Return the cost of any necessary fix-ups on edge UD, or return
7364 : slpg_layout_cost::impossible () if the change isn't possible.
7365 :
7366 : At this point, FROM_NODE_I's partition has chosen the cheapest
7367 : layout based on the information available so far, but this choice
7368 : is only provisional. */
7369 :
7370 : slpg_layout_cost
7371 177147 : vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
7372 : unsigned int to_layout_i)
7373 : {
7374 177147 : auto &from_vertex = m_vertices[from_node_i];
7375 177147 : unsigned int from_partition_i = from_vertex.partition;
7376 177147 : slpg_partition_info &from_partition = m_partitions[from_partition_i];
7377 177147 : gcc_assert (from_partition.layout >= 0);
7378 :
7379 : /* First calculate the cost on the assumption that FROM_PARTITION sticks
7380 : with its current layout preference. */
7381 177147 : slpg_layout_cost cost = slpg_layout_cost::impossible ();
7382 177147 : auto edge_cost = edge_layout_cost (ud, from_node_i,
7383 177147 : from_partition.layout, to_layout_i);
7384 177147 : if (edge_cost.is_possible ())
7385 : {
7386 349028 : auto &from_costs = partition_layout_costs (from_partition_i,
7387 174514 : from_partition.layout);
7388 174514 : cost = from_costs.in_cost;
7389 174514 : cost.add_serial_cost (from_costs.internal_cost);
7390 174514 : cost.split (from_partition.out_degree);
7391 174514 : cost.add_serial_cost (edge_cost);
7392 : }
7393 2633 : else if (from_partition.layout == 0)
7394 : /* We must allow the source partition to have layout 0 as a fallback,
7395 : in case all other options turn out to be impossible. */
7396 2633 : return cost;
7397 :
7398 : /* Take the minimum of that cost and the cost that applies if
7399 : FROM_PARTITION instead switches to TO_LAYOUT_I. */
7400 174514 : auto &direct_layout_costs = partition_layout_costs (from_partition_i,
7401 : to_layout_i);
7402 174514 : if (direct_layout_costs.is_possible ())
7403 : {
7404 157992 : slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
7405 157992 : direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
7406 157992 : direct_cost.split (from_partition.out_degree);
7407 157992 : if (!cost.is_possible ()
7408 157992 : || direct_cost.is_better_than (cost, m_optimize_size))
7409 42096 : cost = direct_cost;
7410 : }
7411 :
7412 174514 : return cost;
7413 : }
7414 :
7415 : /* UD represents a use-def link between TO_NODE_I and a node in an earlier
7416 : partition; TO_NODE_I could be the definition node or the use node.
7417 : The node at the other end of the link wants to use layout FROM_LAYOUT_I;
7418 : return the cost of any necessary fix-ups on edge UD, or
7419 : slpg_layout_cost::impossible () if the choice cannot be made.
7420 :
7421 : At this point, TO_NODE_I's partition has a fixed choice of layout. */
7422 :
7423 : slpg_layout_cost
7424 164395 : vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
7425 : unsigned int from_layout_i)
7426 : {
7427 164395 : auto &to_vertex = m_vertices[to_node_i];
7428 164395 : unsigned int to_partition_i = to_vertex.partition;
7429 164395 : slpg_partition_info &to_partition = m_partitions[to_partition_i];
7430 164395 : gcc_assert (to_partition.layout >= 0);
7431 :
7432 : /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
7433 : adjusted for this input having layout FROM_LAYOUT_I. Assume that
7434 : any other inputs keep their current choice of layout. */
7435 164395 : auto &to_costs = partition_layout_costs (to_partition_i,
7436 : to_partition.layout);
7437 164395 : if (ud->src == int (to_node_i)
7438 164233 : && SLP_TREE_PERMUTE_P (to_vertex.node))
7439 : {
7440 9275 : auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
7441 9275 : auto old_layout = from_partition.layout;
7442 9275 : from_partition.layout = from_layout_i;
7443 18550 : int factor = internal_node_cost (to_vertex.node, -1,
7444 9275 : to_partition.layout);
7445 9275 : from_partition.layout = old_layout;
7446 9275 : if (factor >= 0)
7447 : {
7448 8643 : slpg_layout_cost cost = to_costs.out_cost;
7449 17286 : cost.add_serial_cost ({ to_vertex.weight * factor,
7450 8643 : m_optimize_size });
7451 8643 : cost.split (to_partition.in_degree);
7452 8643 : return cost;
7453 : }
7454 : }
7455 :
7456 : /* Compute the cost if we insert any necessary layout change on edge UD. */
7457 155752 : auto edge_cost = edge_layout_cost (ud, to_node_i,
7458 155752 : to_partition.layout, from_layout_i);
7459 155752 : if (edge_cost.is_possible ())
7460 : {
7461 155752 : slpg_layout_cost cost = to_costs.out_cost;
7462 155752 : cost.add_serial_cost (to_costs.internal_cost);
7463 155752 : cost.split (to_partition.in_degree);
7464 155752 : cost.add_serial_cost (edge_cost);
7465 155752 : return cost;
7466 : }
7467 :
7468 0 : return slpg_layout_cost::impossible ();
7469 : }
7470 :
7471 : /* Make a forward pass through the partitions, accumulating input costs.
7472 : Make a tentative (provisional) choice of layout for each partition,
7473 : ensuring that this choice still allows later partitions to keep
7474 : their original layout. */
7475 :
7476 : void
7477 5251 : vect_optimize_slp_pass::forward_pass ()
7478 : {
7479 113390 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
7480 : ++partition_i)
7481 : {
7482 108139 : auto &partition = m_partitions[partition_i];
7483 :
7484 : /* If the partition consists of a single VEC_PERM_EXPR, precompute
7485 : the incoming cost that would apply if every predecessor partition
7486 : keeps its current layout. This is used within the loop below. */
7487 108139 : slpg_layout_cost in_cost;
7488 108139 : slp_tree single_node = nullptr;
7489 108139 : if (partition.node_end == partition.node_begin + 1)
7490 : {
7491 104297 : unsigned int node_i = m_partitioned_nodes[partition.node_begin];
7492 104297 : single_node = m_vertices[node_i].node;
7493 104297 : if (SLP_TREE_PERMUTE_P (single_node))
7494 3116 : in_cost = total_in_cost (node_i);
7495 : }
7496 :
7497 : /* Go through the possible layouts. Decide which ones are valid
7498 : for this partition and record which of the valid layouts has
7499 : the lowest cost. */
7500 108139 : unsigned int min_layout_i = 0;
7501 108139 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
7502 330308 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7503 : {
7504 222169 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7505 222169 : if (!layout_costs.is_possible ())
7506 49822 : continue;
7507 :
7508 : /* If the recorded layout is already 0 then the layout cannot
7509 : change. */
7510 222169 : if (partition.layout == 0 && layout_i != 0)
7511 : {
7512 36153 : layout_costs.mark_impossible ();
7513 36153 : continue;
7514 : }
7515 :
7516 186016 : bool is_possible = true;
7517 377155 : for (unsigned int order_i = partition.node_begin;
7518 377155 : order_i < partition.node_end; ++order_i)
7519 : {
7520 202573 : unsigned int node_i = m_partitioned_nodes[order_i];
7521 202573 : auto &vertex = m_vertices[node_i];
7522 :
7523 : /* Reject the layout if it is individually incompatible
7524 : with any node in the partition. */
7525 202573 : if (!is_compatible_layout (vertex.node, layout_i))
7526 : {
7527 10396 : is_possible = false;
7528 11434 : break;
7529 : }
7530 :
7531 536891 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
7532 : {
7533 344714 : auto &other_vertex = m_vertices[other_node_i];
7534 344714 : if (other_vertex.partition < vertex.partition)
7535 : {
7536 : /* Accumulate the incoming costs from earlier
7537 : partitions, plus the cost of any layout changes
7538 : on UD itself. */
7539 177147 : auto cost = forward_cost (ud, other_node_i, layout_i);
7540 177147 : if (!cost.is_possible ())
7541 2633 : is_possible = false;
7542 : else
7543 174514 : layout_costs.in_cost.add_parallel_cost (cost);
7544 : }
7545 : else
7546 : /* Reject the layout if it would make layout 0 impossible
7547 : for later partitions. This amounts to testing that the
7548 : target supports reversing the layout change on edges
7549 : to later partitions.
7550 :
7551 : In principle, it might be possible to push a layout
7552 : change all the way down a graph, so that it never
7553 : needs to be reversed and so that the target doesn't
7554 : need to support the reverse operation. But it would
7555 : be awkward to bail out if we hit a partition that
7556 : does not support the new layout, especially since
7557 : we are not dealing with a lattice. */
7558 167567 : is_possible &= edge_layout_cost (ud, other_node_i, 0,
7559 167567 : layout_i).is_possible ();
7560 536891 : };
7561 192177 : for_each_partition_edge (node_i, add_cost);
7562 :
7563 : /* Accumulate the cost of using LAYOUT_I within NODE,
7564 : both for the inputs and the outputs. */
7565 192177 : int factor = internal_node_cost (vertex.node, layout_i,
7566 : layout_i);
7567 192177 : if (factor < 0)
7568 : {
7569 1038 : is_possible = false;
7570 1038 : break;
7571 : }
7572 191139 : else if (factor)
7573 31370 : layout_costs.internal_cost.add_serial_cost
7574 31370 : ({ vertex.weight * factor, m_optimize_size });
7575 : }
7576 186016 : if (!is_possible)
7577 : {
7578 13669 : layout_costs.mark_impossible ();
7579 13669 : continue;
7580 : }
7581 :
7582 : /* Combine the incoming and partition-internal costs. */
7583 172347 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7584 172347 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7585 :
7586 : /* If this partition consists of a single VEC_PERM_EXPR, see
7587 : if the VEC_PERM_EXPR can be changed to support output layout
7588 : LAYOUT_I while keeping all the provisional choices of input
7589 : layout. */
7590 172347 : if (single_node && SLP_TREE_PERMUTE_P (single_node))
7591 : {
7592 5418 : int factor = internal_node_cost (single_node, -1, layout_i);
7593 5418 : if (factor >= 0)
7594 : {
7595 4973 : auto weight = m_vertices[single_node->vertex].weight;
7596 4973 : slpg_layout_cost internal_cost
7597 4973 : = { weight * factor, m_optimize_size };
7598 :
7599 4973 : slpg_layout_cost alt_cost = in_cost;
7600 4973 : alt_cost.add_serial_cost (internal_cost);
7601 4973 : if (alt_cost.is_better_than (combined_cost, m_optimize_size))
7602 : {
7603 1577 : combined_cost = alt_cost;
7604 1577 : layout_costs.in_cost = in_cost;
7605 1577 : layout_costs.internal_cost = internal_cost;
7606 : }
7607 : }
7608 : }
7609 :
7610 : /* Record the layout with the lowest cost. Prefer layout 0 in
7611 : the event of a tie between it and another layout. */
7612 172347 : if (!min_layout_cost.is_possible ()
7613 64208 : || combined_cost.is_better_than (min_layout_cost,
7614 64208 : m_optimize_size))
7615 : {
7616 121530 : min_layout_i = layout_i;
7617 121530 : min_layout_cost = combined_cost;
7618 : }
7619 : }
7620 :
7621 : /* This loop's handling of earlier partitions should ensure that
7622 : choosing the original layout for the current partition is no
7623 : less valid than it was in the original graph, even with the
7624 : provisional layout choices for those earlier partitions. */
7625 108139 : gcc_assert (min_layout_cost.is_possible ());
7626 108139 : partition.layout = min_layout_i;
7627 : }
7628 5251 : }
7629 :
7630 : /* Make a backward pass through the partitions, accumulating output costs.
7631 : Make a final choice of layout for each partition. */
7632 :
7633 : void
7634 5251 : vect_optimize_slp_pass::backward_pass ()
7635 : {
7636 118641 : for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
7637 : {
7638 108139 : auto &partition = m_partitions[partition_i];
7639 :
7640 108139 : unsigned int min_layout_i = 0;
7641 108139 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
7642 330308 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7643 : {
7644 222169 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7645 222169 : if (!layout_costs.is_possible ())
7646 49822 : continue;
7647 :
7648 : /* Accumulate the costs from successor partitions. */
7649 172347 : bool is_possible = true;
7650 361220 : for (unsigned int order_i = partition.node_begin;
7651 361220 : order_i < partition.node_end; ++order_i)
7652 : {
7653 188873 : unsigned int node_i = m_partitioned_nodes[order_i];
7654 188873 : auto &vertex = m_vertices[node_i];
7655 527653 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
7656 : {
7657 338780 : auto &other_vertex = m_vertices[other_node_i];
7658 338780 : auto &other_partition = m_partitions[other_vertex.partition];
7659 338780 : if (other_vertex.partition > vertex.partition)
7660 : {
7661 : /* Accumulate the incoming costs from later
7662 : partitions, plus the cost of any layout changes
7663 : on UD itself. */
7664 164395 : auto cost = backward_cost (ud, other_node_i, layout_i);
7665 164395 : if (!cost.is_possible ())
7666 0 : is_possible = false;
7667 : else
7668 164395 : layout_costs.out_cost.add_parallel_cost (cost);
7669 : }
7670 : else
7671 : /* Make sure that earlier partitions can (if necessary
7672 : or beneficial) keep the layout that they chose in
7673 : the forward pass. This ensures that there is at
7674 : least one valid choice of layout. */
7675 174385 : is_possible &= edge_layout_cost (ud, other_node_i,
7676 174385 : other_partition.layout,
7677 174385 : layout_i).is_possible ();
7678 527653 : };
7679 188873 : for_each_partition_edge (node_i, add_cost);
7680 : }
7681 172347 : if (!is_possible)
7682 : {
7683 0 : layout_costs.mark_impossible ();
7684 0 : continue;
7685 : }
7686 :
7687 : /* Locally combine the costs from the forward and backward passes.
7688 : (This combined cost is not passed on, since that would lead
7689 : to double counting.) */
7690 172347 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7691 172347 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7692 172347 : combined_cost.add_serial_cost (layout_costs.out_cost);
7693 :
7694 : /* Record the layout with the lowest cost. Prefer layout 0 in
7695 : the event of a tie between it and another layout. */
7696 172347 : if (!min_layout_cost.is_possible ()
7697 64208 : || combined_cost.is_better_than (min_layout_cost,
7698 64208 : m_optimize_size))
7699 : {
7700 116056 : min_layout_i = layout_i;
7701 116056 : min_layout_cost = combined_cost;
7702 : }
7703 : }
7704 :
7705 108139 : gcc_assert (min_layout_cost.is_possible ());
7706 108139 : partition.layout = min_layout_i;
7707 : }
7708 5251 : }
7709 :
7710 : /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
7711 : NODE already has the layout that was selected for its partition. */
7712 :
7713 : slp_tree
7714 145124 : vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
7715 : unsigned int to_layout_i)
7716 : {
7717 145124 : unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
7718 145124 : slp_tree result = m_node_layouts[result_i];
7719 145124 : if (result)
7720 : return result;
7721 :
7722 144658 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7723 144658 : || (SLP_TREE_DEF_TYPE (node) == vect_external_def
7724 : /* We can't permute vector defs in place. */
7725 20187 : && SLP_TREE_VEC_DEFS (node).is_empty ()))
7726 : {
7727 : /* If the vector is uniform or unchanged, there's nothing to do. */
7728 37515 : if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
7729 : result = node;
7730 : else
7731 : {
7732 1956 : auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
7733 1956 : result = vect_create_new_slp_node (scalar_ops);
7734 1956 : vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
7735 : }
7736 : }
7737 : else
7738 : {
7739 107143 : unsigned int partition_i = m_vertices[node->vertex].partition;
7740 107143 : unsigned int from_layout_i = m_partitions[partition_i].layout;
7741 107143 : if (from_layout_i == to_layout_i)
7742 106607 : return node;
7743 :
7744 : /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
7745 : permutation instead of a serial one. Leave the new permutation
7746 : in TMP_PERM on success. */
7747 536 : auto_lane_permutation_t tmp_perm;
7748 536 : unsigned int num_inputs = 1;
7749 536 : if (SLP_TREE_PERMUTE_P (node))
7750 : {
7751 7 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
7752 7 : if (from_layout_i != 0)
7753 7 : vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
7754 7 : if (to_layout_i != 0)
7755 4 : vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
7756 7 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
7757 : tmp_perm,
7758 7 : SLP_TREE_CHILDREN (node),
7759 : false) >= 0)
7760 7 : num_inputs = SLP_TREE_CHILDREN (node).length ();
7761 : else
7762 0 : tmp_perm.truncate (0);
7763 : }
7764 :
7765 536 : if (dump_enabled_p ())
7766 : {
7767 68 : if (tmp_perm.length () > 0)
7768 6 : dump_printf_loc (MSG_NOTE, vect_location,
7769 : "duplicating permutation node %p with"
7770 : " layout %d\n",
7771 : (void *) node, to_layout_i);
7772 : else
7773 62 : dump_printf_loc (MSG_NOTE, vect_location,
7774 : "inserting permutation node in place of %p\n",
7775 : (void *) node);
7776 : }
7777 :
7778 536 : unsigned int num_lanes = SLP_TREE_LANES (node);
7779 536 : result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
7780 536 : if (SLP_TREE_SCALAR_STMTS (node).length ())
7781 : {
7782 535 : auto &stmts = SLP_TREE_SCALAR_STMTS (result);
7783 535 : stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
7784 535 : if (from_layout_i != 0)
7785 269 : vect_slp_permute (m_perms[from_layout_i], stmts, false);
7786 535 : if (to_layout_i != 0)
7787 270 : vect_slp_permute (m_perms[to_layout_i], stmts, true);
7788 : }
7789 536 : SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
7790 536 : SLP_TREE_LANES (result) = num_lanes;
7791 536 : SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
7792 536 : result->vertex = -1;
7793 :
7794 536 : auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
7795 536 : if (tmp_perm.length ())
7796 : {
7797 7 : lane_perm.safe_splice (tmp_perm);
7798 7 : SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
7799 : }
7800 : else
7801 : {
7802 529 : lane_perm.create (num_lanes);
7803 1651 : for (unsigned j = 0; j < num_lanes; ++j)
7804 1122 : lane_perm.quick_push ({ 0, j });
7805 529 : if (from_layout_i != 0)
7806 262 : vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
7807 529 : if (to_layout_i != 0)
7808 267 : vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
7809 529 : SLP_TREE_CHILDREN (result).safe_push (node);
7810 : }
7811 2148 : for (slp_tree child : SLP_TREE_CHILDREN (result))
7812 540 : child->refcnt++;
7813 536 : }
7814 38051 : m_node_layouts[result_i] = result;
7815 38051 : return result;
7816 : }
7817 :
7818 : /* Apply the chosen vector layouts to the SLP graph. */
7819 :
7820 : void
7821 10117 : vect_optimize_slp_pass::materialize ()
7822 : {
7823 : /* We no longer need the costs, so avoid having two O(N * P) arrays
7824 : live at the same time. */
7825 10117 : m_partition_layout_costs.release ();
7826 30351 : m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
7827 :
7828 20234 : auto_sbitmap fully_folded (m_vertices.length ());
7829 10117 : bitmap_clear (fully_folded);
7830 155359 : for (unsigned int node_i : m_partitioned_nodes)
7831 : {
7832 125008 : auto &vertex = m_vertices[node_i];
7833 125008 : slp_tree node = vertex.node;
7834 125008 : int layout_i = m_partitions[vertex.partition].layout;
7835 125008 : gcc_assert (layout_i >= 0);
7836 :
7837 : /* Rearrange the scalar statements to match the chosen layout. */
7838 125008 : if (layout_i > 0)
7839 15430 : vect_slp_permute (m_perms[layout_i],
7840 15430 : SLP_TREE_SCALAR_STMTS (node), true);
7841 :
7842 : /* Update load and lane permutations. */
7843 125008 : if (SLP_TREE_PERMUTE_P (node))
7844 : {
7845 : /* First try to absorb the input vector layouts. If that fails,
7846 : force the inputs to have layout LAYOUT_I too. We checked that
7847 : that was possible before deciding to use nonzero output layouts.
7848 : (Note that at this stage we don't really have any guarantee that
7849 : the target supports the original VEC_PERM_EXPR.) */
7850 4519 : auto &perm = SLP_TREE_LANE_PERMUTATION (node);
7851 4519 : auto_lane_permutation_t tmp_perm;
7852 4519 : tmp_perm.safe_splice (perm);
7853 4519 : change_vec_perm_layout (node, tmp_perm, -1, layout_i);
7854 4519 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
7855 : tmp_perm,
7856 4519 : SLP_TREE_CHILDREN (node),
7857 : false) >= 0)
7858 : {
7859 4150 : if (dump_enabled_p ()
7860 5042 : && !std::equal (tmp_perm.begin (), tmp_perm.end (),
7861 : perm.begin ()))
7862 58 : dump_printf_loc (MSG_NOTE, vect_location,
7863 : "absorbing input layouts into %p\n",
7864 : (void *) node);
7865 23827 : std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
7866 4150 : bitmap_set_bit (fully_folded, node_i);
7867 : }
7868 : else
7869 : {
7870 : /* Not MSG_MISSED because it would make no sense to users. */
7871 369 : if (dump_enabled_p ())
7872 46 : dump_printf_loc (MSG_NOTE, vect_location,
7873 : "failed to absorb input layouts into %p\n",
7874 : (void *) node);
7875 369 : change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
7876 : }
7877 4519 : }
7878 : else
7879 : {
7880 120489 : gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
7881 120489 : auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
7882 120489 : if (layout_i > 0)
7883 : /* ??? When we handle non-bijective permutes the idea
7884 : is that we can force the load-permutation to be
7885 : { min, min + 1, min + 2, ... max }. But then the
7886 : scalar defs might no longer match the lane content
7887 : which means wrong-code with live lane vectorization.
7888 : So we possibly have to have NULL entries for those. */
7889 15327 : vect_slp_permute (m_perms[layout_i], load_perm, true);
7890 : }
7891 : }
7892 :
7893 : /* Do this before any nodes disappear, since it involves a walk
7894 : over the leaves. */
7895 10117 : remove_redundant_permutations ();
7896 :
7897 : /* Replace each child with a correctly laid-out version. */
7898 155359 : for (unsigned int node_i : m_partitioned_nodes)
7899 : {
7900 : /* Skip nodes that have already been handled above. */
7901 125008 : if (bitmap_bit_p (fully_folded, node_i))
7902 4150 : continue;
7903 :
7904 120858 : auto &vertex = m_vertices[node_i];
7905 120858 : int in_layout_i = m_partitions[vertex.partition].layout;
7906 120858 : gcc_assert (in_layout_i >= 0);
7907 :
7908 : unsigned j;
7909 : slp_tree child;
7910 359703 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
7911 : {
7912 148781 : if (!child)
7913 3657 : continue;
7914 :
7915 145124 : slp_tree new_child = get_result_with_layout (child, in_layout_i);
7916 145124 : if (new_child != child)
7917 : {
7918 2701 : vect_free_slp_tree (child);
7919 2701 : SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
7920 2701 : new_child->refcnt += 1;
7921 : }
7922 : }
7923 : }
7924 10117 : }
7925 :
7926 : /* Elide load permutations that are not necessary. Such permutations might
7927 : be pre-existing, rather than created by the layout optimizations. */
7928 :
7929 : void
7930 624215 : vect_optimize_slp_pass::remove_redundant_permutations ()
7931 : {
7932 4135546 : for (unsigned int node_i : m_leafs)
7933 : {
7934 2262901 : slp_tree node = m_vertices[node_i].node;
7935 2262901 : if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
7936 1741437 : continue;
7937 :
7938 : /* In basic block vectorization we allow any subchain of an interleaving
7939 : chain.
7940 : FORNOW: not in loop SLP because of realignment complications. */
7941 521464 : if (is_a <bb_vec_info> (m_vinfo))
7942 : {
7943 155631 : bool subchain_p = true;
7944 : stmt_vec_info next_load_info = NULL;
7945 : stmt_vec_info load_info;
7946 : unsigned j;
7947 155631 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
7948 : {
7949 126818 : if (j != 0
7950 126818 : && (next_load_info != load_info
7951 59901 : || ! load_info
7952 59901 : || DR_GROUP_GAP (load_info) != 1))
7953 : {
7954 : subchain_p = false;
7955 : break;
7956 : }
7957 104295 : next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
7958 : }
7959 51336 : if (subchain_p)
7960 : {
7961 28813 : SLP_TREE_LOAD_PERMUTATION (node).release ();
7962 28813 : continue;
7963 : }
7964 : }
7965 : else
7966 : {
7967 470128 : loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
7968 470128 : bool this_load_permuted = !vect_load_perm_consecutive_p (node, 0);
7969 : /* When this isn't a grouped access we know it's single element
7970 : and contiguous. */
7971 470128 : if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
7972 : {
7973 357824 : if (!this_load_permuted
7974 357824 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
7975 357210 : || SLP_TREE_LANES (node) == 1))
7976 357199 : SLP_TREE_LOAD_PERMUTATION (node).release ();
7977 357824 : continue;
7978 : }
7979 112304 : stmt_vec_info first_stmt_info
7980 112304 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
7981 112705 : if (!this_load_permuted
7982 : /* The load requires permutation when unrolling exposes
7983 : a gap either because the group is larger than the SLP
7984 : group-size or because there is a gap between the groups. */
7985 112304 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
7986 95057 : || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
7987 124 : && DR_GROUP_GAP (first_stmt_info) == 0)))
7988 : {
7989 401 : SLP_TREE_LOAD_PERMUTATION (node).release ();
7990 401 : continue;
7991 : }
7992 : }
7993 : }
7994 624215 : }
7995 :
7996 : /* Print the partition graph and layout information to the dump file. */
7997 :
7998 : void
7999 659 : vect_optimize_slp_pass::dump ()
8000 : {
8001 659 : dump_printf_loc (MSG_NOTE, vect_location,
8002 : "SLP optimize permutations:\n");
8003 1331 : for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
8004 : {
8005 672 : dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
8006 672 : const char *sep = "";
8007 5769 : for (unsigned int idx : m_perms[layout_i])
8008 : {
8009 3753 : dump_printf (MSG_NOTE, "%s%d", sep, idx);
8010 3753 : sep = ", ";
8011 : }
8012 672 : dump_printf (MSG_NOTE, " }\n");
8013 : }
8014 659 : dump_printf_loc (MSG_NOTE, vect_location,
8015 : "SLP optimize partitions:\n");
8016 5420 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
8017 : ++partition_i)
8018 : {
8019 4761 : auto &partition = m_partitions[partition_i];
8020 4761 : dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
8021 4761 : dump_printf_loc (MSG_NOTE, vect_location,
8022 : " partition %d (layout %d):\n",
8023 : partition_i, partition.layout);
8024 4761 : dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
8025 9750 : for (unsigned int order_i = partition.node_begin;
8026 9750 : order_i < partition.node_end; ++order_i)
8027 : {
8028 4989 : auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
8029 9978 : dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
8030 4989 : (void *) vertex.node);
8031 4989 : dump_printf_loc (MSG_NOTE, vect_location,
8032 : " weight: %f\n",
8033 : vertex.weight.to_double ());
8034 4989 : if (vertex.out_degree)
8035 3888 : dump_printf_loc (MSG_NOTE, vect_location,
8036 : " out weight: %f (degree %d)\n",
8037 : vertex.out_weight.to_double (),
8038 : vertex.out_degree);
8039 4989 : if (SLP_TREE_PERMUTE_P (vertex.node))
8040 492 : dump_printf_loc (MSG_NOTE, vect_location,
8041 : " op: VEC_PERM_EXPR\n");
8042 4497 : else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
8043 4479 : dump_printf_loc (MSG_NOTE, vect_location,
8044 : " op template: %G", rep->stmt);
8045 : }
8046 4761 : dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
8047 9750 : for (unsigned int order_i = partition.node_begin;
8048 9750 : order_i < partition.node_end; ++order_i)
8049 : {
8050 4989 : unsigned int node_i = m_partitioned_nodes[order_i];
8051 4989 : auto &vertex = m_vertices[node_i];
8052 15041 : auto print_edge = [&](graph_edge *, unsigned int other_node_i)
8053 : {
8054 10052 : auto &other_vertex = m_vertices[other_node_i];
8055 10052 : if (other_vertex.partition < vertex.partition)
8056 5026 : dump_printf_loc (MSG_NOTE, vect_location,
8057 : " - %p [%d] --> %p\n",
8058 5026 : (void *) other_vertex.node,
8059 : other_vertex.partition,
8060 5026 : (void *) vertex.node);
8061 : else
8062 5026 : dump_printf_loc (MSG_NOTE, vect_location,
8063 : " - %p --> [%d] %p\n",
8064 5026 : (void *) vertex.node,
8065 : other_vertex.partition,
8066 5026 : (void *) other_vertex.node);
8067 15041 : };
8068 4989 : for_each_partition_edge (node_i, print_edge);
8069 : }
8070 :
8071 14482 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
8072 : {
8073 9721 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
8074 9721 : if (layout_costs.is_possible ())
8075 : {
8076 7976 : dump_printf_loc (MSG_NOTE, vect_location,
8077 : " layout %d:%s\n", layout_i,
8078 7976 : partition.layout == int (layout_i)
8079 : ? " (*)" : "");
8080 7976 : slpg_layout_cost combined_cost = layout_costs.in_cost;
8081 7976 : combined_cost.add_serial_cost (layout_costs.internal_cost);
8082 7976 : combined_cost.add_serial_cost (layout_costs.out_cost);
8083 : #define TEMPLATE "{depth: %f, total: %f}"
8084 7976 : dump_printf_loc (MSG_NOTE, vect_location,
8085 : " " TEMPLATE "\n",
8086 : layout_costs.in_cost.depth.to_double (),
8087 : layout_costs.in_cost.total.to_double ());
8088 7976 : dump_printf_loc (MSG_NOTE, vect_location,
8089 : " + " TEMPLATE "\n",
8090 : layout_costs.internal_cost.depth.to_double (),
8091 : layout_costs.internal_cost.total.to_double ());
8092 7976 : dump_printf_loc (MSG_NOTE, vect_location,
8093 : " + " TEMPLATE "\n",
8094 : layout_costs.out_cost.depth.to_double (),
8095 : layout_costs.out_cost.total.to_double ());
8096 7976 : dump_printf_loc (MSG_NOTE, vect_location,
8097 : " = " TEMPLATE "\n",
8098 : combined_cost.depth.to_double (),
8099 : combined_cost.total.to_double ());
8100 : #undef TEMPLATE
8101 : }
8102 : else
8103 1745 : dump_printf_loc (MSG_NOTE, vect_location,
8104 : " layout %d: rejected\n", layout_i);
8105 : }
8106 : }
8107 659 : }
8108 :
8109 : /* Masked load lanes discovery. */
8110 :
8111 : void
8112 624215 : vect_optimize_slp_pass::decide_masked_load_lanes ()
8113 : {
8114 6414676 : for (auto v : m_vertices)
8115 : {
8116 4542031 : slp_tree node = v.node;
8117 4542031 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
8118 3133636 : || SLP_TREE_PERMUTE_P (node))
8119 1539199 : continue;
8120 3002832 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8121 1506472 : if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
8122 : /* The mask has to be uniform. */
8123 949532 : || STMT_VINFO_SLP_VECT_ONLY (stmt_info)
8124 949401 : || ! is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
8125 3002917 : || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
8126 : IFN_MASK_LOAD))
8127 3002799 : continue;
8128 33 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8129 66 : if (STMT_VINFO_STRIDED_P (stmt_info)
8130 33 : || compare_step_with_zero (m_vinfo, stmt_info) <= 0
8131 63 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (node),
8132 30 : DR_GROUP_SIZE (stmt_info),
8133 : true) == IFN_LAST)
8134 33 : continue;
8135 :
8136 : /* Uniform masks need to be suitably represented. */
8137 0 : slp_tree mask = SLP_TREE_CHILDREN (node)[0];
8138 0 : if (!SLP_TREE_PERMUTE_P (mask)
8139 0 : || SLP_TREE_CHILDREN (mask).length () != 1)
8140 0 : continue;
8141 0 : bool match = true;
8142 0 : for (auto perm : SLP_TREE_LANE_PERMUTATION (mask))
8143 0 : if (perm.first != 0 || perm.second != 0)
8144 : {
8145 : match = false;
8146 : break;
8147 : }
8148 0 : if (!match)
8149 0 : continue;
8150 :
8151 : /* Now see if the consumer side matches. */
8152 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
8153 0 : pred; pred = pred->pred_next)
8154 : {
8155 0 : slp_tree pred_node = m_vertices[pred->src].node;
8156 : /* All consumers should be a permute with a single outgoing lane. */
8157 0 : if (!SLP_TREE_PERMUTE_P (pred_node)
8158 0 : || SLP_TREE_LANES (pred_node) != 1)
8159 : {
8160 : match = false;
8161 : break;
8162 : }
8163 0 : gcc_assert (SLP_TREE_CHILDREN (pred_node).length () == 1);
8164 : }
8165 0 : if (!match)
8166 0 : continue;
8167 : /* Now we can mark the nodes as to use load lanes. */
8168 0 : node->ldst_lanes = true;
8169 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
8170 0 : pred; pred = pred->pred_next)
8171 0 : m_vertices[pred->src].node->ldst_lanes = true;
8172 : /* The catch is we have to massage the mask. We have arranged
8173 : analyzed uniform masks to be represented by a splat VEC_PERM
8174 : which we can now simply elide as we cannot easily re-do SLP
8175 : discovery here. */
8176 0 : slp_tree new_mask = SLP_TREE_CHILDREN (mask)[0];
8177 0 : SLP_TREE_REF_COUNT (new_mask)++;
8178 0 : SLP_TREE_CHILDREN (node)[0] = new_mask;
8179 0 : vect_free_slp_tree (mask);
8180 : }
8181 624215 : }
8182 :
8183 : /* Perform legitimizing attempts. This is intended to improve the
8184 : situation when layout 0 is not valid which is a situation the cost
8185 : based propagation does not handle well.
8186 : Return true if further layout optimization is possible, false if
8187 : the layout configuration should be considered final. */
8188 :
8189 : bool
8190 10117 : vect_optimize_slp_pass::legitimize ()
8191 : {
8192 : /* Perform a very simple legitimizing attempt by attempting to choose
8193 : a single layout for all partitions that will make all permutations
8194 : a noop. That should also be the optimal layout choice in case
8195 : layout zero is legitimate.
8196 : ??? Disconnected components of the SLP graph could have distinct
8197 : single layouts. */
8198 10117 : int single_layout_i = -1;
8199 10117 : unsigned deferred_up_to = -1U;
8200 30391 : for (unsigned partition_i = 0; partition_i < m_partitions.length ();
8201 : ++partition_i)
8202 : {
8203 25519 : auto &partition = m_partitions[partition_i];
8204 25519 : if (single_layout_i == -1)
8205 : {
8206 13310 : single_layout_i = partition.layout;
8207 13310 : deferred_up_to = partition_i;
8208 : }
8209 12209 : else if (partition.layout == single_layout_i || partition.layout == -1)
8210 : ;
8211 : else
8212 : single_layout_i = 0;
8213 22323 : if (single_layout_i == 0)
8214 : return true;
8215 :
8216 20334 : if (single_layout_i != -1
8217 20334 : && !is_compatible_layout (partition, single_layout_i))
8218 : return true;
8219 : }
8220 :
8221 4872 : if (single_layout_i <= 0)
8222 : return true;
8223 :
8224 4988 : for (unsigned partition_i = 0; partition_i < deferred_up_to; ++partition_i)
8225 122 : if (!is_compatible_layout (m_partitions[partition_i],
8226 : single_layout_i))
8227 : return true;
8228 :
8229 12105 : for (unsigned partition_i = 0; partition_i < m_partitions.length ();
8230 : ++partition_i)
8231 : {
8232 7239 : auto &partition = m_partitions[partition_i];
8233 7239 : partition.layout = single_layout_i;
8234 : }
8235 :
8236 : return false;
8237 : }
8238 :
8239 : /* Main entry point for the SLP graph optimization pass. */
8240 :
8241 : void
8242 624215 : vect_optimize_slp_pass::run ()
8243 : {
8244 624215 : build_graph ();
8245 624215 : create_partitions ();
8246 624215 : start_choosing_layouts ();
8247 624215 : if (m_perms.length () > 1)
8248 : {
8249 10117 : if (legitimize ())
8250 : {
8251 5251 : forward_pass ();
8252 5251 : backward_pass ();
8253 : }
8254 10117 : if (dump_enabled_p ())
8255 659 : dump ();
8256 10117 : materialize ();
8257 40887 : while (!m_perms.is_empty ())
8258 20653 : m_perms.pop ().release ();
8259 : }
8260 : else
8261 614098 : remove_redundant_permutations ();
8262 624215 : free_graph (m_slpg);
8263 624215 : build_graph ();
8264 624215 : decide_masked_load_lanes ();
8265 624215 : free_graph (m_slpg);
8266 624215 : }
8267 :
8268 : /* Apply CSE to NODE and its children using BST_MAP. */
8269 :
8270 : static void
8271 4860761 : vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
8272 : {
8273 4860761 : bool put_p = false;
8274 4860761 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
8275 : /* Besides some VEC_PERM_EXPR, two-operator nodes also
8276 : lack scalar stmts and thus CSE doesn't work via bst_map. Ideally
8277 : we'd have sth that works for all internal and external nodes. */
8278 4860761 : && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
8279 : {
8280 3430607 : slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
8281 3430607 : if (leader)
8282 : {
8283 : /* We've visited this node already. */
8284 320594 : if (!*leader || *leader == node)
8285 : return;
8286 :
8287 2432 : if (dump_enabled_p ())
8288 887 : dump_printf_loc (MSG_NOTE, vect_location,
8289 : "re-using SLP tree %p for %p\n",
8290 : (void *)*leader, (void *)node);
8291 2432 : vect_free_slp_tree (node);
8292 2432 : (*leader)->refcnt += 1;
8293 2432 : node = *leader;
8294 2432 : return;
8295 : }
8296 :
8297 : /* Avoid creating a cycle by populating the map only after recursion. */
8298 3110013 : bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
8299 3110013 : node->refcnt += 1;
8300 3110013 : put_p = true;
8301 : /* And recurse. */
8302 : }
8303 :
8304 13392575 : for (slp_tree &child : SLP_TREE_CHILDREN (node))
8305 3859386 : if (child)
8306 3482475 : vect_cse_slp_nodes (bst_map, child);
8307 :
8308 : /* Now record the node for CSE in other siblings. */
8309 4540167 : if (put_p)
8310 3110013 : *bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
8311 : }
8312 :
8313 : /* Optimize the SLP graph of VINFO. */
8314 :
8315 : void
8316 967687 : vect_optimize_slp (vec_info *vinfo)
8317 : {
8318 967687 : if (vinfo->slp_instances.is_empty ())
8319 : return;
8320 624215 : vect_optimize_slp_pass (vinfo).run ();
8321 :
8322 : /* Apply CSE again to nodes after permute optimization. */
8323 624215 : scalar_stmts_to_slp_tree_map_t *bst_map
8324 624215 : = new scalar_stmts_to_slp_tree_map_t ();
8325 :
8326 3250931 : for (auto inst : vinfo->slp_instances)
8327 1378286 : vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
8328 :
8329 624215 : release_scalar_stmts_to_slp_tree_map (bst_map);
8330 : }
8331 :
8332 : /* Gather loads reachable from the individual SLP graph entries. */
8333 :
8334 : void
8335 967687 : vect_gather_slp_loads (vec_info *vinfo)
8336 : {
8337 967687 : unsigned i;
8338 967687 : slp_instance instance;
8339 2345973 : FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
8340 : {
8341 1378286 : hash_set<slp_tree> visited;
8342 1378286 : vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
8343 : SLP_INSTANCE_TREE (instance), visited);
8344 1378286 : }
8345 967687 : }
8346 :
8347 : /* For NODE update VF based on the number of lanes and the vector types
8348 : used. */
8349 :
8350 : static void
8351 3578119 : vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
8352 : hash_set<slp_tree> &visited)
8353 : {
8354 3578119 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
8355 1286504 : return;
8356 2565429 : if (visited.add (node))
8357 : return;
8358 :
8359 8652484 : for (slp_tree child : SLP_TREE_CHILDREN (node))
8360 2914315 : vect_update_slp_vf_for_node (child, vf, visited);
8361 :
8362 : /* We do not visit SLP nodes for constants or externals - those neither
8363 : have a vector type set yet (vectorizable_* does this) nor do they
8364 : have max_nunits set. Instead we rely on internal nodes max_nunit
8365 : to cover constant/external operands.
8366 : Note that when we stop using fixed size vectors externs and constants
8367 : shouldn't influence the (minimum) vectorization factor, instead
8368 : vectorizable_* should honor the vectorization factor when trying to
8369 : assign vector types to constants and externals and cause iteration
8370 : to a higher vectorization factor when required. */
8371 2291615 : poly_uint64 node_vf
8372 2291615 : = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
8373 2291615 : vf = force_common_multiple (vf, node_vf);
8374 :
8375 : /* For permute nodes that are fed from externs or constants we have to
8376 : consider their number of lanes as well. Likewise for store-lanes. */
8377 2291615 : if (SLP_TREE_PERMUTE_P (node) || node->ldst_lanes)
8378 645732 : for (slp_tree child : SLP_TREE_CHILDREN (node))
8379 171960 : if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
8380 : {
8381 2858 : poly_uint64 child_vf
8382 2858 : = calculate_unrolling_factor (node->max_nunits,
8383 : SLP_TREE_LANES (child));
8384 2858 : vf = force_common_multiple (vf, child_vf);
8385 : }
8386 : }
8387 :
8388 : /* For each possible SLP instance decide whether to SLP it and calculate overall
8389 : unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
8390 : least one instance. */
8391 :
8392 : bool
8393 405174 : vect_make_slp_decision (loop_vec_info loop_vinfo)
8394 : {
8395 405174 : unsigned int i;
8396 405174 : poly_uint64 unrolling_factor = 1;
8397 405174 : const vec<slp_instance> &slp_instances
8398 : = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8399 405174 : slp_instance instance;
8400 405174 : int decided_to_slp = 0;
8401 :
8402 405174 : DUMP_VECT_SCOPE ("vect_make_slp_decision");
8403 :
8404 405174 : hash_set<slp_tree> visited;
8405 1068978 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
8406 : {
8407 663804 : slp_tree root = SLP_INSTANCE_TREE (instance);
8408 :
8409 : /* All unroll factors have the form:
8410 :
8411 : GET_MODE_SIZE (vinfo->vector_mode) * X
8412 :
8413 : for some rational X, so they must have a common multiple. */
8414 663804 : vect_update_slp_vf_for_node (root, unrolling_factor, visited);
8415 :
8416 : /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
8417 : call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
8418 : loop-based vectorization. Such stmts will be marked as HYBRID. */
8419 663804 : vect_mark_slp_stmts (loop_vinfo, root);
8420 :
8421 : /* If all instances ended up with vector(1) T roots make sure to
8422 : not vectorize. RVV for example relies on loop vectorization
8423 : when some instances are essentially kept scalar. See PR121048. */
8424 663804 : if (SLP_TREE_VECTYPE (root)
8425 663804 : && known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U))
8426 545564 : decided_to_slp++;
8427 : }
8428 :
8429 405174 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = unrolling_factor;
8430 :
8431 405174 : if (decided_to_slp && dump_enabled_p ())
8432 : {
8433 18394 : dump_printf_loc (MSG_NOTE, vect_location,
8434 : "Decided to SLP %d instances. Unrolling factor ",
8435 : decided_to_slp);
8436 18394 : dump_dec (MSG_NOTE, unrolling_factor);
8437 18394 : dump_printf (MSG_NOTE, "\n");
8438 : }
8439 :
8440 405174 : return (decided_to_slp > 0);
8441 405174 : }
8442 :
8443 : /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
8444 :
8445 2197543 : _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
8446 : : vec_info (vec_info::bb, shared),
8447 2197543 : roots (vNULL)
8448 : {
8449 : /* The region we are operating on. bbs[0] is the entry, excluding
8450 : its PHI nodes. In the future we might want to track an explicit
8451 : entry edge to cover bbs[0] PHI nodes and have a region entry
8452 : insert location. */
8453 2197543 : bbs = _bbs.address ();
8454 2197543 : nbbs = _bbs.length ();
8455 :
8456 17788150 : for (unsigned i = 0; i < nbbs; ++i)
8457 : {
8458 15590607 : if (i != 0)
8459 20343661 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
8460 6950597 : gsi_next (&si))
8461 : {
8462 6950597 : gphi *phi = si.phi ();
8463 6950597 : gimple_set_uid (phi, 0);
8464 6950597 : add_stmt (phi);
8465 : }
8466 31181214 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
8467 135793589 : !gsi_end_p (gsi); gsi_next (&gsi))
8468 : {
8469 120202982 : gimple *stmt = gsi_stmt (gsi);
8470 120202982 : gimple_set_uid (stmt, 0);
8471 120202982 : if (is_gimple_debug (stmt))
8472 74907747 : continue;
8473 45295235 : add_stmt (stmt);
8474 : }
8475 : }
8476 2197543 : }
8477 :
8478 :
8479 : /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
8480 : stmts in the basic block. */
8481 :
8482 2197543 : _bb_vec_info::~_bb_vec_info ()
8483 : {
8484 : /* Reset region marker. */
8485 17788150 : for (unsigned i = 0; i < nbbs; ++i)
8486 : {
8487 15590607 : if (i != 0)
8488 20359371 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
8489 6966307 : gsi_next (&si))
8490 : {
8491 6966307 : gphi *phi = si.phi ();
8492 6966307 : gimple_set_uid (phi, -1);
8493 : }
8494 31181214 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
8495 135735683 : !gsi_end_p (gsi); gsi_next (&gsi))
8496 : {
8497 120145076 : gimple *stmt = gsi_stmt (gsi);
8498 120145076 : gimple_set_uid (stmt, -1);
8499 : }
8500 : }
8501 :
8502 3397632 : for (unsigned i = 0; i < roots.length (); ++i)
8503 : {
8504 1200089 : roots[i].stmts.release ();
8505 1200089 : roots[i].roots.release ();
8506 1200089 : roots[i].remain.release ();
8507 : }
8508 2197543 : roots.release ();
8509 2197543 : }
8510 :
8511 : /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
8512 : given then that child nodes have already been processed, and that
8513 : their def types currently match their SLP node's def type. */
8514 :
8515 : static bool
8516 2443693 : vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
8517 : slp_instance node_instance,
8518 : stmt_vector_for_cost *cost_vec)
8519 : {
8520 : /* Handle purely internal nodes. */
8521 2443693 : if (SLP_TREE_PERMUTE_P (node))
8522 : {
8523 99267 : if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
8524 : return false;
8525 :
8526 : stmt_vec_info slp_stmt_info;
8527 : unsigned int i;
8528 257030 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
8529 : {
8530 159092 : if (slp_stmt_info
8531 154147 : && STMT_VINFO_LIVE_P (slp_stmt_info)
8532 159110 : && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
8533 : node_instance, i,
8534 : false, cost_vec))
8535 : return false;
8536 : }
8537 97938 : SLP_TREE_TYPE (node) = permute_info_type;
8538 97938 : return true;
8539 : }
8540 :
8541 2344426 : return vect_analyze_stmt (vinfo, node, node_instance, cost_vec);
8542 : }
8543 :
8544 : static int
8545 1847209 : sort_ints (const void *a_, const void *b_)
8546 : {
8547 1847209 : int a = *(const int *)a_;
8548 1847209 : int b = *(const int *)b_;
8549 1847209 : return a - b;
8550 : }
8551 :
8552 : /* Verify if we can externalize a set of internal defs. */
8553 :
8554 : static bool
8555 378223 : vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
8556 : {
8557 : /* Constant generation uses get_later_stmt which can only handle
8558 : defs from the same BB or a set of defs that can be ordered
8559 : with a dominance query. */
8560 378223 : basic_block bb = NULL;
8561 378223 : bool all_same = true;
8562 378223 : auto_vec<int> bbs;
8563 756446 : bbs.reserve_exact (stmts.length ());
8564 2044439 : for (stmt_vec_info stmt : stmts)
8565 : {
8566 909770 : if (!stmt)
8567 : return false;
8568 909770 : else if (!bb)
8569 378223 : bb = gimple_bb (stmt->stmt);
8570 531547 : else if (gimple_bb (stmt->stmt) != bb)
8571 172086 : all_same = false;
8572 909770 : bbs.quick_push (gimple_bb (stmt->stmt)->index);
8573 : }
8574 378223 : if (all_same)
8575 : return true;
8576 :
8577 : /* Produce a vector of unique BB indexes for the defs. */
8578 128939 : bbs.qsort (sort_ints);
8579 : unsigned i, j;
8580 314176 : for (i = 1, j = 1; i < bbs.length (); ++i)
8581 185237 : if (bbs[i] != bbs[j-1])
8582 137884 : bbs[j++] = bbs[i];
8583 128939 : gcc_assert (j >= 2);
8584 128939 : bbs.truncate (j);
8585 :
8586 257878 : if (bbs.length () == 2)
8587 125399 : return (dominated_by_p (CDI_DOMINATORS,
8588 125399 : BASIC_BLOCK_FOR_FN (cfun, bbs[0]),
8589 125399 : BASIC_BLOCK_FOR_FN (cfun, bbs[1]))
8590 244135 : || dominated_by_p (CDI_DOMINATORS,
8591 118736 : BASIC_BLOCK_FOR_FN (cfun, bbs[1]),
8592 118736 : BASIC_BLOCK_FOR_FN (cfun, bbs[0])));
8593 :
8594 : /* ??? For more than two BBs we can sort the vector and verify the
8595 : result is a total order. But we can't use vec::qsort with a
8596 : compare function using a dominance query since there's no way to
8597 : signal failure and any fallback for an unordered pair would
8598 : fail qsort_chk later.
8599 : For now simply hope that ordering after BB index provides the
8600 : best candidate total order. If required we can implement our
8601 : own mergesort or export an entry without checking. */
8602 394195 : for (unsigned i = 1; i < bbs.length (); ++i)
8603 12461 : if (!dominated_by_p (CDI_DOMINATORS,
8604 12461 : BASIC_BLOCK_FOR_FN (cfun, bbs[i]),
8605 12461 : BASIC_BLOCK_FOR_FN (cfun, bbs[i-1])))
8606 : return false;
8607 :
8608 : return true;
8609 378223 : }
8610 :
8611 : /* Try to build NODE from scalars, returning true on success.
8612 : NODE_INSTANCE is the SLP instance that contains NODE. */
8613 :
8614 : static bool
8615 526610 : vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
8616 : slp_instance node_instance)
8617 : {
8618 526610 : stmt_vec_info stmt_info;
8619 526610 : unsigned int i;
8620 :
8621 526610 : if (!is_a <bb_vec_info> (vinfo)
8622 70510 : || node == SLP_INSTANCE_TREE (node_instance)
8623 22182 : || !SLP_TREE_SCALAR_STMTS (node).exists ()
8624 22141 : || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
8625 : /* Force the mask use to be built from scalars instead. */
8626 19941 : || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
8627 546336 : || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
8628 506884 : return false;
8629 :
8630 19726 : if (dump_enabled_p ())
8631 70 : dump_printf_loc (MSG_NOTE, vect_location,
8632 : "Building vector operands of %p from scalars instead\n",
8633 : (void *) node);
8634 :
8635 : /* Don't remove and free the child nodes here, since they could be
8636 : referenced by other structures. The analysis and scheduling phases
8637 : (need to) ignore child nodes of anything that isn't vect_internal_def. */
8638 19726 : unsigned int group_size = SLP_TREE_LANES (node);
8639 19726 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
8640 : /* Invariants get their vector type from the uses. */
8641 19726 : SLP_TREE_VECTYPE (node) = NULL_TREE;
8642 19726 : SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
8643 19726 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8644 68630 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8645 : {
8646 48904 : tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
8647 48904 : SLP_TREE_SCALAR_OPS (node)[i] = lhs;
8648 : }
8649 : return true;
8650 : }
8651 :
8652 : /* Return true if all elements of the slice are the same. */
8653 : bool
8654 452526 : vect_scalar_ops_slice::all_same_p () const
8655 : {
8656 499858 : for (unsigned int i = 1; i < length; ++i)
8657 421723 : if (!operand_equal_p (op (0), op (i)))
8658 : return false;
8659 : return true;
8660 : }
8661 :
8662 : hashval_t
8663 392695 : vect_scalar_ops_slice_hash::hash (const value_type &s)
8664 : {
8665 392695 : hashval_t hash = 0;
8666 1517292 : for (unsigned i = 0; i < s.length; ++i)
8667 1124597 : hash = iterative_hash_expr (s.op (i), hash);
8668 392695 : return hash;
8669 : }
8670 :
8671 : bool
8672 214115 : vect_scalar_ops_slice_hash::equal (const value_type &s1,
8673 : const compare_type &s2)
8674 : {
8675 214115 : if (s1.length != s2.length)
8676 : return false;
8677 371814 : for (unsigned i = 0; i < s1.length; ++i)
8678 324860 : if (!operand_equal_p (s1.op (i), s2.op (i)))
8679 : return false;
8680 : return true;
8681 : }
8682 :
8683 : /* Compute the prologue cost for invariant or constant operands represented
8684 : by NODE. */
8685 :
8686 : static void
8687 1037089 : vect_prologue_cost_for_slp (vec_info *vinfo, slp_tree node,
8688 : stmt_vector_for_cost *cost_vec)
8689 : {
8690 : /* There's a special case of an existing vector, that costs nothing. */
8691 1037089 : if (SLP_TREE_SCALAR_OPS (node).length () == 0
8692 1037089 : && !SLP_TREE_VEC_DEFS (node).is_empty ())
8693 1570 : return;
8694 : /* Without looking at the actual initializer a vector of
8695 : constants can be implemented as load from the constant pool.
8696 : When all elements are the same we can use a splat. */
8697 1035519 : tree vectype = SLP_TREE_VECTYPE (node);
8698 1035519 : unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
8699 1035519 : unsigned HOST_WIDE_INT const_nunits;
8700 1035519 : unsigned nelt_limit;
8701 1035519 : unsigned nvectors = vect_get_num_copies (vinfo, node);
8702 1035519 : auto ops = &SLP_TREE_SCALAR_OPS (node);
8703 1035519 : auto_vec<unsigned int> starts (nvectors);
8704 1035519 : if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
8705 1035519 : && ! multiple_p (const_nunits, group_size))
8706 : {
8707 62478 : nelt_limit = const_nunits;
8708 62478 : hash_set<vect_scalar_ops_slice_hash> vector_ops;
8709 258465 : for (unsigned int i = 0; i < nvectors; ++i)
8710 195987 : if (!vector_ops.add ({ ops, i * nelt_limit, nelt_limit }))
8711 149033 : starts.quick_push (i * nelt_limit);
8712 62478 : }
8713 : else
8714 : {
8715 : /* If either the vector has variable length or the vectors
8716 : are composed of repeated whole groups we only need to
8717 : cost construction once. All vectors will be the same. */
8718 973041 : nelt_limit = group_size;
8719 973041 : starts.quick_push (0);
8720 : }
8721 : /* ??? We're just tracking whether vectors in a single node are the same.
8722 : Ideally we'd do something more global. */
8723 1035519 : bool passed = false;
8724 4228631 : for (unsigned int start : starts)
8725 : {
8726 1122074 : vect_cost_for_stmt kind;
8727 1122074 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
8728 : kind = vector_load;
8729 452526 : else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
8730 : kind = scalar_to_vec;
8731 : else
8732 374391 : kind = vec_construct;
8733 : /* The target cost hook has no idea which part of the SLP node
8734 : we are costing so avoid passing it down more than once. Pass
8735 : it to the first vec_construct or scalar_to_vec part since for those
8736 : the x86 backend tries to account for GPR to XMM register moves. */
8737 1122074 : record_stmt_cost (cost_vec, 1, kind, nullptr,
8738 1122074 : (kind != vector_load && !passed) ? node : nullptr,
8739 : vectype, 0, vect_prologue);
8740 1122074 : if (kind != vector_load)
8741 452526 : passed = true;
8742 : }
8743 1035519 : }
8744 :
8745 : /* Analyze statements contained in SLP tree NODE after recursively analyzing
8746 : the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
8747 :
8748 : Return true if the operations are supported. */
8749 :
8750 : static bool
8751 4547216 : vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
8752 : slp_instance node_instance,
8753 : hash_set<slp_tree> &visited_set,
8754 : vec<slp_tree> &visited_vec,
8755 : stmt_vector_for_cost *cost_vec)
8756 : {
8757 4547216 : int i, j;
8758 4547216 : slp_tree child;
8759 :
8760 : /* Assume we can code-generate all invariants. */
8761 4547216 : if (!node
8762 4225463 : || SLP_TREE_DEF_TYPE (node) == vect_constant_def
8763 3507010 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8764 : return true;
8765 :
8766 3002591 : if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
8767 : {
8768 9 : if (dump_enabled_p ())
8769 0 : dump_printf_loc (MSG_NOTE, vect_location,
8770 : "Failed cyclic SLP reference in %p\n", (void *) node);
8771 9 : return false;
8772 : }
8773 3002582 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
8774 :
8775 : /* If we already analyzed the exact same set of scalar stmts we're done.
8776 : We share the generated vector stmts for those. */
8777 3002582 : if (visited_set.add (node))
8778 : return true;
8779 2733581 : visited_vec.safe_push (node);
8780 :
8781 2733581 : bool res = true;
8782 2733581 : unsigned visited_rec_start = visited_vec.length ();
8783 2733581 : unsigned cost_vec_rec_start = cost_vec->length ();
8784 2733581 : bool seen_non_constant_child = false;
8785 5767069 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8786 : {
8787 3323151 : res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
8788 : visited_set, visited_vec,
8789 : cost_vec);
8790 3323151 : if (!res)
8791 : break;
8792 3033488 : if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
8793 3033488 : seen_non_constant_child = true;
8794 : }
8795 : /* We're having difficulties scheduling nodes with just constant
8796 : operands and no scalar stmts since we then cannot compute a stmt
8797 : insertion place. */
8798 2733581 : if (res
8799 2733581 : && !seen_non_constant_child
8800 2733581 : && SLP_TREE_SCALAR_STMTS (node).is_empty ())
8801 : {
8802 225 : if (dump_enabled_p ())
8803 6 : dump_printf_loc (MSG_NOTE, vect_location,
8804 : "Cannot vectorize all-constant op node %p\n",
8805 : (void *) node);
8806 : res = false;
8807 : }
8808 :
8809 2733356 : if (res)
8810 2443693 : res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
8811 : cost_vec);
8812 : /* If analysis failed we have to pop all recursive visited nodes
8813 : plus ourselves. */
8814 2733581 : if (!res)
8815 : {
8816 2634270 : while (visited_vec.length () >= visited_rec_start)
8817 790525 : visited_set.remove (visited_vec.pop ());
8818 526610 : cost_vec->truncate (cost_vec_rec_start);
8819 : }
8820 :
8821 : /* When the node can be vectorized cost invariant nodes it references.
8822 : This is not done in DFS order to allow the referring node
8823 : vectorizable_* calls to nail down the invariant nodes vector type
8824 : and possibly unshare it if it needs a different vector type than
8825 : other referrers. */
8826 2733581 : if (res)
8827 4940693 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
8828 2733722 : if (child
8829 2475987 : && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
8830 2475987 : || SLP_TREE_DEF_TYPE (child) == vect_external_def)
8831 : /* Perform usual caching, note code-generation still
8832 : code-gens these nodes multiple times but we expect
8833 : to CSE them later. */
8834 3836425 : && !visited_set.add (child))
8835 : {
8836 1078067 : visited_vec.safe_push (child);
8837 : /* ??? After auditing more code paths make a "default"
8838 : and push the vector type from NODE to all children
8839 : if it is not already set. */
8840 : /* Compute the number of vectors to be generated. */
8841 1078067 : tree vector_type = SLP_TREE_VECTYPE (child);
8842 1078067 : if (!vector_type)
8843 : {
8844 : /* Masked loads can have an undefined (default SSA definition)
8845 : else operand. We do not need to cost it. */
8846 40978 : vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
8847 42028 : if (SLP_TREE_TYPE (node) == load_vec_info_type
8848 42028 : && ((ops.length ()
8849 1050 : && TREE_CODE (ops[0]) == SSA_NAME
8850 0 : && SSA_NAME_IS_DEFAULT_DEF (ops[0])
8851 0 : && VAR_P (SSA_NAME_VAR (ops[0])))
8852 1050 : || SLP_TREE_DEF_TYPE (child) == vect_constant_def))
8853 1050 : continue;
8854 :
8855 : /* For shifts with a scalar argument we don't need
8856 : to cost or code-generate anything.
8857 : ??? Represent this more explicitly. */
8858 39928 : gcc_assert (SLP_TREE_TYPE (node) == shift_vec_info_type
8859 : && j == 1);
8860 39928 : continue;
8861 39928 : }
8862 :
8863 : /* And cost them. */
8864 1037089 : vect_prologue_cost_for_slp (vinfo, child, cost_vec);
8865 : }
8866 :
8867 : /* If this node or any of its children can't be vectorized, try pruning
8868 : the tree here rather than felling the whole thing. */
8869 526610 : if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
8870 : {
8871 : /* We'll need to revisit this for invariant costing and number
8872 : of vectorized stmt setting. */
8873 : res = true;
8874 : }
8875 :
8876 : return res;
8877 : }
8878 :
8879 : /* Given a definition DEF, analyze if it will have any live scalar use after
8880 : performing SLP vectorization whose information is represented by BB_VINFO,
8881 : and record result into hash map SCALAR_USE_MAP as cache for later fast
8882 : check. If recursion DEPTH exceeds a limit, stop analysis and make a
8883 : conservative assumption. Return 0 if no scalar use, 1 if there is, -1
8884 : means recursion is limited. */
8885 :
8886 : static int
8887 564445 : vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
8888 : hash_map<tree, int> &scalar_use_map,
8889 : int depth = 0)
8890 : {
8891 564445 : const int depth_limit = 3;
8892 564445 : imm_use_iterator use_iter;
8893 564445 : gimple *use_stmt;
8894 :
8895 564445 : if (int *res = scalar_use_map.get (def))
8896 24387 : return *res;
8897 :
8898 540058 : int scalar_use = 1;
8899 :
8900 1778066 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
8901 : {
8902 818791 : if (is_gimple_debug (use_stmt))
8903 183018 : continue;
8904 :
8905 635773 : stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
8906 :
8907 635773 : if (!use_stmt_info)
8908 : break;
8909 :
8910 638363 : if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
8911 511275 : continue;
8912 :
8913 : /* Do not step forward when encounter PHI statement, since it may
8914 : involve cyclic reference and cause infinite recursive invocation. */
8915 118427 : if (gimple_code (use_stmt) == GIMPLE_PHI)
8916 : break;
8917 :
8918 : /* When pattern recognition is involved, a statement whose definition is
8919 : consumed in some pattern, may not be included in the final replacement
8920 : pattern statements, so would be skipped when building SLP graph.
8921 :
8922 : * Original
8923 : char a_c = *(char *) a;
8924 : char b_c = *(char *) b;
8925 : unsigned short a_s = (unsigned short) a_c;
8926 : int a_i = (int) a_s;
8927 : int b_i = (int) b_c;
8928 : int r_i = a_i - b_i;
8929 :
8930 : * After pattern replacement
8931 : a_s = (unsigned short) a_c;
8932 : a_i = (int) a_s;
8933 :
8934 : patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
8935 : patt_b_i = (int) patt_b_s; // b_i = (int) b_c
8936 :
8937 : patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
8938 : patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
8939 :
8940 : The definitions of a_i(original statement) and b_i(pattern statement)
8941 : are related to, but actually not part of widen_minus pattern.
8942 : Vectorizing the pattern does not cause these definition statements to
8943 : be marked as PURE_SLP. For this case, we need to recursively check
8944 : whether their uses are all absorbed into vectorized code. But there
8945 : is an exception that some use may participate in an vectorized
8946 : operation via an external SLP node containing that use as an element.
8947 : The parameter "scalar_use_map" tags such kind of SSA as having scalar
8948 : use in advance. */
8949 99168 : tree lhs = gimple_get_lhs (use_stmt);
8950 :
8951 99168 : if (!lhs || TREE_CODE (lhs) != SSA_NAME)
8952 : break;
8953 :
8954 65061 : if (depth_limit && depth >= depth_limit)
8955 7473 : return -1;
8956 :
8957 57588 : if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
8958 : depth + 1)))
8959 : break;
8960 7473 : }
8961 :
8962 532585 : if (end_imm_use_stmt_p (&use_iter))
8963 419217 : scalar_use = 0;
8964 :
8965 : /* If recursion is limited, do not cache result for non-root defs. */
8966 532585 : if (!depth || scalar_use >= 0)
8967 : {
8968 517639 : bool added = scalar_use_map.put (def, scalar_use);
8969 517639 : gcc_assert (!added);
8970 : }
8971 :
8972 532585 : return scalar_use;
8973 : }
8974 :
8975 : /* Mark lanes of NODE that are live outside of the basic-block vectorized
8976 : region and that can be vectorized using vectorizable_live_operation
8977 : with STMT_VINFO_LIVE_P. Not handled live operations will cause the
8978 : scalar code computing it to be retained. */
8979 :
8980 : static void
8981 904003 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
8982 : slp_instance instance,
8983 : stmt_vector_for_cost *cost_vec,
8984 : hash_map<tree, int> &scalar_use_map,
8985 : hash_set<stmt_vec_info> &svisited,
8986 : hash_set<slp_tree> &visited)
8987 : {
8988 904003 : if (visited.add (node))
8989 41342 : return;
8990 :
8991 862661 : unsigned i;
8992 862661 : stmt_vec_info stmt_info;
8993 862661 : stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
8994 3118937 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8995 : {
8996 2256276 : if (!stmt_info || svisited.contains (stmt_info))
8997 30133 : continue;
8998 2234668 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
8999 2234668 : if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
9000 11427 : && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
9001 : /* Only the pattern root stmt computes the original scalar value. */
9002 8525 : continue;
9003 2226143 : bool mark_visited = true;
9004 2226143 : gimple *orig_stmt = orig_stmt_info->stmt;
9005 2226143 : ssa_op_iter op_iter;
9006 2226143 : def_operand_p def_p;
9007 4959143 : FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
9008 : {
9009 506857 : if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
9010 : scalar_use_map))
9011 : {
9012 90885 : STMT_VINFO_LIVE_P (stmt_info) = true;
9013 90885 : if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
9014 : instance, i, false, cost_vec))
9015 : /* ??? So we know we can vectorize the live stmt from one SLP
9016 : node. If we cannot do so from all or none consistently
9017 : we'd have to record which SLP node (and lane) we want to
9018 : use for the live operation. So make sure we can
9019 : code-generate from all nodes. */
9020 : mark_visited = false;
9021 : else
9022 0 : STMT_VINFO_LIVE_P (stmt_info) = false;
9023 : }
9024 :
9025 : /* We have to verify whether we can insert the lane extract
9026 : before all uses. The following is a conservative approximation.
9027 : We cannot put this into vectorizable_live_operation because
9028 : iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
9029 : doesn't work.
9030 : Note that while the fact that we emit code for loads at the
9031 : first load should make this a non-problem leafs we construct
9032 : from scalars are vectorized after the last scalar def.
9033 : ??? If we'd actually compute the insert location during
9034 : analysis we could use sth less conservative than the last
9035 : scalar stmt in the node for the dominance check. */
9036 : /* ??? What remains is "live" uses in vector CTORs in the same
9037 : SLP graph which is where those uses can end up code-generated
9038 : right after their definition instead of close to their original
9039 : use. But that would restrict us to code-generate lane-extracts
9040 : from the latest stmt in a node. So we compensate for this
9041 : during code-generation, simply not replacing uses for those
9042 : hopefully rare cases. */
9043 506857 : imm_use_iterator use_iter;
9044 506857 : gimple *use_stmt;
9045 506857 : stmt_vec_info use_stmt_info;
9046 :
9047 506857 : if (STMT_VINFO_LIVE_P (stmt_info))
9048 613505 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
9049 431735 : if (!is_gimple_debug (use_stmt)
9050 323347 : && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
9051 313925 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
9052 610581 : && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
9053 : {
9054 17397 : if (dump_enabled_p ())
9055 57 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9056 : "Cannot determine insertion place for "
9057 : "lane extract\n");
9058 17397 : STMT_VINFO_LIVE_P (stmt_info) = false;
9059 17397 : mark_visited = true;
9060 90885 : }
9061 : }
9062 2226143 : if (mark_visited)
9063 2149729 : svisited.add (stmt_info);
9064 : }
9065 :
9066 : slp_tree child;
9067 2495951 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9068 874426 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9069 229984 : vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
9070 : scalar_use_map, svisited, visited);
9071 : }
9072 :
9073 : /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
9074 : are live outside of the basic-block vectorized region and that can be
9075 : vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
9076 :
9077 : static void
9078 263665 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
9079 : {
9080 263665 : if (bb_vinfo->slp_instances.is_empty ())
9081 29789 : return;
9082 :
9083 233876 : hash_set<stmt_vec_info> svisited;
9084 233876 : hash_set<slp_tree> visited;
9085 233876 : hash_map<tree, int> scalar_use_map;
9086 233876 : auto_vec<slp_tree> worklist;
9087 :
9088 1375647 : for (slp_instance instance : bb_vinfo->slp_instances)
9089 : {
9090 674019 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
9091 58699 : for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
9092 16748 : if (TREE_CODE (op) == SSA_NAME)
9093 14100 : scalar_use_map.put (op, 1);
9094 674019 : if (!visited.add (SLP_INSTANCE_TREE (instance)))
9095 671925 : worklist.safe_push (SLP_INSTANCE_TREE (instance));
9096 : }
9097 :
9098 1505754 : do
9099 : {
9100 1505754 : slp_tree node = worklist.pop ();
9101 :
9102 1505754 : if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
9103 : {
9104 1542075 : for (tree op : SLP_TREE_SCALAR_OPS (node))
9105 680932 : if (TREE_CODE (op) == SSA_NAME)
9106 458685 : scalar_use_map.put (op, 1);
9107 : }
9108 : else
9109 : {
9110 3610303 : for (slp_tree child : SLP_TREE_CHILDREN (node))
9111 874402 : if (child && !visited.add (child))
9112 833829 : worklist.safe_push (child);
9113 : }
9114 : }
9115 3011508 : while (!worklist.is_empty ());
9116 :
9117 233876 : visited.empty ();
9118 :
9119 1375647 : for (slp_instance instance : bb_vinfo->slp_instances)
9120 : {
9121 674019 : vect_location = instance->location ();
9122 674019 : vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
9123 : instance, &instance->cost_vec,
9124 : scalar_use_map, svisited, visited);
9125 : }
9126 233876 : }
9127 :
9128 : /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
9129 :
9130 : static bool
9131 73949 : vectorizable_bb_reduc_epilogue (slp_instance instance,
9132 : stmt_vector_for_cost *cost_vec)
9133 : {
9134 73949 : gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
9135 73949 : enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
9136 73949 : if (reduc_code == MINUS_EXPR)
9137 0 : reduc_code = PLUS_EXPR;
9138 73949 : internal_fn reduc_fn;
9139 73949 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
9140 73949 : if (!vectype
9141 73937 : || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9142 73937 : || reduc_fn == IFN_LAST
9143 73937 : || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
9144 108848 : || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
9145 34899 : TREE_TYPE (vectype)))
9146 : {
9147 49363 : if (dump_enabled_p ())
9148 271 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9149 : "not vectorized: basic block reduction epilogue "
9150 : "operation unsupported.\n");
9151 49363 : return false;
9152 : }
9153 :
9154 : /* There's no way to cost a horizontal vector reduction via REDUC_FN so
9155 : cost log2 vector operations plus shuffles and one extraction. */
9156 24586 : unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
9157 24586 : record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
9158 : vectype, 0, vect_body);
9159 24586 : record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
9160 : vectype, 0, vect_body);
9161 24586 : record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
9162 : vectype, 0, vect_body);
9163 :
9164 : /* Since we replace all stmts of a possibly longer scalar reduction
9165 : chain account for the extra scalar stmts for that. */
9166 24586 : if (!instance->remain_defs.is_empty ())
9167 19790 : record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
9168 9895 : instance->root_stmts[0], 0, vect_body);
9169 : return true;
9170 : }
9171 :
9172 : /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
9173 : and recurse to children. */
9174 :
9175 : static void
9176 182495 : vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
9177 : hash_set<slp_tree> &visited)
9178 : {
9179 182495 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
9180 182495 : || visited.add (node))
9181 81051 : return;
9182 :
9183 : stmt_vec_info stmt;
9184 : unsigned i;
9185 344026 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
9186 242582 : if (stmt)
9187 246336 : roots.remove (vect_orig_stmt (stmt));
9188 :
9189 : slp_tree child;
9190 225162 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9191 123718 : if (child)
9192 122342 : vect_slp_prune_covered_roots (child, roots, visited);
9193 : }
9194 :
9195 : /* Analyze statements in SLP instances of VINFO. Return true if the
9196 : operations are supported. */
9197 :
9198 : bool
9199 605121 : vect_slp_analyze_operations (vec_info *vinfo)
9200 : {
9201 605121 : slp_instance instance;
9202 605121 : int i;
9203 :
9204 605121 : DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
9205 :
9206 605121 : hash_set<slp_tree> visited;
9207 1600395 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
9208 : {
9209 1224065 : auto_vec<slp_tree> visited_vec;
9210 1224065 : stmt_vector_for_cost cost_vec;
9211 1224065 : cost_vec.create (2);
9212 1224065 : if (is_a <bb_vec_info> (vinfo))
9213 773228 : vect_location = instance->location ();
9214 1224065 : if (!vect_slp_analyze_node_operations (vinfo,
9215 : SLP_INSTANCE_TREE (instance),
9216 : instance, visited, visited_vec,
9217 : &cost_vec)
9218 : /* CTOR instances require vectorized defs for the SLP tree root. */
9219 1006835 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
9220 5236 : && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
9221 : != vect_internal_def
9222 : /* Make sure we vectorized with the expected type. */
9223 5236 : || !useless_type_conversion_p
9224 5236 : (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
9225 : (instance->root_stmts[0]->stmt))),
9226 5236 : TREE_TYPE (SLP_TREE_VECTYPE
9227 : (SLP_INSTANCE_TREE (instance))))))
9228 : /* Check we can vectorize the reduction. */
9229 1006820 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
9230 73949 : && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
9231 : /* Check we can vectorize the gcond. */
9232 2181522 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
9233 61946 : && !vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
9234 61946 : SLP_INSTANCE_ROOT_STMTS (instance)[0],
9235 : NULL,
9236 : SLP_INSTANCE_TREE (instance),
9237 : &cost_vec)))
9238 : {
9239 326497 : cost_vec.release ();
9240 326497 : slp_tree node = SLP_INSTANCE_TREE (instance);
9241 326497 : stmt_vec_info stmt_info;
9242 326497 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9243 253981 : stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
9244 72516 : else if (!SLP_TREE_SCALAR_STMTS (node).is_empty ()
9245 72516 : && SLP_TREE_SCALAR_STMTS (node)[0])
9246 : stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
9247 : else
9248 0 : stmt_info = SLP_TREE_REPRESENTATIVE (node);
9249 326497 : if (is_a <loop_vec_info> (vinfo))
9250 : {
9251 228791 : if (dump_enabled_p ())
9252 6319 : dump_printf_loc (MSG_NOTE, vect_location,
9253 : "unsupported SLP instance starting from: %G",
9254 : stmt_info->stmt);
9255 228791 : return false;
9256 : }
9257 97706 : if (dump_enabled_p ())
9258 325 : dump_printf_loc (MSG_NOTE, vect_location,
9259 : "removing SLP instance operations starting from: %G",
9260 : stmt_info->stmt);
9261 435712 : while (!visited_vec.is_empty ())
9262 : {
9263 338006 : slp_tree node = visited_vec.pop ();
9264 338006 : SLP_TREE_TYPE (node) = undef_vec_info_type;
9265 338006 : if (node->data)
9266 : {
9267 12285 : delete node->data;
9268 12285 : node->data = nullptr;
9269 : }
9270 338006 : visited.remove (node);
9271 : }
9272 97706 : vect_free_slp_instance (instance);
9273 97706 : vinfo->slp_instances.ordered_remove (i);
9274 : }
9275 : else
9276 : {
9277 897568 : i++;
9278 897568 : if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
9279 : {
9280 222046 : add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
9281 222046 : cost_vec.release ();
9282 : }
9283 : else
9284 : /* For BB vectorization remember the SLP graph entry
9285 : cost for later. */
9286 675522 : instance->cost_vec = cost_vec;
9287 : }
9288 1224065 : }
9289 :
9290 : /* Now look for SLP instances with a root that are covered by other
9291 : instances and remove them. */
9292 376330 : hash_set<stmt_vec_info> roots;
9293 1581961 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
9294 861153 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9295 31852 : roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
9296 376330 : if (!roots.is_empty ())
9297 : {
9298 12368 : visited.empty ();
9299 72521 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
9300 60153 : vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
9301 : visited);
9302 72521 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
9303 60153 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
9304 31852 : && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
9305 : {
9306 1503 : stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
9307 1503 : if (dump_enabled_p ())
9308 20 : dump_printf_loc (MSG_NOTE, vect_location,
9309 : "removing SLP instance operations starting "
9310 : "from: %G", root->stmt);
9311 1503 : vect_free_slp_instance (instance);
9312 1503 : vinfo->slp_instances.ordered_remove (i);
9313 : }
9314 : else
9315 58650 : ++i;
9316 : }
9317 :
9318 : /* Compute vectorizable live stmts. */
9319 376330 : if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
9320 263665 : vect_bb_slp_mark_live_stmts (bb_vinfo);
9321 :
9322 752660 : return !vinfo->slp_instances.is_empty ();
9323 981451 : }
9324 :
9325 : /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
9326 : closing the eventual chain. */
9327 :
9328 : static slp_instance
9329 737632 : get_ultimate_leader (slp_instance instance,
9330 : hash_map<slp_instance, slp_instance> &instance_leader)
9331 : {
9332 737632 : auto_vec<slp_instance *, 8> chain;
9333 737632 : slp_instance *tem;
9334 812980 : while (*(tem = instance_leader.get (instance)) != instance)
9335 : {
9336 75348 : chain.safe_push (tem);
9337 75348 : instance = *tem;
9338 : }
9339 812980 : while (!chain.is_empty ())
9340 75348 : *chain.pop () = instance;
9341 737632 : return instance;
9342 737632 : }
9343 :
9344 : namespace {
9345 : /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
9346 : KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
9347 : for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
9348 :
9349 : INSTANCE_LEADER is as for get_ultimate_leader. */
9350 :
9351 : template<typename T>
9352 : bool
9353 3261123 : vect_map_to_instance (slp_instance instance, T key,
9354 : hash_map<T, slp_instance> &key_to_instance,
9355 : hash_map<slp_instance, slp_instance> &instance_leader)
9356 : {
9357 : bool existed_p;
9358 3261123 : slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
9359 3261123 : if (!existed_p)
9360 : ;
9361 172786 : else if (key_instance != instance)
9362 : {
9363 : /* If we're running into a previously marked key make us the
9364 : leader of the current ultimate leader. This keeps the
9365 : leader chain acyclic and works even when the current instance
9366 : connects two previously independent graph parts. */
9367 63613 : slp_instance key_leader
9368 63613 : = get_ultimate_leader (key_instance, instance_leader);
9369 63613 : if (key_leader != instance)
9370 19144 : instance_leader.put (key_leader, instance);
9371 : }
9372 3261123 : key_instance = instance;
9373 3261123 : return existed_p;
9374 : }
9375 : }
9376 :
9377 : /* Worker of vect_bb_partition_graph, recurse on NODE. */
9378 :
9379 : static void
9380 904003 : vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
9381 : slp_instance instance, slp_tree node,
9382 : hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
9383 : hash_map<slp_tree, slp_instance> &node_to_instance,
9384 : hash_map<slp_instance, slp_instance> &instance_leader)
9385 : {
9386 904003 : stmt_vec_info stmt_info;
9387 904003 : unsigned i;
9388 :
9389 3261123 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9390 2357120 : if (stmt_info)
9391 2357120 : vect_map_to_instance (instance, stmt_info, stmt_to_instance,
9392 : instance_leader);
9393 :
9394 904003 : if (vect_map_to_instance (instance, node, node_to_instance,
9395 : instance_leader))
9396 904003 : return;
9397 :
9398 : slp_tree child;
9399 1737087 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9400 874426 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9401 229984 : vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
9402 : node_to_instance, instance_leader);
9403 : }
9404 :
9405 : /* Partition the SLP graph into pieces that can be costed independently. */
9406 :
9407 : static void
9408 233876 : vect_bb_partition_graph (bb_vec_info bb_vinfo)
9409 : {
9410 233876 : DUMP_VECT_SCOPE ("vect_bb_partition_graph");
9411 :
9412 : /* First walk the SLP graph assigning each involved scalar stmt a
9413 : corresponding SLP graph entry and upon visiting a previously
9414 : marked stmt, make the stmts leader the current SLP graph entry. */
9415 233876 : hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
9416 233876 : hash_map<slp_tree, slp_instance> node_to_instance;
9417 233876 : hash_map<slp_instance, slp_instance> instance_leader;
9418 233876 : slp_instance instance;
9419 907895 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
9420 : {
9421 674019 : instance_leader.put (instance, instance);
9422 674019 : vect_bb_partition_graph_r (bb_vinfo,
9423 : instance, SLP_INSTANCE_TREE (instance),
9424 : stmt_to_instance, node_to_instance,
9425 : instance_leader);
9426 : }
9427 :
9428 : /* Then collect entries to each independent subgraph. */
9429 1141771 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
9430 : {
9431 674019 : slp_instance leader = get_ultimate_leader (instance, instance_leader);
9432 674019 : leader->subgraph_entries.safe_push (instance);
9433 674019 : if (dump_enabled_p ()
9434 674019 : && leader != instance)
9435 69 : dump_printf_loc (MSG_NOTE, vect_location,
9436 : "instance %p is leader of %p\n",
9437 : (void *) leader, (void *) instance);
9438 : }
9439 233876 : }
9440 :
9441 : /* Compute the set of scalar stmts participating in internal and external
9442 : nodes. */
9443 :
9444 : static void
9445 1534727 : vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
9446 : hash_set<slp_tree> &visited,
9447 : hash_set<stmt_vec_info> &vstmts,
9448 : hash_set<stmt_vec_info> &estmts)
9449 : {
9450 1534727 : int i;
9451 1534727 : stmt_vec_info stmt_info;
9452 1534727 : slp_tree child;
9453 :
9454 1534727 : if (visited.add (node))
9455 41271 : return;
9456 :
9457 1493456 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
9458 : {
9459 3058657 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9460 2205062 : if (stmt_info)
9461 2205062 : vstmts.add (stmt_info);
9462 :
9463 3109305 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9464 864177 : if (child)
9465 864177 : vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
9466 : vstmts, estmts);
9467 : }
9468 : else
9469 3588113 : for (tree def : SLP_TREE_SCALAR_OPS (node))
9470 : {
9471 1669566 : stmt_vec_info def_stmt = vinfo->lookup_def (def);
9472 1669566 : if (def_stmt)
9473 329824 : estmts.add (def_stmt);
9474 : }
9475 : }
9476 :
9477 :
9478 : /* Compute the scalar cost of the SLP node NODE and its children
9479 : and return it. Do not account defs that are marked in LIFE and
9480 : update LIFE according to uses of NODE. */
9481 :
9482 : static void
9483 894093 : vect_bb_slp_scalar_cost (vec_info *vinfo,
9484 : slp_tree node, vec<bool, va_heap> *life,
9485 : stmt_vector_for_cost *cost_vec,
9486 : hash_set<stmt_vec_info> &vectorized_scalar_stmts,
9487 : hash_set<stmt_vec_info> &scalar_stmts_in_externs,
9488 : hash_set<slp_tree> &visited)
9489 : {
9490 894093 : unsigned i;
9491 894093 : stmt_vec_info stmt_info;
9492 894093 : slp_tree child;
9493 :
9494 894093 : if (visited.add (node))
9495 40481 : return;
9496 :
9497 3058708 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9498 : {
9499 2205096 : ssa_op_iter op_iter;
9500 2205096 : def_operand_p def_p;
9501 :
9502 2236402 : if (!stmt_info
9503 2205096 : || (*life)[i]
9504 : /* Defs also used in external nodes are not in the
9505 : vectorized_scalar_stmts set as they need to be preserved.
9506 : Honor that. */
9507 4381703 : || scalar_stmts_in_externs.contains (stmt_info))
9508 104211 : continue;
9509 :
9510 2173790 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
9511 2173790 : gimple *orig_stmt = orig_stmt_info->stmt;
9512 :
9513 : /* If there is a non-vectorized use of the defs then the scalar
9514 : stmt is kept live in which case we do not account it or any
9515 : required defs in the SLP children in the scalar cost. This
9516 : way we make the vectorization more costly when compared to
9517 : the scalar cost. */
9518 2173790 : if (!STMT_VINFO_LIVE_P (stmt_info))
9519 : {
9520 2107047 : auto_vec<gimple *, 8> worklist;
9521 2107047 : hash_set<gimple *> *worklist_visited = NULL;
9522 2107047 : worklist.quick_push (orig_stmt);
9523 2112047 : do
9524 : {
9525 2112047 : gimple *work_stmt = worklist.pop ();
9526 4619010 : FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
9527 : {
9528 415261 : imm_use_iterator use_iter;
9529 415261 : gimple *use_stmt;
9530 1036643 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
9531 : DEF_FROM_PTR (def_p))
9532 641727 : if (!is_gimple_debug (use_stmt))
9533 : {
9534 489035 : stmt_vec_info use_stmt_info
9535 489035 : = vinfo->lookup_stmt (use_stmt);
9536 489035 : if (!use_stmt_info
9537 489035 : || !vectorized_scalar_stmts.contains (use_stmt_info))
9538 : {
9539 25445 : if (use_stmt_info
9540 22368 : && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
9541 : {
9542 : /* For stmts participating in patterns we have
9543 : to check its uses recursively. */
9544 5100 : if (!worklist_visited)
9545 3959 : worklist_visited = new hash_set<gimple *> ();
9546 5100 : if (!worklist_visited->add (use_stmt))
9547 5100 : worklist.safe_push (use_stmt);
9548 5100 : continue;
9549 : }
9550 20345 : (*life)[i] = true;
9551 20345 : goto next_lane;
9552 : }
9553 415261 : }
9554 : }
9555 : }
9556 4183404 : while (!worklist.is_empty ());
9557 2086702 : next_lane:
9558 2107047 : if (worklist_visited)
9559 3959 : delete worklist_visited;
9560 2107047 : if ((*life)[i])
9561 20345 : continue;
9562 2107047 : }
9563 :
9564 : /* Count scalar stmts only once. */
9565 2153445 : if (gimple_visited_p (orig_stmt))
9566 24620 : continue;
9567 2128825 : gimple_set_visited (orig_stmt, true);
9568 :
9569 2128825 : vect_cost_for_stmt kind;
9570 2128825 : if (STMT_VINFO_DATA_REF (orig_stmt_info))
9571 : {
9572 1931889 : data_reference_p dr = STMT_VINFO_DATA_REF (orig_stmt_info);
9573 1931889 : tree base = get_base_address (DR_REF (dr));
9574 : /* When the scalar access is to a non-global not address-taken
9575 : decl that is not BLKmode assume we can access it with a single
9576 : non-load/store instruction. */
9577 1931889 : if (DECL_P (base)
9578 1495890 : && !is_global_var (base)
9579 1419844 : && !TREE_ADDRESSABLE (base)
9580 2482419 : && DECL_MODE (base) != BLKmode)
9581 : kind = scalar_stmt;
9582 1788582 : else if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
9583 : kind = scalar_load;
9584 : else
9585 1569876 : kind = scalar_store;
9586 : }
9587 196936 : else if (vect_nop_conversion_p (orig_stmt_info))
9588 19945 : continue;
9589 : /* For single-argument PHIs assume coalescing which means zero cost
9590 : for the scalar and the vector PHIs. This avoids artificially
9591 : favoring the vector path (but may pessimize it in some cases). */
9592 176991 : else if (is_a <gphi *> (orig_stmt_info->stmt)
9593 176991 : && gimple_phi_num_args
9594 83469 : (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
9595 7995 : continue;
9596 : else
9597 : kind = scalar_stmt;
9598 2100885 : record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
9599 : SLP_TREE_VECTYPE (node), 0, vect_body);
9600 : }
9601 :
9602 1707224 : auto_vec<bool, 20> subtree_life;
9603 2469494 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9604 : {
9605 864201 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9606 : {
9607 : /* Do not directly pass LIFE to the recursive call, copy it to
9608 : confine changes in the callee to the current child/subtree. */
9609 223543 : if (SLP_TREE_PERMUTE_P (node))
9610 : {
9611 3496 : subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
9612 12240 : for (unsigned j = 0;
9613 12240 : j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
9614 : {
9615 8744 : auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
9616 8744 : if (perm.first == i)
9617 4600 : subtree_life[perm.second] = (*life)[j];
9618 : }
9619 : }
9620 : else
9621 : {
9622 220047 : gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
9623 220047 : subtree_life.safe_splice (*life);
9624 : }
9625 223543 : vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
9626 : vectorized_scalar_stmts,
9627 : scalar_stmts_in_externs, visited);
9628 223543 : subtree_life.truncate (0);
9629 : }
9630 : }
9631 : }
9632 :
9633 : /* Comparator for the loop-index sorted cost vectors. */
9634 :
9635 : static int
9636 17466825 : li_cost_vec_cmp (const void *a_, const void *b_)
9637 : {
9638 17466825 : auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
9639 17466825 : auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
9640 17466825 : if (a->first < b->first)
9641 : return -1;
9642 16707272 : else if (a->first == b->first)
9643 16036403 : return 0;
9644 : return 1;
9645 : }
9646 :
9647 : /* Check if vectorization of the basic block is profitable for the
9648 : subgraph denoted by SLP_INSTANCES. */
9649 :
9650 : static bool
9651 651543 : vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
9652 : vec<slp_instance> slp_instances,
9653 : loop_p orig_loop)
9654 : {
9655 651543 : slp_instance instance;
9656 651543 : int i;
9657 651543 : unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
9658 651543 : unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
9659 :
9660 651543 : if (dump_enabled_p ())
9661 : {
9662 98 : dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
9663 98 : hash_set<slp_tree> visited;
9664 395 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9665 101 : vect_print_slp_graph (MSG_NOTE, vect_location,
9666 : SLP_INSTANCE_TREE (instance), visited);
9667 98 : }
9668 :
9669 : /* Compute the set of scalar stmts we know will go away 'locally' when
9670 : vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
9671 : not accurate for nodes promoted extern late or for scalar stmts that
9672 : are used both in extern defs and in vectorized defs. */
9673 651543 : hash_set<stmt_vec_info> vectorized_scalar_stmts;
9674 651543 : hash_set<stmt_vec_info> scalar_stmts_in_externs;
9675 651543 : hash_set<slp_tree> visited;
9676 1322093 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9677 : {
9678 670550 : vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
9679 : SLP_INSTANCE_TREE (instance),
9680 : visited,
9681 : vectorized_scalar_stmts,
9682 : scalar_stmts_in_externs);
9683 777988 : for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
9684 51350 : vectorized_scalar_stmts.add (rstmt);
9685 : }
9686 : /* Scalar stmts used as defs in external nodes need to be preseved, so
9687 : remove them from vectorized_scalar_stmts. */
9688 946169 : for (stmt_vec_info stmt : scalar_stmts_in_externs)
9689 294626 : vectorized_scalar_stmts.remove (stmt);
9690 :
9691 : /* Calculate scalar cost and sum the cost for the vector stmts
9692 : previously collected. */
9693 651543 : stmt_vector_for_cost scalar_costs = vNULL;
9694 651543 : stmt_vector_for_cost vector_costs = vNULL;
9695 651543 : visited.empty ();
9696 1322093 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9697 : {
9698 670550 : auto_vec<bool, 20> life;
9699 670550 : life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
9700 : true);
9701 670550 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9702 56088 : record_stmt_cost (&scalar_costs,
9703 28044 : SLP_INSTANCE_ROOT_STMTS (instance).length (),
9704 : scalar_stmt,
9705 28044 : SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
9706 670550 : vect_bb_slp_scalar_cost (bb_vinfo,
9707 : SLP_INSTANCE_TREE (instance),
9708 : &life, &scalar_costs, vectorized_scalar_stmts,
9709 : scalar_stmts_in_externs, visited);
9710 670550 : vector_costs.safe_splice (instance->cost_vec);
9711 670550 : instance->cost_vec.release ();
9712 670550 : }
9713 :
9714 651543 : if (dump_enabled_p ())
9715 98 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
9716 :
9717 : /* When costing non-loop vectorization we need to consider each covered
9718 : loop independently and make sure vectorization is profitable. For
9719 : now we assume a loop may be not entered or executed an arbitrary
9720 : number of iterations (??? static information can provide more
9721 : precise info here) which means we can simply cost each containing
9722 : loops stmts separately. */
9723 :
9724 : /* First produce cost vectors sorted by loop index. */
9725 651543 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
9726 651543 : li_scalar_costs (scalar_costs.length ());
9727 651543 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
9728 651543 : li_vector_costs (vector_costs.length ());
9729 651543 : stmt_info_for_cost *cost;
9730 2780472 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9731 : {
9732 2128929 : unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
9733 2128929 : li_scalar_costs.quick_push (std::make_pair (l, cost));
9734 : }
9735 : /* Use a random used loop as fallback in case the first vector_costs
9736 : entry does not have a stmt_info associated with it. */
9737 651543 : unsigned l = li_scalar_costs[0].first;
9738 2402080 : FOR_EACH_VEC_ELT (vector_costs, i, cost)
9739 : {
9740 : /* We inherit from the previous COST, invariants, externals and
9741 : extracts immediately follow the cost for the related stmt. */
9742 1750537 : if (cost->stmt_info)
9743 1036525 : l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
9744 1750537 : li_vector_costs.quick_push (std::make_pair (l, cost));
9745 : }
9746 651543 : li_scalar_costs.qsort (li_cost_vec_cmp);
9747 651543 : li_vector_costs.qsort (li_cost_vec_cmp);
9748 :
9749 : /* Now cost the portions individually. */
9750 : unsigned vi = 0;
9751 : unsigned si = 0;
9752 1131665 : bool profitable = true;
9753 1131665 : while (si < li_scalar_costs.length ()
9754 1788006 : && vi < li_vector_costs.length ())
9755 : {
9756 656341 : unsigned sl = li_scalar_costs[si].first;
9757 656341 : unsigned vl = li_vector_costs[vi].first;
9758 656341 : if (sl != vl)
9759 : {
9760 1219 : if (dump_enabled_p ())
9761 0 : dump_printf_loc (MSG_NOTE, vect_location,
9762 : "Scalar %d and vector %d loop part do not "
9763 : "match up, skipping scalar part\n", sl, vl);
9764 : /* Skip the scalar part, assuming zero cost on the vector side. */
9765 2640 : do
9766 : {
9767 2640 : si++;
9768 : }
9769 2640 : while (si < li_scalar_costs.length ()
9770 4730 : && li_scalar_costs[si].first == sl);
9771 1219 : continue;
9772 : }
9773 :
9774 655122 : class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
9775 2109151 : do
9776 : {
9777 2109151 : add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
9778 2109151 : si++;
9779 : }
9780 2109151 : while (si < li_scalar_costs.length ()
9781 4225835 : && li_scalar_costs[si].first == sl);
9782 655122 : scalar_target_cost_data->finish_cost (nullptr);
9783 655122 : scalar_cost = (scalar_target_cost_data->body_cost ()
9784 655122 : * param_vect_scalar_cost_multiplier) / 100;
9785 :
9786 : /* Complete the target-specific vector cost calculation. */
9787 655122 : class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
9788 1716563 : do
9789 : {
9790 1716563 : add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
9791 1716563 : vi++;
9792 : }
9793 1716563 : while (vi < li_vector_costs.length ()
9794 3441792 : && li_vector_costs[vi].first == vl);
9795 655122 : vect_target_cost_data->finish_cost (scalar_target_cost_data);
9796 655122 : vec_prologue_cost = vect_target_cost_data->prologue_cost ();
9797 655122 : vec_inside_cost = vect_target_cost_data->body_cost ();
9798 655122 : vec_epilogue_cost = vect_target_cost_data->epilogue_cost ();
9799 655122 : delete scalar_target_cost_data;
9800 655122 : delete vect_target_cost_data;
9801 :
9802 655122 : vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
9803 :
9804 655122 : if (dump_enabled_p ())
9805 : {
9806 98 : dump_printf_loc (MSG_NOTE, vect_location,
9807 : "Cost model analysis for part in loop %d:\n", sl);
9808 98 : dump_printf (MSG_NOTE, " Vector cost: %d\n",
9809 : vec_inside_cost + vec_outside_cost);
9810 98 : dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
9811 : }
9812 :
9813 : /* Vectorization is profitable if its cost is more than the cost of scalar
9814 : version. Note that we err on the vector side for equal cost because
9815 : the cost estimate is otherwise quite pessimistic (constant uses are
9816 : free on the scalar side but cost a load on the vector side for
9817 : example). */
9818 655122 : if (vec_outside_cost + vec_inside_cost > scalar_cost)
9819 : {
9820 : profitable = false;
9821 : break;
9822 : }
9823 : }
9824 1126854 : if (profitable && vi < li_vector_costs.length ())
9825 : {
9826 1151 : if (dump_enabled_p ())
9827 12 : dump_printf_loc (MSG_NOTE, vect_location,
9828 : "Excess vector cost for part in loop %d:\n",
9829 6 : li_vector_costs[vi].first);
9830 : profitable = false;
9831 : }
9832 :
9833 : /* Unset visited flag. This is delayed when the subgraph is profitable
9834 : and we process the loop for remaining unvectorized if-converted code. */
9835 651543 : if (!orig_loop || !profitable)
9836 2779185 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9837 2127731 : gimple_set_visited (cost->stmt_info->stmt, false);
9838 :
9839 651543 : scalar_costs.release ();
9840 651543 : vector_costs.release ();
9841 :
9842 651543 : return profitable;
9843 651543 : }
9844 :
9845 : /* qsort comparator for lane defs. */
9846 :
9847 : static int
9848 40 : vld_cmp (const void *a_, const void *b_)
9849 : {
9850 40 : auto *a = (const std::pair<unsigned, tree> *)a_;
9851 40 : auto *b = (const std::pair<unsigned, tree> *)b_;
9852 40 : return a->first - b->first;
9853 : }
9854 :
9855 : /* Return true if USE_STMT is a vector lane insert into VEC and set
9856 : *THIS_LANE to the lane number that is set. */
9857 :
9858 : static bool
9859 248 : vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
9860 : {
9861 248 : gassign *use_ass = dyn_cast <gassign *> (use_stmt);
9862 91 : if (!use_ass
9863 91 : || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
9864 22 : || (vec
9865 22 : ? gimple_assign_rhs1 (use_ass) != vec
9866 24 : : ((vec = gimple_assign_rhs1 (use_ass)), false))
9867 46 : || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
9868 46 : TREE_TYPE (gimple_assign_rhs2 (use_ass)))
9869 46 : || !constant_multiple_p
9870 46 : (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
9871 92 : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
9872 : this_lane))
9873 202 : return false;
9874 : return true;
9875 : }
9876 :
9877 : /* Find any vectorizable constructors and add them to the grouped_store
9878 : array. */
9879 :
9880 : static void
9881 2197543 : vect_slp_check_for_roots (bb_vec_info bb_vinfo)
9882 : {
9883 17788150 : for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
9884 31181214 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
9885 135793589 : !gsi_end_p (gsi); gsi_next (&gsi))
9886 : {
9887 120202982 : gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
9888 : /* This can be used to start SLP discovery for early breaks for BB early breaks
9889 : when we get that far. */
9890 120202982 : if (!assign)
9891 180435275 : continue;
9892 :
9893 30899947 : tree rhs = gimple_assign_rhs1 (assign);
9894 30899947 : enum tree_code code = gimple_assign_rhs_code (assign);
9895 30899947 : use_operand_p use_p;
9896 30899947 : gimple *use_stmt;
9897 30899947 : if (code == CONSTRUCTOR)
9898 : {
9899 1564360 : if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9900 62281 : || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
9901 91159 : CONSTRUCTOR_NELTS (rhs))
9902 42169 : || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
9903 1606529 : || uniform_vector_p (rhs))
9904 1551909 : continue;
9905 :
9906 : unsigned j;
9907 : tree val;
9908 61225 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9909 48774 : if (TREE_CODE (val) != SSA_NAME
9910 48774 : || !bb_vinfo->lookup_def (val))
9911 : break;
9912 30674 : if (j != CONSTRUCTOR_NELTS (rhs))
9913 2886 : continue;
9914 :
9915 12451 : vec<stmt_vec_info> roots = vNULL;
9916 12451 : roots.safe_push (bb_vinfo->lookup_stmt (assign));
9917 12451 : vec<stmt_vec_info> stmts;
9918 12451 : stmts.create (CONSTRUCTOR_NELTS (rhs));
9919 69216 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9920 44314 : stmts.quick_push
9921 44314 : (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
9922 12451 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9923 12451 : stmts, roots));
9924 : }
9925 29335587 : else if (code == BIT_INSERT_EXPR
9926 924 : && VECTOR_TYPE_P (TREE_TYPE (rhs))
9927 606 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
9928 606 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
9929 603 : && integer_zerop (gimple_assign_rhs3 (assign))
9930 341 : && useless_type_conversion_p
9931 341 : (TREE_TYPE (TREE_TYPE (rhs)),
9932 341 : TREE_TYPE (gimple_assign_rhs2 (assign)))
9933 29336209 : && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
9934 : {
9935 : /* We start to match on insert to lane zero but since the
9936 : inserts need not be ordered we'd have to search both
9937 : the def and the use chains. */
9938 215 : tree vectype = TREE_TYPE (rhs);
9939 215 : unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
9940 215 : auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
9941 215 : auto_sbitmap lanes (nlanes);
9942 215 : bitmap_clear (lanes);
9943 215 : bitmap_set_bit (lanes, 0);
9944 215 : tree def = gimple_assign_lhs (assign);
9945 215 : lane_defs.quick_push
9946 215 : (std::make_pair (0, gimple_assign_rhs2 (assign)));
9947 215 : unsigned lanes_found = 1;
9948 : /* Start with the use chains, the last stmt will be the root. */
9949 215 : stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
9950 215 : vec<stmt_vec_info> roots = vNULL;
9951 215 : roots.safe_push (last);
9952 217 : do
9953 : {
9954 217 : use_operand_p use_p;
9955 217 : gimple *use_stmt;
9956 217 : if (!single_imm_use (def, &use_p, &use_stmt))
9957 : break;
9958 211 : unsigned this_lane;
9959 211 : if (!bb_vinfo->lookup_stmt (use_stmt)
9960 211 : || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
9961 233 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
9962 : break;
9963 22 : if (bitmap_bit_p (lanes, this_lane))
9964 : break;
9965 2 : lanes_found++;
9966 2 : bitmap_set_bit (lanes, this_lane);
9967 2 : gassign *use_ass = as_a <gassign *> (use_stmt);
9968 2 : lane_defs.quick_push (std::make_pair
9969 2 : (this_lane, gimple_assign_rhs2 (use_ass)));
9970 2 : last = bb_vinfo->lookup_stmt (use_ass);
9971 2 : roots.safe_push (last);
9972 2 : def = gimple_assign_lhs (use_ass);
9973 : }
9974 2 : while (lanes_found < nlanes);
9975 215 : if (roots.length () > 1)
9976 2 : std::swap(roots[0], roots[roots.length () - 1]);
9977 215 : if (lanes_found < nlanes)
9978 : {
9979 : /* Now search the def chain. */
9980 215 : def = gimple_assign_rhs1 (assign);
9981 217 : do
9982 : {
9983 217 : if (TREE_CODE (def) != SSA_NAME
9984 217 : || !has_single_use (def))
9985 : break;
9986 56 : gimple *def_stmt = SSA_NAME_DEF_STMT (def);
9987 56 : unsigned this_lane;
9988 56 : if (!bb_vinfo->lookup_stmt (def_stmt)
9989 37 : || !vect_slp_is_lane_insert (def_stmt,
9990 : NULL_TREE, &this_lane)
9991 80 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
9992 : break;
9993 24 : if (bitmap_bit_p (lanes, this_lane))
9994 : break;
9995 4 : lanes_found++;
9996 4 : bitmap_set_bit (lanes, this_lane);
9997 8 : lane_defs.quick_push (std::make_pair
9998 4 : (this_lane,
9999 4 : gimple_assign_rhs2 (def_stmt)));
10000 4 : roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
10001 4 : def = gimple_assign_rhs1 (def_stmt);
10002 : }
10003 4 : while (lanes_found < nlanes);
10004 : }
10005 215 : if (lanes_found == nlanes)
10006 : {
10007 : /* Sort lane_defs after the lane index and register the root. */
10008 2 : lane_defs.qsort (vld_cmp);
10009 2 : vec<stmt_vec_info> stmts;
10010 2 : stmts.create (nlanes);
10011 10 : for (unsigned i = 0; i < nlanes; ++i)
10012 8 : stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
10013 2 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
10014 2 : stmts, roots));
10015 : }
10016 : else
10017 213 : roots.release ();
10018 215 : }
10019 29335372 : else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
10020 28371967 : && (associative_tree_code (code) || code == MINUS_EXPR)
10021 : /* ??? This pessimizes a two-element reduction. PR54400.
10022 : ??? In-order reduction could be handled if we only
10023 : traverse one operand chain in vect_slp_linearize_chain. */
10024 33276033 : && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
10025 : /* Ops with constants at the tail can be stripped here. */
10026 5803578 : && TREE_CODE (rhs) == SSA_NAME
10027 5744083 : && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
10028 : /* Should be the chain end. */
10029 31605333 : && (!single_imm_use (gimple_assign_lhs (assign),
10030 : &use_p, &use_stmt)
10031 1750982 : || !is_gimple_assign (use_stmt)
10032 1191228 : || (gimple_assign_rhs_code (use_stmt) != code
10033 882799 : && ((code != PLUS_EXPR && code != MINUS_EXPR)
10034 500043 : || (gimple_assign_rhs_code (use_stmt)
10035 500043 : != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
10036 : {
10037 : /* We start the match at the end of a possible association
10038 : chain. */
10039 1862917 : auto_vec<chain_op_t> chain;
10040 1862917 : auto_vec<std::pair<tree_code, gimple *> > worklist;
10041 1862917 : auto_vec<gimple *> chain_stmts;
10042 1862917 : gimple *code_stmt = NULL, *alt_code_stmt = NULL;
10043 1862917 : if (code == MINUS_EXPR)
10044 306702 : code = PLUS_EXPR;
10045 1862917 : internal_fn reduc_fn;
10046 2140213 : if (!reduction_fn_for_scalar_code (code, &reduc_fn)
10047 1862917 : || reduc_fn == IFN_LAST)
10048 277296 : continue;
10049 1585621 : vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
10050 : /* ??? */
10051 : code_stmt, alt_code_stmt, &chain_stmts);
10052 3171242 : if (chain.length () > 1)
10053 : {
10054 : /* Sort the chain according to def_type and operation. */
10055 1585621 : chain.sort (dt_sort_cmp, bb_vinfo);
10056 : /* ??? Now we'd want to strip externals and constants
10057 : but record those to be handled in the epilogue. */
10058 : /* ??? For now do not allow mixing ops or externs/constants. */
10059 1585621 : bool invalid = false;
10060 1585621 : unsigned remain_cnt = 0;
10061 1585621 : unsigned last_idx = 0;
10062 4781609 : for (unsigned i = 0; i < chain.length (); ++i)
10063 : {
10064 3525796 : if (chain[i].code != code)
10065 : {
10066 : invalid = true;
10067 : break;
10068 : }
10069 3195988 : if (chain[i].dt != vect_internal_def
10070 : /* Avoid stmts where the def is not the LHS, like
10071 : ASMs. */
10072 6161971 : || (gimple_get_lhs (bb_vinfo->lookup_def
10073 2965983 : (chain[i].op)->stmt)
10074 2965983 : != chain[i].op))
10075 232949 : remain_cnt++;
10076 : else
10077 : last_idx = i;
10078 : }
10079 : /* Make sure to have an even number of lanes as we later do
10080 : all-or-nothing discovery, not trying to split further. */
10081 1585621 : if ((chain.length () - remain_cnt) & 1)
10082 185443 : remain_cnt++;
10083 1585621 : if (!invalid && chain.length () - remain_cnt > 1)
10084 : {
10085 1187636 : vec<stmt_vec_info> stmts;
10086 1187636 : vec<tree> remain = vNULL;
10087 1187636 : stmts.create (chain.length ());
10088 1187636 : if (remain_cnt > 0)
10089 110141 : remain.create (remain_cnt);
10090 3816764 : for (unsigned i = 0; i < chain.length (); ++i)
10091 : {
10092 2629128 : stmt_vec_info stmt_info;
10093 2629128 : if (chain[i].dt == vect_internal_def
10094 2592312 : && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
10095 2592312 : gimple_get_lhs (stmt_info->stmt) == chain[i].op)
10096 5221356 : && (i != last_idx
10097 1187636 : || (stmts.length () & 1)))
10098 2507650 : stmts.quick_push (stmt_info);
10099 : else
10100 121478 : remain.quick_push (chain[i].op);
10101 : }
10102 1187636 : vec<stmt_vec_info> roots;
10103 1187636 : roots.create (chain_stmts.length ());
10104 2629128 : for (unsigned i = 0; i < chain_stmts.length (); ++i)
10105 1441492 : roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
10106 1187636 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
10107 1187636 : stmts, roots, remain));
10108 : }
10109 : }
10110 1862917 : }
10111 : }
10112 2197543 : }
10113 :
10114 : /* Walk the grouped store chains and replace entries with their
10115 : pattern variant if any. */
10116 :
10117 : static void
10118 609003 : vect_fixup_store_groups_with_patterns (vec_info *vinfo)
10119 : {
10120 609003 : stmt_vec_info first_element;
10121 609003 : unsigned i;
10122 :
10123 1491620 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
10124 : {
10125 : /* We also have CTORs in this array. */
10126 882617 : if (!STMT_VINFO_GROUPED_ACCESS (first_element))
10127 0 : continue;
10128 882617 : if (STMT_VINFO_IN_PATTERN_P (first_element))
10129 : {
10130 254 : stmt_vec_info orig = first_element;
10131 254 : first_element = STMT_VINFO_RELATED_STMT (first_element);
10132 254 : DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
10133 254 : DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
10134 254 : DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
10135 254 : DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
10136 254 : vinfo->grouped_stores[i] = first_element;
10137 : }
10138 882617 : stmt_vec_info prev = first_element;
10139 2478507 : while (DR_GROUP_NEXT_ELEMENT (prev))
10140 : {
10141 1595890 : stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
10142 1595890 : if (STMT_VINFO_IN_PATTERN_P (elt))
10143 : {
10144 893 : stmt_vec_info orig = elt;
10145 893 : elt = STMT_VINFO_RELATED_STMT (elt);
10146 893 : DR_GROUP_NEXT_ELEMENT (prev) = elt;
10147 893 : DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
10148 893 : DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
10149 : }
10150 1595890 : DR_GROUP_FIRST_ELEMENT (elt) = first_element;
10151 1595890 : prev = elt;
10152 : }
10153 : }
10154 609003 : }
10155 :
10156 : /* Check if the region described by BB_VINFO can be vectorized, returning
10157 : true if so. When returning false, set FATAL to true if the same failure
10158 : would prevent vectorization at other vector sizes, false if it is still
10159 : worth trying other sizes. N_STMTS is the number of statements in the
10160 : region. */
10161 :
10162 : static bool
10163 2197543 : vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
10164 : vec<int> *dataref_groups)
10165 : {
10166 2197543 : DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
10167 :
10168 2197543 : slp_instance instance;
10169 2197543 : int i;
10170 :
10171 : /* The first group of checks is independent of the vector size. */
10172 2197543 : fatal = true;
10173 :
10174 : /* Analyze the data references. */
10175 :
10176 2197543 : if (!vect_analyze_data_refs (bb_vinfo, NULL))
10177 : {
10178 0 : if (dump_enabled_p ())
10179 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10180 : "not vectorized: unhandled data-ref in basic "
10181 : "block.\n");
10182 0 : return false;
10183 : }
10184 :
10185 2197543 : if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
10186 : {
10187 0 : if (dump_enabled_p ())
10188 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10189 : "not vectorized: unhandled data access in "
10190 : "basic block.\n");
10191 0 : return false;
10192 : }
10193 :
10194 2197543 : vect_slp_check_for_roots (bb_vinfo);
10195 :
10196 : /* If there are no grouped stores and no constructors in the region
10197 : there is no need to continue with pattern recog as vect_analyze_slp
10198 : will fail anyway. */
10199 2197543 : if (bb_vinfo->grouped_stores.is_empty ()
10200 1856767 : && bb_vinfo->roots.is_empty ())
10201 : {
10202 1588540 : if (dump_enabled_p ())
10203 1022 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10204 : "not vectorized: no grouped stores in "
10205 : "basic block.\n");
10206 1588540 : return false;
10207 : }
10208 :
10209 : /* While the rest of the analysis below depends on it in some way. */
10210 609003 : fatal = false;
10211 :
10212 609003 : vect_pattern_recog (bb_vinfo);
10213 :
10214 : /* Update store groups from pattern processing. */
10215 609003 : vect_fixup_store_groups_with_patterns (bb_vinfo);
10216 :
10217 : /* Check the SLP opportunities in the basic block, analyze and build SLP
10218 : trees. */
10219 609003 : if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
10220 : {
10221 0 : if (dump_enabled_p ())
10222 : {
10223 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10224 : "Failed to SLP the basic block.\n");
10225 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10226 : "not vectorized: failed to find SLP opportunities "
10227 : "in basic block.\n");
10228 : }
10229 0 : return false;
10230 : }
10231 :
10232 : /* Optimize permutations. */
10233 609003 : vect_optimize_slp (bb_vinfo);
10234 :
10235 : /* Gather the loads reachable from the SLP graph entries. */
10236 609003 : vect_gather_slp_loads (bb_vinfo);
10237 :
10238 609003 : vect_record_base_alignments (bb_vinfo);
10239 :
10240 : /* Analyze and verify the alignment of data references and the
10241 : dependence in the SLP instances. */
10242 1390636 : for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
10243 : {
10244 781633 : vect_location = instance->location ();
10245 781633 : if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
10246 781633 : || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
10247 : {
10248 8405 : slp_tree node = SLP_INSTANCE_TREE (instance);
10249 8405 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
10250 8405 : if (dump_enabled_p ())
10251 4 : dump_printf_loc (MSG_NOTE, vect_location,
10252 : "removing SLP instance operations starting from: %G",
10253 : stmt_info->stmt);
10254 8405 : vect_free_slp_instance (instance);
10255 8405 : BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
10256 8405 : continue;
10257 8405 : }
10258 :
10259 : /* Mark all the statements that we want to vectorize as pure SLP and
10260 : relevant. */
10261 773228 : vect_mark_slp_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance));
10262 773228 : vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
10263 773228 : unsigned j;
10264 773228 : stmt_vec_info root;
10265 : /* Likewise consider instance root stmts as vectorized. */
10266 1707544 : FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
10267 161088 : STMT_SLP_TYPE (root) = pure_slp;
10268 :
10269 773228 : i++;
10270 : }
10271 2227332 : if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
10272 : return false;
10273 :
10274 263665 : if (!vect_slp_analyze_operations (bb_vinfo))
10275 : {
10276 29789 : if (dump_enabled_p ())
10277 81 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10278 : "not vectorized: bad operation in basic block.\n");
10279 29789 : return false;
10280 : }
10281 :
10282 233876 : vect_bb_partition_graph (bb_vinfo);
10283 :
10284 233876 : return true;
10285 : }
10286 :
10287 : /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
10288 : basic blocks in BBS, returning true on success.
10289 : The region has N_STMTS statements and has the datarefs given by DATAREFS. */
10290 :
10291 : static bool
10292 1877972 : vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
10293 : vec<int> *dataref_groups, unsigned int n_stmts,
10294 : loop_p orig_loop)
10295 : {
10296 1877972 : bb_vec_info bb_vinfo;
10297 1877972 : auto_vector_modes vector_modes;
10298 :
10299 : /* Autodetect first vector size we try. */
10300 1877972 : machine_mode next_vector_mode = VOIDmode;
10301 1877972 : targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
10302 1877972 : unsigned int mode_i = 0;
10303 :
10304 1877972 : vec_info_shared shared;
10305 :
10306 1877972 : machine_mode autodetected_vector_mode = VOIDmode;
10307 2517114 : while (1)
10308 : {
10309 2197543 : bool vectorized = false;
10310 2197543 : bool fatal = false;
10311 2197543 : bb_vinfo = new _bb_vec_info (bbs, &shared);
10312 :
10313 2197543 : bool first_time_p = shared.datarefs.is_empty ();
10314 2197543 : BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
10315 2197543 : if (first_time_p)
10316 1900248 : bb_vinfo->shared->save_datarefs ();
10317 : else
10318 297295 : bb_vinfo->shared->check_datarefs ();
10319 2197543 : bb_vinfo->vector_mode = next_vector_mode;
10320 :
10321 2197543 : if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
10322 : {
10323 233876 : if (dump_enabled_p ())
10324 : {
10325 1502 : dump_printf_loc (MSG_NOTE, vect_location,
10326 : "***** Analysis succeeded with vector mode"
10327 751 : " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
10328 751 : dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
10329 : }
10330 :
10331 233876 : bb_vinfo->shared->check_datarefs ();
10332 :
10333 233876 : bool force_clear = false;
10334 233876 : auto_vec<slp_instance> profitable_subgraphs;
10335 1375647 : for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
10336 : {
10337 674019 : if (instance->subgraph_entries.is_empty ())
10338 215658 : continue;
10339 :
10340 654875 : dump_user_location_t saved_vect_location = vect_location;
10341 654875 : vect_location = instance->location ();
10342 654875 : if (!unlimited_cost_model (NULL)
10343 1306418 : && !vect_bb_vectorization_profitable_p
10344 651543 : (bb_vinfo, instance->subgraph_entries, orig_loop))
10345 : {
10346 177370 : if (dump_enabled_p ())
10347 28 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10348 : "not vectorized: vectorization is not "
10349 : "profitable.\n");
10350 177370 : vect_location = saved_vect_location;
10351 177370 : continue;
10352 : }
10353 :
10354 477505 : vect_location = saved_vect_location;
10355 477505 : if (!dbg_cnt (vect_slp))
10356 : {
10357 0 : force_clear = true;
10358 0 : continue;
10359 : }
10360 :
10361 477505 : profitable_subgraphs.safe_push (instance);
10362 : }
10363 :
10364 : /* When we're vectorizing an if-converted loop body make sure
10365 : we vectorized all if-converted code. */
10366 391766 : if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
10367 : {
10368 97 : gcc_assert (bb_vinfo->nbbs == 1);
10369 194 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
10370 4084 : !gsi_end_p (gsi); gsi_next (&gsi))
10371 : {
10372 : /* The costing above left us with DCEable vectorized scalar
10373 : stmts having the visited flag set on profitable
10374 : subgraphs. Do the delayed clearing of the flag here. */
10375 3987 : if (gimple_visited_p (gsi_stmt (gsi)))
10376 : {
10377 1172 : gimple_set_visited (gsi_stmt (gsi), false);
10378 1172 : continue;
10379 : }
10380 2815 : if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
10381 813 : continue;
10382 :
10383 5859 : if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
10384 2450 : if (gimple_assign_rhs_code (ass) == COND_EXPR)
10385 : {
10386 51 : if (!profitable_subgraphs.is_empty ()
10387 22 : && dump_enabled_p ())
10388 0 : dump_printf_loc (MSG_NOTE, vect_location,
10389 : "not profitable because of "
10390 : "unprofitable if-converted scalar "
10391 : "code\n");
10392 29 : profitable_subgraphs.truncate (0);
10393 : }
10394 : }
10395 : }
10396 :
10397 : /* Finally schedule the profitable subgraphs. */
10398 1027129 : for (slp_instance instance : profitable_subgraphs)
10399 : {
10400 477473 : if (!vectorized && dump_enabled_p ())
10401 726 : dump_printf_loc (MSG_NOTE, vect_location,
10402 : "Basic block will be vectorized "
10403 : "using SLP\n");
10404 477473 : vectorized = true;
10405 :
10406 : /* Dump before scheduling as store vectorization will remove
10407 : the original stores and mess with the instance tree
10408 : so querying its location will eventually ICE. */
10409 477473 : if (flag_checking)
10410 1920263 : for (slp_instance sub : instance->subgraph_entries)
10411 487844 : gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
10412 477473 : unsigned HOST_WIDE_INT bytes;
10413 477473 : if (dump_enabled_p ())
10414 3457 : for (slp_instance sub : instance->subgraph_entries)
10415 : {
10416 916 : tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
10417 1832 : if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
10418 916 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
10419 916 : sub->location (),
10420 : "basic block part vectorized using %wu "
10421 : "byte vectors\n", bytes);
10422 : else
10423 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
10424 : sub->location (),
10425 : "basic block part vectorized using "
10426 : "variable length vectors\n");
10427 : }
10428 :
10429 477473 : dump_user_location_t saved_vect_location = vect_location;
10430 477473 : vect_location = instance->location ();
10431 :
10432 477473 : vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
10433 :
10434 477473 : vect_location = saved_vect_location;
10435 : }
10436 :
10437 :
10438 : /* Generate the invariant statements. */
10439 233876 : if (!gimple_seq_empty_p (bb_vinfo->inv_pattern_def_seq))
10440 : {
10441 23 : if (dump_enabled_p ())
10442 0 : dump_printf_loc (MSG_NOTE, vect_location,
10443 : "------>generating invariant statements\n");
10444 :
10445 23 : bb_vinfo->insert_seq_on_entry (NULL,
10446 : bb_vinfo->inv_pattern_def_seq);
10447 : }
10448 233876 : }
10449 : else
10450 : {
10451 1963667 : if (dump_enabled_p ())
10452 1314 : dump_printf_loc (MSG_NOTE, vect_location,
10453 : "***** Analysis failed with vector mode %s\n",
10454 1314 : GET_MODE_NAME (bb_vinfo->vector_mode));
10455 : }
10456 :
10457 2197543 : if (mode_i == 0)
10458 1877972 : autodetected_vector_mode = bb_vinfo->vector_mode;
10459 :
10460 2197543 : if (!fatal)
10461 3139013 : while (mode_i < vector_modes.length ()
10462 1751329 : && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
10463 : {
10464 332467 : if (dump_enabled_p ())
10465 1654 : dump_printf_loc (MSG_NOTE, vect_location,
10466 : "***** The result for vector mode %s would"
10467 : " be the same\n",
10468 827 : GET_MODE_NAME (vector_modes[mode_i]));
10469 332467 : mode_i += 1;
10470 : }
10471 :
10472 2197543 : delete bb_vinfo;
10473 :
10474 2197543 : if (mode_i < vector_modes.length ()
10475 2020865 : && VECTOR_MODE_P (autodetected_vector_mode)
10476 1997372 : && (related_vector_mode (vector_modes[mode_i],
10477 : GET_MODE_INNER (autodetected_vector_mode))
10478 998686 : == autodetected_vector_mode)
10479 4218408 : && (related_vector_mode (autodetected_vector_mode,
10480 520833 : GET_MODE_INNER (vector_modes[mode_i]))
10481 1041666 : == vector_modes[mode_i]))
10482 : {
10483 520833 : if (dump_enabled_p ())
10484 205 : dump_printf_loc (MSG_NOTE, vect_location,
10485 : "***** Skipping vector mode %s, which would"
10486 : " repeat the analysis for %s\n",
10487 205 : GET_MODE_NAME (vector_modes[mode_i]),
10488 205 : GET_MODE_NAME (autodetected_vector_mode));
10489 520833 : mode_i += 1;
10490 : }
10491 :
10492 2197543 : if (vectorized
10493 2039675 : || mode_i == vector_modes.length ()
10494 1863044 : || autodetected_vector_mode == VOIDmode
10495 : /* If vect_slp_analyze_bb_1 signaled that analysis for all
10496 : vector sizes will fail do not bother iterating. */
10497 3038408 : || fatal)
10498 3755944 : return vectorized;
10499 :
10500 : /* Try the next biggest vector size. */
10501 319571 : next_vector_mode = vector_modes[mode_i++];
10502 319571 : if (dump_enabled_p ())
10503 218 : dump_printf_loc (MSG_NOTE, vect_location,
10504 : "***** Re-trying analysis with vector mode %s\n",
10505 218 : GET_MODE_NAME (next_vector_mode));
10506 319571 : }
10507 1877972 : }
10508 :
10509 :
10510 : /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
10511 : true if anything in the basic-block was vectorized. */
10512 :
10513 : static bool
10514 1877972 : vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
10515 : {
10516 1877972 : vec<data_reference_p> datarefs = vNULL;
10517 1877972 : auto_vec<int> dataref_groups;
10518 1877972 : int insns = 0;
10519 1877972 : int current_group = 0;
10520 :
10521 12545866 : for (unsigned i = 0; i < bbs.length (); i++)
10522 : {
10523 10667894 : basic_block bb = bbs[i];
10524 88915180 : for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
10525 78247286 : gsi_next (&gsi))
10526 : {
10527 78247286 : gimple *stmt = gsi_stmt (gsi);
10528 78247286 : if (is_gimple_debug (stmt))
10529 48554624 : continue;
10530 :
10531 29692662 : insns++;
10532 :
10533 29692662 : if (gimple_location (stmt) != UNKNOWN_LOCATION)
10534 26690447 : vect_location = stmt;
10535 :
10536 29692662 : if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
10537 : &dataref_groups, current_group))
10538 5095393 : ++current_group;
10539 : }
10540 : /* New BBs always start a new DR group. */
10541 10667894 : ++current_group;
10542 : }
10543 :
10544 1877972 : return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
10545 1877972 : }
10546 :
10547 : /* Special entry for the BB vectorizer. Analyze and transform a single
10548 : if-converted BB with ORIG_LOOPs body being the not if-converted
10549 : representation. Returns true if anything in the basic-block was
10550 : vectorized. */
10551 :
10552 : bool
10553 19383 : vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
10554 : {
10555 19383 : auto_vec<basic_block> bbs;
10556 19383 : bbs.safe_push (bb);
10557 19383 : return vect_slp_bbs (bbs, orig_loop);
10558 19383 : }
10559 :
10560 : /* Main entry for the BB vectorizer. Analyze and transform BB, returns
10561 : true if anything in the basic-block was vectorized. */
10562 :
10563 : bool
10564 909169 : vect_slp_function (function *fun)
10565 : {
10566 909169 : bool r = false;
10567 909169 : int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
10568 909169 : auto_bitmap exit_bbs;
10569 909169 : bitmap_set_bit (exit_bbs, EXIT_BLOCK);
10570 909169 : edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
10571 909169 : unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
10572 909169 : true, rpo, NULL);
10573 :
10574 : /* For the moment split the function into pieces to avoid making
10575 : the iteration on the vector mode moot. Split at points we know
10576 : to not handle well which is CFG merges (SLP discovery doesn't
10577 : handle non-loop-header PHIs) and loop exits. Since pattern
10578 : recog requires reverse iteration to visit uses before defs
10579 : simply chop RPO into pieces. */
10580 909169 : auto_vec<basic_block> bbs;
10581 11588604 : for (unsigned i = 0; i < n; i++)
10582 : {
10583 10679435 : basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
10584 10679435 : bool split = false;
10585 :
10586 : /* Split when a BB is not dominated by the first block. */
10587 20143687 : if (!bbs.is_empty ()
10588 9464252 : && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
10589 : {
10590 663332 : if (dump_enabled_p ())
10591 146 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10592 : "splitting region at dominance boundary bb%d\n",
10593 : bb->index);
10594 : split = true;
10595 : }
10596 : /* Split when the loop determined by the first block
10597 : is exited. This is because we eventually insert
10598 : invariants at region begin. */
10599 18817023 : else if (!bbs.is_empty ()
10600 8800920 : && bbs[0]->loop_father != bb->loop_father
10601 2286617 : && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
10602 : {
10603 3731 : if (dump_enabled_p ())
10604 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10605 : "splitting region at loop %d exit at bb%d\n",
10606 3 : bbs[0]->loop_father->num, bb->index);
10607 : split = true;
10608 : }
10609 10012372 : else if (!bbs.is_empty ()
10610 8797189 : && bb->loop_father->header == bb
10611 473774 : && bb->loop_father->dont_vectorize)
10612 : {
10613 7267 : if (dump_enabled_p ())
10614 72 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10615 : "splitting region at dont-vectorize loop %d "
10616 : "entry at bb%d\n",
10617 : bb->loop_father->num, bb->index);
10618 : split = true;
10619 : }
10620 :
10621 11353765 : if (split && !bbs.is_empty ())
10622 : {
10623 674330 : r |= vect_slp_bbs (bbs, NULL);
10624 674330 : bbs.truncate (0);
10625 : }
10626 :
10627 10679435 : if (bbs.is_empty ())
10628 : {
10629 : /* We need to be able to insert at the head of the region which
10630 : we cannot for region starting with a returns-twice call. */
10631 1889513 : if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
10632 400750 : if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
10633 : {
10634 301 : if (dump_enabled_p ())
10635 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10636 : "skipping bb%d as start of region as it "
10637 : "starts with returns-twice call\n",
10638 : bb->index);
10639 30924 : continue;
10640 : }
10641 : /* If the loop this BB belongs to is marked as not to be vectorized
10642 : honor that also for BB vectorization. */
10643 1889212 : if (bb->loop_father->dont_vectorize)
10644 30623 : continue;
10645 : }
10646 :
10647 10648511 : bbs.safe_push (bb);
10648 :
10649 : /* When we have a stmt ending this block and defining a
10650 : value we have to insert on edges when inserting after it for
10651 : a vector containing its definition. Avoid this for now. */
10652 21297022 : if (gimple *last = *gsi_last_bb (bb))
10653 8613944 : if (gimple_get_lhs (last)
10654 8613944 : && is_ctrl_altering_stmt (last))
10655 : {
10656 275097 : if (dump_enabled_p ())
10657 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10658 : "splitting region at control altering "
10659 : "definition %G", last);
10660 275097 : r |= vect_slp_bbs (bbs, NULL);
10661 275097 : bbs.truncate (0);
10662 : }
10663 : }
10664 :
10665 909169 : if (!bbs.is_empty ())
10666 909162 : r |= vect_slp_bbs (bbs, NULL);
10667 :
10668 909169 : free (rpo);
10669 :
10670 909169 : return r;
10671 909169 : }
10672 :
10673 : /* Build a variable-length vector in which the elements in ELTS are repeated
10674 : to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
10675 : RESULTS and add any new instructions to SEQ.
10676 :
10677 : The approach we use is:
10678 :
10679 : (1) Find a vector mode VM with integer elements of mode IM.
10680 :
10681 : (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
10682 : ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
10683 : from small vectors to IM.
10684 :
10685 : (3) Duplicate each ELTS'[I] into a vector of mode VM.
10686 :
10687 : (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
10688 : correct byte contents.
10689 :
10690 : (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
10691 :
10692 : We try to find the largest IM for which this sequence works, in order
10693 : to cut down on the number of interleaves. */
10694 :
10695 : void
10696 0 : duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
10697 : const vec<tree> &elts, unsigned int nresults,
10698 : vec<tree> &results)
10699 : {
10700 0 : unsigned int nelts = elts.length ();
10701 0 : tree element_type = TREE_TYPE (vector_type);
10702 :
10703 : /* (1) Find a vector mode VM with integer elements of mode IM. */
10704 0 : unsigned int nvectors = 1;
10705 0 : tree new_vector_type;
10706 0 : tree permutes[2];
10707 0 : if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
10708 : &nvectors, &new_vector_type,
10709 : permutes))
10710 0 : gcc_unreachable ();
10711 :
10712 : /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
10713 0 : unsigned int partial_nelts = nelts / nvectors;
10714 0 : tree partial_vector_type = build_vector_type (element_type, partial_nelts);
10715 :
10716 0 : tree_vector_builder partial_elts;
10717 0 : auto_vec<tree, 32> pieces (nvectors * 2);
10718 0 : pieces.quick_grow_cleared (nvectors * 2);
10719 0 : for (unsigned int i = 0; i < nvectors; ++i)
10720 : {
10721 : /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
10722 : ELTS' has mode IM. */
10723 0 : partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
10724 0 : for (unsigned int j = 0; j < partial_nelts; ++j)
10725 0 : partial_elts.quick_push (elts[i * partial_nelts + j]);
10726 0 : tree t = gimple_build_vector (seq, &partial_elts);
10727 0 : t = gimple_build (seq, VIEW_CONVERT_EXPR,
10728 0 : TREE_TYPE (new_vector_type), t);
10729 :
10730 : /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
10731 0 : pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
10732 : }
10733 :
10734 : /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
10735 : correct byte contents.
10736 :
10737 : Conceptually, we need to repeat the following operation log2(nvectors)
10738 : times, where hi_start = nvectors / 2:
10739 :
10740 : out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
10741 : out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
10742 :
10743 : However, if each input repeats every N elements and the VF is
10744 : a multiple of N * 2, the HI result is the same as the LO result.
10745 : This will be true for the first N1 iterations of the outer loop,
10746 : followed by N2 iterations for which both the LO and HI results
10747 : are needed. I.e.:
10748 :
10749 : N1 + N2 = log2(nvectors)
10750 :
10751 : Each "N1 iteration" doubles the number of redundant vectors and the
10752 : effect of the process as a whole is to have a sequence of nvectors/2**N1
10753 : vectors that repeats 2**N1 times. Rather than generate these redundant
10754 : vectors, we halve the number of vectors for each N1 iteration. */
10755 : unsigned int in_start = 0;
10756 : unsigned int out_start = nvectors;
10757 : unsigned int new_nvectors = nvectors;
10758 0 : for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
10759 : {
10760 0 : unsigned int hi_start = new_nvectors / 2;
10761 0 : unsigned int out_i = 0;
10762 0 : for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
10763 : {
10764 0 : if ((in_i & 1) != 0
10765 0 : && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
10766 : 2 * in_repeat))
10767 0 : continue;
10768 :
10769 0 : tree output = make_ssa_name (new_vector_type);
10770 0 : tree input1 = pieces[in_start + (in_i / 2)];
10771 0 : tree input2 = pieces[in_start + (in_i / 2) + hi_start];
10772 0 : gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
10773 : input1, input2,
10774 : permutes[in_i & 1]);
10775 0 : gimple_seq_add_stmt (seq, stmt);
10776 0 : pieces[out_start + out_i] = output;
10777 0 : out_i += 1;
10778 : }
10779 0 : std::swap (in_start, out_start);
10780 0 : new_nvectors = out_i;
10781 : }
10782 :
10783 : /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
10784 0 : results.reserve (nresults);
10785 0 : for (unsigned int i = 0; i < nresults; ++i)
10786 0 : if (i < new_nvectors)
10787 0 : results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
10788 0 : pieces[in_start + i]));
10789 : else
10790 0 : results.quick_push (results[i - new_nvectors]);
10791 0 : }
10792 :
10793 :
10794 : /* For constant and loop invariant defs in OP_NODE this function creates
10795 : vector defs that will be used in the vectorized stmts and stores them
10796 : to SLP_TREE_VEC_DEFS of OP_NODE. */
10797 :
10798 : static void
10799 490000 : vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
10800 : {
10801 490000 : unsigned HOST_WIDE_INT nunits;
10802 490000 : tree vec_cst;
10803 490000 : unsigned j, number_of_places_left_in_vector;
10804 490000 : tree vector_type;
10805 490000 : tree vop;
10806 490000 : int group_size = op_node->ops.length ();
10807 490000 : unsigned int vec_num, i;
10808 490000 : unsigned number_of_copies = 1;
10809 490000 : bool constant_p;
10810 490000 : gimple_seq ctor_seq = NULL;
10811 490000 : auto_vec<tree, 16> permute_results;
10812 :
10813 : /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
10814 490000 : vector_type = SLP_TREE_VECTYPE (op_node);
10815 :
10816 490000 : unsigned int number_of_vectors = vect_get_num_copies (vinfo, op_node);
10817 490000 : SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
10818 490000 : auto_vec<tree> voprnds (number_of_vectors);
10819 :
10820 : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
10821 : created vectors. It is greater than 1 if unrolling is performed.
10822 :
10823 : For example, we have two scalar operands, s1 and s2 (e.g., group of
10824 : strided accesses of size two), while NUNITS is four (i.e., four scalars
10825 : of this type can be packed in a vector). The output vector will contain
10826 : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
10827 : will be 2).
10828 :
10829 : If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
10830 : containing the operands.
10831 :
10832 : For example, NUNITS is four as before, and the group size is 8
10833 : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
10834 : {s5, s6, s7, s8}. */
10835 :
10836 : /* When using duplicate_and_interleave, we just need one element for
10837 : each scalar statement. */
10838 490000 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
10839 : nunits = group_size;
10840 :
10841 490000 : number_of_copies = nunits * number_of_vectors / group_size;
10842 :
10843 490000 : number_of_places_left_in_vector = nunits;
10844 490000 : constant_p = true;
10845 490000 : tree uniform_elt = NULL_TREE;
10846 490000 : tree_vector_builder elts (vector_type, nunits, 1);
10847 490000 : elts.quick_grow (nunits);
10848 490000 : stmt_vec_info insert_after = NULL;
10849 1466692 : for (j = 0; j < number_of_copies; j++)
10850 : {
10851 976692 : tree op;
10852 3740292 : for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
10853 : {
10854 : /* Create 'vect_ = {op0,op1,...,opn}'. */
10855 1786908 : tree orig_op = op;
10856 1786908 : if (number_of_places_left_in_vector == nunits)
10857 : uniform_elt = op;
10858 1167974 : else if (uniform_elt && operand_equal_p (uniform_elt, op))
10859 745234 : op = elts[number_of_places_left_in_vector];
10860 : else
10861 : uniform_elt = NULL_TREE;
10862 1786908 : number_of_places_left_in_vector--;
10863 1786908 : if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
10864 : {
10865 274777 : if (CONSTANT_CLASS_P (op))
10866 : {
10867 100349 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10868 : {
10869 : /* Can't use VIEW_CONVERT_EXPR for booleans because
10870 : of possibly different sizes of scalar value and
10871 : vector element. */
10872 51 : if (integer_zerop (op))
10873 51 : op = build_int_cst (TREE_TYPE (vector_type), 0);
10874 0 : else if (integer_onep (op))
10875 0 : op = build_all_ones_cst (TREE_TYPE (vector_type));
10876 : else
10877 0 : gcc_unreachable ();
10878 : }
10879 : else
10880 100298 : op = fold_unary (VIEW_CONVERT_EXPR,
10881 : TREE_TYPE (vector_type), op);
10882 100349 : gcc_assert (op && CONSTANT_CLASS_P (op));
10883 : }
10884 : else
10885 : {
10886 174428 : tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
10887 174428 : gimple *init_stmt;
10888 174428 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10889 : {
10890 403 : tree true_val
10891 403 : = build_all_ones_cst (TREE_TYPE (vector_type));
10892 403 : tree false_val
10893 403 : = build_zero_cst (TREE_TYPE (vector_type));
10894 403 : gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
10895 403 : init_stmt = gimple_build_assign (new_temp, COND_EXPR,
10896 : op, true_val,
10897 : false_val);
10898 : }
10899 : else
10900 : {
10901 174025 : op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
10902 : op);
10903 174025 : init_stmt
10904 174025 : = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
10905 : op);
10906 : }
10907 174428 : gimple_seq_add_stmt (&ctor_seq, init_stmt);
10908 174428 : op = new_temp;
10909 : }
10910 : }
10911 1786908 : elts[number_of_places_left_in_vector] = op;
10912 1786908 : if (!CONSTANT_CLASS_P (op))
10913 316386 : constant_p = false;
10914 : /* For BB vectorization we have to compute an insert location
10915 : when a def is inside the analyzed region since we cannot
10916 : simply insert at the BB start in this case. */
10917 1786908 : stmt_vec_info opdef;
10918 1786908 : if (TREE_CODE (orig_op) == SSA_NAME
10919 181541 : && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
10920 161741 : && is_a <bb_vec_info> (vinfo)
10921 1890826 : && (opdef = vinfo->lookup_def (orig_op)))
10922 : {
10923 85209 : if (!insert_after)
10924 : insert_after = opdef;
10925 : else
10926 47059 : insert_after = get_later_stmt (insert_after, opdef);
10927 : }
10928 :
10929 1786908 : if (number_of_places_left_in_vector == 0)
10930 : {
10931 618934 : auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
10932 618934 : if (uniform_elt)
10933 646930 : vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
10934 323465 : elts[0]);
10935 590938 : else if (constant_p
10936 590938 : ? multiple_p (type_nunits, nunits)
10937 109163 : : known_eq (type_nunits, nunits))
10938 295469 : vec_cst = gimple_build_vector (&ctor_seq, &elts);
10939 : else
10940 : {
10941 0 : if (permute_results.is_empty ())
10942 0 : duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
10943 : elts, number_of_vectors,
10944 : permute_results);
10945 0 : vec_cst = permute_results[number_of_vectors - j - 1];
10946 : }
10947 618934 : if (!gimple_seq_empty_p (ctor_seq))
10948 : {
10949 136314 : if (insert_after)
10950 : {
10951 38150 : gimple_stmt_iterator gsi;
10952 38150 : if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
10953 : {
10954 614 : gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
10955 614 : gsi_insert_seq_before (&gsi, ctor_seq,
10956 : GSI_CONTINUE_LINKING);
10957 : }
10958 37536 : else if (!stmt_ends_bb_p (insert_after->stmt))
10959 : {
10960 37536 : gsi = gsi_for_stmt (insert_after->stmt);
10961 37536 : gsi_insert_seq_after (&gsi, ctor_seq,
10962 : GSI_CONTINUE_LINKING);
10963 : }
10964 : else
10965 : {
10966 : /* When we want to insert after a def where the
10967 : defining stmt throws then insert on the fallthru
10968 : edge. */
10969 0 : edge e = find_fallthru_edge
10970 0 : (gimple_bb (insert_after->stmt)->succs);
10971 0 : basic_block new_bb
10972 0 : = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
10973 0 : gcc_assert (!new_bb);
10974 : }
10975 : }
10976 : else
10977 98164 : vinfo->insert_seq_on_entry (NULL, ctor_seq);
10978 136314 : ctor_seq = NULL;
10979 : }
10980 618934 : voprnds.quick_push (vec_cst);
10981 618934 : insert_after = NULL;
10982 618934 : number_of_places_left_in_vector = nunits;
10983 618934 : constant_p = true;
10984 618934 : elts.new_vector (vector_type, nunits, 1);
10985 618934 : elts.quick_grow (nunits);
10986 : }
10987 : }
10988 : }
10989 :
10990 : /* Since the vectors are created in the reverse order, we should invert
10991 : them. */
10992 490000 : vec_num = voprnds.length ();
10993 1108934 : for (j = vec_num; j != 0; j--)
10994 : {
10995 618934 : vop = voprnds[j - 1];
10996 618934 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
10997 : }
10998 :
10999 : /* In case that VF is greater than the unrolling factor needed for the SLP
11000 : group of stmts, NUMBER_OF_VECTORS to be created is greater than
11001 : NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
11002 : to replicate the vectors. */
11003 490000 : while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
11004 490000 : for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
11005 : i++)
11006 0 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
11007 490000 : }
11008 :
11009 : /* Get the scalar definition of the Nth lane from SLP_NODE or NULL_TREE
11010 : if there is no definition for it in the scalar IL or it is not known. */
11011 :
11012 : tree
11013 1909 : vect_get_slp_scalar_def (slp_tree slp_node, unsigned n)
11014 : {
11015 1909 : if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
11016 : {
11017 1899 : if (!SLP_TREE_SCALAR_STMTS (slp_node).exists ())
11018 : return NULL_TREE;
11019 1899 : stmt_vec_info def = SLP_TREE_SCALAR_STMTS (slp_node)[n];
11020 1899 : if (!def)
11021 : return NULL_TREE;
11022 1899 : return gimple_get_lhs (STMT_VINFO_STMT (def));
11023 : }
11024 : else
11025 10 : return SLP_TREE_SCALAR_OPS (slp_node)[n];
11026 : }
11027 :
11028 : /* Get the Ith vectorized definition from SLP_NODE. */
11029 :
11030 : tree
11031 145845 : vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
11032 : {
11033 145845 : return SLP_TREE_VEC_DEFS (slp_node)[i];
11034 : }
11035 :
11036 : /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
11037 :
11038 : void
11039 925642 : vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
11040 : {
11041 1851284 : vec_defs->create (SLP_TREE_VEC_DEFS (slp_node).length ());
11042 925642 : vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
11043 925642 : }
11044 :
11045 : /* Get N vectorized definitions for SLP_NODE. */
11046 :
11047 : void
11048 2955 : vect_get_slp_defs (vec_info *,
11049 : slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
11050 : {
11051 2955 : if (n == -1U)
11052 2955 : n = SLP_TREE_CHILDREN (slp_node).length ();
11053 :
11054 10648 : for (unsigned i = 0; i < n; ++i)
11055 : {
11056 7693 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
11057 7693 : vec<tree> vec_defs = vNULL;
11058 7693 : vect_get_slp_defs (child, &vec_defs);
11059 7693 : vec_oprnds->quick_push (vec_defs);
11060 : }
11061 2955 : }
11062 :
11063 : /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
11064 : - PERM gives the permutation that the caller wants to use for NODE,
11065 : which might be different from SLP_LOAD_PERMUTATION.
11066 : - DUMP_P controls whether the function dumps information. */
11067 :
11068 : static bool
11069 125915 : vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
11070 : load_permutation_t &perm,
11071 : const vec<tree> &dr_chain,
11072 : gimple_stmt_iterator *gsi, poly_uint64 vf,
11073 : bool analyze_only, bool dump_p,
11074 : unsigned *n_perms, unsigned int *n_loads,
11075 : bool dce_chain)
11076 : {
11077 125915 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
11078 125915 : int vec_index = 0;
11079 125915 : tree vectype = SLP_TREE_VECTYPE (node);
11080 125915 : unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
11081 125915 : unsigned int mask_element;
11082 125915 : unsigned dr_group_size;
11083 125915 : machine_mode mode;
11084 :
11085 125915 : if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
11086 : {
11087 : /* We have both splats of the same non-grouped load and groups
11088 : of distinct invariant loads entering here. */
11089 1205 : unsigned max_idx = 0;
11090 6793 : for (auto idx : perm)
11091 3178 : max_idx = idx > max_idx ? idx : max_idx;
11092 1205 : dr_group_size = max_idx + 1;
11093 : }
11094 : else
11095 : {
11096 124710 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
11097 124710 : dr_group_size = DR_GROUP_SIZE (stmt_info);
11098 : }
11099 :
11100 125915 : mode = TYPE_MODE (vectype);
11101 125915 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
11102 125915 : unsigned int nstmts = vect_get_num_copies (vinfo, node);
11103 :
11104 : /* Initialize the vect stmts of NODE to properly insert the generated
11105 : stmts later. */
11106 125915 : if (! analyze_only)
11107 57661 : for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
11108 22219 : SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
11109 :
11110 : /* Generate permutation masks for every NODE. Number of masks for each NODE
11111 : is equal to GROUP_SIZE.
11112 : E.g., we have a group of three nodes with three loads from the same
11113 : location in each node, and the vector size is 4. I.e., we have a
11114 : a0b0c0a1b1c1... sequence and we need to create the following vectors:
11115 : for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
11116 : for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
11117 : ...
11118 :
11119 : The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
11120 : The last mask is illegal since we assume two operands for permute
11121 : operation, and the mask element values can't be outside that range.
11122 : Hence, the last mask must be converted into {2,5,5,5}.
11123 : For the first two permutations we need the first and the second input
11124 : vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
11125 : we need the second and the third vectors: {b1,c1,a2,b2} and
11126 : {c2,a3,b3,c3}. */
11127 :
11128 125915 : int vect_stmts_counter = 0;
11129 125915 : unsigned int index = 0;
11130 125915 : int first_vec_index = -1;
11131 125915 : int second_vec_index = -1;
11132 125915 : bool noop_p = true;
11133 125915 : *n_perms = 0;
11134 :
11135 125915 : vec_perm_builder mask;
11136 125915 : unsigned int nelts_to_build;
11137 125915 : unsigned int nvectors_per_build;
11138 125915 : unsigned int in_nlanes;
11139 125915 : bool repeating_p = (group_size == dr_group_size
11140 157546 : && multiple_p (nunits, group_size));
11141 125915 : if (repeating_p)
11142 : {
11143 : /* A single vector contains a whole number of copies of the node, so:
11144 : (a) all permutes can use the same mask; and
11145 : (b) the permutes only need a single vector input. */
11146 29464 : mask.new_vector (nunits, group_size, 3);
11147 29464 : nelts_to_build = mask.encoded_nelts ();
11148 : /* It's possible to obtain zero nstmts during analyze_only, so make
11149 : it at least one to ensure the later computation for n_perms
11150 : proceed. */
11151 29464 : nvectors_per_build = nstmts > 0 ? nstmts : 1;
11152 29464 : in_nlanes = dr_group_size * 3;
11153 : }
11154 : else
11155 : {
11156 : /* We need to construct a separate mask for each vector statement. */
11157 96451 : unsigned HOST_WIDE_INT const_nunits, const_vf;
11158 96451 : if (!nunits.is_constant (&const_nunits)
11159 96451 : || !vf.is_constant (&const_vf))
11160 : return false;
11161 96451 : mask.new_vector (const_nunits, const_nunits, 1);
11162 96451 : nelts_to_build = const_vf * group_size;
11163 96451 : nvectors_per_build = 1;
11164 96451 : in_nlanes = const_vf * dr_group_size;
11165 : }
11166 125915 : auto_sbitmap used_in_lanes (in_nlanes);
11167 125915 : bitmap_clear (used_in_lanes);
11168 125915 : auto_bitmap used_defs;
11169 :
11170 125915 : unsigned int count = mask.encoded_nelts ();
11171 125915 : mask.quick_grow (count);
11172 125915 : vec_perm_indices indices;
11173 :
11174 660604 : for (unsigned int j = 0; j < nelts_to_build; j++)
11175 : {
11176 544566 : unsigned int iter_num = j / group_size;
11177 544566 : unsigned int stmt_num = j % group_size;
11178 544566 : unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
11179 544566 : bitmap_set_bit (used_in_lanes, i);
11180 544566 : if (repeating_p)
11181 : {
11182 : first_vec_index = 0;
11183 : mask_element = i;
11184 : }
11185 : else
11186 : {
11187 : /* Enforced before the loop when !repeating_p. */
11188 354684 : unsigned int const_nunits = nunits.to_constant ();
11189 354684 : vec_index = i / const_nunits;
11190 354684 : mask_element = i % const_nunits;
11191 354684 : if (vec_index == first_vec_index
11192 354684 : || first_vec_index == -1)
11193 : {
11194 : first_vec_index = vec_index;
11195 : }
11196 140242 : else if (vec_index == second_vec_index
11197 140242 : || second_vec_index == -1)
11198 : {
11199 133838 : second_vec_index = vec_index;
11200 133838 : mask_element += const_nunits;
11201 : }
11202 : else
11203 : {
11204 6404 : if (dump_p)
11205 366 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11206 : "permutation requires at "
11207 : "least three vectors %G",
11208 : stmt_info->stmt);
11209 6404 : gcc_assert (analyze_only);
11210 : return false;
11211 : }
11212 :
11213 348280 : gcc_assert (mask_element < 2 * const_nunits);
11214 : }
11215 :
11216 538162 : if (mask_element != index)
11217 339363 : noop_p = false;
11218 538162 : mask[index++] = mask_element;
11219 :
11220 538162 : if (index == count)
11221 : {
11222 153949 : if (!noop_p)
11223 : {
11224 207086 : indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
11225 121438 : if (!can_vec_perm_const_p (mode, mode, indices))
11226 : {
11227 3473 : if (dump_p)
11228 : {
11229 79 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11230 : "unsupported vect permute { ");
11231 669 : for (i = 0; i < count; ++i)
11232 : {
11233 590 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
11234 590 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
11235 : }
11236 79 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
11237 : }
11238 3473 : gcc_assert (analyze_only);
11239 : return false;
11240 : }
11241 :
11242 117965 : tree mask_vec = NULL_TREE;
11243 117965 : if (!analyze_only)
11244 20579 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
11245 :
11246 117965 : if (second_vec_index == -1)
11247 33884 : second_vec_index = first_vec_index;
11248 :
11249 237921 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
11250 : {
11251 119956 : ++*n_perms;
11252 119956 : if (analyze_only)
11253 99094 : continue;
11254 : /* Generate the permute statement if necessary. */
11255 20862 : tree first_vec = dr_chain[first_vec_index + ri];
11256 20862 : tree second_vec = dr_chain[second_vec_index + ri];
11257 20862 : gassign *stmt = as_a<gassign *> (stmt_info->stmt);
11258 20862 : tree perm_dest
11259 20862 : = vect_create_destination_var (gimple_assign_lhs (stmt),
11260 : vectype);
11261 20862 : perm_dest = make_ssa_name (perm_dest);
11262 20862 : gimple *perm_stmt
11263 20862 : = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
11264 : second_vec, mask_vec);
11265 20862 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
11266 : gsi);
11267 20862 : if (dce_chain)
11268 : {
11269 20093 : bitmap_set_bit (used_defs, first_vec_index + ri);
11270 20093 : bitmap_set_bit (used_defs, second_vec_index + ri);
11271 : }
11272 :
11273 : /* Store the vector statement in NODE. */
11274 20862 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
11275 : }
11276 : }
11277 32511 : else if (!analyze_only)
11278 : {
11279 2714 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
11280 : {
11281 1357 : tree first_vec = dr_chain[first_vec_index + ri];
11282 : /* If mask was NULL_TREE generate the requested
11283 : identity transform. */
11284 1357 : if (dce_chain)
11285 1356 : bitmap_set_bit (used_defs, first_vec_index + ri);
11286 :
11287 : /* Store the vector statement in NODE. */
11288 1357 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
11289 : }
11290 : }
11291 :
11292 : index = 0;
11293 : first_vec_index = -1;
11294 : second_vec_index = -1;
11295 : noop_p = true;
11296 : }
11297 : }
11298 :
11299 116038 : if (n_loads)
11300 : {
11301 81738 : if (repeating_p)
11302 10396 : *n_loads = nstmts;
11303 : else
11304 : {
11305 : /* Enforced above when !repeating_p. */
11306 71342 : unsigned int const_nunits = nunits.to_constant ();
11307 71342 : *n_loads = 0;
11308 71342 : bool load_seen = false;
11309 997036 : for (unsigned i = 0; i < in_nlanes; ++i)
11310 : {
11311 925694 : if (i % const_nunits == 0)
11312 : {
11313 407271 : if (load_seen)
11314 122397 : *n_loads += 1;
11315 : load_seen = false;
11316 : }
11317 925694 : if (bitmap_bit_p (used_in_lanes, i))
11318 252217 : load_seen = true;
11319 : }
11320 71342 : if (load_seen)
11321 42922 : *n_loads += 1;
11322 : }
11323 : }
11324 :
11325 116038 : if (dce_chain)
11326 215123 : for (unsigned i = 0; i < dr_chain.length (); ++i)
11327 72108 : if (!bitmap_bit_p (used_defs, i))
11328 : {
11329 39350 : tree def = dr_chain[i];
11330 39685 : do
11331 : {
11332 39685 : gimple *stmt = SSA_NAME_DEF_STMT (def);
11333 39685 : if (is_gimple_assign (stmt)
11334 39685 : && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
11335 39685 : || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
11336 4916 : def = single_ssa_tree_operand (stmt, SSA_OP_USE);
11337 : else
11338 : def = NULL;
11339 39685 : gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
11340 39685 : gsi_remove (&rgsi, true);
11341 39685 : release_defs (stmt);
11342 : }
11343 39685 : while (def);
11344 : }
11345 :
11346 : return true;
11347 125915 : }
11348 :
11349 : /* Generate vector permute statements from a list of loads in DR_CHAIN.
11350 : If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
11351 : permute statements for the SLP node NODE. Store the number of vector
11352 : permute instructions in *N_PERMS and the number of vector load
11353 : instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
11354 : that were not needed. */
11355 :
11356 : bool
11357 90755 : vect_transform_slp_perm_load (vec_info *vinfo,
11358 : slp_tree node, const vec<tree> &dr_chain,
11359 : gimple_stmt_iterator *gsi, poly_uint64 vf,
11360 : bool analyze_only, unsigned *n_perms,
11361 : unsigned int *n_loads, bool dce_chain)
11362 : {
11363 90755 : return vect_transform_slp_perm_load_1 (vinfo, node,
11364 90755 : SLP_TREE_LOAD_PERMUTATION (node),
11365 : dr_chain, gsi, vf, analyze_only,
11366 : dump_enabled_p (), n_perms, n_loads,
11367 90755 : dce_chain);
11368 : }
11369 :
11370 : /* Produce the next vector result for SLP permutation NODE by adding a vector
11371 : statement at GSI. If MASK_VEC is nonnull, add:
11372 :
11373 : <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
11374 :
11375 : otherwise add:
11376 :
11377 : <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF,
11378 : { N, N+1, N+2, ... }>
11379 :
11380 : where N == IDENTITY_OFFSET which is either zero or equal to the
11381 : number of elements of the result. */
11382 :
11383 : static void
11384 31376 : vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11385 : slp_tree node, tree first_def, tree second_def,
11386 : tree mask_vec, poly_uint64 identity_offset)
11387 : {
11388 31376 : tree vectype = SLP_TREE_VECTYPE (node);
11389 :
11390 : /* ??? We SLP match existing vector element extracts but
11391 : allow punning which we need to re-instantiate at uses
11392 : but have no good way of explicitly representing. */
11393 31376 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
11394 31376 : && !types_compatible_p (TREE_TYPE (first_def), vectype))
11395 : {
11396 14 : gassign *conv_stmt
11397 14 : = gimple_build_assign (make_ssa_name (vectype),
11398 : build1 (VIEW_CONVERT_EXPR, vectype, first_def));
11399 14 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
11400 14 : first_def = gimple_assign_lhs (conv_stmt);
11401 : }
11402 31376 : gassign *perm_stmt;
11403 31376 : tree perm_dest = make_ssa_name (vectype);
11404 31376 : if (mask_vec)
11405 : {
11406 28100 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
11407 28100 : TYPE_SIZE (vectype))
11408 28100 : && !types_compatible_p (TREE_TYPE (second_def), vectype))
11409 : {
11410 8 : gassign *conv_stmt
11411 8 : = gimple_build_assign (make_ssa_name (vectype),
11412 : build1 (VIEW_CONVERT_EXPR,
11413 : vectype, second_def));
11414 8 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
11415 8 : second_def = gimple_assign_lhs (conv_stmt);
11416 : }
11417 28100 : perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
11418 : first_def, second_def,
11419 : mask_vec);
11420 : }
11421 : else
11422 : {
11423 3276 : auto def_nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
11424 3276 : unsigned HOST_WIDE_INT vecno;
11425 3276 : poly_uint64 eltno;
11426 3276 : if (!can_div_trunc_p (poly_uint64 (identity_offset), def_nunits,
11427 : &vecno, &eltno))
11428 : gcc_unreachable ();
11429 3276 : tree def = vecno & 1 ? second_def : first_def;
11430 3276 : if (!types_compatible_p (TREE_TYPE (def), vectype))
11431 : {
11432 : /* For identity permutes we still need to handle the case
11433 : of offsetted extracts or concats. */
11434 261 : unsigned HOST_WIDE_INT c;
11435 261 : if (known_le (TYPE_VECTOR_SUBPARTS (vectype), def_nunits))
11436 : {
11437 257 : unsigned HOST_WIDE_INT elsz
11438 257 : = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (def))));
11439 514 : tree lowpart = build3 (BIT_FIELD_REF, vectype, def,
11440 257 : TYPE_SIZE (vectype),
11441 257 : bitsize_int (eltno * elsz));
11442 257 : perm_stmt = gimple_build_assign (perm_dest, lowpart);
11443 : }
11444 4 : else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
11445 4 : def_nunits, &c) && c == 2)
11446 : {
11447 4 : gcc_assert (known_eq (identity_offset, 0U));
11448 4 : tree ctor = build_constructor_va (vectype, 2,
11449 : NULL_TREE, first_def,
11450 : NULL_TREE, second_def);
11451 4 : perm_stmt = gimple_build_assign (perm_dest, ctor);
11452 : }
11453 : else
11454 0 : gcc_unreachable ();
11455 : }
11456 : else
11457 : {
11458 : /* We need a copy here in case the def was external. */
11459 3015 : gcc_assert (known_eq (eltno, 0U));
11460 3015 : perm_stmt = gimple_build_assign (perm_dest, def);
11461 : }
11462 : }
11463 31376 : vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
11464 : /* Store the vector statement in NODE. */
11465 31376 : node->push_vec_def (perm_stmt);
11466 31376 : }
11467 :
11468 : /* Subroutine of vectorizable_slp_permutation. Check whether the target
11469 : can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
11470 : If GSI is nonnull, emit the permutation there.
11471 :
11472 : When GSI is null, the only purpose of NODE is to give properties
11473 : of the result, such as the vector type and number of SLP lanes.
11474 : The node does not need to be a VEC_PERM_EXPR.
11475 :
11476 : If the target supports the operation, return the number of individual
11477 : VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
11478 : dump file if DUMP_P is true. */
11479 :
11480 : static int
11481 435500 : vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
11482 : slp_tree node, lane_permutation_t &perm,
11483 : vec<slp_tree> &children, bool dump_p)
11484 : {
11485 435500 : tree vectype = SLP_TREE_VECTYPE (node);
11486 :
11487 : /* ??? We currently only support all same vector input types
11488 : while the SLP IL should really do a concat + select and thus accept
11489 : arbitrary mismatches. */
11490 435500 : slp_tree child;
11491 435500 : unsigned i;
11492 435500 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
11493 435500 : bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
11494 : /* True if we're permuting a single input of 2N vectors down
11495 : to N vectors. This case doesn't generalize beyond 2 since
11496 : VEC_PERM_EXPR only takes 2 inputs. */
11497 435500 : bool pack_p = false;
11498 : /* If we're permuting inputs of N vectors each into X*N outputs,
11499 : this is the value of X, otherwise it is 1. */
11500 435500 : unsigned int unpack_factor = 1;
11501 435500 : tree op_vectype = NULL_TREE;
11502 436685 : FOR_EACH_VEC_ELT (children, i, child)
11503 436606 : if (SLP_TREE_VECTYPE (child))
11504 : {
11505 : op_vectype = SLP_TREE_VECTYPE (child);
11506 : break;
11507 : }
11508 435500 : if (!op_vectype)
11509 79 : op_vectype = vectype;
11510 930282 : FOR_EACH_VEC_ELT (children, i, child)
11511 : {
11512 494782 : if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
11513 10077 : && !vect_maybe_update_slp_op_vectype (child, op_vectype))
11514 494782 : || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
11515 989564 : || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
11516 : {
11517 0 : if (dump_p)
11518 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11519 : "Unsupported vector types in lane permutation\n");
11520 0 : return -1;
11521 : }
11522 494782 : auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
11523 494782 : unsigned int this_unpack_factor;
11524 : /* Detect permutations of external, pre-existing vectors. The external
11525 : node's SLP_TREE_LANES stores the total number of units in the vector,
11526 : or zero if the vector has variable length.
11527 :
11528 : We are expected to keep the original VEC_PERM_EXPR for such cases.
11529 : There is no repetition to model. */
11530 494782 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def
11531 494782 : && SLP_TREE_SCALAR_OPS (child).is_empty ())
11532 : repeating_p = false;
11533 : /* Check whether the input has twice as many lanes per vector. */
11534 486819 : else if (children.length () == 1
11535 486819 : && known_eq (SLP_TREE_LANES (child) * nunits,
11536 : SLP_TREE_LANES (node) * op_nunits * 2))
11537 : pack_p = true;
11538 : /* Check whether the output has N times as many lanes per vector. */
11539 494782 : else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
11540 444222 : SLP_TREE_LANES (child) * nunits,
11541 : &this_unpack_factor)
11542 409559 : && (i == 0 || unpack_factor == this_unpack_factor))
11543 : unpack_factor = this_unpack_factor;
11544 : else
11545 : repeating_p = false;
11546 : }
11547 :
11548 871000 : gcc_assert (perm.length () == SLP_TREE_LANES (node));
11549 :
11550 : /* Load-lanes permute. This permute only acts as a forwarder to
11551 : select the correct vector def of the load-lanes load which
11552 : has the permuted vectors in its vector defs like
11553 : { v0, w0, r0, v1, w1, r1 ... } for a ld3. All costs are
11554 : accounted for in the costing for the actual load so we
11555 : return zero here. */
11556 435500 : if (node->ldst_lanes)
11557 : {
11558 0 : gcc_assert (children.length () == 1);
11559 0 : if (!gsi)
11560 : /* This is a trivial op always supported. */
11561 : return 0;
11562 0 : slp_tree child = children[0];
11563 0 : unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
11564 0 : / SLP_TREE_LANES (node));
11565 0 : unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
11566 0 : unsigned nvectors = vect_get_num_copies (vinfo, node);
11567 0 : for (unsigned i = 0; i < nvectors; ++i)
11568 : {
11569 0 : tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx];
11570 0 : node->push_vec_def (def);
11571 : }
11572 : return 0;
11573 : }
11574 :
11575 : /* Set REPEATING_P to true if the permutations are cyclical wrt UNPACK_FACTOR
11576 : and if we can generate the vectors in a vector-length agnostic way.
11577 : This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
11578 : compile time.
11579 :
11580 : The significance of UNPACK_STEP is that, when PACK_P is false,
11581 : output vector I operates on a window of UNPACK_STEP elements from each
11582 : input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR). For example,
11583 : when UNPACK_FACTOR is 2, the first output vector operates on lanes
11584 : [0, NUNITS / 2 - 1] of each input vector and the second output vector
11585 : operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
11586 :
11587 : When REPEATING_P is true, NOUTPUTS holds the total number of outputs
11588 : that we actually need to generate. */
11589 435500 : uint64_t noutputs = 0;
11590 435500 : poly_uint64 unpack_step = 0;
11591 435500 : loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
11592 148764 : if (!linfo
11593 473731 : || !multiple_p (nunits, unpack_factor, &unpack_step)
11594 147864 : || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
11595 147864 : * SLP_TREE_LANES (node), nunits, &noutputs))
11596 : repeating_p = false;
11597 :
11598 : /* We can handle the conditions described for REPEATING_P above for
11599 : both variable- and constant-length vectors. The fallback requires
11600 : us to generate every element of every permute vector explicitly,
11601 : which is only possible for constant-length permute vectors.
11602 :
11603 : Set:
11604 :
11605 : - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
11606 : mask vectors that we want to build.
11607 :
11608 : - NCOPIES to the number of copies of PERM that we need in order
11609 : to build the necessary permute mask vectors. */
11610 147864 : uint64_t npatterns;
11611 147864 : unsigned nelts_per_pattern;
11612 147864 : uint64_t ncopies;
11613 147864 : if (repeating_p)
11614 : {
11615 : /* We need permute mask vectors that have the form:
11616 :
11617 : { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
11618 :
11619 : In other words, the original n-element permute in PERM is
11620 : "unrolled" to fill a full vector. The stepped vector encoding
11621 : that we use for permutes requires 3n elements. */
11622 109633 : npatterns = SLP_TREE_LANES (node);
11623 109633 : nelts_per_pattern = ncopies = 3;
11624 : }
11625 : else
11626 : {
11627 : /* Calculate every element of every permute mask vector explicitly,
11628 : instead of relying on the pattern described above. */
11629 325867 : if (!nunits.is_constant (&npatterns)
11630 325867 : || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
11631 : {
11632 : if (dump_p)
11633 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11634 : "unsupported permutation %p on variable-length"
11635 : " vectors\n", (void *) node);
11636 : return -1;
11637 : }
11638 325867 : nelts_per_pattern = ncopies = 1;
11639 325867 : if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
11640 : {
11641 : if (dump_p)
11642 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11643 : "unsupported permutation %p for variable VF\n",
11644 : (void *) node);
11645 : return -1;
11646 : }
11647 : pack_p = false;
11648 : unpack_factor = 1;
11649 : }
11650 435500 : unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
11651 435500 : gcc_assert (repeating_p || multiple_p (olanes, nunits));
11652 :
11653 : /* Compute the { { SLP operand, vector index}, lane } permutation sequence
11654 : from the { SLP operand, scalar lane } permutation as recorded in the
11655 : SLP node as intermediate step. This part should already work
11656 : with SLP children with arbitrary number of lanes. */
11657 435500 : auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
11658 435500 : auto_vec<poly_uint64> active_lane;
11659 435500 : vperm.create (olanes);
11660 435500 : active_lane.safe_grow_cleared (children.length (), true);
11661 877970 : for (unsigned int ui = 0; ui < unpack_factor; ++ui)
11662 : {
11663 1902336 : for (unsigned j = 0; j < children.length (); ++j)
11664 508698 : active_lane[j] = ui * unpack_step;
11665 1217188 : for (unsigned i = 0; i < ncopies; ++i)
11666 : {
11667 4841970 : for (unsigned pi = 0; pi < perm.length (); ++pi)
11668 : {
11669 1646267 : std::pair<unsigned, unsigned> p = perm[pi];
11670 1646267 : tree vtype = SLP_TREE_VECTYPE (children[p.first]);
11671 1646267 : if (repeating_p)
11672 626667 : vperm.quick_push ({{p.first, 0},
11673 626667 : p.second + active_lane[p.first]});
11674 : else
11675 : {
11676 : /* We checked above that the vectors are constant-length. */
11677 1019600 : unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
11678 1019600 : .to_constant ();
11679 1019600 : unsigned lane = active_lane[p.first].to_constant ();
11680 1019600 : unsigned vi = (lane + p.second) / vnunits;
11681 1019600 : unsigned vl = (lane + p.second) % vnunits;
11682 1019600 : vperm.quick_push ({{p.first, vi}, vl});
11683 : }
11684 : }
11685 : /* Advance to the next group. */
11686 1668637 : for (unsigned j = 0; j < children.length (); ++j)
11687 893919 : active_lane[j] += SLP_TREE_LANES (children[j]);
11688 : }
11689 : }
11690 :
11691 435500 : if (dump_p)
11692 : {
11693 8827 : dump_printf_loc (MSG_NOTE, vect_location,
11694 : "vectorizing permutation %p", (void *)node);
11695 31996 : for (unsigned i = 0; i < perm.length (); ++i)
11696 23169 : dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
11697 8827 : if (repeating_p)
11698 7427 : dump_printf (MSG_NOTE, " (repeat %d)", SLP_TREE_LANES (node));
11699 8827 : dump_printf (MSG_NOTE, "\n");
11700 8827 : dump_printf_loc (MSG_NOTE, vect_location, "as");
11701 88790 : for (unsigned i = 0; i < vperm.length (); ++i)
11702 : {
11703 79963 : if (i != 0
11704 79963 : && (repeating_p
11705 53986 : ? multiple_p (i, npatterns)
11706 59505 : : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
11707 23952 : dump_printf (MSG_NOTE, ",");
11708 79963 : dump_printf (MSG_NOTE, " vops%u[%u][",
11709 79963 : vperm[i].first.first, vperm[i].first.second);
11710 79963 : dump_dec (MSG_NOTE, vperm[i].second);
11711 79963 : dump_printf (MSG_NOTE, "]");
11712 : }
11713 8827 : dump_printf (MSG_NOTE, "\n");
11714 : }
11715 :
11716 : /* We can only handle two-vector permutes, everything else should
11717 : be lowered on the SLP level. The following is closely inspired
11718 : by vect_transform_slp_perm_load and is supposed to eventually
11719 : replace it.
11720 : ??? As intermediate step do code-gen in the SLP tree representation
11721 : somehow? */
11722 435500 : std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
11723 435500 : std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
11724 435500 : unsigned int index = 0;
11725 435500 : poly_uint64 mask_element;
11726 435500 : vec_perm_builder mask;
11727 435500 : mask.new_vector (nunits, npatterns, nelts_per_pattern);
11728 435500 : unsigned int count = mask.encoded_nelts ();
11729 435500 : mask.quick_grow (count);
11730 435500 : vec_perm_indices indices;
11731 435500 : unsigned nperms = 0;
11732 : /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
11733 : vectors to check during analysis, but we need to generate NOUTPUTS
11734 : vectors during transformation. */
11735 435500 : unsigned total_nelts = olanes;
11736 435500 : unsigned process_nelts = olanes;
11737 435500 : if (repeating_p)
11738 : {
11739 109633 : total_nelts = (total_nelts / unpack_factor) * noutputs;
11740 109633 : if (gsi)
11741 9879 : process_nelts = total_nelts;
11742 : }
11743 435500 : unsigned last_ei = (total_nelts - 1) % process_nelts;
11744 2091053 : for (unsigned i = 0; i < process_nelts; ++i)
11745 : {
11746 : /* VI is the input vector index when generating code for REPEATING_P. */
11747 1663235 : unsigned vi = i / olanes * (pack_p ? 2 : 1);
11748 1663235 : unsigned ei = i % olanes;
11749 1663235 : mask_element = vperm[ei].second;
11750 1663235 : if (pack_p)
11751 : {
11752 : /* In this case, we have N outputs and the single child provides 2N
11753 : inputs. Output X permutes inputs 2X and 2X+1.
11754 :
11755 : The mask indices are taken directly from the SLP permutation node.
11756 : Index X selects from the first vector if (X / NUNITS) % 2 == 0;
11757 : X selects from the second vector otherwise. These conditions
11758 : are only known at compile time for constant-length vectors. */
11759 : first_vec = std::make_pair (0, 0);
11760 : second_vec = std::make_pair (0, 1);
11761 : }
11762 1499669 : else if (first_vec.first == -1U
11763 1499669 : || first_vec == vperm[ei].first)
11764 1304558 : first_vec = vperm[ei].first;
11765 195111 : else if (second_vec.first == -1U
11766 195111 : || second_vec == vperm[ei].first)
11767 : {
11768 194723 : second_vec = vperm[ei].first;
11769 194723 : mask_element += nunits;
11770 : }
11771 : else
11772 : {
11773 388 : if (dump_p)
11774 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11775 : "permutation requires at "
11776 : "least three vectors\n");
11777 388 : gcc_assert (!gsi);
11778 : return -1;
11779 : }
11780 :
11781 1662847 : mask[index++] = mask_element;
11782 :
11783 1662847 : if (index == count)
11784 : {
11785 719469 : indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
11786 : TYPE_VECTOR_SUBPARTS (op_vectype));
11787 572952 : bool identity_p = (indices.series_p (0, 1, mask[0], 1)
11788 887723 : && constant_multiple_p (mask[0], nunits));
11789 572952 : machine_mode vmode = TYPE_MODE (vectype);
11790 572952 : machine_mode op_vmode = TYPE_MODE (op_vectype);
11791 572952 : unsigned HOST_WIDE_INT c;
11792 572952 : if ((!identity_p
11793 532880 : && !can_vec_perm_const_p (vmode, op_vmode, indices))
11794 572952 : || (identity_p
11795 40072 : && !known_le (nunits,
11796 : TYPE_VECTOR_SUBPARTS (op_vectype))
11797 7302 : && (!constant_multiple_p (nunits,
11798 8 : TYPE_VECTOR_SUBPARTS (op_vectype),
11799 8 : &c) || c != 2)))
11800 : {
11801 7294 : if (dump_p)
11802 : {
11803 152 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
11804 : vect_location,
11805 : "unsupported vect permute { ");
11806 1586 : for (i = 0; i < count; ++i)
11807 : {
11808 1434 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
11809 1434 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
11810 : }
11811 152 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
11812 : }
11813 7294 : gcc_assert (!gsi);
11814 7682 : return -1;
11815 : }
11816 :
11817 565658 : if (!identity_p)
11818 525586 : nperms += CEIL (total_nelts, process_nelts) - (ei > last_ei);
11819 565658 : if (gsi)
11820 : {
11821 31376 : if (second_vec.first == -1U)
11822 7001 : second_vec = first_vec;
11823 :
11824 31376 : slp_tree
11825 31376 : first_node = children[first_vec.first],
11826 31376 : second_node = children[second_vec.first];
11827 :
11828 31376 : tree mask_vec = NULL_TREE;
11829 31376 : if (!identity_p)
11830 28100 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
11831 :
11832 31376 : tree first_def
11833 31376 : = vect_get_slp_vect_def (first_node, first_vec.second + vi);
11834 31376 : tree second_def
11835 31376 : = vect_get_slp_vect_def (second_node, second_vec.second + vi);
11836 31376 : vect_add_slp_permutation (vinfo, gsi, node, first_def,
11837 31376 : second_def, mask_vec, mask[0]);
11838 : }
11839 :
11840 : index = 0;
11841 : first_vec = std::make_pair (-1U, -1U);
11842 : second_vec = std::make_pair (-1U, -1U);
11843 : }
11844 : }
11845 :
11846 427818 : return nperms;
11847 435500 : }
11848 :
11849 : /* Vectorize the SLP permutations in NODE as specified
11850 : in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
11851 : child number and lane number.
11852 : Interleaving of two two-lane two-child SLP subtrees (not supported):
11853 : [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
11854 : A blend of two four-lane two-child SLP subtrees:
11855 : [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
11856 : Highpart of a four-lane one-child SLP subtree (not supported):
11857 : [ { 0, 2 }, { 0, 3 } ]
11858 : Where currently only a subset is supported by code generating below. */
11859 :
11860 : bool
11861 115741 : vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11862 : slp_tree node, stmt_vector_for_cost *cost_vec)
11863 : {
11864 115741 : tree vectype = SLP_TREE_VECTYPE (node);
11865 115741 : lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
11866 115741 : int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
11867 115741 : SLP_TREE_CHILDREN (node),
11868 : dump_enabled_p ());
11869 115741 : if (nperms < 0)
11870 : return false;
11871 :
11872 114412 : if (!gsi && nperms != 0)
11873 92917 : record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
11874 :
11875 : return true;
11876 : }
11877 :
11878 : /* Vectorize SLP NODE. */
11879 :
11880 : static void
11881 1461423 : vect_schedule_slp_node (vec_info *vinfo,
11882 : slp_tree node, slp_instance instance)
11883 : {
11884 1461423 : gimple_stmt_iterator si;
11885 1461423 : int i;
11886 1461423 : slp_tree child;
11887 :
11888 : /* Vectorize externals and constants. */
11889 1461423 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
11890 1461423 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
11891 : {
11892 : /* ??? vectorizable_shift can end up using a scalar operand which is
11893 : currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
11894 : node in this case. */
11895 497004 : if (!SLP_TREE_VECTYPE (node))
11896 497004 : return;
11897 :
11898 : /* There are two reasons vector defs might already exist. The first
11899 : is that we are vectorizing an existing vector def. The second is
11900 : when performing BB vectorization shared constant/external nodes
11901 : are not split apart during partitioning so during the code-gen
11902 : DFS walk we can end up visiting them twice. */
11903 490801 : if (! SLP_TREE_VEC_DEFS (node).exists ())
11904 490000 : vect_create_constant_vectors (vinfo, node);
11905 490801 : return;
11906 : }
11907 :
11908 964419 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
11909 :
11910 964419 : gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
11911 964419 : if (SLP_TREE_VECTYPE (node))
11912 964413 : SLP_TREE_VEC_DEFS (node).create (vect_get_num_copies (vinfo, node));
11913 :
11914 964419 : if (!SLP_TREE_PERMUTE_P (node) && STMT_VINFO_DATA_REF (stmt_info))
11915 : {
11916 : /* Vectorized loads go before the first scalar load to make it
11917 : ready early, vectorized stores go before the last scalar
11918 : stmt which is where all uses are ready. */
11919 704721 : stmt_vec_info last_stmt_info = NULL;
11920 704721 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
11921 163322 : last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
11922 : else /* DR_IS_WRITE */
11923 541399 : last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
11924 704721 : si = gsi_for_stmt (last_stmt_info->stmt);
11925 704721 : }
11926 259698 : else if (!SLP_TREE_PERMUTE_P (node)
11927 243224 : && (SLP_TREE_TYPE (node) == cycle_phi_info_type
11928 : || SLP_TREE_TYPE (node) == induc_vec_info_type
11929 : || SLP_TREE_TYPE (node) == phi_info_type))
11930 : {
11931 : /* For PHI node vectorization we do not use the insertion iterator. */
11932 53968 : si = gsi_none ();
11933 : }
11934 : else
11935 : {
11936 : /* Emit other stmts after the children vectorized defs which is
11937 : earliest possible. */
11938 : gimple *last_stmt = NULL;
11939 : bool seen_vector_def = false;
11940 573136 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11941 367406 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
11942 : {
11943 : /* For fold-left reductions we are retaining the scalar
11944 : reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
11945 : set so the representation isn't perfect. Resort to the
11946 : last scalar def here. */
11947 294845 : if (SLP_TREE_VEC_DEFS (child).is_empty ())
11948 : {
11949 866 : gcc_assert (SLP_TREE_TYPE (child) == cycle_phi_info_type);
11950 866 : gphi *phi = as_a <gphi *>
11951 866 : (vect_find_last_scalar_stmt_in_slp (child)->stmt);
11952 866 : if (!last_stmt)
11953 : last_stmt = phi;
11954 648 : else if (vect_stmt_dominates_stmt_p (last_stmt, phi))
11955 : last_stmt = phi;
11956 637 : else if (vect_stmt_dominates_stmt_p (phi, last_stmt))
11957 : ;
11958 : else
11959 0 : gcc_unreachable ();
11960 : }
11961 : /* We are emitting all vectorized stmts in the same place and
11962 : the last one is the last.
11963 : ??? Unless we have a load permutation applied and that
11964 : figures to re-use an earlier generated load. */
11965 : unsigned j;
11966 : tree vdef;
11967 696829 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11968 : {
11969 401984 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11970 401984 : if (!last_stmt)
11971 : last_stmt = vstmt;
11972 206850 : else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11973 : last_stmt = vstmt;
11974 45417 : else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
11975 : ;
11976 : else
11977 0 : gcc_unreachable ();
11978 : }
11979 : }
11980 72561 : else if (!SLP_TREE_VECTYPE (child))
11981 : {
11982 : /* For externals we use unvectorized at all scalar defs. */
11983 : unsigned j;
11984 : tree def;
11985 12903 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
11986 7334 : if (TREE_CODE (def) == SSA_NAME
11987 7334 : && !SSA_NAME_IS_DEFAULT_DEF (def))
11988 : {
11989 167 : gimple *stmt = SSA_NAME_DEF_STMT (def);
11990 167 : if (gimple_uid (stmt) == -1u)
11991 : /* If the stmt is not inside the region do not
11992 : use it as possible insertion point. */
11993 : ;
11994 159 : else if (!last_stmt)
11995 : last_stmt = stmt;
11996 153 : else if (vect_stmt_dominates_stmt_p (last_stmt, stmt))
11997 : last_stmt = stmt;
11998 153 : else if (vect_stmt_dominates_stmt_p (stmt, last_stmt))
11999 : ;
12000 : else
12001 0 : gcc_unreachable ();
12002 : }
12003 : }
12004 : else
12005 : {
12006 : /* For externals we have to look at all defs since their
12007 : insertion place is decided per vector. But beware
12008 : of pre-existing vectors where we need to make sure
12009 : we do not insert before the region boundary. */
12010 66992 : if (SLP_TREE_SCALAR_OPS (child).is_empty ()
12011 654 : && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
12012 : seen_vector_def = true;
12013 : else
12014 : {
12015 : unsigned j;
12016 : tree vdef;
12017 529058 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
12018 94777 : if (TREE_CODE (vdef) == SSA_NAME
12019 94777 : && !SSA_NAME_IS_DEFAULT_DEF (vdef))
12020 : {
12021 19610 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
12022 19610 : if (!last_stmt)
12023 : last_stmt = vstmt;
12024 10962 : else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
12025 : last_stmt = vstmt;
12026 8709 : else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
12027 : ;
12028 : else
12029 0 : gcc_unreachable ();
12030 : }
12031 : }
12032 : }
12033 : /* This can happen when all children are pre-existing vectors or
12034 : constants. */
12035 205730 : if (!last_stmt)
12036 1724 : last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
12037 1724 : if (!last_stmt)
12038 : {
12039 0 : gcc_assert (seen_vector_def);
12040 0 : si = gsi_after_labels (vinfo->bbs[0]);
12041 : }
12042 205730 : else if (is_ctrl_altering_stmt (last_stmt))
12043 : {
12044 : /* We split regions to vectorize at control altering stmts
12045 : with a definition so this must be an external which
12046 : we can insert at the start of the region. */
12047 0 : si = gsi_after_labels (vinfo->bbs[0]);
12048 : }
12049 205730 : else if (is_a <bb_vec_info> (vinfo)
12050 17754 : && !SLP_TREE_PERMUTE_P (node)
12051 16331 : && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
12052 206990 : && gimple_could_trap_p (stmt_info->stmt))
12053 : {
12054 : /* We've constrained possibly trapping operations to all come
12055 : from the same basic-block, if vectorized defs would allow earlier
12056 : scheduling still force vectorized stmts to the original block.
12057 : This is only necessary for BB vectorization since for loop vect
12058 : all operations are in a single BB and scalar stmt based
12059 : placement doesn't play well with epilogue vectorization. */
12060 53 : gcc_assert (dominated_by_p (CDI_DOMINATORS,
12061 : gimple_bb (stmt_info->stmt),
12062 : gimple_bb (last_stmt)));
12063 53 : si = gsi_after_labels (gimple_bb (stmt_info->stmt));
12064 : }
12065 205677 : else if (is_a <gphi *> (last_stmt))
12066 14345 : si = gsi_after_labels (gimple_bb (last_stmt));
12067 : else
12068 : {
12069 191332 : si = gsi_for_stmt (last_stmt);
12070 191332 : gsi_next (&si);
12071 :
12072 : /* Avoid scheduling internal defs outside of the loop when
12073 : we might have only implicitly tracked loop mask/len defs. */
12074 191332 : if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
12075 74 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
12076 173843 : || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
12077 : {
12078 74 : gimple_stmt_iterator si2
12079 74 : = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
12080 74 : if ((gsi_end_p (si2)
12081 0 : && (LOOP_VINFO_LOOP (loop_vinfo)->header
12082 0 : != gimple_bb (last_stmt))
12083 0 : && dominated_by_p (CDI_DOMINATORS,
12084 : LOOP_VINFO_LOOP (loop_vinfo)->header,
12085 0 : gimple_bb (last_stmt)))
12086 74 : || (!gsi_end_p (si2)
12087 74 : && last_stmt != *si2
12088 72 : && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
12089 3 : si = si2;
12090 : }
12091 : }
12092 : }
12093 :
12094 964419 : if (dump_enabled_p ())
12095 : {
12096 71845 : if (stmt_info)
12097 71792 : dump_printf_loc (MSG_NOTE, vect_location,
12098 : "------>vectorizing SLP node starting from: %G",
12099 : stmt_info->stmt);
12100 : else
12101 : {
12102 53 : dump_printf_loc (MSG_NOTE, vect_location,
12103 : "------>vectorizing SLP node:\n");
12104 53 : vect_print_slp_tree (MSG_NOTE, vect_location, node);
12105 : }
12106 : }
12107 964419 : vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
12108 : }
12109 :
12110 : /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
12111 : For loop vectorization this is done in vectorizable_call, but for SLP
12112 : it needs to be deferred until end of vect_schedule_slp, because multiple
12113 : SLP instances may refer to the same scalar stmt. */
12114 :
12115 : static void
12116 600677 : vect_remove_slp_scalar_calls (vec_info *vinfo,
12117 : slp_tree node, hash_set<slp_tree> &visited)
12118 : {
12119 600677 : gimple *new_stmt;
12120 600677 : gimple_stmt_iterator gsi;
12121 600677 : int i;
12122 600677 : slp_tree child;
12123 600677 : tree lhs;
12124 600677 : stmt_vec_info stmt_info;
12125 :
12126 600677 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
12127 188413 : return;
12128 :
12129 456257 : if (visited.add (node))
12130 : return;
12131 :
12132 923159 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
12133 510895 : vect_remove_slp_scalar_calls (vinfo, child, visited);
12134 :
12135 1305688 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
12136 : {
12137 485375 : if (!stmt_info)
12138 3974 : continue;
12139 481401 : stmt_info = vect_orig_stmt (stmt_info);
12140 481401 : gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
12141 5231 : if (!stmt || gimple_bb (stmt) == NULL)
12142 476208 : continue;
12143 5193 : lhs = gimple_call_lhs (stmt);
12144 5193 : if (lhs)
12145 4579 : new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
12146 : else
12147 614 : new_stmt = gimple_build_nop ();
12148 5193 : unlink_stmt_vdef (stmt_info->stmt);
12149 5193 : gsi = gsi_for_stmt (stmt);
12150 5193 : vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
12151 5193 : if (lhs)
12152 4579 : SSA_NAME_DEF_STMT (lhs) = new_stmt;
12153 : }
12154 : }
12155 :
12156 : static void
12157 89782 : vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
12158 : {
12159 89782 : hash_set<slp_tree> visited;
12160 89782 : vect_remove_slp_scalar_calls (vinfo, node, visited);
12161 89782 : }
12162 :
12163 : /* Vectorize the instance root. */
12164 :
12165 : void
12166 10820 : vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance instance)
12167 : {
12168 10820 : gassign *rstmt = NULL;
12169 :
12170 10820 : if (instance->kind == slp_inst_kind_ctor)
12171 : {
12172 4901 : if (SLP_TREE_VEC_DEFS (node).length () == 1)
12173 : {
12174 4864 : tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
12175 4864 : tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
12176 4864 : if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
12177 4864 : TREE_TYPE (vect_lhs)))
12178 0 : vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
12179 : vect_lhs);
12180 4864 : rstmt = gimple_build_assign (root_lhs, vect_lhs);
12181 : }
12182 : else
12183 : {
12184 37 : gcc_assert (SLP_TREE_VEC_DEFS (node).length () > 1);
12185 37 : tree child_def;
12186 37 : int j;
12187 37 : vec<constructor_elt, va_gc> *v;
12188 37 : vec_alloc (v, SLP_TREE_VEC_DEFS (node).length ());
12189 :
12190 : /* A CTOR can handle V16HI composition from VNx8HI so we
12191 : do not need to convert vector elements if the types
12192 : do not match. */
12193 111 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
12194 74 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
12195 37 : tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
12196 37 : tree rtype
12197 37 : = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
12198 37 : tree r_constructor = build_constructor (rtype, v);
12199 37 : rstmt = gimple_build_assign (lhs, r_constructor);
12200 : }
12201 : }
12202 5919 : else if (instance->kind == slp_inst_kind_bb_reduc)
12203 : {
12204 : /* Largely inspired by reduction chain epilogue handling in
12205 : vect_create_epilog_for_reduction. */
12206 4330 : vec<tree> vec_defs = vNULL;
12207 4330 : vect_get_slp_defs (node, &vec_defs);
12208 4330 : enum tree_code reduc_code
12209 4330 : = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
12210 : /* ??? We actually have to reflect signs somewhere. */
12211 4330 : if (reduc_code == MINUS_EXPR)
12212 0 : reduc_code = PLUS_EXPR;
12213 4330 : gimple_seq epilogue = NULL;
12214 : /* We may end up with more than one vector result, reduce them
12215 : to one vector. */
12216 4330 : tree vec_def = vec_defs[0];
12217 4330 : tree vectype = TREE_TYPE (vec_def);
12218 4330 : tree compute_vectype = vectype;
12219 4330 : bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
12220 4130 : && TYPE_OVERFLOW_UNDEFINED (vectype)
12221 7296 : && operation_can_overflow (reduc_code));
12222 2833 : if (pun_for_overflow_p)
12223 : {
12224 2833 : compute_vectype = unsigned_type_for (vectype);
12225 2833 : vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
12226 : compute_vectype, vec_def);
12227 : }
12228 6708 : for (unsigned i = 1; i < vec_defs.length (); ++i)
12229 : {
12230 2378 : tree def = vec_defs[i];
12231 2378 : if (pun_for_overflow_p)
12232 2273 : def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
12233 : compute_vectype, def);
12234 2378 : vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
12235 : vec_def, def);
12236 : }
12237 4330 : vec_defs.release ();
12238 : /* ??? Support other schemes than direct internal fn. */
12239 4330 : internal_fn reduc_fn;
12240 4330 : if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
12241 4330 : || reduc_fn == IFN_LAST)
12242 0 : gcc_unreachable ();
12243 4330 : tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
12244 4330 : TREE_TYPE (compute_vectype), vec_def);
12245 4330 : if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
12246 : {
12247 2813 : tree rem_def = NULL_TREE;
12248 12403 : for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
12249 : {
12250 9590 : def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
12251 9590 : if (!rem_def)
12252 : rem_def = def;
12253 : else
12254 6777 : rem_def = gimple_build (&epilogue, reduc_code,
12255 6777 : TREE_TYPE (scalar_def),
12256 : rem_def, def);
12257 : }
12258 2813 : scalar_def = gimple_build (&epilogue, reduc_code,
12259 2813 : TREE_TYPE (scalar_def),
12260 : scalar_def, rem_def);
12261 : }
12262 4330 : scalar_def = gimple_convert (&epilogue,
12263 4330 : TREE_TYPE (vectype), scalar_def);
12264 4330 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
12265 4330 : gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
12266 4330 : gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
12267 4330 : update_stmt (gsi_stmt (rgsi));
12268 4330 : return;
12269 : }
12270 1589 : else if (instance->kind == slp_inst_kind_gcond)
12271 : {
12272 : /* Only support a single root for now as we can't codegen CFG yet and so we
12273 : can't support lane > 1 at this time. */
12274 1589 : gcc_assert (instance->root_stmts.length () == 1);
12275 1589 : auto root_stmt_info = instance->root_stmts[0];
12276 1589 : auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
12277 1589 : gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
12278 1589 : gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
12279 1589 : bool res = vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
12280 : root_stmt_info, &rgsi, node, NULL);
12281 1589 : gcc_assert (res);
12282 1589 : return;
12283 : }
12284 : else
12285 0 : gcc_unreachable ();
12286 :
12287 4901 : gcc_assert (rstmt);
12288 :
12289 4901 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
12290 4901 : gsi_replace (&rgsi, rstmt, true);
12291 : }
12292 :
12293 : struct slp_scc_info
12294 : {
12295 : bool on_stack;
12296 : int dfs;
12297 : int lowlink;
12298 : };
12299 :
12300 : /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
12301 :
12302 : static void
12303 1461423 : vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
12304 : hash_map<slp_tree, slp_scc_info> &scc_info,
12305 : int &maxdfs, vec<slp_tree> &stack)
12306 : {
12307 1461423 : bool existed_p;
12308 1461423 : slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
12309 1461423 : gcc_assert (!existed_p);
12310 1461423 : info->dfs = maxdfs;
12311 1461423 : info->lowlink = maxdfs;
12312 1461423 : maxdfs++;
12313 :
12314 : /* Leaf. */
12315 1461423 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
12316 : {
12317 497004 : info->on_stack = false;
12318 497004 : vect_schedule_slp_node (vinfo, node, instance);
12319 1025668 : return;
12320 : }
12321 :
12322 964419 : info->on_stack = true;
12323 964419 : stack.safe_push (node);
12324 :
12325 964419 : unsigned i;
12326 964419 : slp_tree child;
12327 : /* DFS recurse. */
12328 1992970 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
12329 : {
12330 1028551 : if (!child)
12331 55101 : continue;
12332 973450 : slp_scc_info *child_info = scc_info.get (child);
12333 973450 : if (!child_info)
12334 : {
12335 883902 : vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
12336 : /* Recursion might have re-allocated the node. */
12337 883902 : info = scc_info.get (node);
12338 883902 : child_info = scc_info.get (child);
12339 883902 : info->lowlink = MIN (info->lowlink, child_info->lowlink);
12340 : }
12341 89548 : else if (child_info->on_stack)
12342 25492 : info->lowlink = MIN (info->lowlink, child_info->dfs);
12343 : }
12344 964419 : if (info->lowlink != info->dfs)
12345 : return;
12346 :
12347 932759 : auto_vec<slp_tree, 4> phis_to_fixup;
12348 :
12349 : /* Singleton. */
12350 932759 : if (stack.last () == node)
12351 : {
12352 908922 : stack.pop ();
12353 908922 : info->on_stack = false;
12354 908922 : vect_schedule_slp_node (vinfo, node, instance);
12355 908922 : if (!SLP_TREE_PERMUTE_P (node)
12356 908922 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
12357 30239 : phis_to_fixup.quick_push (node);
12358 : }
12359 : else
12360 : {
12361 : /* SCC. */
12362 23837 : int last_idx = stack.length () - 1;
12363 55497 : while (stack[last_idx] != node)
12364 31660 : last_idx--;
12365 : /* We can break the cycle at PHIs who have at least one child
12366 : code generated. Then we could re-start the DFS walk until
12367 : all nodes in the SCC are covered (we might have new entries
12368 : for only back-reachable nodes). But it's simpler to just
12369 : iterate and schedule those that are ready. */
12370 23837 : unsigned todo = stack.length () - last_idx;
12371 24164 : do
12372 : {
12373 105555 : for (int idx = stack.length () - 1; idx >= last_idx; --idx)
12374 : {
12375 57227 : slp_tree entry = stack[idx];
12376 57227 : if (!entry)
12377 934 : continue;
12378 56293 : bool phi = (!SLP_TREE_PERMUTE_P (entry)
12379 56293 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
12380 56293 : bool ready = !phi;
12381 142467 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
12382 111213 : if (!child)
12383 : {
12384 22983 : gcc_assert (phi);
12385 : ready = true;
12386 : break;
12387 : }
12388 88230 : else if (scc_info.get (child)->on_stack)
12389 : {
12390 24027 : if (!phi)
12391 : {
12392 : ready = false;
12393 : break;
12394 : }
12395 : }
12396 : else
12397 : {
12398 64203 : if (phi)
12399 : {
12400 : ready = true;
12401 : break;
12402 : }
12403 : }
12404 33310 : if (ready)
12405 : {
12406 55497 : vect_schedule_slp_node (vinfo, entry, instance);
12407 55497 : scc_info.get (entry)->on_stack = false;
12408 55497 : stack[idx] = NULL;
12409 55497 : todo--;
12410 55497 : if (phi)
12411 24273 : phis_to_fixup.safe_push (entry);
12412 : }
12413 : }
12414 : }
12415 24164 : while (todo != 0);
12416 :
12417 : /* Pop the SCC. */
12418 23837 : stack.truncate (last_idx);
12419 : }
12420 :
12421 : /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
12422 : slp_tree phi_node;
12423 1920030 : FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
12424 : {
12425 54512 : gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
12426 54512 : edge_iterator ei;
12427 54512 : edge e;
12428 171804 : FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
12429 : {
12430 117292 : unsigned dest_idx = e->dest_idx;
12431 117292 : child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
12432 117292 : if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
12433 66013 : continue;
12434 51279 : unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
12435 : /* Simply fill all args. */
12436 51279 : if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
12437 : != vect_first_order_recurrence)
12438 110252 : for (unsigned i = 0; i < n; ++i)
12439 : {
12440 59013 : tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
12441 59013 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
12442 59013 : add_phi_arg (phi, vect_get_slp_vect_def (child, i),
12443 : e, gimple_phi_arg_location (phi, dest_idx));
12444 : }
12445 : else
12446 : {
12447 : /* Unless it is a first order recurrence which needs
12448 : args filled in for both the PHI node and the permutes. */
12449 40 : gimple *perm
12450 40 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
12451 40 : gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
12452 40 : add_phi_arg (as_a <gphi *> (rphi),
12453 : vect_get_slp_vect_def (child, n - 1),
12454 : e, gimple_phi_arg_location (phi, dest_idx));
12455 117 : for (unsigned i = 0; i < n; ++i)
12456 : {
12457 77 : gimple *perm
12458 77 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
12459 77 : if (i > 0)
12460 37 : gimple_assign_set_rhs1 (perm,
12461 : vect_get_slp_vect_def (child, i - 1));
12462 77 : gimple_assign_set_rhs2 (perm,
12463 : vect_get_slp_vect_def (child, i));
12464 77 : update_stmt (perm);
12465 : }
12466 : }
12467 : }
12468 : }
12469 932759 : }
12470 :
12471 : /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
12472 :
12473 : void
12474 538891 : vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
12475 : {
12476 538891 : slp_instance instance;
12477 538891 : unsigned int i;
12478 :
12479 538891 : hash_map<slp_tree, slp_scc_info> scc_info;
12480 538891 : int maxdfs = 0;
12481 1116517 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
12482 : {
12483 577626 : slp_tree node = SLP_INSTANCE_TREE (instance);
12484 577626 : if (dump_enabled_p ())
12485 : {
12486 16008 : dump_printf_loc (MSG_NOTE, vect_location,
12487 : "Vectorizing SLP tree:\n");
12488 : /* ??? Dump all? */
12489 16008 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
12490 469 : dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
12491 469 : SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
12492 16008 : vect_print_slp_graph (MSG_NOTE, vect_location,
12493 : SLP_INSTANCE_TREE (instance));
12494 : }
12495 : /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
12496 : have a PHI be the node breaking the cycle. */
12497 577626 : auto_vec<slp_tree> stack;
12498 577626 : if (!scc_info.get (node))
12499 577521 : vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
12500 :
12501 577626 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
12502 10820 : vectorize_slp_instance_root_stmt (vinfo, node, instance);
12503 :
12504 577626 : if (dump_enabled_p ())
12505 16008 : dump_printf_loc (MSG_NOTE, vect_location,
12506 : "vectorizing stmts using SLP.\n");
12507 577626 : }
12508 :
12509 1655408 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
12510 : {
12511 577626 : slp_tree root = SLP_INSTANCE_TREE (instance);
12512 577626 : stmt_vec_info store_info;
12513 577626 : unsigned int j;
12514 :
12515 : /* Remove scalar call stmts. Do not do this for basic-block
12516 : vectorization as not all uses may be vectorized.
12517 : ??? Why should this be necessary? DCE should be able to
12518 : remove the stmts itself.
12519 : ??? For BB vectorization we can as well remove scalar
12520 : stmts starting from the SLP tree root if they have no
12521 : uses. */
12522 577626 : if (is_a <loop_vec_info> (vinfo))
12523 89782 : vect_remove_slp_scalar_calls (vinfo, root);
12524 :
12525 : /* Remove vectorized stores original scalar stmts. */
12526 2575575 : for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
12527 : {
12528 1456550 : if (!store_info
12529 1456536 : || !STMT_VINFO_DATA_REF (store_info)
12530 1428964 : || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
12531 : break;
12532 :
12533 1420323 : store_info = vect_orig_stmt (store_info);
12534 : /* Free the attached stmt_vec_info and remove the stmt. */
12535 1420323 : vinfo->remove_stmt (store_info);
12536 :
12537 : /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
12538 : to not crash in vect_free_slp_tree later. */
12539 1420323 : if (SLP_TREE_REPRESENTATIVE (root) == store_info)
12540 541100 : SLP_TREE_REPRESENTATIVE (root) = NULL;
12541 : }
12542 : }
12543 538891 : }
|