Branch data Line data Source code
1 : : /* SLP - Basic Block Vectorization
2 : : Copyright (C) 2007-2025 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : : and Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #include "config.h"
23 : : #define INCLUDE_ALGORITHM
24 : : #include "system.h"
25 : : #include "coretypes.h"
26 : : #include "backend.h"
27 : : #include "target.h"
28 : : #include "rtl.h"
29 : : #include "tree.h"
30 : : #include "gimple.h"
31 : : #include "tree-pass.h"
32 : : #include "ssa.h"
33 : : #include "optabs-tree.h"
34 : : #include "insn-config.h"
35 : : #include "recog.h" /* FIXME: for insn_data */
36 : : #include "fold-const.h"
37 : : #include "stor-layout.h"
38 : : #include "gimple-iterator.h"
39 : : #include "cfgloop.h"
40 : : #include "tree-vectorizer.h"
41 : : #include "langhooks.h"
42 : : #include "gimple-walk.h"
43 : : #include "dbgcnt.h"
44 : : #include "tree-vector-builder.h"
45 : : #include "vec-perm-indices.h"
46 : : #include "gimple-fold.h"
47 : : #include "internal-fn.h"
48 : : #include "dump-context.h"
49 : : #include "cfganal.h"
50 : : #include "tree-eh.h"
51 : : #include "tree-cfg.h"
52 : : #include "alloc-pool.h"
53 : : #include "sreal.h"
54 : : #include "predict.h"
55 : :
56 : : #define REDUC_GROUP_FIRST_ELEMENT(S) \
57 : : (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element)
58 : :
59 : : static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
60 : : load_permutation_t &,
61 : : const vec<tree> &,
62 : : gimple_stmt_iterator *,
63 : : poly_uint64, bool, bool,
64 : : unsigned *,
65 : : unsigned * = nullptr,
66 : : bool = false);
67 : : static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
68 : : slp_tree, lane_permutation_t &,
69 : : vec<slp_tree> &, bool);
70 : : static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
71 : : static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
72 : :
73 : : static object_allocator<_slp_tree> *slp_tree_pool;
74 : : static slp_tree slp_first_node;
75 : :
76 : : void
77 : 1120955 : vect_slp_init (void)
78 : : {
79 : 1120955 : slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
80 : 1120955 : }
81 : :
82 : : void
83 : 1120955 : vect_slp_fini (void)
84 : : {
85 : 1706049 : while (slp_first_node)
86 : 585094 : delete slp_first_node;
87 : 2241910 : delete slp_tree_pool;
88 : 1120955 : slp_tree_pool = NULL;
89 : 1120955 : }
90 : :
91 : : void *
92 : 7921087 : _slp_tree::operator new (size_t n)
93 : : {
94 : 7921087 : gcc_assert (n == sizeof (_slp_tree));
95 : 7921087 : return slp_tree_pool->allocate_raw ();
96 : : }
97 : :
98 : : void
99 : 7921087 : _slp_tree::operator delete (void *node, size_t n)
100 : : {
101 : 7921087 : gcc_assert (n == sizeof (_slp_tree));
102 : 7921087 : slp_tree_pool->remove_raw (node);
103 : 7921087 : }
104 : :
105 : :
106 : : /* Initialize a SLP node. */
107 : :
108 : 7921087 : _slp_tree::_slp_tree ()
109 : : {
110 : 7921087 : this->prev_node = NULL;
111 : 7921087 : if (slp_first_node)
112 : 7013222 : slp_first_node->prev_node = this;
113 : 7921087 : this->next_node = slp_first_node;
114 : 7921087 : slp_first_node = this;
115 : 7921087 : SLP_TREE_SCALAR_STMTS (this) = vNULL;
116 : 7921087 : SLP_TREE_SCALAR_OPS (this) = vNULL;
117 : 7921087 : SLP_TREE_VEC_DEFS (this) = vNULL;
118 : 7921087 : SLP_TREE_CHILDREN (this) = vNULL;
119 : 7921087 : SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
120 : 7921087 : SLP_TREE_LANE_PERMUTATION (this) = vNULL;
121 : 7921087 : SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122 : 7921087 : SLP_TREE_CODE (this) = ERROR_MARK;
123 : 7921087 : SLP_TREE_GS_SCALE (this) = 0;
124 : 7921087 : SLP_TREE_GS_BASE (this) = NULL_TREE;
125 : 7921087 : this->ldst_lanes = false;
126 : 7921087 : this->avoid_stlf_fail = false;
127 : 7921087 : SLP_TREE_VECTYPE (this) = NULL_TREE;
128 : 7921087 : SLP_TREE_REPRESENTATIVE (this) = NULL;
129 : 7921087 : this->cycle_info.id = -1;
130 : 7921087 : this->cycle_info.reduc_idx = -1;
131 : 7921087 : SLP_TREE_REF_COUNT (this) = 1;
132 : 7921087 : this->failed = NULL;
133 : 7921087 : this->max_nunits = 1;
134 : 7921087 : this->lanes = 0;
135 : 7921087 : SLP_TREE_TYPE (this) = undef_vec_info_type;
136 : 7921087 : this->data = NULL;
137 : 7921087 : }
138 : :
139 : : /* Tear down a SLP node. */
140 : :
141 : 7921087 : _slp_tree::~_slp_tree ()
142 : : {
143 : 7921087 : if (this->prev_node)
144 : 5147392 : this->prev_node->next_node = this->next_node;
145 : : else
146 : 2773695 : slp_first_node = this->next_node;
147 : 7921087 : if (this->next_node)
148 : 5946164 : this->next_node->prev_node = this->prev_node;
149 : 7921087 : SLP_TREE_CHILDREN (this).release ();
150 : 7921087 : SLP_TREE_SCALAR_STMTS (this).release ();
151 : 7921087 : SLP_TREE_SCALAR_OPS (this).release ();
152 : 7921087 : SLP_TREE_VEC_DEFS (this).release ();
153 : 7921087 : SLP_TREE_LOAD_PERMUTATION (this).release ();
154 : 7921087 : SLP_TREE_LANE_PERMUTATION (this).release ();
155 : 7921087 : if (this->failed)
156 : 1951776 : free (failed);
157 : 7921087 : if (this->data)
158 : 1153536 : delete this->data;
159 : 7921087 : }
160 : :
161 : : /* Push the single SSA definition in DEF to the vector of vector defs. */
162 : :
163 : : void
164 : 522718 : _slp_tree::push_vec_def (gimple *def)
165 : : {
166 : 522718 : if (gphi *phi = dyn_cast <gphi *> (def))
167 : 59182 : vec_defs.quick_push (gimple_phi_result (phi));
168 : : else
169 : : {
170 : 463536 : def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
171 : 463536 : vec_defs.quick_push (get_def_from_ptr (defop));
172 : : }
173 : 522718 : }
174 : :
175 : : /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
176 : :
177 : : void
178 : 15140975 : vect_free_slp_tree (slp_tree node)
179 : : {
180 : 15140975 : int i;
181 : 15140975 : slp_tree child;
182 : :
183 : 15140975 : if (--SLP_TREE_REF_COUNT (node) != 0)
184 : 15140975 : return;
185 : :
186 : 11874064 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
187 : 4538071 : if (child)
188 : 3777493 : vect_free_slp_tree (child);
189 : :
190 : : /* If the node defines any SLP only patterns then those patterns are no
191 : : longer valid and should be removed. */
192 : 7335993 : stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
193 : 7335993 : if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
194 : : {
195 : 964 : stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
196 : 964 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
197 : 964 : STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
198 : : }
199 : :
200 : 7335993 : delete node;
201 : : }
202 : :
203 : : /* Return a location suitable for dumpings related to the SLP instance. */
204 : :
205 : : dump_user_location_t
206 : 3411929 : _slp_instance::location () const
207 : : {
208 : 3411929 : if (!root_stmts.is_empty ())
209 : 321086 : return root_stmts[0]->stmt;
210 : : else
211 : 3090843 : return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
212 : : }
213 : :
214 : :
215 : : /* Free the memory allocated for the SLP instance. */
216 : :
217 : : void
218 : 1764223 : vect_free_slp_instance (slp_instance instance)
219 : : {
220 : 1764223 : vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
221 : 1764223 : SLP_INSTANCE_LOADS (instance).release ();
222 : 1764223 : SLP_INSTANCE_ROOT_STMTS (instance).release ();
223 : 1764223 : SLP_INSTANCE_REMAIN_DEFS (instance).release ();
224 : 1764223 : instance->subgraph_entries.release ();
225 : 1764223 : instance->cost_vec.release ();
226 : 1764223 : free (instance);
227 : 1764223 : }
228 : :
229 : :
230 : : /* Create an SLP node for SCALAR_STMTS. */
231 : :
232 : : slp_tree
233 : 107498 : vect_create_new_slp_node (unsigned nops, tree_code code)
234 : : {
235 : 107498 : slp_tree node = new _slp_tree;
236 : 107498 : SLP_TREE_SCALAR_STMTS (node) = vNULL;
237 : 107498 : SLP_TREE_CHILDREN (node).create (nops);
238 : 107498 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
239 : 107498 : SLP_TREE_CODE (node) = code;
240 : 107498 : return node;
241 : : }
242 : : /* Create an SLP node for SCALAR_STMTS. */
243 : :
244 : : static slp_tree
245 : 3826811 : vect_create_new_slp_node (slp_tree node,
246 : : vec<stmt_vec_info> scalar_stmts, unsigned nops)
247 : : {
248 : 3826811 : SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
249 : 3826811 : SLP_TREE_CHILDREN (node).create (nops);
250 : 3826811 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
251 : 3826811 : SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
252 : 3826811 : SLP_TREE_LANES (node) = scalar_stmts.length ();
253 : 3826811 : return node;
254 : : }
255 : :
256 : : /* Create an SLP node for SCALAR_STMTS. */
257 : :
258 : : static slp_tree
259 : 6206 : vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
260 : : {
261 : 6206 : return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
262 : : }
263 : :
264 : : /* Create an SLP node for OPS. */
265 : :
266 : : static slp_tree
267 : 2025244 : vect_create_new_slp_node (slp_tree node, vec<tree> ops)
268 : : {
269 : 2025244 : SLP_TREE_SCALAR_OPS (node) = ops;
270 : 2025244 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
271 : 0 : SLP_TREE_LANES (node) = ops.length ();
272 : 2025244 : return node;
273 : : }
274 : :
275 : : /* Create an SLP node for OPS. */
276 : :
277 : : static slp_tree
278 : 2025244 : vect_create_new_slp_node (vec<tree> ops)
279 : : {
280 : 2025244 : return vect_create_new_slp_node (new _slp_tree, ops);
281 : : }
282 : :
283 : :
284 : : /* This structure is used in creation of an SLP tree. Each instance
285 : : corresponds to the same operand in a group of scalar stmts in an SLP
286 : : node. */
287 : : typedef struct _slp_oprnd_info
288 : : {
289 : : /* Def-stmts for the operands. */
290 : : vec<stmt_vec_info> def_stmts;
291 : : /* Operands. */
292 : : vec<tree> ops;
293 : : /* Information about the first statement, its vector def-type, type, the
294 : : operand itself in case it's constant, and an indication if it's a pattern
295 : : stmt and gather/scatter info. */
296 : : tree first_op_type;
297 : : enum vect_def_type first_dt;
298 : : bool any_pattern;
299 : : bool first_gs_p;
300 : : gather_scatter_info first_gs_info;
301 : : } *slp_oprnd_info;
302 : :
303 : :
304 : : /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
305 : : operand. */
306 : : static vec<slp_oprnd_info>
307 : 3466828 : vect_create_oprnd_info (int nops, int group_size)
308 : : {
309 : 3466828 : int i;
310 : 3466828 : slp_oprnd_info oprnd_info;
311 : 3466828 : vec<slp_oprnd_info> oprnds_info;
312 : :
313 : 3466828 : oprnds_info.create (nops);
314 : 12580060 : for (i = 0; i < nops; i++)
315 : : {
316 : 5646404 : oprnd_info = XNEW (struct _slp_oprnd_info);
317 : 5646404 : oprnd_info->def_stmts.create (group_size);
318 : 5646404 : oprnd_info->ops.create (group_size);
319 : 5646404 : oprnd_info->first_dt = vect_uninitialized_def;
320 : 5646404 : oprnd_info->first_op_type = NULL_TREE;
321 : 5646404 : oprnd_info->any_pattern = false;
322 : 5646404 : oprnd_info->first_gs_p = false;
323 : 5646404 : oprnds_info.quick_push (oprnd_info);
324 : : }
325 : :
326 : 3466828 : return oprnds_info;
327 : : }
328 : :
329 : :
330 : : /* Free operands info. */
331 : :
332 : : static void
333 : 3466828 : vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
334 : : {
335 : 3466828 : int i;
336 : 3466828 : slp_oprnd_info oprnd_info;
337 : :
338 : 9113232 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
339 : : {
340 : 5646404 : oprnd_info->def_stmts.release ();
341 : 5646404 : oprnd_info->ops.release ();
342 : 5646404 : XDELETE (oprnd_info);
343 : : }
344 : :
345 : 3466828 : oprnds_info.release ();
346 : 3466828 : }
347 : :
348 : : /* Return the execution frequency of NODE (so that a higher value indicates
349 : : a "more important" node when optimizing for speed). */
350 : :
351 : : static sreal
352 : 3609633 : vect_slp_node_weight (slp_tree node)
353 : : {
354 : 3609633 : stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
355 : 3609633 : basic_block bb = gimple_bb (stmt_info->stmt);
356 : 3609633 : return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
357 : : }
358 : :
359 : : /* Return true if STMTS contains a pattern statement. */
360 : :
361 : : static bool
362 : 22241 : vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
363 : : {
364 : 22241 : stmt_vec_info stmt_info;
365 : 22241 : unsigned int i;
366 : 72561 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
367 : 52513 : if (stmt_info && is_pattern_stmt_p (stmt_info))
368 : : return true;
369 : : return false;
370 : : }
371 : :
372 : : /* Return true when all lanes in the external or constant NODE have
373 : : the same value. */
374 : :
375 : : static bool
376 : 610225 : vect_slp_tree_uniform_p (slp_tree node)
377 : : {
378 : 610225 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
379 : : || SLP_TREE_DEF_TYPE (node) == vect_external_def);
380 : :
381 : : /* Pre-exsting vectors. */
382 : 1076607 : if (SLP_TREE_SCALAR_OPS (node).is_empty ())
383 : : return false;
384 : :
385 : : unsigned i;
386 : : tree op, first = NULL_TREE;
387 : 1392648 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
388 : 1248805 : if (!first)
389 : : first = op;
390 : 638580 : else if (!operand_equal_p (first, op, 0))
391 : : return false;
392 : :
393 : : return true;
394 : : }
395 : :
396 : : /* Find the place of the data-ref in STMT_INFO in the interleaving chain
397 : : that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
398 : : of the chain. */
399 : :
400 : : int
401 : 680614 : vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
402 : : stmt_vec_info first_stmt_info)
403 : : {
404 : 680614 : stmt_vec_info next_stmt_info = first_stmt_info;
405 : 680614 : int result = 0;
406 : :
407 : 680614 : if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
408 : : return -1;
409 : :
410 : 1717418 : do
411 : : {
412 : 1717418 : if (next_stmt_info == stmt_info)
413 : : return result;
414 : 1036804 : next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
415 : 1036804 : if (next_stmt_info)
416 : 1036804 : result += DR_GROUP_GAP (next_stmt_info);
417 : : }
418 : 1036804 : while (next_stmt_info);
419 : :
420 : : return -1;
421 : : }
422 : :
423 : : /* Check whether it is possible to load COUNT elements of type ELT_TYPE
424 : : using the method implemented by duplicate_and_interleave. Return true
425 : : if so, returning the number of intermediate vectors in *NVECTORS_OUT
426 : : (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
427 : : (if nonnull). */
428 : :
429 : : bool
430 : 0 : can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
431 : : tree elt_type, unsigned int *nvectors_out,
432 : : tree *vector_type_out,
433 : : tree *permutes)
434 : : {
435 : 0 : tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
436 : 0 : if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
437 : 0 : return false;
438 : :
439 : 0 : machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
440 : 0 : poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
441 : 0 : unsigned int nvectors = 1;
442 : 0 : for (;;)
443 : : {
444 : 0 : scalar_int_mode int_mode;
445 : 0 : poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
446 : 0 : if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
447 : : {
448 : : /* Get the natural vector type for this SLP group size. */
449 : 0 : tree int_type = build_nonstandard_integer_type
450 : 0 : (GET_MODE_BITSIZE (int_mode), 1);
451 : 0 : tree vector_type
452 : 0 : = get_vectype_for_scalar_type (vinfo, int_type, count);
453 : 0 : poly_int64 half_nelts;
454 : 0 : if (vector_type
455 : 0 : && VECTOR_MODE_P (TYPE_MODE (vector_type))
456 : 0 : && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
457 : : GET_MODE_SIZE (base_vector_mode))
458 : 0 : && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
459 : : 2, &half_nelts))
460 : : {
461 : : /* Try fusing consecutive sequences of COUNT / NVECTORS elements
462 : : together into elements of type INT_TYPE and using the result
463 : : to build NVECTORS vectors. */
464 : 0 : poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
465 : 0 : vec_perm_builder sel1 (nelts, 2, 3);
466 : 0 : vec_perm_builder sel2 (nelts, 2, 3);
467 : :
468 : 0 : for (unsigned int i = 0; i < 3; ++i)
469 : : {
470 : 0 : sel1.quick_push (i);
471 : 0 : sel1.quick_push (i + nelts);
472 : 0 : sel2.quick_push (half_nelts + i);
473 : 0 : sel2.quick_push (half_nelts + i + nelts);
474 : : }
475 : 0 : vec_perm_indices indices1 (sel1, 2, nelts);
476 : 0 : vec_perm_indices indices2 (sel2, 2, nelts);
477 : 0 : machine_mode vmode = TYPE_MODE (vector_type);
478 : 0 : if (can_vec_perm_const_p (vmode, vmode, indices1)
479 : 0 : && can_vec_perm_const_p (vmode, vmode, indices2))
480 : : {
481 : 0 : if (nvectors_out)
482 : 0 : *nvectors_out = nvectors;
483 : 0 : if (vector_type_out)
484 : 0 : *vector_type_out = vector_type;
485 : 0 : if (permutes)
486 : : {
487 : 0 : permutes[0] = vect_gen_perm_mask_checked (vector_type,
488 : : indices1);
489 : 0 : permutes[1] = vect_gen_perm_mask_checked (vector_type,
490 : : indices2);
491 : : }
492 : 0 : return true;
493 : : }
494 : 0 : }
495 : : }
496 : 0 : if (!multiple_p (elt_bytes, 2, &elt_bytes))
497 : : return false;
498 : 0 : nvectors *= 2;
499 : : /* We need to be able to fuse COUNT / NVECTORS elements together. */
500 : 0 : if (!multiple_p (count, nvectors))
501 : : return false;
502 : : }
503 : : }
504 : :
505 : : /* Return true if DTA and DTB match. */
506 : :
507 : : static bool
508 : 16798676 : vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
509 : : {
510 : 16798676 : return (dta == dtb
511 : 333347 : || ((dta == vect_external_def || dta == vect_constant_def)
512 : 209957 : && (dtb == vect_external_def || dtb == vect_constant_def)));
513 : : }
514 : :
515 : : #define GATHER_SCATTER_OFFSET (-3)
516 : :
517 : : static const int no_arg_map[] = { 0 };
518 : : static const int arg0_map[] = { 1, 0 };
519 : : static const int arg2_map[] = { 1, 2 };
520 : : static const int arg2_arg3_map[] = { 2, 2, 3 };
521 : : static const int arg2_arg4_map[] = { 2, 2, 4 };
522 : : static const int arg2_arg5_arg6_map[] = { 3, 2, 5, 6 };
523 : : static const int arg2_arg4_arg5_map[] = { 3, 2, 4, 5 };
524 : : static const int arg3_arg2_map[] = { 2, 3, 2 };
525 : : static const int op1_op0_map[] = { 2, 1, 0 };
526 : : static const int off_map[] = { 1, GATHER_SCATTER_OFFSET };
527 : : static const int off_op0_map[] = { 2, GATHER_SCATTER_OFFSET, 0 };
528 : : static const int off_arg2_arg3_map[] = { 3, GATHER_SCATTER_OFFSET, 2, 3 };
529 : : static const int off_arg3_arg2_map[] = { 3, GATHER_SCATTER_OFFSET, 3, 2 };
530 : : static const int mask_call_maps[6][7] = {
531 : : { 1, 1, },
532 : : { 2, 1, 2, },
533 : : { 3, 1, 2, 3, },
534 : : { 4, 1, 2, 3, 4, },
535 : : { 5, 1, 2, 3, 4, 5, },
536 : : { 6, 1, 2, 3, 4, 5, 6 },
537 : : };
538 : :
539 : : /* For most SLP statements, there is a one-to-one mapping between
540 : : gimple arguments and child nodes. If that is not true for STMT,
541 : : return an array that contains:
542 : :
543 : : - the number of child nodes, followed by
544 : : - for each child node, the index of the argument associated with that node.
545 : : The special index -1 is the first operand of an embedded comparison and
546 : : the special index -2 is the second operand of an embedded comparison.
547 : : The special indes -3 is the offset of a gather as analyzed by
548 : : vect_check_gather_scatter.
549 : :
550 : : SWAP is as for vect_get_and_check_slp_defs. */
551 : :
552 : : static const int *
553 : 19833756 : vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
554 : : unsigned char swap = 0)
555 : : {
556 : 19833756 : if (auto assign = dyn_cast<const gassign *> (stmt))
557 : : {
558 : 18256807 : if (gimple_assign_rhs_code (assign) == COND_EXPR
559 : 18256807 : && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
560 : 0 : gcc_unreachable ();
561 : 18256807 : if ((TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
562 : 16956080 : || commutative_tree_code (gimple_assign_rhs_code (assign)))
563 : 27059075 : && swap)
564 : : return op1_op0_map;
565 : 18213792 : if (gather_scatter_p)
566 : 30606 : return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
567 : 30606 : ? off_op0_map : off_map);
568 : : }
569 : 19760135 : gcc_assert (!swap);
570 : 19760135 : if (auto call = dyn_cast<const gcall *> (stmt))
571 : : {
572 : 134587 : if (gimple_call_internal_p (call))
573 : 71986 : switch (gimple_call_internal_fn (call))
574 : : {
575 : 11727 : case IFN_MASK_LOAD:
576 : 19234 : return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
577 : :
578 : 0 : case IFN_GATHER_LOAD:
579 : 0 : return arg2_map;
580 : :
581 : 0 : case IFN_MASK_GATHER_LOAD:
582 : 0 : case IFN_MASK_LEN_GATHER_LOAD:
583 : 0 : return arg2_arg5_arg6_map;
584 : :
585 : 0 : case IFN_SCATTER_STORE:
586 : 0 : return arg2_arg4_map;
587 : :
588 : 0 : case IFN_MASK_SCATTER_STORE:
589 : 0 : case IFN_MASK_LEN_SCATTER_STORE:
590 : 0 : return arg2_arg4_arg5_map;
591 : :
592 : 5824 : case IFN_MASK_STORE:
593 : 10372 : return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
594 : :
595 : 976 : case IFN_MASK_CALL:
596 : 976 : {
597 : 976 : unsigned nargs = gimple_call_num_args (call);
598 : 976 : if (nargs >= 2 && nargs <= 7)
599 : 976 : return mask_call_maps[nargs-2];
600 : : else
601 : : return nullptr;
602 : : }
603 : :
604 : 140 : case IFN_CLZ:
605 : 140 : case IFN_CTZ:
606 : 140 : return arg0_map;
607 : :
608 : 6306 : case IFN_GOMP_SIMD_LANE:
609 : 6306 : return no_arg_map;
610 : :
611 : : default:
612 : : break;
613 : : }
614 : : }
615 : : return nullptr;
616 : : }
617 : :
618 : : /* Return the SLP node child index for operand OP of STMT. */
619 : :
620 : : int
621 : 1336298 : vect_slp_child_index_for_operand (const gimple *stmt, int op,
622 : : bool gather_scatter_p)
623 : : {
624 : 1336298 : const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
625 : 1336298 : if (!opmap)
626 : : return op;
627 : 17819 : for (int i = 1; i < 1 + opmap[0]; ++i)
628 : 17819 : if (opmap[i] == op)
629 : 9751 : return i - 1;
630 : 0 : gcc_unreachable ();
631 : : }
632 : :
633 : : /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
634 : : they are of a valid type and that they match the defs of the first stmt of
635 : : the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
636 : : by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
637 : : indicates swap is required for cond_expr stmts. Specifically, SWAP
638 : : is 1 if STMT is cond and operands of comparison need to be swapped;
639 : : SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
640 : :
641 : : If there was a fatal error return -1; if the error could be corrected by
642 : : swapping operands of father node of this one, return 1; if everything is
643 : : ok return 0. */
644 : : static int
645 : 12711703 : vect_get_and_check_slp_defs (vec_info *vinfo, tree vectype, unsigned char swap,
646 : : bool *skip_args,
647 : : vec<stmt_vec_info> stmts, unsigned stmt_num,
648 : : vec<slp_oprnd_info> *oprnds_info)
649 : : {
650 : 12711703 : stmt_vec_info stmt_info = stmts[stmt_num];
651 : 12711703 : tree oprnd;
652 : 12711703 : unsigned int i, number_of_oprnds;
653 : 12711703 : enum vect_def_type dt = vect_uninitialized_def;
654 : 12711703 : slp_oprnd_info oprnd_info;
655 : 12711703 : gather_scatter_info gs_info;
656 : 12711703 : unsigned int gs_op = -1u;
657 : 12711703 : unsigned int commutative_op = -1U;
658 : 12711703 : bool first = stmt_num == 0;
659 : :
660 : 12711703 : if (!stmt_info)
661 : : {
662 : 0 : for (auto oi : *oprnds_info)
663 : : {
664 : 0 : oi->def_stmts.quick_push (NULL);
665 : 0 : oi->ops.quick_push (NULL_TREE);
666 : : }
667 : : return 0;
668 : : }
669 : :
670 : 12711703 : if (!is_a<gcall *> (stmt_info->stmt)
671 : : && !is_a<gassign *> (stmt_info->stmt)
672 : : && !is_a<gphi *> (stmt_info->stmt))
673 : : return -1;
674 : :
675 : 12711703 : number_of_oprnds = gimple_num_args (stmt_info->stmt);
676 : 12711703 : const int *map
677 : 25423406 : = vect_get_operand_map (stmt_info->stmt,
678 : 12711703 : STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
679 : 12711703 : if (map)
680 : 66032 : number_of_oprnds = *map++;
681 : 12711703 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
682 : : {
683 : 39808 : if (gimple_call_internal_p (stmt))
684 : : {
685 : 23530 : internal_fn ifn = gimple_call_internal_fn (stmt);
686 : 23530 : commutative_op = first_commutative_argument (ifn);
687 : 23530 : if (internal_gather_scatter_fn_p (ifn))
688 : : {
689 : 0 : vect_describe_gather_scatter_call
690 : 0 : (stmt_info,
691 : 0 : first ? &(*oprnds_info)[0]->first_gs_info : &gs_info);
692 : 0 : if (first)
693 : 0 : (*oprnds_info)[0]->first_gs_p = true;
694 : : gs_op = 0;
695 : : }
696 : : }
697 : : }
698 : 12671895 : else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
699 : : {
700 : 14514933 : if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
701 : 8311819 : commutative_op = 0;
702 : : }
703 : :
704 : 12711703 : bool swapped = (swap != 0);
705 : 12711703 : bool backedge = false;
706 : 12711703 : enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
707 : 35294376 : for (i = 0; i < number_of_oprnds; i++)
708 : : {
709 : 22585145 : oprnd_info = (*oprnds_info)[i];
710 : 22585145 : int opno = map ? map[i] : int (i);
711 : 22585145 : if (opno == GATHER_SCATTER_OFFSET)
712 : : {
713 : 16247 : gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
714 : 16247 : if (!is_a <loop_vec_info> (vinfo)
715 : 16247 : || !vect_check_gather_scatter (stmt_info, vectype,
716 : : as_a <loop_vec_info> (vinfo),
717 : : first ? &oprnd_info->first_gs_info
718 : : : &gs_info))
719 : 2472 : return -1;
720 : :
721 : 16247 : if (first)
722 : : {
723 : 16010 : oprnd_info->first_gs_p = true;
724 : 16010 : oprnd = oprnd_info->first_gs_info.offset;
725 : : }
726 : : else
727 : : {
728 : 237 : gs_op = i;
729 : 237 : oprnd = gs_info.offset;
730 : : }
731 : : }
732 : 22568898 : else if (opno < 0)
733 : 0 : oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
734 : : else
735 : : {
736 : 22568898 : oprnd = gimple_arg (stmt_info->stmt, opno);
737 : 22568898 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
738 : : {
739 : 1510376 : edge e = gimple_phi_arg_edge (stmt, opno);
740 : 3020752 : backedge = (is_a <bb_vec_info> (vinfo)
741 : 2480821 : ? e->flags & EDGE_DFS_BACK
742 : 970445 : : dominated_by_p (CDI_DOMINATORS, e->src,
743 : 970445 : gimple_bb (stmt_info->stmt)));
744 : : }
745 : : }
746 : 22585145 : if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
747 : 2616 : oprnd = TREE_OPERAND (oprnd, 0);
748 : :
749 : 22585145 : stmt_vec_info def_stmt_info;
750 : 22585145 : if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
751 : : {
752 : 1109 : if (dump_enabled_p ())
753 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
754 : : "Build SLP failed: can't analyze def for %T\n",
755 : : oprnd);
756 : :
757 : 1109 : return -1;
758 : : }
759 : :
760 : 22584036 : if (skip_args[i])
761 : : {
762 : 879078 : oprnd_info->def_stmts.quick_push (NULL);
763 : 879078 : oprnd_info->ops.quick_push (NULL_TREE);
764 : 879078 : oprnd_info->first_dt = vect_uninitialized_def;
765 : 879078 : continue;
766 : : }
767 : :
768 : 21704958 : oprnd_info->def_stmts.quick_push (def_stmt_info);
769 : 21704958 : oprnd_info->ops.quick_push (oprnd);
770 : :
771 : 21704958 : if (def_stmt_info
772 : 21704958 : && is_pattern_stmt_p (def_stmt_info))
773 : : {
774 : 345930 : if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
775 : : != def_stmt_info)
776 : 248802 : oprnd_info->any_pattern = true;
777 : : else
778 : : /* If we promote this to external use the original stmt def. */
779 : 97128 : oprnd_info->ops.last ()
780 : 194256 : = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
781 : : }
782 : :
783 : : /* If there's a extern def on a backedge make sure we can
784 : : code-generate at the region start.
785 : : ??? This is another case that could be fixed by adjusting
786 : : how we split the function but at the moment we'd have conflicting
787 : : goals there. */
788 : 21704958 : if (backedge
789 : 126236 : && dts[i] == vect_external_def
790 : 1384 : && is_a <bb_vec_info> (vinfo)
791 : 1384 : && TREE_CODE (oprnd) == SSA_NAME
792 : 1363 : && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
793 : 21706321 : && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
794 : 1363 : gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
795 : : {
796 : 1363 : if (dump_enabled_p ())
797 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
798 : : "Build SLP failed: extern def %T only defined "
799 : : "on backedge\n", oprnd);
800 : 1363 : return -1;
801 : : }
802 : :
803 : 21703595 : if (first)
804 : : {
805 : 4789438 : tree type = TREE_TYPE (oprnd);
806 : 4789438 : dt = dts[i];
807 : :
808 : : /* For the swapping logic below force vect_reduction_def
809 : : for the reduction op in a SLP reduction group. */
810 : 4789438 : if (!STMT_VINFO_DATA_REF (stmt_info)
811 : 3709080 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
812 : 3248 : && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
813 : 4791038 : && def_stmt_info)
814 : 1600 : dts[i] = dt = vect_reduction_def;
815 : :
816 : : /* Check the types of the definition. */
817 : 4789438 : switch (dt)
818 : : {
819 : 4789438 : case vect_external_def:
820 : 4789438 : case vect_constant_def:
821 : 4789438 : case vect_internal_def:
822 : 4789438 : case vect_reduction_def:
823 : 4789438 : case vect_double_reduction_def:
824 : 4789438 : case vect_induction_def:
825 : 4789438 : case vect_nested_cycle:
826 : 4789438 : case vect_first_order_recurrence:
827 : 4789438 : break;
828 : :
829 : 0 : default:
830 : : /* FORNOW: Not supported. */
831 : 0 : if (dump_enabled_p ())
832 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
833 : : "Build SLP failed: illegal type of def %T\n",
834 : : oprnd);
835 : 0 : return -1;
836 : : }
837 : :
838 : 4789438 : oprnd_info->first_dt = dt;
839 : 4789438 : oprnd_info->first_op_type = type;
840 : : }
841 : : }
842 : 12709231 : if (first)
843 : : return 0;
844 : :
845 : : /* Now match the operand definition types to that of the first stmt. */
846 : 25902197 : for (i = 0; i < number_of_oprnds;)
847 : : {
848 : 16910127 : if (skip_args[i])
849 : : {
850 : 27611 : ++i;
851 : 27611 : continue;
852 : : }
853 : :
854 : 16882516 : oprnd_info = (*oprnds_info)[i];
855 : 16882516 : dt = dts[i];
856 : 16882516 : stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
857 : 16882516 : oprnd = oprnd_info->ops[stmt_num];
858 : 16882516 : tree type = TREE_TYPE (oprnd);
859 : :
860 : 16882516 : if (!types_compatible_p (oprnd_info->first_op_type, type))
861 : : {
862 : 89768 : if (dump_enabled_p ())
863 : 107 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
864 : : "Build SLP failed: different operand types\n");
865 : 89768 : return 1;
866 : : }
867 : :
868 : 16792748 : if ((gs_op == i) != oprnd_info->first_gs_p)
869 : : {
870 : 0 : if (dump_enabled_p ())
871 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
872 : : "Build SLP failed: mixed gather and non-gather\n");
873 : 0 : return 1;
874 : : }
875 : 16792748 : else if (gs_op == i)
876 : : {
877 : 207 : if (!operand_equal_p (oprnd_info->first_gs_info.base,
878 : 207 : gs_info.base))
879 : : {
880 : 16 : if (dump_enabled_p ())
881 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
882 : : "Build SLP failed: different gather base\n");
883 : 16 : return 1;
884 : : }
885 : 191 : if (oprnd_info->first_gs_info.scale != gs_info.scale)
886 : : {
887 : 8 : if (dump_enabled_p ())
888 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
889 : : "Build SLP failed: different gather scale\n");
890 : 8 : return 1;
891 : : }
892 : : }
893 : :
894 : : /* Not first stmt of the group, check that the def-stmt/s match
895 : : the def-stmt/s of the first stmt. Allow different definition
896 : : types for reduction chains: the first stmt must be a
897 : : vect_reduction_def (a phi node), and the rest
898 : : end in the reduction chain. */
899 : 16792724 : if ((!vect_def_types_match (oprnd_info->first_dt, dt)
900 : 274935 : && !(oprnd_info->first_dt == vect_reduction_def
901 : 2754 : && !STMT_VINFO_DATA_REF (stmt_info)
902 : 2754 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
903 : 2744 : && def_stmt_info
904 : 2744 : && !STMT_VINFO_DATA_REF (def_stmt_info)
905 : 2744 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
906 : : == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
907 : 16520533 : || (!STMT_VINFO_DATA_REF (stmt_info)
908 : 15232572 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
909 : 5768 : && ((!def_stmt_info
910 : 5607 : || STMT_VINFO_DATA_REF (def_stmt_info)
911 : 10289 : || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
912 : : != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
913 : 5768 : != (oprnd_info->first_dt != vect_reduction_def))))
914 : : {
915 : : /* Try swapping operands if we got a mismatch. For BB
916 : : vectorization only in case it will clearly improve things. */
917 : 273953 : if (i == commutative_op && !swapped
918 : 272191 : && (!is_a <bb_vec_info> (vinfo)
919 : 4782 : || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
920 : 4782 : dts[i+1])
921 : 981 : && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
922 : : || vect_def_types_match
923 : 189 : ((*oprnds_info)[i+1]->first_dt, dts[i])))))
924 : : {
925 : 1762 : if (dump_enabled_p ())
926 : 140 : dump_printf_loc (MSG_NOTE, vect_location,
927 : : "trying swapped operands\n");
928 : 1762 : std::swap (dts[i], dts[i+1]);
929 : 1762 : std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
930 : 1762 : (*oprnds_info)[i+1]->def_stmts[stmt_num]);
931 : 1762 : std::swap ((*oprnds_info)[i]->ops[stmt_num],
932 : 1762 : (*oprnds_info)[i+1]->ops[stmt_num]);
933 : : /* After swapping some operands we lost track whether an
934 : : operand has any pattern defs so be conservative here. */
935 : 1762 : if ((*oprnds_info)[i]->any_pattern
936 : 1762 : || (*oprnds_info)[i+1]->any_pattern)
937 : 4 : (*oprnds_info)[i]->any_pattern
938 : 2 : = (*oprnds_info)[i+1]->any_pattern = true;
939 : 1762 : swapped = true;
940 : 1762 : continue;
941 : : }
942 : :
943 : 270429 : if (is_a <bb_vec_info> (vinfo)
944 : 259944 : && !oprnd_info->any_pattern
945 : 530144 : && number_of_oprnds > 1)
946 : : {
947 : : /* Now for commutative ops we should see whether we can
948 : : make the other operand matching. */
949 : 104811 : if (dump_enabled_p ())
950 : 149 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
951 : : "treating operand as external\n");
952 : 104811 : oprnd_info->first_dt = dt = vect_external_def;
953 : : }
954 : : else
955 : : {
956 : 165618 : if (dump_enabled_p ())
957 : 380 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
958 : : "Build SLP failed: different types\n");
959 : 165618 : return 1;
960 : : }
961 : : }
962 : :
963 : : /* Make sure to demote the overall operand to external. */
964 : 16625344 : if (dt == vect_external_def)
965 : 343377 : oprnd_info->first_dt = vect_external_def;
966 : : /* For a SLP reduction chain we want to duplicate the reduction to
967 : : each of the chain members. That gets us a sane SLP graph (still
968 : : the stmts are not 100% correct wrt the initial values). */
969 : 16281967 : else if ((dt == vect_internal_def
970 : 16281967 : || dt == vect_reduction_def)
971 : 15378948 : && oprnd_info->first_dt == vect_reduction_def
972 : 64579 : && !STMT_VINFO_DATA_REF (stmt_info)
973 : 64579 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
974 : 2744 : && !STMT_VINFO_DATA_REF (def_stmt_info)
975 : 16284711 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
976 : : == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
977 : : {
978 : 2744 : oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
979 : 2744 : oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
980 : : }
981 : :
982 : 16625344 : ++i;
983 : : }
984 : :
985 : : /* Swap operands. */
986 : 8992070 : if (swapped)
987 : : {
988 : 42128 : if (dump_enabled_p ())
989 : 401 : dump_printf_loc (MSG_NOTE, vect_location,
990 : : "swapped operands to match def types in %G",
991 : : stmt_info->stmt);
992 : : }
993 : :
994 : : return 0;
995 : : }
996 : :
997 : : /* Return true if call statements CALL1 and CALL2 are similar enough
998 : : to be combined into the same SLP group. */
999 : :
1000 : : bool
1001 : 20886 : compatible_calls_p (gcall *call1, gcall *call2, bool allow_two_operators)
1002 : : {
1003 : 20886 : unsigned int nargs = gimple_call_num_args (call1);
1004 : 20886 : if (nargs != gimple_call_num_args (call2))
1005 : : return false;
1006 : :
1007 : 19015 : auto cfn1 = gimple_call_combined_fn (call1);
1008 : 19015 : auto cfn2 = gimple_call_combined_fn (call2);
1009 : 19015 : if (cfn1 != cfn2
1010 : 2 : && (!allow_two_operators
1011 : 2 : || !((cfn1 == CFN_FMA || cfn1 == CFN_FMS)
1012 : 2 : && (cfn2 == CFN_FMA || cfn2 == CFN_FMS))))
1013 : : return false;
1014 : :
1015 : 19015 : if (gimple_call_internal_p (call1))
1016 : : {
1017 : 7073 : if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
1018 : 7073 : TREE_TYPE (gimple_call_lhs (call2))))
1019 : : return false;
1020 : 14353 : for (unsigned int i = 0; i < nargs; ++i)
1021 : 7280 : if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
1022 : 7280 : TREE_TYPE (gimple_call_arg (call2, i))))
1023 : : return false;
1024 : : }
1025 : : else
1026 : : {
1027 : 11942 : if (!operand_equal_p (gimple_call_fn (call1),
1028 : 11942 : gimple_call_fn (call2), 0))
1029 : : return false;
1030 : :
1031 : 25842 : if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
1032 : : return false;
1033 : : }
1034 : :
1035 : : /* Check that any unvectorized arguments are equal. */
1036 : 15687 : if (const int *map = vect_get_operand_map (call1))
1037 : : {
1038 : 15 : unsigned int nkept = *map++;
1039 : 15 : unsigned int mapi = 0;
1040 : 57 : for (unsigned int i = 0; i < nargs; ++i)
1041 : 42 : if (mapi < nkept && map[mapi] == int (i))
1042 : 27 : mapi += 1;
1043 : 15 : else if (!operand_equal_p (gimple_call_arg (call1, i),
1044 : 15 : gimple_call_arg (call2, i)))
1045 : : return false;
1046 : : }
1047 : :
1048 : : return true;
1049 : : }
1050 : :
1051 : : /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1052 : : caller's attempt to find the vector type in STMT_INFO with the narrowest
1053 : : element type. Return true if VECTYPE is nonnull and if it is valid
1054 : : for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1055 : : number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1056 : : vect_build_slp_tree. */
1057 : :
1058 : : static bool
1059 : 5675267 : vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1060 : : unsigned int group_size,
1061 : : tree vectype, poly_uint64 *max_nunits)
1062 : : {
1063 : 5675267 : if (!vectype)
1064 : : {
1065 : 4191 : if (dump_enabled_p ())
1066 : 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1067 : : "Build SLP failed: unsupported data-type in %G\n",
1068 : : stmt_info->stmt);
1069 : : /* Fatal mismatch. */
1070 : 4191 : return false;
1071 : : }
1072 : :
1073 : : /* If populating the vector type requires unrolling then fail
1074 : : before adjusting *max_nunits for basic-block vectorization. */
1075 : 5671076 : if (is_a <bb_vec_info> (vinfo)
1076 : 5671076 : && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1077 : : {
1078 : 141079 : if (dump_enabled_p ())
1079 : 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1080 : : "Build SLP failed: unrolling required "
1081 : : "in basic block SLP\n");
1082 : : /* Fatal mismatch. */
1083 : 141079 : return false;
1084 : : }
1085 : :
1086 : : /* In case of multiple types we need to detect the smallest type. */
1087 : 5529997 : vect_update_max_nunits (max_nunits, vectype);
1088 : 5529997 : return true;
1089 : : }
1090 : :
1091 : : /* Verify if the scalar stmts STMTS are isomorphic, require data
1092 : : permutation or are of unsupported types of operation. Return
1093 : : true if they are, otherwise return false and indicate in *MATCHES
1094 : : which stmts are not isomorphic to the first one. If MATCHES[0]
1095 : : is false then this indicates the comparison could not be
1096 : : carried out or the stmts will never be vectorized by SLP.
1097 : :
1098 : : Note COND_EXPR is possibly isomorphic to another one after swapping its
1099 : : operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1100 : : the first stmt by swapping the two operands of comparison; set SWAP[i]
1101 : : to 2 if stmt I is isormorphic to the first stmt by inverting the code
1102 : : of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1103 : : to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1104 : :
1105 : : static bool
1106 : 5765875 : vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1107 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1108 : : poly_uint64 *max_nunits, bool *matches,
1109 : : bool *two_operators, tree *node_vectype)
1110 : : {
1111 : 5765875 : unsigned int i;
1112 : 5765875 : stmt_vec_info first_stmt_info = stmts[0];
1113 : 5765875 : code_helper first_stmt_code = ERROR_MARK;
1114 : 5765875 : code_helper alt_stmt_code = ERROR_MARK;
1115 : 5765875 : code_helper first_cond_code = ERROR_MARK;
1116 : 5765875 : bool need_same_oprnds = false;
1117 : 5765875 : tree first_lhs = NULL_TREE;
1118 : 5765875 : tree first_op1 = NULL_TREE;
1119 : 5765875 : stmt_vec_info first_load = NULL, prev_first_load = NULL;
1120 : 5765875 : bool first_stmt_ldst_p = false, first_stmt_ldst_masklen_p = false;
1121 : 5765875 : bool first_stmt_phi_p = false;
1122 : 5765875 : int first_reduc_idx = -1;
1123 : 5765875 : bool maybe_soft_fail = false;
1124 : 5765875 : tree soft_fail_nunits_vectype = NULL_TREE;
1125 : :
1126 : 5765875 : tree vectype, nunits_vectype;
1127 : 5765875 : if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
1128 : : &nunits_vectype, group_size))
1129 : : {
1130 : : /* Fatal mismatch. */
1131 : 204606 : matches[0] = false;
1132 : 204606 : return false;
1133 : : }
1134 : 5561269 : if (is_a <bb_vec_info> (vinfo)
1135 : 5561269 : && known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U))
1136 : : {
1137 : 354100 : if (dump_enabled_p ())
1138 : 282 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1139 : : "Build SLP failed: not using single lane "
1140 : : "vector type %T\n", vectype);
1141 : 354100 : matches[0] = false;
1142 : 354100 : return false;
1143 : : }
1144 : : /* Record nunits required but continue analysis, producing matches[]
1145 : : as if nunits was not an issue. This allows splitting of groups
1146 : : to happen. */
1147 : 5207169 : if (nunits_vectype
1148 : 5207169 : && !vect_record_max_nunits (vinfo, first_stmt_info, group_size,
1149 : : nunits_vectype, max_nunits))
1150 : : {
1151 : 141079 : gcc_assert (is_a <bb_vec_info> (vinfo));
1152 : 141079 : maybe_soft_fail = true;
1153 : 141079 : soft_fail_nunits_vectype = nunits_vectype;
1154 : : }
1155 : :
1156 : 5207169 : gcc_assert (vectype || !gimple_get_lhs (first_stmt_info->stmt));
1157 : 5207169 : *node_vectype = vectype;
1158 : :
1159 : : /* For every stmt in NODE find its def stmt/s. */
1160 : 5207169 : stmt_vec_info stmt_info;
1161 : 22005365 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1162 : : {
1163 : 16957544 : bool ldst_p = false;
1164 : 16957544 : bool ldst_masklen_p = false;
1165 : 16957544 : bool phi_p = false;
1166 : 16957544 : code_helper rhs_code = ERROR_MARK;
1167 : :
1168 : 16957544 : swap[i] = 0;
1169 : 16957544 : matches[i] = false;
1170 : 16957544 : if (!stmt_info)
1171 : : {
1172 : 51107 : matches[i] = true;
1173 : 16849303 : continue;
1174 : : }
1175 : :
1176 : 16906437 : gimple *stmt = stmt_info->stmt;
1177 : 16906437 : if (dump_enabled_p ())
1178 : 211751 : dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1179 : :
1180 : : /* Fail to vectorize statements marked as unvectorizable, throw
1181 : : or are volatile. */
1182 : 16906437 : if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1183 : 16718266 : || stmt_can_throw_internal (cfun, stmt)
1184 : 32679456 : || gimple_has_volatile_ops (stmt))
1185 : : {
1186 : 193621 : if (dump_enabled_p ())
1187 : 195 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1188 : : "Build SLP failed: unvectorizable statement %G",
1189 : : stmt);
1190 : : /* ??? For BB vectorization we want to commutate operands in a way
1191 : : to shuffle all unvectorizable defs into one operand and have
1192 : : the other still vectorized. The following doesn't reliably
1193 : : work for this though but it's the easiest we can do here. */
1194 : 193621 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1195 : 61324 : continue;
1196 : : /* Fatal mismatch. */
1197 : 132297 : matches[0] = false;
1198 : 132297 : return false;
1199 : : }
1200 : :
1201 : 16712816 : gcall *call_stmt = dyn_cast <gcall *> (stmt);
1202 : 16712816 : tree lhs = gimple_get_lhs (stmt);
1203 : 16712816 : if (lhs == NULL_TREE && !call_stmt)
1204 : : {
1205 : 36 : if (dump_enabled_p ())
1206 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1207 : : "Build SLP failed: not GIMPLE_ASSIGN nor "
1208 : : "GIMPLE_CALL %G", stmt);
1209 : 36 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1210 : 36 : continue;
1211 : : /* Fatal mismatch. */
1212 : 0 : matches[0] = false;
1213 : 0 : return false;
1214 : : }
1215 : :
1216 : 16712780 : if (call_stmt)
1217 : : {
1218 : 89053 : combined_fn cfn = gimple_call_combined_fn (call_stmt);
1219 : 89053 : if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1220 : 47960 : rhs_code = cfn;
1221 : : else
1222 : : rhs_code = CALL_EXPR;
1223 : :
1224 : 89053 : if (cfn == CFN_GATHER_LOAD
1225 : 89053 : || cfn == CFN_SCATTER_STORE)
1226 : : ldst_p = true;
1227 : : else if (cfn == CFN_MASK_LOAD
1228 : : || cfn == CFN_MASK_GATHER_LOAD
1229 : : || cfn == CFN_MASK_LEN_GATHER_LOAD
1230 : : || cfn == CFN_MASK_SCATTER_STORE
1231 : : || cfn == CFN_MASK_LEN_SCATTER_STORE)
1232 : : {
1233 : : ldst_p = true;
1234 : : ldst_masklen_p = true;
1235 : : }
1236 : : else if (cfn == CFN_MASK_STORE)
1237 : : {
1238 : : ldst_p = true;
1239 : : ldst_masklen_p = true;
1240 : : rhs_code = CFN_MASK_STORE;
1241 : : }
1242 : : else if (cfn == CFN_GOMP_SIMD_LANE)
1243 : : ;
1244 : 80794 : else if ((cfn != CFN_LAST
1245 : : && cfn != CFN_MASK_CALL
1246 : 39701 : && internal_fn_p (cfn)
1247 : 30891 : && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1248 : 80720 : || gimple_call_tail_p (call_stmt)
1249 : 80720 : || gimple_call_noreturn_p (call_stmt)
1250 : 161514 : || gimple_call_chain (call_stmt))
1251 : : {
1252 : 423 : if (dump_enabled_p ())
1253 : 13 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254 : : "Build SLP failed: unsupported call type %G",
1255 : : (gimple *) call_stmt);
1256 : 423 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1257 : 66 : continue;
1258 : : /* Fatal mismatch. */
1259 : 357 : matches[0] = false;
1260 : 357 : return false;
1261 : : }
1262 : : }
1263 : 16623727 : else if (gimple_code (stmt) == GIMPLE_PHI)
1264 : : {
1265 : : rhs_code = ERROR_MARK;
1266 : : phi_p = true;
1267 : : }
1268 : : else
1269 : : {
1270 : 15678480 : rhs_code = gimple_assign_rhs_code (stmt);
1271 : 15678480 : ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1272 : : }
1273 : :
1274 : : /* Check the operation. */
1275 : 16712357 : if (i == 0)
1276 : : {
1277 : 5074515 : first_lhs = lhs;
1278 : 5074515 : first_stmt_code = rhs_code;
1279 : 5074515 : first_stmt_ldst_p = ldst_p;
1280 : 5074515 : first_stmt_ldst_masklen_p = ldst_masklen_p;
1281 : 5074515 : first_stmt_phi_p = phi_p;
1282 : 5074515 : first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
1283 : :
1284 : : /* Shift arguments should be equal in all the packed stmts for a
1285 : : vector shift with scalar shift operand. */
1286 : 5074515 : if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1287 : 4962396 : || rhs_code == LROTATE_EXPR
1288 : 10036869 : || rhs_code == RROTATE_EXPR)
1289 : : {
1290 : : /* First see if we have a vector/vector shift. */
1291 : 112363 : if (!directly_supported_p (rhs_code, vectype, optab_vector))
1292 : : {
1293 : : /* No vector/vector shift, try for a vector/scalar shift. */
1294 : 105313 : if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1295 : : {
1296 : 7933 : if (dump_enabled_p ())
1297 : 380 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1298 : : "Build SLP failed: "
1299 : : "op not supported by target.\n");
1300 : 7933 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1301 : : continue;
1302 : : /* Fatal mismatch. */
1303 : 7933 : matches[0] = false;
1304 : 7933 : return false;
1305 : : }
1306 : 97380 : need_same_oprnds = true;
1307 : 97380 : first_op1 = gimple_assign_rhs2 (stmt);
1308 : : }
1309 : : }
1310 : 4962152 : else if (rhs_code == WIDEN_LSHIFT_EXPR)
1311 : : {
1312 : 0 : need_same_oprnds = true;
1313 : 0 : first_op1 = gimple_assign_rhs2 (stmt);
1314 : : }
1315 : 4962152 : else if (!ldst_p
1316 : 4962152 : && rhs_code == BIT_FIELD_REF)
1317 : : {
1318 : 5448 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1319 : 5448 : if (!is_a <bb_vec_info> (vinfo)
1320 : 5322 : || TREE_CODE (vec) != SSA_NAME
1321 : : /* When the element types are not compatible we pun the
1322 : : source to the target vectype which requires equal size. */
1323 : 10758 : || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1324 : 4595 : || !types_compatible_p (TREE_TYPE (vectype),
1325 : 4595 : TREE_TYPE (TREE_TYPE (vec))))
1326 : 1129 : && !operand_equal_p (TYPE_SIZE (vectype),
1327 : 1129 : TYPE_SIZE (TREE_TYPE (vec)))))
1328 : : {
1329 : 861 : if (dump_enabled_p ())
1330 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1331 : : "Build SLP failed: "
1332 : : "BIT_FIELD_REF not supported\n");
1333 : : /* Fatal mismatch. */
1334 : 861 : matches[0] = false;
1335 : 861 : return false;
1336 : : }
1337 : : }
1338 : 4956704 : else if (rhs_code == CFN_DIV_POW2)
1339 : : {
1340 : 0 : need_same_oprnds = true;
1341 : 0 : first_op1 = gimple_call_arg (call_stmt, 1);
1342 : : }
1343 : 4956704 : else if (rhs_code == CFN_GOMP_SIMD_LANE)
1344 : : {
1345 : 3153 : need_same_oprnds = true;
1346 : 3153 : first_op1 = gimple_call_arg (call_stmt, 1);
1347 : : }
1348 : : }
1349 : : else
1350 : : {
1351 : 11638161 : if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1352 : : /* For SLP reduction groups the index isn't necessarily
1353 : : uniform but only that of the first stmt matters. */
1354 : 1558 : && !(first_reduc_idx != -1
1355 : 1558 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1356 : 1558 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
1357 : 11637842 : && !(first_reduc_idx != -1
1358 : 868 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1359 : 868 : && rhs_code.is_tree_code ()
1360 : 868 : && commutative_tree_code (tree_code (rhs_code))
1361 : 687 : && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info)))
1362 : : {
1363 : 319 : if (dump_enabled_p ())
1364 : : {
1365 : 12 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1366 : : "Build SLP failed: different reduc_idx "
1367 : : "%d instead of %d in %G",
1368 : : STMT_VINFO_REDUC_IDX (stmt_info),
1369 : : first_reduc_idx, stmt);
1370 : : }
1371 : : /* Mismatch. */
1372 : 319 : continue;
1373 : : }
1374 : 11637523 : if (!ldst_p
1375 : 9191264 : && first_stmt_code != rhs_code
1376 : 13033889 : && alt_stmt_code == ERROR_MARK)
1377 : : alt_stmt_code = rhs_code;
1378 : 13011108 : if ((!ldst_p
1379 : 9191264 : && first_stmt_code != rhs_code
1380 : 1396366 : && (first_stmt_code != IMAGPART_EXPR
1381 : 135 : || rhs_code != REALPART_EXPR)
1382 : 1396338 : && (first_stmt_code != REALPART_EXPR
1383 : 450 : || rhs_code != IMAGPART_EXPR)
1384 : : /* Handle mismatches in plus/minus by computing both
1385 : : and merging the results. */
1386 : 1396335 : && !((((first_stmt_code == PLUS_EXPR
1387 : 1301473 : || first_stmt_code == MINUS_EXPR)
1388 : 114288 : && (alt_stmt_code == PLUS_EXPR
1389 : 105602 : || alt_stmt_code == MINUS_EXPR))
1390 : 1374260 : || ((first_stmt_code == CFN_FMA
1391 : 1374258 : || first_stmt_code == CFN_FMS)
1392 : 2 : && (alt_stmt_code == CFN_FMA
1393 : 2 : || alt_stmt_code == CFN_FMS)))
1394 : 22077 : && rhs_code == alt_stmt_code)
1395 : 1417139 : && !(first_stmt_code.is_tree_code ()
1396 : 1294304 : && rhs_code.is_tree_code ()
1397 : 1201512 : && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1398 : : == tcc_comparison)
1399 : 141812 : && (swap_tree_comparison (tree_code (first_stmt_code))
1400 : 141812 : == tree_code (rhs_code))
1401 : : && (first_reduc_idx == -1
1402 : 0 : || REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
1403 : : || (ldst_p
1404 : 4892518 : && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1405 : 2446259 : != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1406 : : || (ldst_p
1407 : 2404396 : && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1408 : 2404396 : != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1409 : 10264080 : || first_stmt_ldst_p != ldst_p
1410 : 10263946 : || (ldst_p && first_stmt_ldst_masklen_p != ldst_masklen_p)
1411 : 21901461 : || first_stmt_phi_p != phi_p)
1412 : : {
1413 : 1373585 : if (dump_enabled_p ())
1414 : : {
1415 : 2777 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1416 : : "Build SLP failed: different operation "
1417 : : "in stmt %G", stmt);
1418 : 2777 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1419 : : "original stmt %G", first_stmt_info->stmt);
1420 : : }
1421 : : /* Mismatch. */
1422 : 1373585 : continue;
1423 : : }
1424 : :
1425 : 10265921 : if (!ldst_p
1426 : 7859672 : && first_stmt_code == BIT_FIELD_REF
1427 : 10269012 : && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1428 : 5074 : != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1429 : : {
1430 : 1983 : if (dump_enabled_p ())
1431 : 40 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1432 : : "Build SLP failed: different BIT_FIELD_REF "
1433 : : "arguments in %G", stmt);
1434 : : /* Mismatch. */
1435 : 1983 : continue;
1436 : : }
1437 : :
1438 : 10261955 : if (call_stmt
1439 : 21013 : && first_stmt_code != CFN_MASK_LOAD
1440 : 10282903 : && first_stmt_code != CFN_MASK_STORE)
1441 : : {
1442 : 20886 : if (!is_a <gcall *> (stmts[0]->stmt)
1443 : 20886 : || !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1444 : : call_stmt, true))
1445 : : {
1446 : 5199 : if (dump_enabled_p ())
1447 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1448 : : "Build SLP failed: different calls in %G",
1449 : : stmt);
1450 : : /* Mismatch. */
1451 : 5199 : continue;
1452 : : }
1453 : : }
1454 : :
1455 : 10085147 : if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1456 : 10952846 : && (gimple_bb (first_stmt_info->stmt)
1457 : 867699 : != gimple_bb (stmt_info->stmt)))
1458 : : {
1459 : 27315 : if (dump_enabled_p ())
1460 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1461 : : "Build SLP failed: different BB for PHI "
1462 : : "or possibly trapping operation in %G", stmt);
1463 : : /* Mismatch. */
1464 : 27315 : continue;
1465 : : }
1466 : :
1467 : 10229441 : if (need_same_oprnds)
1468 : : {
1469 : 53061 : tree other_op1 = gimple_arg (stmt, 1);
1470 : 53061 : if (!operand_equal_p (first_op1, other_op1, 0))
1471 : : {
1472 : 7171 : if (dump_enabled_p ())
1473 : 123 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474 : : "Build SLP failed: different shift "
1475 : : "arguments in %G", stmt);
1476 : : /* Mismatch. */
1477 : 7171 : continue;
1478 : : }
1479 : : }
1480 : :
1481 : 10222983 : if (first_lhs
1482 : 10222270 : && lhs
1483 : 10222270 : && !types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (first_lhs)))
1484 : : {
1485 : 713 : if (dump_enabled_p ())
1486 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1487 : : "Build SLP failed: different vector type "
1488 : : "in %G", stmt);
1489 : : /* Mismatch. */
1490 : 713 : continue;
1491 : : }
1492 : : }
1493 : :
1494 : : /* Grouped store or load. */
1495 : 15287278 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1496 : : {
1497 : 3767591 : gcc_assert (ldst_p);
1498 : 3767591 : if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1499 : : {
1500 : : /* Store. */
1501 : 2970065 : gcc_assert (rhs_code == CFN_MASK_STORE
1502 : : || REFERENCE_CLASS_P (lhs)
1503 : : || DECL_P (lhs));
1504 : : }
1505 : : else
1506 : : {
1507 : : /* Load. */
1508 : 797526 : first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1509 : 797526 : if (prev_first_load)
1510 : : {
1511 : : /* Check that there are no loads from different interleaving
1512 : : chains in the same node. */
1513 : 354286 : if (prev_first_load != first_load)
1514 : : {
1515 : 42915 : if (dump_enabled_p ())
1516 : 1904 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1517 : : vect_location,
1518 : : "Build SLP failed: different "
1519 : : "interleaving chains in one node %G",
1520 : : stmt);
1521 : : /* Mismatch. */
1522 : 42915 : continue;
1523 : : }
1524 : : }
1525 : : else
1526 : : prev_first_load = first_load;
1527 : : }
1528 : : }
1529 : : /* Non-grouped store or load. */
1530 : 11519687 : else if (ldst_p)
1531 : : {
1532 : 690633 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1533 : 476989 : && rhs_code != CFN_GATHER_LOAD
1534 : : && rhs_code != CFN_MASK_GATHER_LOAD
1535 : : && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1536 : : && rhs_code != CFN_SCATTER_STORE
1537 : : && rhs_code != CFN_MASK_SCATTER_STORE
1538 : : && rhs_code != CFN_MASK_LEN_SCATTER_STORE
1539 : 476989 : && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1540 : : /* Not grouped loads are handled as externals for BB
1541 : : vectorization. For loop vectorization we can handle
1542 : : splats the same we handle single element interleaving.
1543 : : Likewise we can handle a collection of invariant refs. */
1544 : 1155042 : && (is_a <bb_vec_info> (vinfo)
1545 : 464409 : || (stmt_info != first_stmt_info
1546 : 44572 : && !(integer_zerop (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
1547 : 289 : && integer_zerop (DR_STEP (STMT_VINFO_DATA_REF
1548 : : (first_stmt_info)))))))
1549 : : {
1550 : : /* Not grouped load. */
1551 : 43994 : if (dump_enabled_p ())
1552 : 121 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1553 : : "Build SLP failed: not grouped load %G", stmt);
1554 : :
1555 : 43994 : if (i != 0)
1556 : 43994 : continue;
1557 : : /* Fatal mismatch. */
1558 : 0 : matches[0] = false;
1559 : 0 : return false;
1560 : : }
1561 : : }
1562 : : /* Not memory operation. */
1563 : : else
1564 : : {
1565 : 10829054 : if (!phi_p
1566 : 10005958 : && rhs_code.is_tree_code ()
1567 : 9964850 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1568 : 1451717 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1569 : 917910 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1570 : 868958 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1571 : 60465 : && rhs_code != VIEW_CONVERT_EXPR
1572 : : && rhs_code != CALL_EXPR
1573 : : && rhs_code != BIT_FIELD_REF
1574 : 10829054 : && rhs_code != SSA_NAME)
1575 : : {
1576 : 17900 : if (dump_enabled_p ())
1577 : 17 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1578 : : "Build SLP failed: operation unsupported %G",
1579 : : stmt);
1580 : 17900 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1581 : 0 : continue;
1582 : : /* Fatal mismatch. */
1583 : 17900 : matches[0] = false;
1584 : 17900 : return false;
1585 : : }
1586 : :
1587 : 10811154 : if (rhs_code == COND_EXPR)
1588 : : {
1589 : 46381 : tree cond_expr = gimple_assign_rhs1 (stmt);
1590 : 46381 : enum tree_code cond_code = TREE_CODE (cond_expr);
1591 : 46381 : enum tree_code swap_code = ERROR_MARK;
1592 : 46381 : enum tree_code invert_code = ERROR_MARK;
1593 : :
1594 : 46381 : if (i == 0)
1595 : 37748 : first_cond_code = TREE_CODE (cond_expr);
1596 : 8633 : else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1597 : : {
1598 : 0 : bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1599 : 0 : swap_code = swap_tree_comparison (cond_code);
1600 : 0 : invert_code = invert_tree_comparison (cond_code, honor_nans);
1601 : : }
1602 : :
1603 : 46381 : if (first_cond_code == cond_code)
1604 : : ;
1605 : : /* Isomorphic can be achieved by swapping. */
1606 : 0 : else if (first_cond_code == swap_code)
1607 : 0 : swap[i] = 1;
1608 : : /* Isomorphic can be achieved by inverting. */
1609 : 0 : else if (first_cond_code == invert_code)
1610 : 0 : swap[i] = 2;
1611 : : else
1612 : : {
1613 : 0 : if (dump_enabled_p ())
1614 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1615 : : "Build SLP failed: different"
1616 : : " operation %G", stmt);
1617 : : /* Mismatch. */
1618 : 0 : continue;
1619 : : }
1620 : : }
1621 : :
1622 : 10811154 : if (i != 0
1623 : 7818510 : && first_stmt_code != rhs_code
1624 : 64774 : && first_stmt_code.is_tree_code ()
1625 : 64772 : && rhs_code.is_tree_code ()
1626 : 64772 : && TREE_CODE_CLASS ((tree_code)first_stmt_code) == tcc_comparison
1627 : 10853943 : && (swap_tree_comparison ((tree_code)first_stmt_code)
1628 : 42789 : == (tree_code)rhs_code))
1629 : 42789 : swap[i] = 1;
1630 : :
1631 : 10811154 : if (i != 0
1632 : 7818510 : && first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1633 : 1016 : && first_reduc_idx != -1
1634 : 1016 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1635 : 1016 : && rhs_code.is_tree_code ()
1636 : 1016 : && commutative_tree_code (tree_code (rhs_code))
1637 : 10812170 : && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info))
1638 : 1016 : swap[i] = 1;
1639 : : }
1640 : :
1641 : 15182469 : matches[i] = true;
1642 : : }
1643 : :
1644 : 20256287 : for (i = 0; i < group_size; ++i)
1645 : 15872426 : if (!matches[i])
1646 : : return false;
1647 : :
1648 : : /* If we allowed a two-operation SLP node verify the target can cope
1649 : : with the permute we are going to use. */
1650 : 4383861 : if (alt_stmt_code != ERROR_MARK
1651 : 4383861 : && (!alt_stmt_code.is_tree_code ()
1652 : 54039 : || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1653 : 54039 : && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1654 : : {
1655 : 12121 : *two_operators = true;
1656 : : }
1657 : :
1658 : 4383861 : if (maybe_soft_fail)
1659 : : {
1660 : 140690 : unsigned HOST_WIDE_INT const_nunits;
1661 : 140690 : if (!TYPE_VECTOR_SUBPARTS
1662 : 140690 : (soft_fail_nunits_vectype).is_constant (&const_nunits)
1663 : 140690 : || const_nunits > group_size)
1664 : 0 : matches[0] = false;
1665 : : else
1666 : : {
1667 : : /* With constant vector elements simulate a mismatch at the
1668 : : point we need to split. */
1669 : 140690 : unsigned tail = group_size & (const_nunits - 1);
1670 : 140690 : memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1671 : : }
1672 : 140690 : return false;
1673 : : }
1674 : :
1675 : : return true;
1676 : : }
1677 : :
1678 : : /* Traits for the hash_set to record failed SLP builds for a stmt set.
1679 : : Note we never remove apart from at destruction time so we do not
1680 : : need a special value for deleted that differs from empty. */
1681 : : struct bst_traits
1682 : : {
1683 : : typedef vec <stmt_vec_info> value_type;
1684 : : typedef vec <stmt_vec_info> compare_type;
1685 : : static inline hashval_t hash (value_type);
1686 : : static inline bool equal (value_type existing, value_type candidate);
1687 : 496050263 : static inline bool is_empty (value_type x) { return !x.exists (); }
1688 : 116024915 : static inline bool is_deleted (value_type x) { return !x.exists (); }
1689 : : static const bool empty_zero_p = true;
1690 : 0 : static inline void mark_empty (value_type &x) { x.release (); }
1691 : : static inline void mark_deleted (value_type &x) { x.release (); }
1692 : 9271993 : static inline void remove (value_type &x) { x.release (); }
1693 : : };
1694 : : inline hashval_t
1695 : 100005917 : bst_traits::hash (value_type x)
1696 : : {
1697 : 100005917 : inchash::hash h;
1698 : 431450373 : for (unsigned i = 0; i < x.length (); ++i)
1699 : 331444456 : h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
1700 : 100005917 : return h.end ();
1701 : : }
1702 : : inline bool
1703 : 90021840 : bst_traits::equal (value_type existing, value_type candidate)
1704 : : {
1705 : 270065520 : if (existing.length () != candidate.length ())
1706 : : return false;
1707 : 91841615 : for (unsigned i = 0; i < existing.length (); ++i)
1708 : 87011424 : if (existing[i] != candidate[i])
1709 : : return false;
1710 : : return true;
1711 : : }
1712 : :
1713 : : typedef hash_map <vec <stmt_vec_info>, slp_tree,
1714 : : simple_hashmap_traits <bst_traits, slp_tree> >
1715 : : scalar_stmts_to_slp_tree_map_t;
1716 : :
1717 : : /* Release BST_MAP. */
1718 : :
1719 : : static void
1720 : 1644049 : release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
1721 : : {
1722 : : /* The map keeps a reference on SLP nodes built, release that. */
1723 : 10916042 : for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1724 : 20188035 : it != bst_map->end (); ++it)
1725 : 9271993 : if ((*it).second)
1726 : 9271993 : vect_free_slp_tree ((*it).second);
1727 : 1644049 : delete bst_map;
1728 : 1644049 : }
1729 : :
1730 : : /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1731 : : but then vec::insert does memmove and that's not compatible with
1732 : : std::pair. */
1733 : : struct chain_op_t
1734 : : {
1735 : 3699081 : chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1736 : 3699081 : : code (code_), dt (dt_), op (op_) {}
1737 : : tree_code code;
1738 : : vect_def_type dt;
1739 : : tree op;
1740 : : };
1741 : :
1742 : : /* Comparator for sorting associatable chains. */
1743 : :
1744 : : static int
1745 : 8535414 : dt_sort_cmp (const void *op1_, const void *op2_, void *)
1746 : : {
1747 : 8535414 : auto *op1 = (const chain_op_t *) op1_;
1748 : 8535414 : auto *op2 = (const chain_op_t *) op2_;
1749 : 8535414 : if (op1->dt != op2->dt)
1750 : 1040411 : return (int)op1->dt - (int)op2->dt;
1751 : 7495003 : return (int)op1->code - (int)op2->code;
1752 : : }
1753 : :
1754 : : /* Linearize the associatable expression chain at START with the
1755 : : associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1756 : : filling CHAIN with the result and using WORKLIST as intermediate storage.
1757 : : CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1758 : : or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1759 : : stmts, starting with START. */
1760 : :
1761 : : static void
1762 : 1660911 : vect_slp_linearize_chain (vec_info *vinfo,
1763 : : vec<std::pair<tree_code, gimple *> > &worklist,
1764 : : vec<chain_op_t> &chain,
1765 : : enum tree_code code, gimple *start,
1766 : : gimple *&code_stmt, gimple *&alt_code_stmt,
1767 : : vec<gimple *> *chain_stmts)
1768 : : {
1769 : : /* For each lane linearize the addition/subtraction (or other
1770 : : uniform associatable operation) expression tree. */
1771 : 1660911 : worklist.safe_push (std::make_pair (code, start));
1772 : 3699081 : while (!worklist.is_empty ())
1773 : : {
1774 : 2038170 : auto entry = worklist.pop ();
1775 : 2038170 : gassign *stmt = as_a <gassign *> (entry.second);
1776 : 2038170 : enum tree_code in_code = entry.first;
1777 : 4076340 : enum tree_code this_code = gimple_assign_rhs_code (stmt);
1778 : : /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1779 : 2038170 : if (!code_stmt
1780 : 2038170 : && gimple_assign_rhs_code (stmt) == code)
1781 : 1396814 : code_stmt = stmt;
1782 : 641356 : else if (!alt_code_stmt
1783 : 641356 : && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1784 : 338688 : alt_code_stmt = stmt;
1785 : 2038170 : if (chain_stmts)
1786 : 1990551 : chain_stmts->safe_push (stmt);
1787 : 6114510 : for (unsigned opnum = 1; opnum <= 2; ++opnum)
1788 : : {
1789 : 4076340 : tree op = gimple_op (stmt, opnum);
1790 : 4076340 : vect_def_type dt;
1791 : 4076340 : stmt_vec_info def_stmt_info;
1792 : 4076340 : bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1793 : 4076340 : gcc_assert (res);
1794 : 4076340 : if (dt == vect_internal_def
1795 : 4076340 : && is_pattern_stmt_p (def_stmt_info))
1796 : 6797 : op = gimple_get_lhs (def_stmt_info->stmt);
1797 : 4076340 : gimple *use_stmt;
1798 : 4076340 : use_operand_p use_p;
1799 : 4076340 : if (dt == vect_internal_def
1800 : 3785853 : && single_imm_use (op, &use_p, &use_stmt)
1801 : 2367069 : && is_gimple_assign (def_stmt_info->stmt)
1802 : 6250418 : && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1803 : 1822990 : || (code == PLUS_EXPR
1804 : 898602 : && (gimple_assign_rhs_code (def_stmt_info->stmt)
1805 : : == MINUS_EXPR))))
1806 : : {
1807 : 377259 : tree_code op_def_code = this_code;
1808 : 377259 : if (op_def_code == MINUS_EXPR && opnum == 1)
1809 : 55757 : op_def_code = PLUS_EXPR;
1810 : 377259 : if (in_code == MINUS_EXPR)
1811 : 193 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1812 : 377259 : worklist.safe_push (std::make_pair (op_def_code,
1813 : 377259 : def_stmt_info->stmt));
1814 : : }
1815 : : else
1816 : : {
1817 : 3699081 : tree_code op_def_code = this_code;
1818 : 3699081 : if (op_def_code == MINUS_EXPR && opnum == 1)
1819 : 287230 : op_def_code = PLUS_EXPR;
1820 : 3699081 : if (in_code == MINUS_EXPR)
1821 : 4283 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1822 : 3699081 : chain.safe_push (chain_op_t (op_def_code, dt, op));
1823 : : }
1824 : : }
1825 : : }
1826 : 1660911 : }
1827 : :
1828 : : static slp_tree
1829 : : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1830 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1831 : : poly_uint64 *max_nunits,
1832 : : bool *matches, unsigned *limit, unsigned *tree_size,
1833 : : scalar_stmts_to_slp_tree_map_t *bst_map);
1834 : :
1835 : : static slp_tree
1836 : 6267731 : vect_build_slp_tree (vec_info *vinfo,
1837 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1838 : : poly_uint64 *max_nunits,
1839 : : bool *matches, unsigned *limit, unsigned *tree_size,
1840 : : scalar_stmts_to_slp_tree_map_t *bst_map)
1841 : : {
1842 : 6267731 : if (slp_tree *leader = bst_map->get (stmts))
1843 : : {
1844 : 496091 : if (dump_enabled_p ())
1845 : 16994 : dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1846 : 16994 : !(*leader)->failed ? "" : "failed ",
1847 : : (void *) *leader);
1848 : 496091 : if (!(*leader)->failed)
1849 : : {
1850 : 448356 : SLP_TREE_REF_COUNT (*leader)++;
1851 : 448356 : vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1852 : 448356 : stmts.release ();
1853 : 448356 : return *leader;
1854 : : }
1855 : 47735 : memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1856 : 47735 : return NULL;
1857 : : }
1858 : :
1859 : : /* Single-lane SLP doesn't have the chance of run-away, do not account
1860 : : it to the limit. */
1861 : 5771640 : if (stmts.length () > 1)
1862 : : {
1863 : 3115999 : if (*limit == 0)
1864 : : {
1865 : 1501 : if (dump_enabled_p ())
1866 : 12 : dump_printf_loc (MSG_NOTE, vect_location,
1867 : : "SLP discovery limit exceeded\n");
1868 : 1501 : memset (matches, 0, sizeof (bool) * group_size);
1869 : 1501 : return NULL;
1870 : : }
1871 : 3114498 : --*limit;
1872 : : }
1873 : :
1874 : : /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1875 : : so we can pick up backedge destinations during discovery. */
1876 : 5770139 : slp_tree res = new _slp_tree;
1877 : 5770139 : SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1878 : 5770139 : SLP_TREE_SCALAR_STMTS (res) = stmts;
1879 : 5770139 : bst_map->put (stmts.copy (), res);
1880 : :
1881 : 5770139 : if (dump_enabled_p ())
1882 : 144108 : dump_printf_loc (MSG_NOTE, vect_location,
1883 : : "starting SLP discovery for node %p\n", (void *) res);
1884 : :
1885 : 5770139 : poly_uint64 this_max_nunits = 1;
1886 : 5770139 : slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1887 : : &this_max_nunits,
1888 : : matches, limit, tree_size, bst_map);
1889 : 5770139 : if (!res_)
1890 : : {
1891 : 1951776 : if (dump_enabled_p ())
1892 : 7813 : dump_printf_loc (MSG_NOTE, vect_location,
1893 : : "SLP discovery for node %p failed\n", (void *) res);
1894 : : /* Mark the node invalid so we can detect those when still in use
1895 : : as backedge destinations. */
1896 : 1951776 : SLP_TREE_SCALAR_STMTS (res) = vNULL;
1897 : 1951776 : SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1898 : 1951776 : res->failed = XNEWVEC (bool, group_size);
1899 : 1951776 : if (flag_checking)
1900 : : {
1901 : : unsigned i;
1902 : 3451016 : for (i = 0; i < group_size; ++i)
1903 : 3451016 : if (!matches[i])
1904 : : break;
1905 : 1951776 : gcc_assert (i < group_size);
1906 : : }
1907 : 1951776 : memcpy (res->failed, matches, sizeof (bool) * group_size);
1908 : : }
1909 : : else
1910 : : {
1911 : 3818363 : if (dump_enabled_p ())
1912 : 136295 : dump_printf_loc (MSG_NOTE, vect_location,
1913 : : "SLP discovery for node %p succeeded\n",
1914 : : (void *) res);
1915 : 3818363 : gcc_assert (res_ == res);
1916 : 3818363 : res->max_nunits = this_max_nunits;
1917 : 3818363 : vect_update_max_nunits (max_nunits, this_max_nunits);
1918 : : /* Keep a reference for the bst_map use. */
1919 : 3818363 : SLP_TREE_REF_COUNT (res)++;
1920 : : }
1921 : : return res_;
1922 : : }
1923 : :
1924 : : /* Helper for building an associated SLP node chain. */
1925 : :
1926 : : static void
1927 : 122 : vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1928 : : slp_tree op0, slp_tree op1,
1929 : : stmt_vec_info oper1, stmt_vec_info oper2,
1930 : : vec<std::pair<unsigned, unsigned> > lperm)
1931 : : {
1932 : 122 : unsigned group_size = SLP_TREE_LANES (op1);
1933 : :
1934 : 122 : slp_tree child1 = new _slp_tree;
1935 : 122 : SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1936 : 122 : SLP_TREE_VECTYPE (child1) = vectype;
1937 : 122 : SLP_TREE_LANES (child1) = group_size;
1938 : 122 : SLP_TREE_CHILDREN (child1).create (2);
1939 : 122 : SLP_TREE_CHILDREN (child1).quick_push (op0);
1940 : 122 : SLP_TREE_CHILDREN (child1).quick_push (op1);
1941 : 122 : SLP_TREE_REPRESENTATIVE (child1) = oper1;
1942 : :
1943 : 122 : slp_tree child2 = new _slp_tree;
1944 : 122 : SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1945 : 122 : SLP_TREE_VECTYPE (child2) = vectype;
1946 : 122 : SLP_TREE_LANES (child2) = group_size;
1947 : 122 : SLP_TREE_CHILDREN (child2).create (2);
1948 : 122 : SLP_TREE_CHILDREN (child2).quick_push (op0);
1949 : 122 : SLP_TREE_REF_COUNT (op0)++;
1950 : 122 : SLP_TREE_CHILDREN (child2).quick_push (op1);
1951 : 122 : SLP_TREE_REF_COUNT (op1)++;
1952 : 122 : SLP_TREE_REPRESENTATIVE (child2) = oper2;
1953 : :
1954 : 122 : SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1955 : 122 : SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1956 : 122 : SLP_TREE_VECTYPE (perm) = vectype;
1957 : 122 : SLP_TREE_LANES (perm) = group_size;
1958 : : /* ??? We should set this NULL but that's not expected. */
1959 : 122 : SLP_TREE_REPRESENTATIVE (perm) = oper1;
1960 : 122 : SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1961 : 122 : SLP_TREE_CHILDREN (perm).quick_push (child1);
1962 : 122 : SLP_TREE_CHILDREN (perm).quick_push (child2);
1963 : 122 : }
1964 : :
1965 : : /* Recursively build an SLP tree starting from NODE.
1966 : : Fail (and return a value not equal to zero) if def-stmts are not
1967 : : isomorphic, require data permutation or are of unsupported types of
1968 : : operation. Otherwise, return 0.
1969 : : The value returned is the depth in the SLP tree where a mismatch
1970 : : was found. */
1971 : :
1972 : : static slp_tree
1973 : 5770139 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1974 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1975 : : poly_uint64 *max_nunits,
1976 : : bool *matches, unsigned *limit, unsigned *tree_size,
1977 : : scalar_stmts_to_slp_tree_map_t *bst_map)
1978 : : {
1979 : 5770139 : unsigned nops, i, this_tree_size = 0;
1980 : 5770139 : poly_uint64 this_max_nunits = *max_nunits;
1981 : :
1982 : 5770139 : matches[0] = false;
1983 : :
1984 : 5770139 : stmt_vec_info stmt_info = stmts[0];
1985 : 5770139 : if (!is_a<gcall *> (stmt_info->stmt)
1986 : : && !is_a<gassign *> (stmt_info->stmt)
1987 : : && !is_a<gphi *> (stmt_info->stmt))
1988 : : return NULL;
1989 : :
1990 : 5770068 : nops = gimple_num_args (stmt_info->stmt);
1991 : 5770068 : if (const int *map = vect_get_operand_map (stmt_info->stmt,
1992 : 5770068 : STMT_VINFO_GATHER_SCATTER_P
1993 : : (stmt_info)))
1994 : 22796 : nops = map[0];
1995 : :
1996 : : /* If the SLP node is a PHI (induction or reduction), terminate
1997 : : the recursion. */
1998 : 5770068 : bool *skip_args = XALLOCAVEC (bool, nops);
1999 : 5770068 : memset (skip_args, 0, sizeof (bool) * nops);
2000 : 5770068 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
2001 : 2808678 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
2002 : : {
2003 : 468118 : tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
2004 : 468118 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
2005 : : group_size);
2006 : 468118 : if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
2007 : : max_nunits))
2008 : : return NULL;
2009 : :
2010 : 463927 : vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
2011 : 463927 : if (def_type == vect_induction_def)
2012 : : {
2013 : : /* Induction PHIs are not cycles but walk the initial
2014 : : value. Only for inner loops through, for outer loops
2015 : : we need to pick up the value from the actual PHIs
2016 : : to more easily support peeling and epilogue vectorization. */
2017 : 390593 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2018 : 390593 : if (!nested_in_vect_loop_p (loop, stmt_info))
2019 : 389949 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
2020 : : else
2021 : : loop = loop->inner;
2022 : 390593 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
2023 : : }
2024 : 73334 : else if (def_type == vect_reduction_def
2025 : : || def_type == vect_double_reduction_def
2026 : : || def_type == vect_nested_cycle
2027 : 73334 : || def_type == vect_first_order_recurrence)
2028 : : {
2029 : : /* Else def types have to match. */
2030 : : stmt_vec_info other_info;
2031 : : bool all_same = true;
2032 : 162817 : FOR_EACH_VEC_ELT (stmts, i, other_info)
2033 : : {
2034 : 90534 : if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
2035 : 1731061 : return NULL;
2036 : 90532 : if (other_info != stmt_info)
2037 : 15628 : all_same = false;
2038 : : }
2039 : 72283 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2040 : : /* Reduction initial values are not explicitely represented. */
2041 : 72283 : if (def_type != vect_first_order_recurrence
2042 : 72283 : && gimple_bb (stmt_info->stmt) == loop->header)
2043 : 69528 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
2044 : : /* Reduction chain backedge defs are filled manually.
2045 : : ??? Need a better way to identify a SLP reduction chain PHI.
2046 : : Or a better overall way to SLP match those. */
2047 : 72283 : if (stmts.length () > 1
2048 : 72283 : && all_same && def_type == vect_reduction_def)
2049 : 1397 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
2050 : : }
2051 : 1049 : else if (def_type != vect_internal_def)
2052 : : return NULL;
2053 : : }
2054 : :
2055 : :
2056 : 5765875 : bool two_operators = false;
2057 : 5765875 : unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
2058 : 5765875 : tree vectype = NULL_TREE;
2059 : 5765875 : if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
2060 : : &this_max_nunits, matches, &two_operators,
2061 : : &vectype))
2062 : : return NULL;
2063 : :
2064 : : /* If the SLP node is a load, terminate the recursion unless masked. */
2065 : 4243171 : if (STMT_VINFO_DATA_REF (stmt_info)
2066 : 1847146 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2067 : : {
2068 : 791057 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2069 : : gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
2070 : : else
2071 : : {
2072 : 778714 : *max_nunits = this_max_nunits;
2073 : 778714 : (*tree_size)++;
2074 : 778714 : node = vect_create_new_slp_node (node, stmts, 0);
2075 : 778714 : SLP_TREE_VECTYPE (node) = vectype;
2076 : : /* And compute the load permutation. Whether it is actually
2077 : : a permutation depends on the unrolling factor which is
2078 : : decided later. */
2079 : 778714 : vec<unsigned> load_permutation;
2080 : 778714 : int j;
2081 : 778714 : stmt_vec_info load_info;
2082 : 778714 : load_permutation.create (group_size);
2083 : 778714 : stmt_vec_info first_stmt_info
2084 : 778714 : = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2085 : 778714 : ? DR_GROUP_FIRST_ELEMENT (stmt_info) : stmt_info;
2086 : 778714 : bool any_permute = false;
2087 : 1913515 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2088 : : {
2089 : 1134801 : int load_place;
2090 : 1134801 : if (! load_info)
2091 : : {
2092 : 51107 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2093 : : load_place = j;
2094 : : else
2095 : : load_place = 0;
2096 : : }
2097 : 1083694 : else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2098 : 680614 : load_place = vect_get_place_in_interleaving_chain
2099 : 680614 : (load_info, first_stmt_info);
2100 : : else
2101 : : /* Recognize the splat case as { 0, 0, ... } but make
2102 : : sure to use the appropriate refs for collections
2103 : : of invariant refs. */
2104 : 403080 : load_place = (load_info == stmt_info) ? 0 : j;
2105 : 732010 : gcc_assert (load_place != -1);
2106 : 1134801 : any_permute |= load_place != j;
2107 : 1134801 : load_permutation.quick_push (load_place);
2108 : : }
2109 : :
2110 : 778714 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
2111 : : {
2112 : 2258 : gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
2113 : 2258 : bool has_gaps = false;
2114 : 2258 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2115 : 119 : for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
2116 : 196 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2117 : 77 : if (DR_GROUP_GAP (si) != 1)
2118 : 20 : has_gaps = true;
2119 : : /* We cannot handle permuted masked loads directly, see
2120 : : PR114375. We cannot handle strided masked loads or masked
2121 : : loads with gaps unless the mask is uniform. */
2122 : 2258 : if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
2123 : 119 : && (DR_GROUP_GAP (first_stmt_info) != 0
2124 : 59 : || (has_gaps
2125 : 20 : && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))))
2126 : 4436 : || STMT_VINFO_STRIDED_P (stmt_info))
2127 : : {
2128 : 93 : load_permutation.release ();
2129 : 93 : matches[0] = false;
2130 : 776573 : return NULL;
2131 : : }
2132 : :
2133 : : /* For permuted masked loads do an unpermuted masked load of
2134 : : the whole group followed by a SLP permute node. */
2135 : 2165 : if (any_permute
2136 : 2165 : || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2137 : 25 : && DR_GROUP_SIZE (first_stmt_info) != group_size))
2138 : : {
2139 : : /* Discover the whole unpermuted load. */
2140 : 24 : vec<stmt_vec_info> stmts2;
2141 : 24 : unsigned dr_group_size = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2142 : 38 : ? DR_GROUP_SIZE (first_stmt_info) : 1;
2143 : 24 : stmts2.create (dr_group_size);
2144 : 24 : stmts2.quick_grow_cleared (dr_group_size);
2145 : 24 : unsigned i = 0;
2146 : 24 : for (stmt_vec_info si = first_stmt_info;
2147 : 74 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2148 : : {
2149 : 50 : if (si != first_stmt_info)
2150 : 26 : for (unsigned k = 1; k < DR_GROUP_GAP (si); ++k)
2151 : 0 : stmts2[i++] = NULL;
2152 : 50 : stmts2[i++] = si;
2153 : : }
2154 : 24 : bool *matches2 = XALLOCAVEC (bool, dr_group_size);
2155 : 24 : slp_tree unperm_load
2156 : 24 : = vect_build_slp_tree (vinfo, stmts2, dr_group_size,
2157 : : &this_max_nunits, matches2, limit,
2158 : 24 : &this_tree_size, bst_map);
2159 : : /* When we are able to do the full masked load emit that
2160 : : followed by 'node' being the desired final permutation. */
2161 : 24 : if (unperm_load)
2162 : : {
2163 : 16 : gcc_assert
2164 : : (!SLP_TREE_LOAD_PERMUTATION (unperm_load).exists ());
2165 : 16 : lane_permutation_t lperm;
2166 : 16 : lperm.create (group_size);
2167 : 56 : for (unsigned j = 0; j < load_permutation.length (); ++j)
2168 : 40 : lperm.quick_push
2169 : 40 : (std::make_pair (0, load_permutation[j]));
2170 : 16 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2171 : 16 : SLP_TREE_CHILDREN (node).safe_push (unperm_load);
2172 : 16 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2173 : 16 : load_permutation.release ();
2174 : 16 : return node;
2175 : : }
2176 : 8 : stmts2.release ();
2177 : 8 : load_permutation.release ();
2178 : 8 : matches[0] = false;
2179 : 8 : return NULL;
2180 : : }
2181 : 2141 : load_permutation.release ();
2182 : : }
2183 : : else
2184 : : {
2185 : 776456 : if (!any_permute
2186 : 668054 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2187 : 1057935 : && group_size == DR_GROUP_SIZE (first_stmt_info))
2188 : 132826 : load_permutation.release ();
2189 : 776456 : SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
2190 : 776456 : return node;
2191 : : }
2192 : : }
2193 : : }
2194 : 3452114 : else if (gimple_assign_single_p (stmt_info->stmt)
2195 : 2118320 : && !gimple_vuse (stmt_info->stmt)
2196 : 3459584 : && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
2197 : : {
2198 : : /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
2199 : : the same SSA name vector of a compatible type to vectype. */
2200 : 2124 : vec<std::pair<unsigned, unsigned> > lperm = vNULL;
2201 : 2124 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
2202 : 2124 : stmt_vec_info estmt_info;
2203 : 6674 : FOR_EACH_VEC_ELT (stmts, i, estmt_info)
2204 : : {
2205 : 4697 : gassign *estmt = as_a <gassign *> (estmt_info->stmt);
2206 : 4697 : tree bfref = gimple_assign_rhs1 (estmt);
2207 : 4697 : HOST_WIDE_INT lane;
2208 : 4697 : if (!known_eq (bit_field_size (bfref),
2209 : : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2210 : 9247 : || !constant_multiple_p (bit_field_offset (bfref),
2211 : 4550 : bit_field_size (bfref), &lane))
2212 : : {
2213 : 147 : lperm.release ();
2214 : 147 : matches[0] = false;
2215 : 147 : return NULL;
2216 : : }
2217 : 4550 : lperm.safe_push (std::make_pair (0, (unsigned)lane));
2218 : : }
2219 : 1977 : slp_tree vnode = vect_create_new_slp_node (vNULL);
2220 : 1977 : if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2221 : : /* ??? We record vectype here but we hide eventually necessary
2222 : : punning and instead rely on code generation to materialize
2223 : : VIEW_CONVERT_EXPRs as necessary. We instead should make
2224 : : this explicit somehow. */
2225 : 684 : SLP_TREE_VECTYPE (vnode) = vectype;
2226 : : else
2227 : : {
2228 : : /* For different size but compatible elements we can still
2229 : : use VEC_PERM_EXPR without punning. */
2230 : 1293 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2231 : : && types_compatible_p (TREE_TYPE (vectype),
2232 : : TREE_TYPE (TREE_TYPE (vec))));
2233 : 1293 : SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2234 : : }
2235 : 1977 : auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2236 : 1977 : unsigned HOST_WIDE_INT const_nunits;
2237 : 1977 : if (nunits.is_constant (&const_nunits))
2238 : 1977 : SLP_TREE_LANES (vnode) = const_nunits;
2239 : 1977 : SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2240 : : /* We are always building a permutation node even if it is an identity
2241 : : permute to shield the rest of the vectorizer from the odd node
2242 : : representing an actual vector without any scalar ops.
2243 : : ??? We could hide it completely with making the permute node
2244 : : external? */
2245 : 1977 : node = vect_create_new_slp_node (node, stmts, 1);
2246 : 1977 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2247 : 1977 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2248 : 1977 : SLP_TREE_VECTYPE (node) = vectype;
2249 : 1977 : SLP_TREE_CHILDREN (node).quick_push (vnode);
2250 : 1977 : return node;
2251 : : }
2252 : : /* When discovery reaches an associatable operation see whether we can
2253 : : improve that to match up lanes in a way superior to the operand
2254 : : swapping code which at most looks at two defs.
2255 : : ??? For BB vectorization we cannot do the brute-force search
2256 : : for matching as we can succeed by means of builds from scalars
2257 : : and have no good way to "cost" one build against another. */
2258 : 3449990 : else if (is_a <loop_vec_info> (vinfo)
2259 : : /* Do not bother for single-lane SLP. */
2260 : 2108415 : && group_size > 1
2261 : : /* ??? We don't handle !vect_internal_def defs below. */
2262 : 77907 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2263 : : /* ??? Do not associate a reduction, this will wreck REDUC_IDX
2264 : : mapping as long as that exists on the stmt_info level. */
2265 : 61467 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1
2266 : 56356 : && is_gimple_assign (stmt_info->stmt)
2267 : 56131 : && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2268 : 38758 : || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2269 : 3468965 : && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2270 : 11489 : || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2271 : 9611 : && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2272 : : {
2273 : : /* See if we have a chain of (mixed) adds or subtracts or other
2274 : : associatable ops. */
2275 : 13608 : enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2276 : 13608 : if (code == MINUS_EXPR)
2277 : 677 : code = PLUS_EXPR;
2278 : 13608 : stmt_vec_info other_op_stmt_info = NULL;
2279 : 13608 : stmt_vec_info op_stmt_info = NULL;
2280 : 13608 : unsigned chain_len = 0;
2281 : 13608 : auto_vec<chain_op_t> chain;
2282 : 13608 : auto_vec<std::pair<tree_code, gimple *> > worklist;
2283 : 13608 : auto_vec<vec<chain_op_t> > chains (group_size);
2284 : 13608 : auto_vec<slp_tree, 4> children;
2285 : 13608 : bool hard_fail = true;
2286 : 14403 : for (unsigned lane = 0; lane < group_size; ++lane)
2287 : : {
2288 : 14152 : if (!stmts[lane])
2289 : : {
2290 : : /* ??? Below we require lane zero is present. */
2291 : 0 : if (lane == 0)
2292 : : {
2293 : : hard_fail = false;
2294 : 13357 : break;
2295 : : }
2296 : 0 : chains.quick_push (vNULL);
2297 : 0 : continue;
2298 : : }
2299 : : /* For each lane linearize the addition/subtraction (or other
2300 : : uniform associatable operation) expression tree. */
2301 : 14152 : gimple *op_stmt = NULL, *other_op_stmt = NULL;
2302 : 14152 : vect_slp_linearize_chain (vinfo, worklist, chain, code,
2303 : 14152 : stmts[lane]->stmt, op_stmt, other_op_stmt,
2304 : : NULL);
2305 : 14152 : if (!op_stmt_info && op_stmt)
2306 : 13078 : op_stmt_info = vinfo->lookup_stmt (op_stmt);
2307 : 14152 : if (!other_op_stmt_info && other_op_stmt)
2308 : 713 : other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2309 : 14152 : if (chain.length () == 2)
2310 : : {
2311 : : /* In a chain of just two elements resort to the regular
2312 : : operand swapping scheme. Likewise if we run into a
2313 : : length mismatch process regularly as well as we did not
2314 : : process the other lanes we cannot report a good hint what
2315 : : lanes to try swapping in the parent. */
2316 : : hard_fail = false;
2317 : : break;
2318 : : }
2319 : 798 : else if (chain_len == 0)
2320 : 291 : chain_len = chain.length ();
2321 : 1014 : else if (chain.length () != chain_len)
2322 : : {
2323 : : /* ??? Here we could slip in magic to compensate with
2324 : : neutral operands. */
2325 : 3 : matches[lane] = false;
2326 : 3 : if (lane != group_size - 1)
2327 : 3 : matches[0] = false;
2328 : : break;
2329 : : }
2330 : 795 : chains.quick_push (chain.copy ());
2331 : 795 : chain.truncate (0);
2332 : : }
2333 : 27216 : if (chains.length () == group_size)
2334 : : {
2335 : : /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2336 : 251 : if (!op_stmt_info)
2337 : : {
2338 : 2 : hard_fail = false;
2339 : 2 : goto out;
2340 : : }
2341 : : /* Now we have a set of chains with the same length. */
2342 : : /* 1. pre-sort according to def_type and operation. */
2343 : 934 : for (unsigned lane = 0; lane < group_size; ++lane)
2344 : 1370 : chains[lane].stablesort (dt_sort_cmp, vinfo);
2345 : 249 : if (dump_enabled_p ())
2346 : : {
2347 : 127 : dump_printf_loc (MSG_NOTE, vect_location,
2348 : : "pre-sorted chains of %s\n",
2349 : : get_tree_code_name (code));
2350 : 541 : for (unsigned lane = 0; lane < group_size; ++lane)
2351 : : {
2352 : 414 : if (!stmts[lane])
2353 : 0 : dump_printf (MSG_NOTE, "--");
2354 : : else
2355 : 1870 : for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2356 : 2912 : dump_printf (MSG_NOTE, "%s %T ",
2357 : 1456 : get_tree_code_name (chains[lane][opnum].code),
2358 : 1456 : chains[lane][opnum].op);
2359 : 414 : dump_printf (MSG_NOTE, "\n");
2360 : : }
2361 : : }
2362 : : /* 2. try to build children nodes, associating as necessary. */
2363 : : /* 2a. prepare and perform early checks to avoid eating into
2364 : : discovery limit unnecessarily. */
2365 : 249 : vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
2366 : 1051 : for (unsigned n = 0; n < chain_len; ++n)
2367 : : {
2368 : 802 : vect_def_type dt = chains[0][n].dt;
2369 : 802 : unsigned lane;
2370 : 3103 : for (lane = 0; lane < group_size; ++lane)
2371 : 4602 : if (stmts[lane] && chains[lane][n].dt != dt)
2372 : : {
2373 : 0 : if (dt == vect_constant_def
2374 : 0 : && chains[lane][n].dt == vect_external_def)
2375 : : dt = vect_external_def;
2376 : 0 : else if (dt == vect_external_def
2377 : 0 : && chains[lane][n].dt == vect_constant_def)
2378 : : ;
2379 : : else
2380 : : break;
2381 : : }
2382 : 802 : if (lane != group_size)
2383 : : {
2384 : 0 : if (dump_enabled_p ())
2385 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2386 : : "giving up on chain due to mismatched "
2387 : : "def types\n");
2388 : 0 : matches[lane] = false;
2389 : 0 : if (lane != group_size - 1)
2390 : 0 : matches[0] = false;
2391 : 0 : goto out;
2392 : : }
2393 : 802 : dts[n] = dt;
2394 : 802 : if (dt == vect_constant_def
2395 : 802 : || dt == vect_external_def)
2396 : : {
2397 : : /* Check whether we can build the invariant. If we can't
2398 : : we never will be able to. */
2399 : 71 : tree type = TREE_TYPE (chains[0][n].op);
2400 : 802 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2401 : : && (TREE_CODE (type) == BOOLEAN_TYPE
2402 : : || !can_duplicate_and_interleave_p (vinfo, group_size,
2403 : : type)))
2404 : : {
2405 : : matches[0] = false;
2406 : : goto out;
2407 : : }
2408 : : }
2409 : 731 : else if (dt != vect_internal_def)
2410 : : {
2411 : : /* Not sure, we might need sth special.
2412 : : gcc.dg/vect/pr96854.c,
2413 : : gfortran.dg/vect/fast-math-pr37021.f90
2414 : : and gfortran.dg/vect/pr61171.f trigger. */
2415 : : /* Soft-fail for now. */
2416 : 0 : hard_fail = false;
2417 : 0 : goto out;
2418 : : }
2419 : : }
2420 : : /* 2b. do the actual build. */
2421 : 997 : for (unsigned n = 0; n < chain_len; ++n)
2422 : : {
2423 : 767 : vect_def_type dt = dts[n];
2424 : 767 : unsigned lane;
2425 : 767 : if (dt == vect_constant_def
2426 : 767 : || dt == vect_external_def)
2427 : : {
2428 : 71 : vec<tree> ops;
2429 : 71 : ops.create (group_size);
2430 : 355 : for (lane = 0; lane < group_size; ++lane)
2431 : 213 : if (stmts[lane])
2432 : 213 : ops.quick_push (chains[lane][n].op);
2433 : : else
2434 : 0 : ops.quick_push (NULL_TREE);
2435 : 71 : slp_tree child = vect_create_new_slp_node (ops);
2436 : 71 : SLP_TREE_DEF_TYPE (child) = dt;
2437 : 71 : children.safe_push (child);
2438 : : }
2439 : : else
2440 : : {
2441 : 696 : vec<stmt_vec_info> op_stmts;
2442 : 696 : op_stmts.create (group_size);
2443 : 696 : slp_tree child = NULL;
2444 : : /* Brute-force our way. We have to consider a lane
2445 : : failing after fixing an earlier fail up in the
2446 : : SLP discovery recursion. So track the current
2447 : : permute per lane. */
2448 : 696 : unsigned *perms = XALLOCAVEC (unsigned, group_size);
2449 : 696 : memset (perms, 0, sizeof (unsigned) * group_size);
2450 : 775 : do
2451 : : {
2452 : 775 : op_stmts.truncate (0);
2453 : 3792 : for (lane = 0; lane < group_size; ++lane)
2454 : 2242 : if (stmts[lane])
2455 : 2242 : op_stmts.quick_push
2456 : 2242 : (vinfo->lookup_def (chains[lane][n].op));
2457 : : else
2458 : 0 : op_stmts.quick_push (NULL);
2459 : 775 : child = vect_build_slp_tree (vinfo, op_stmts,
2460 : : group_size, &this_max_nunits,
2461 : : matches, limit,
2462 : : &this_tree_size, bst_map);
2463 : : /* ??? We're likely getting too many fatal mismatches
2464 : : here so maybe we want to ignore them (but then we
2465 : : have no idea which lanes fatally mismatched). */
2466 : 775 : if (child || !matches[0])
2467 : : break;
2468 : : /* Swap another lane we have not yet matched up into
2469 : : lanes that did not match. If we run out of
2470 : : permute possibilities for a lane terminate the
2471 : : search. */
2472 : 257 : bool term = false;
2473 : 257 : for (lane = 1; lane < group_size; ++lane)
2474 : 178 : if (!matches[lane])
2475 : : {
2476 : 150 : if (n + perms[lane] + 1 == chain_len)
2477 : : {
2478 : : term = true;
2479 : : break;
2480 : : }
2481 : 131 : if (dump_enabled_p ())
2482 : 113 : dump_printf_loc (MSG_NOTE, vect_location,
2483 : : "swapping operand %d and %d "
2484 : : "of lane %d\n",
2485 : : n, n + perms[lane] + 1, lane);
2486 : 262 : std::swap (chains[lane][n],
2487 : 131 : chains[lane][n + perms[lane] + 1]);
2488 : 131 : perms[lane]++;
2489 : : }
2490 : 98 : if (term)
2491 : : break;
2492 : : }
2493 : : while (1);
2494 : 696 : if (!child)
2495 : : {
2496 : 19 : if (dump_enabled_p ())
2497 : 18 : dump_printf_loc (MSG_NOTE, vect_location,
2498 : : "failed to match up op %d\n", n);
2499 : 19 : op_stmts.release ();
2500 : 19 : if (lane != group_size - 1)
2501 : 9 : matches[0] = false;
2502 : : else
2503 : 10 : matches[lane] = false;
2504 : 19 : goto out;
2505 : : }
2506 : 677 : if (dump_enabled_p ())
2507 : : {
2508 : 337 : dump_printf_loc (MSG_NOTE, vect_location,
2509 : : "matched up op %d to\n", n);
2510 : 337 : vect_print_slp_tree (MSG_NOTE, vect_location, child);
2511 : : }
2512 : 677 : children.safe_push (child);
2513 : : }
2514 : : }
2515 : : /* 3. build SLP nodes to combine the chain. */
2516 : 842 : for (unsigned lane = 0; lane < group_size; ++lane)
2517 : 1236 : if (stmts[lane] && chains[lane][0].code != code)
2518 : : {
2519 : : /* See if there's any alternate all-PLUS entry. */
2520 : : unsigned n;
2521 : 6 : for (n = 1; n < chain_len; ++n)
2522 : : {
2523 : 30 : for (lane = 0; lane < group_size; ++lane)
2524 : 48 : if (stmts[lane] && chains[lane][n].code != code)
2525 : : break;
2526 : 6 : if (lane == group_size)
2527 : : break;
2528 : : }
2529 : 6 : if (n != chain_len)
2530 : : {
2531 : : /* Swap that in at first position. */
2532 : 6 : std::swap (children[0], children[n]);
2533 : 30 : for (lane = 0; lane < group_size; ++lane)
2534 : 24 : if (stmts[lane])
2535 : 24 : std::swap (chains[lane][0], chains[lane][n]);
2536 : : }
2537 : : else
2538 : : {
2539 : : /* ??? When this triggers and we end up with two
2540 : : vect_constant/external_def up-front things break (ICE)
2541 : : spectacularly finding an insertion place for the
2542 : : all-constant op. We should have a fully
2543 : : vect_internal_def operand though(?) so we can swap
2544 : : that into first place and then prepend the all-zero
2545 : : constant. */
2546 : 0 : if (dump_enabled_p ())
2547 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2548 : : "inserting constant zero to compensate "
2549 : : "for (partially) negated first "
2550 : : "operand\n");
2551 : 0 : chain_len++;
2552 : 0 : for (lane = 0; lane < group_size; ++lane)
2553 : 0 : if (stmts[lane])
2554 : 0 : chains[lane].safe_insert
2555 : 0 : (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2556 : 0 : vec<tree> zero_ops;
2557 : 0 : zero_ops.create (group_size);
2558 : 0 : zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2559 : 0 : for (lane = 1; lane < group_size; ++lane)
2560 : 0 : if (stmts[lane])
2561 : 0 : zero_ops.quick_push (zero_ops[0]);
2562 : : else
2563 : 0 : zero_ops.quick_push (NULL_TREE);
2564 : 0 : slp_tree zero = vect_create_new_slp_node (zero_ops);
2565 : 0 : SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2566 : 0 : children.safe_insert (0, zero);
2567 : : }
2568 : : break;
2569 : : }
2570 : 743 : for (unsigned i = 1; i < children.length (); ++i)
2571 : : {
2572 : 513 : slp_tree op0 = children[i - 1];
2573 : 513 : slp_tree op1 = children[i];
2574 : 513 : bool this_two_op = false;
2575 : 1845 : for (unsigned lane = 0; lane < group_size; ++lane)
2576 : 2908 : if (stmts[lane] && chains[lane][i].code != chains[0][i].code)
2577 : : {
2578 : : this_two_op = true;
2579 : : break;
2580 : : }
2581 : 513 : slp_tree child;
2582 : 513 : if (i == children.length () - 1)
2583 : 230 : child = vect_create_new_slp_node (node, stmts, 2);
2584 : : else
2585 : 283 : child = vect_create_new_slp_node (2, ERROR_MARK);
2586 : 513 : if (this_two_op)
2587 : : {
2588 : 122 : vec<std::pair<unsigned, unsigned> > lperm;
2589 : 122 : lperm.create (group_size);
2590 : 462 : for (unsigned lane = 0; lane < group_size; ++lane)
2591 : 680 : lperm.quick_push (std::make_pair
2592 : 340 : (chains[lane][i].code != chains[0][i].code, lane));
2593 : 244 : vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2594 : 122 : (chains[0][i].code == code
2595 : : ? op_stmt_info
2596 : : : other_op_stmt_info),
2597 : 122 : (chains[0][i].code == code
2598 : : ? other_op_stmt_info
2599 : : : op_stmt_info),
2600 : : lperm);
2601 : : }
2602 : : else
2603 : : {
2604 : 391 : SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2605 : 391 : SLP_TREE_VECTYPE (child) = vectype;
2606 : 391 : SLP_TREE_LANES (child) = group_size;
2607 : 391 : SLP_TREE_CHILDREN (child).quick_push (op0);
2608 : 391 : SLP_TREE_CHILDREN (child).quick_push (op1);
2609 : 391 : SLP_TREE_REPRESENTATIVE (child)
2610 : 782 : = (chains[0][i].code == code
2611 : 391 : ? op_stmt_info : other_op_stmt_info);
2612 : : }
2613 : 513 : children[i] = child;
2614 : : }
2615 : 230 : *tree_size += this_tree_size + 1;
2616 : 230 : *max_nunits = this_max_nunits;
2617 : 1118 : while (!chains.is_empty ())
2618 : 636 : chains.pop ().release ();
2619 : : return node;
2620 : : }
2621 : 13357 : out:
2622 : 13378 : if (dump_enabled_p ())
2623 : 2721 : dump_printf_loc (MSG_NOTE, vect_location,
2624 : : "failed to line up SLP graph by re-associating "
2625 : : "operations in lanes%s\n",
2626 : : !hard_fail ? " trying regular discovery" : "");
2627 : 13383 : while (!children.is_empty ())
2628 : 5 : vect_free_slp_tree (children.pop ());
2629 : 13537 : while (!chains.is_empty ())
2630 : 159 : chains.pop ().release ();
2631 : : /* Hard-fail, otherwise we might run into quadratic processing of the
2632 : : chains starting one stmt into the chain again. */
2633 : 13378 : if (hard_fail)
2634 : : return NULL;
2635 : : /* Fall thru to normal processing. */
2636 : 13608 : }
2637 : :
2638 : : /* Get at the operands, verifying they are compatible. */
2639 : 3464222 : vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2640 : 3464222 : slp_oprnd_info oprnd_info;
2641 : 16173453 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2642 : : {
2643 : 25423406 : int res = vect_get_and_check_slp_defs (vinfo, vectype,
2644 : 12711703 : swap[i], skip_args,
2645 : : stmts, i, &oprnds_info);
2646 : 12711703 : if (res != 0)
2647 : 513292 : matches[(res == -1) ? 0 : i] = false;
2648 : 12711703 : if (!matches[0])
2649 : : break;
2650 : : }
2651 : 15882237 : for (i = 0; i < group_size; ++i)
2652 : 12622108 : if (!matches[i])
2653 : : {
2654 : 204093 : vect_free_oprnd_info (oprnds_info);
2655 : 204093 : return NULL;
2656 : : }
2657 : 9780387 : swap = NULL;
2658 : :
2659 : 9780387 : bool has_two_operators_perm = false;
2660 : 19560774 : auto_vec<unsigned> two_op_perm_indices[2];
2661 : 3260129 : vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
2662 : :
2663 : 3272084 : if (two_operators && oprnds_info.length () == 2 && group_size > 2)
2664 : : {
2665 : 2606 : unsigned idx = 0;
2666 : 2606 : hash_map<gimple *, unsigned> seen;
2667 : 2606 : vec<slp_oprnd_info> new_oprnds_info
2668 : 2606 : = vect_create_oprnd_info (1, group_size);
2669 : 2606 : bool success = true;
2670 : :
2671 : 2606 : enum tree_code code = ERROR_MARK;
2672 : 2606 : if (oprnds_info[0]->def_stmts[0]
2673 : 2606 : && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
2674 : 2548 : code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
2675 : 2606 : basic_block bb = nullptr;
2676 : :
2677 : 5675 : for (unsigned j = 0; j < group_size; ++j)
2678 : : {
2679 : 13523 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2680 : : {
2681 : 10454 : stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
2682 : 10454 : if (!stmt_info
2683 : 10323 : || !is_a<gassign *> (stmt_info->stmt)
2684 : 10320 : || gimple_assign_rhs_code (stmt_info->stmt) != code
2685 : 18683 : || skip_args[i])
2686 : : {
2687 : : success = false;
2688 : 2229 : break;
2689 : : }
2690 : : /* Avoid mixing lanes with defs in different basic-blocks. */
2691 : 8229 : if (!bb)
2692 : 2706 : bb = gimple_bb (vect_orig_stmt (stmt_info)->stmt);
2693 : 7045 : else if (gimple_bb (vect_orig_stmt (stmt_info)->stmt) != bb)
2694 : : {
2695 : : success = false;
2696 : : break;
2697 : : }
2698 : :
2699 : 8225 : bool exists;
2700 : 8225 : unsigned &stmt_idx
2701 : 8225 : = seen.get_or_insert (stmt_info->stmt, &exists);
2702 : :
2703 : 8225 : if (!exists)
2704 : : {
2705 : 7176 : new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
2706 : 7176 : new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
2707 : 7176 : stmt_idx = idx;
2708 : 7176 : idx++;
2709 : : }
2710 : :
2711 : 8225 : two_op_perm_indices[i].safe_push (stmt_idx);
2712 : : }
2713 : :
2714 : 5298 : if (!success)
2715 : : break;
2716 : : }
2717 : :
2718 : 2606 : if (success && idx == group_size)
2719 : : {
2720 : 56 : if (dump_enabled_p ())
2721 : : {
2722 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2723 : : "Replace two_operators operands:\n");
2724 : :
2725 : 0 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2726 : : {
2727 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2728 : : "Operand %u:\n", i);
2729 : 0 : for (unsigned j = 0; j < group_size; j++)
2730 : 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2731 : 0 : j, oprnd_info->def_stmts[j]->stmt);
2732 : : }
2733 : :
2734 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2735 : : "With a single operand:\n");
2736 : 0 : for (unsigned j = 0; j < group_size; j++)
2737 : 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2738 : 0 : j, new_oprnds_info[0]->def_stmts[j]->stmt);
2739 : : }
2740 : :
2741 : 56 : two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
2742 : 56 : two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
2743 : :
2744 : 56 : new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
2745 : 56 : new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
2746 : 56 : new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
2747 : 56 : new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
2748 : 56 : new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
2749 : :
2750 : 56 : vect_free_oprnd_info (oprnds_info);
2751 : 56 : oprnds_info = new_oprnds_info;
2752 : 56 : nops = 1;
2753 : 56 : has_two_operators_perm = true;
2754 : : }
2755 : : else
2756 : 2550 : vect_free_oprnd_info (new_oprnds_info);
2757 : 2606 : }
2758 : :
2759 : 6520258 : auto_vec<slp_tree, 4> children;
2760 : :
2761 : 3260129 : stmt_info = stmts[0];
2762 : :
2763 : 3260129 : int reduc_idx = -1;
2764 : 3260129 : int gs_scale = 0;
2765 : 3260129 : tree gs_base = NULL_TREE;
2766 : :
2767 : : /* Create SLP_TREE nodes for the definition node/s. */
2768 : 8531741 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2769 : : {
2770 : 5357273 : slp_tree child = nullptr;
2771 : 5357273 : unsigned int j;
2772 : :
2773 : : /* We're skipping certain operands from processing, for example
2774 : : outer loop reduction initial defs. */
2775 : 5357273 : if (skip_args[i])
2776 : : {
2777 : 851467 : children.safe_push (NULL);
2778 : 6123079 : continue;
2779 : : }
2780 : :
2781 : 4505806 : if (oprnd_info->first_dt == vect_uninitialized_def)
2782 : : {
2783 : : /* COND_EXPR have one too many eventually if the condition
2784 : : is a SSA name. */
2785 : 0 : gcc_assert (i == 3 && nops == 4);
2786 : 0 : continue;
2787 : : }
2788 : :
2789 : 4505806 : if (oprnd_info->first_gs_p)
2790 : : {
2791 : 15962 : gs_scale = oprnd_info->first_gs_info.scale;
2792 : 15962 : gs_base = oprnd_info->first_gs_info.base;
2793 : : }
2794 : :
2795 : 4505806 : if (is_a <bb_vec_info> (vinfo)
2796 : 1587876 : && oprnd_info->first_dt == vect_internal_def
2797 : 5329154 : && !oprnd_info->any_pattern)
2798 : : {
2799 : : /* For BB vectorization, if all defs are the same do not
2800 : : bother to continue the build along the single-lane
2801 : : graph but use a splat of the scalar value. */
2802 : 780823 : stmt_vec_info first_def = oprnd_info->def_stmts[0];
2803 : 840493 : for (j = 1; j < group_size; ++j)
2804 : 796727 : if (oprnd_info->def_stmts[j] != first_def)
2805 : : break;
2806 : 780823 : if (j == group_size
2807 : : /* But avoid doing this for loads where we may be
2808 : : able to CSE things, unless the stmt is not
2809 : : vectorizable. */
2810 : 780823 : && (!STMT_VINFO_VECTORIZABLE (first_def)
2811 : 49975 : || !gimple_vuse (first_def->stmt)))
2812 : : {
2813 : 34539 : if (dump_enabled_p ())
2814 : 94 : dump_printf_loc (MSG_NOTE, vect_location,
2815 : : "Using a splat of the uniform operand %G",
2816 : : first_def->stmt);
2817 : 34539 : oprnd_info->first_dt = vect_external_def;
2818 : : }
2819 : : }
2820 : :
2821 : 4505806 : if (oprnd_info->first_dt == vect_external_def
2822 : 4505806 : || oprnd_info->first_dt == vect_constant_def)
2823 : : {
2824 : 1650211 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2825 : : {
2826 : : tree op0;
2827 : : tree uniform_val = op0 = oprnd_info->ops[0];
2828 : : for (j = 1; j < oprnd_info->ops.length (); ++j)
2829 : : if (oprnd_info->ops[j]
2830 : : && !operand_equal_p (uniform_val, oprnd_info->ops[j]))
2831 : : {
2832 : : uniform_val = NULL_TREE;
2833 : : break;
2834 : : }
2835 : : if (!uniform_val
2836 : : && !can_duplicate_and_interleave_p (vinfo,
2837 : : oprnd_info->ops.length (),
2838 : : TREE_TYPE (op0)))
2839 : : {
2840 : : matches[j] = false;
2841 : : if (dump_enabled_p ())
2842 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2843 : : "Build SLP failed: invalid type of def "
2844 : : "for variable-length SLP %T\n", op0);
2845 : : goto fail;
2846 : : }
2847 : : }
2848 : 1650211 : slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2849 : 1650211 : SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2850 : 1650211 : oprnd_info->ops = vNULL;
2851 : 1650211 : children.safe_push (invnode);
2852 : 1650211 : continue;
2853 : 1650211 : }
2854 : :
2855 : : /* See which SLP operand a reduction chain continues on. We want
2856 : : to chain even PHIs but not backedges. */
2857 : 2855595 : if (STMT_VINFO_REDUC_DEF (oprnd_info->def_stmts[0])
2858 : 2855595 : || STMT_VINFO_REDUC_IDX (oprnd_info->def_stmts[0]) != -1)
2859 : : {
2860 : 156834 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2861 : : {
2862 : 596 : if (oprnd_info->first_dt == vect_double_reduction_def)
2863 : 298 : reduc_idx = i;
2864 : : }
2865 : 156238 : else if (is_a <gphi *> (stmt_info->stmt)
2866 : 156238 : && gimple_phi_num_args
2867 : 68433 : (as_a <gphi *> (stmt_info->stmt)) != 1)
2868 : : ;
2869 : 88107 : else if (STMT_VINFO_REDUC_IDX (stmt_info) == -1
2870 : 302 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2871 : : ;
2872 : 88107 : else if (reduc_idx == -1)
2873 : 83801 : reduc_idx = i;
2874 : : else
2875 : : /* For .COND_* reduction operations the else value can be the
2876 : : same as one of the operation operands. The other def
2877 : : stmts have been moved, so we can't check easily. Check
2878 : : it's a call at least. */
2879 : 4306 : gcc_assert (is_a <gcall *> (stmt_info->stmt));
2880 : : }
2881 : :
2882 : : /* When we have a masked load with uniform mask discover this
2883 : : as a single-lane mask with a splat permute. This way we can
2884 : : recognize this as a masked load-lane by stripping the splat. */
2885 : 2855595 : if (is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
2886 : 34487 : && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
2887 : : IFN_MASK_LOAD)
2888 : 4676 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2889 : 2855617 : && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info)))
2890 : : {
2891 : 0 : vec<stmt_vec_info> def_stmts2;
2892 : 0 : def_stmts2.create (1);
2893 : 0 : def_stmts2.quick_push (oprnd_info->def_stmts[0]);
2894 : 0 : child = vect_build_slp_tree (vinfo, def_stmts2, 1,
2895 : : &this_max_nunits,
2896 : : matches, limit,
2897 : : &this_tree_size, bst_map);
2898 : 0 : if (child)
2899 : : {
2900 : 0 : slp_tree pnode = vect_create_new_slp_node (1, VEC_PERM_EXPR);
2901 : 0 : SLP_TREE_VECTYPE (pnode) = SLP_TREE_VECTYPE (child);
2902 : 0 : SLP_TREE_LANES (pnode) = group_size;
2903 : 0 : SLP_TREE_SCALAR_STMTS (pnode).create (group_size);
2904 : 0 : SLP_TREE_LANE_PERMUTATION (pnode).create (group_size);
2905 : 0 : for (unsigned k = 0; k < group_size; ++k)
2906 : : {
2907 : 0 : SLP_TREE_SCALAR_STMTS (pnode)
2908 : 0 : .quick_push (oprnd_info->def_stmts[0]);
2909 : 0 : SLP_TREE_LANE_PERMUTATION (pnode)
2910 : 0 : .quick_push (std::make_pair (0u, 0u));
2911 : : }
2912 : 0 : SLP_TREE_CHILDREN (pnode).quick_push (child);
2913 : 0 : pnode->max_nunits = child->max_nunits;
2914 : 0 : children.safe_push (pnode);
2915 : 0 : oprnd_info->def_stmts = vNULL;
2916 : 0 : continue;
2917 : 0 : }
2918 : : else
2919 : 0 : def_stmts2.release ();
2920 : : }
2921 : :
2922 : 2855595 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2923 : : group_size, &this_max_nunits,
2924 : : matches, limit,
2925 : : &this_tree_size, bst_map)) != NULL)
2926 : : {
2927 : 2391630 : oprnd_info->def_stmts = vNULL;
2928 : 2391630 : children.safe_push (child);
2929 : 2391630 : continue;
2930 : : }
2931 : :
2932 : : /* If the SLP build for operand zero failed and operand zero
2933 : : and one can be commutated try that for the scalar stmts
2934 : : that failed the match. */
2935 : 463965 : if (i == 0
2936 : : /* A first scalar stmt mismatch signals a fatal mismatch. */
2937 : 364169 : && matches[0]
2938 : : /* ??? For COND_EXPRs we can swap the comparison operands
2939 : : as well as the arms under some constraints. */
2940 : 169693 : && (nops == 2 || nops == 3)
2941 : 104914 : && oprnds_info[1]->first_dt == vect_internal_def
2942 : 59001 : && (is_gimple_assign (stmt_info->stmt)
2943 : 12744 : || is_gimple_call (stmt_info->stmt))
2944 : : /* Swapping operands for reductions breaks assumptions later on. */
2945 : 510235 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
2946 : : {
2947 : : /* See whether we can swap the matching or the non-matching
2948 : : stmt operands. */
2949 : : bool swap_not_matching = true;
2950 : 52533 : do
2951 : : {
2952 : 7040992 : for (j = 0; j < group_size; ++j)
2953 : : {
2954 : 7003795 : if (matches[j] != !swap_not_matching)
2955 : 66499 : continue;
2956 : 6937296 : stmt_vec_info stmt_info = stmts[j];
2957 : : /* Verify if we can swap operands of this stmt. */
2958 : 6937296 : if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
2959 : : {
2960 : 6937270 : tree_code code = gimple_assign_rhs_code (stmt);
2961 : 6937270 : if (! commutative_tree_code (code)
2962 : 6937270 : && ! commutative_ternary_tree_code (code))
2963 : : {
2964 : 15312 : if (!swap_not_matching)
2965 : 7080 : goto fail;
2966 : : swap_not_matching = false;
2967 : : break;
2968 : : }
2969 : : }
2970 : 6988485 : else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
2971 : : {
2972 : 26 : internal_fn fn = (gimple_call_internal_p (call)
2973 : 26 : ? gimple_call_internal_fn (call)
2974 : : : IFN_LAST);
2975 : 26 : if ((! commutative_binary_fn_p (fn)
2976 : 26 : && ! commutative_ternary_fn_p (fn))
2977 : 28 : || first_commutative_argument (fn) != 0)
2978 : : {
2979 : 24 : if (!swap_not_matching)
2980 : 12 : goto fail;
2981 : : swap_not_matching = false;
2982 : : break;
2983 : : }
2984 : : }
2985 : : }
2986 : : }
2987 : 45441 : while (j != group_size);
2988 : :
2989 : : /* Swap mismatched definition stmts. */
2990 : 37197 : if (dump_enabled_p ())
2991 : 330 : dump_printf_loc (MSG_NOTE, vect_location,
2992 : : "Re-trying with swapped operands of stmts ");
2993 : 7016652 : for (j = 0; j < group_size; ++j)
2994 : 6979455 : if (matches[j] == !swap_not_matching)
2995 : : {
2996 : 13843656 : std::swap (oprnds_info[0]->def_stmts[j],
2997 : 6921828 : oprnds_info[1]->def_stmts[j]);
2998 : 13843656 : std::swap (oprnds_info[0]->ops[j],
2999 : 6921828 : oprnds_info[1]->ops[j]);
3000 : 6921828 : if (dump_enabled_p ())
3001 : 899 : dump_printf (MSG_NOTE, "%d ", j);
3002 : : }
3003 : 37197 : if (dump_enabled_p ())
3004 : 330 : dump_printf (MSG_NOTE, "\n");
3005 : : /* After swapping some operands we lost track whether an
3006 : : operand has any pattern defs so be conservative here. */
3007 : 71184 : if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
3008 : 3249 : oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
3009 : : /* And try again with scratch 'matches' ... */
3010 : 37197 : bool *tem = XALLOCAVEC (bool, group_size);
3011 : 37197 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
3012 : : group_size, &this_max_nunits,
3013 : : tem, limit,
3014 : : &this_tree_size, bst_map)) != NULL)
3015 : : {
3016 : 7247 : oprnd_info->def_stmts = vNULL;
3017 : 7247 : children.safe_push (child);
3018 : 7247 : continue;
3019 : : }
3020 : : }
3021 : 456718 : fail:
3022 : :
3023 : : /* If the SLP build failed and we analyze a basic-block
3024 : : simply treat nodes we fail to build as externally defined
3025 : : (and thus build vectors from the scalar defs).
3026 : : The cost model will reject outright expensive cases.
3027 : : ??? This doesn't treat cases where permutation ultimatively
3028 : : fails (or we don't try permutation below). Ideally we'd
3029 : : even compute a permutation that will end up with the maximum
3030 : : SLP tree size... */
3031 : 456718 : if (is_a <bb_vec_info> (vinfo)
3032 : : /* ??? Rejecting patterns this way doesn't work. We'd have to
3033 : : do extra work to cancel the pattern so the uses see the
3034 : : scalar version. */
3035 : 405443 : && !is_pattern_stmt_p (stmt_info)
3036 : 838408 : && !oprnd_info->any_pattern)
3037 : : {
3038 : : /* But if there's a leading vector sized set of matching stmts
3039 : : fail here so we can split the group. This matches the condition
3040 : : vect_analyze_slp_instance uses. */
3041 : : /* ??? We might want to split here and combine the results to support
3042 : : multiple vector sizes better. */
3043 : 591501 : for (j = 0; j < group_size; ++j)
3044 : 591501 : if (!matches[j])
3045 : : break;
3046 : 381480 : if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
3047 : 381415 : && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
3048 : : {
3049 : 371057 : if (dump_enabled_p ())
3050 : 498 : dump_printf_loc (MSG_NOTE, vect_location,
3051 : : "Building vector operands from scalars\n");
3052 : 371057 : this_tree_size++;
3053 : 371057 : child = vect_create_new_slp_node (oprnd_info->ops);
3054 : 371057 : children.safe_push (child);
3055 : 371057 : oprnd_info->ops = vNULL;
3056 : 371057 : continue;
3057 : : }
3058 : : }
3059 : :
3060 : 85661 : gcc_assert (child == NULL);
3061 : 96523 : FOR_EACH_VEC_ELT (children, j, child)
3062 : 10862 : if (child)
3063 : 10862 : vect_free_slp_tree (child);
3064 : 85661 : vect_free_oprnd_info (oprnds_info);
3065 : 85661 : return NULL;
3066 : : }
3067 : :
3068 : 3174468 : vect_free_oprnd_info (oprnds_info);
3069 : :
3070 : : /* If we have all children of a child built up from uniform scalars
3071 : : or does more than one possibly expensive vector construction then
3072 : : just throw that away, causing it built up from scalars.
3073 : : The exception is the SLP node for the vector store. */
3074 : 3174468 : if (is_a <bb_vec_info> (vinfo)
3075 : 1107290 : && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3076 : : /* ??? Rejecting patterns this way doesn't work. We'd have to
3077 : : do extra work to cancel the pattern so the uses see the
3078 : : scalar version. */
3079 : 3618627 : && !is_pattern_stmt_p (stmt_info))
3080 : : {
3081 : : slp_tree child;
3082 : : unsigned j;
3083 : : bool all_uniform_p = true;
3084 : : unsigned n_vector_builds = 0;
3085 : 1260244 : FOR_EACH_VEC_ELT (children, j, child)
3086 : : {
3087 : 841033 : if (!child)
3088 : : ;
3089 : 841033 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3090 : : all_uniform_p = false;
3091 : 607057 : else if (!vect_slp_tree_uniform_p (child))
3092 : : {
3093 : 464454 : all_uniform_p = false;
3094 : 464454 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
3095 : 429044 : n_vector_builds++;
3096 : : }
3097 : : }
3098 : 419211 : if (all_uniform_p
3099 : 419211 : || n_vector_builds > 1
3100 : 708946 : || (n_vector_builds == children.length ()
3101 : 30826 : && is_a <gphi *> (stmt_info->stmt)))
3102 : : {
3103 : : /* Roll back. */
3104 : 134784 : matches[0] = false;
3105 : 425731 : FOR_EACH_VEC_ELT (children, j, child)
3106 : 290947 : if (child)
3107 : 290947 : vect_free_slp_tree (child);
3108 : :
3109 : 134784 : if (dump_enabled_p ())
3110 : 129 : dump_printf_loc (MSG_NOTE, vect_location,
3111 : : "Building parent vector operands from "
3112 : : "scalars instead\n");
3113 : 134784 : return NULL;
3114 : : }
3115 : : }
3116 : :
3117 : 3039684 : *tree_size += this_tree_size + 1;
3118 : 3039684 : *max_nunits = this_max_nunits;
3119 : :
3120 : 3039684 : if (two_operators)
3121 : : {
3122 : : /* ??? We'd likely want to either cache in bst_map sth like
3123 : : { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
3124 : : the true { a+b, a+b, a+b, a+b } ... but there we don't have
3125 : : explicit stmts to put in so the keying on 'stmts' doesn't
3126 : : work (but we have the same issue with nodes that use 'ops'). */
3127 : :
3128 : 5878 : if (has_two_operators_perm)
3129 : : {
3130 : 22 : slp_tree child = children[0];
3131 : 22 : children.truncate (0);
3132 : 66 : for (i = 0; i < 2; i++)
3133 : : {
3134 : 44 : slp_tree pnode
3135 : 44 : = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
3136 : 44 : SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
3137 : 44 : SLP_TREE_VECTYPE (pnode) = vectype;
3138 : 44 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3139 : 44 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3140 : 44 : lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
3141 : 44 : children.safe_push (pnode);
3142 : :
3143 : 476 : for (unsigned j = 0; j < stmts.length (); j++)
3144 : 432 : perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
3145 : : }
3146 : :
3147 : 22 : SLP_TREE_REF_COUNT (child) += 4;
3148 : : }
3149 : :
3150 : 5878 : slp_tree one = new _slp_tree;
3151 : 5878 : slp_tree two = new _slp_tree;
3152 : 5878 : SLP_TREE_DEF_TYPE (one) = vect_internal_def;
3153 : 5878 : SLP_TREE_DEF_TYPE (two) = vect_internal_def;
3154 : 5878 : SLP_TREE_VECTYPE (one) = vectype;
3155 : 5878 : SLP_TREE_VECTYPE (two) = vectype;
3156 : 5878 : SLP_TREE_CHILDREN (one).safe_splice (children);
3157 : 5878 : SLP_TREE_CHILDREN (two).safe_splice (children);
3158 : 5878 : slp_tree child;
3159 : 23514 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
3160 : 11758 : SLP_TREE_REF_COUNT (child)++;
3161 : :
3162 : : /* Here we record the original defs since this
3163 : : node represents the final lane configuration. */
3164 : 5878 : node = vect_create_new_slp_node (node, stmts, 2);
3165 : 5878 : SLP_TREE_VECTYPE (node) = vectype;
3166 : 5878 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3167 : 5878 : SLP_TREE_CHILDREN (node).quick_push (one);
3168 : 5878 : SLP_TREE_CHILDREN (node).quick_push (two);
3169 : 5878 : enum tree_code code0 = ERROR_MARK;
3170 : 5878 : enum tree_code ocode = ERROR_MARK;
3171 : 5878 : if (gassign *stmt = dyn_cast <gassign *> (stmts[0]->stmt))
3172 : 5876 : code0 = gimple_assign_rhs_code (stmt);
3173 : 5878 : stmt_vec_info ostmt_info;
3174 : 5878 : unsigned j = 0;
3175 : 21779 : FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
3176 : : {
3177 : 15901 : int op = 0;
3178 : 15901 : if (gassign *ostmt = dyn_cast <gassign *> (ostmt_info->stmt))
3179 : : {
3180 : 15897 : if (gimple_assign_rhs_code (ostmt) != code0)
3181 : : {
3182 : 7983 : ocode = gimple_assign_rhs_code (ostmt);
3183 : : op = 1;
3184 : : j = i;
3185 : : }
3186 : : }
3187 : : else
3188 : : {
3189 : 8 : if (gimple_call_combined_fn (stmts[0]->stmt)
3190 : 4 : != gimple_call_combined_fn (ostmt_info->stmt))
3191 : : {
3192 : 2 : op = 1;
3193 : 2 : j = i;
3194 : : }
3195 : : }
3196 : 15901 : SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (op, i));
3197 : : }
3198 : 5878 : SLP_TREE_CODE (one) = code0;
3199 : 5878 : SLP_TREE_CODE (two) = ocode;
3200 : 5878 : SLP_TREE_LANES (one) = stmts.length ();
3201 : 5878 : SLP_TREE_LANES (two) = stmts.length ();
3202 : 5878 : SLP_TREE_REPRESENTATIVE (one) = stmts[0];
3203 : 5878 : SLP_TREE_REPRESENTATIVE (two) = stmts[j];
3204 : :
3205 : 5878 : return node;
3206 : : }
3207 : :
3208 : 3033806 : node = vect_create_new_slp_node (node, stmts, nops);
3209 : 3033806 : SLP_TREE_VECTYPE (node) = vectype;
3210 : 3033806 : SLP_TREE_CHILDREN (node).splice (children);
3211 : 3033806 : SLP_TREE_GS_SCALE (node) = gs_scale;
3212 : 3033806 : SLP_TREE_GS_BASE (node) = gs_base;
3213 : 3033806 : if (reduc_idx != -1)
3214 : : {
3215 : 78867 : gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) != -1
3216 : : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
3217 : : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def);
3218 : 78867 : SLP_TREE_REDUC_IDX (node) = reduc_idx;
3219 : 78867 : node->cycle_info.id = SLP_TREE_CHILDREN (node)[reduc_idx]->cycle_info.id;
3220 : : }
3221 : : /* When reaching the reduction PHI, create a vect_reduc_info. */
3222 : 2954939 : else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3223 : 2954939 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3224 : 2954939 : && is_a <gphi *> (STMT_VINFO_STMT (stmt_info)))
3225 : : {
3226 : 69528 : loop_vec_info loop_vinfo = as_a <loop_vec_info> (vinfo);
3227 : 69528 : gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) == -1);
3228 : 69528 : node->cycle_info.id = loop_vinfo->reduc_infos.length ();
3229 : 69528 : vect_reduc_info reduc_info = new vect_reduc_info_s ();
3230 : 69528 : loop_vinfo->reduc_infos.safe_push (reduc_info);
3231 : 69528 : stmt_vec_info reduc_phi = stmt_info;
3232 : : /* ??? For double reductions vect_is_simple_reduction stores the
3233 : : reduction type and code on the inner loop header PHI. */
3234 : 69528 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3235 : : {
3236 : 298 : use_operand_p use_p;
3237 : 298 : gimple *use_stmt;
3238 : 298 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
3239 : : &use_p, &use_stmt);
3240 : 298 : gcc_assert (res);
3241 : 298 : reduc_phi = loop_vinfo->lookup_stmt (use_stmt);
3242 : : }
3243 : 69528 : VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (stmt_info);
3244 : 69528 : VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (reduc_phi);
3245 : 69528 : VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (reduc_phi);
3246 : 69528 : VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
3247 : : }
3248 : : return node;
3249 : 9780387 : }
3250 : :
3251 : : /* Dump a single SLP tree NODE. */
3252 : :
3253 : : static void
3254 : 446148 : vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
3255 : : slp_tree node)
3256 : : {
3257 : 446148 : unsigned i, j;
3258 : 446148 : slp_tree child;
3259 : 446148 : stmt_vec_info stmt_info;
3260 : 446148 : tree op;
3261 : :
3262 : 446148 : dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
3263 : 446148 : dump_user_location_t user_loc = loc.get_user_location ();
3264 : 446148 : dump_printf_loc (metadata, user_loc,
3265 : : "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
3266 : : ", refcnt=%u)",
3267 : 446148 : SLP_TREE_DEF_TYPE (node) == vect_external_def
3268 : : ? " (external)"
3269 : : : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
3270 : 431334 : ? " (constant)"
3271 : : : ""), (void *) node,
3272 : 446148 : estimated_poly_value (node->max_nunits),
3273 : : SLP_TREE_REF_COUNT (node));
3274 : 446148 : if (SLP_TREE_VECTYPE (node))
3275 : 374715 : dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
3276 : 446148 : dump_printf (metadata, "%s",
3277 : 446148 : node->avoid_stlf_fail ? " (avoid-stlf-fail)" : "");
3278 : 446148 : if (node->cycle_info.id != -1 || node->cycle_info.reduc_idx != -1)
3279 : 21979 : dump_printf (metadata, " cycle %d, link %d", node->cycle_info.id,
3280 : : node->cycle_info.reduc_idx);
3281 : 446148 : dump_printf (metadata, "\n");
3282 : 446148 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3283 : : {
3284 : 359271 : if (SLP_TREE_PERMUTE_P (node))
3285 : 12535 : dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
3286 : : else
3287 : 346736 : dump_printf_loc (metadata, user_loc, "op template: %G",
3288 : 346736 : SLP_TREE_REPRESENTATIVE (node)->stmt);
3289 : : }
3290 : 446148 : if (SLP_TREE_SCALAR_STMTS (node).exists ())
3291 : 848884 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3292 : 497279 : if (stmt_info)
3293 : 492376 : dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
3294 : 492376 : STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
3295 : : i, stmt_info->stmt);
3296 : : else
3297 : 4903 : dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
3298 : : else
3299 : : {
3300 : 94543 : dump_printf_loc (metadata, user_loc, "\t{ ");
3301 : 299896 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
3302 : 110810 : dump_printf (metadata, "%T%s ", op,
3303 : 110810 : i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
3304 : 94543 : dump_printf (metadata, "}\n");
3305 : : }
3306 : 446148 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3307 : : {
3308 : 60543 : dump_printf_loc (metadata, user_loc, "\tload permutation {");
3309 : 197578 : FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
3310 : 76492 : dump_printf (dump_kind, " %u", j);
3311 : 60543 : dump_printf (dump_kind, " }\n");
3312 : : }
3313 : 446148 : if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3314 : : {
3315 : 12543 : dump_printf_loc (metadata, user_loc, "\tlane permutation {");
3316 : 59458 : for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
3317 : 34372 : dump_printf (dump_kind, " %u[%u]",
3318 : 34372 : SLP_TREE_LANE_PERMUTATION (node)[i].first,
3319 : 34372 : SLP_TREE_LANE_PERMUTATION (node)[i].second);
3320 : 12543 : dump_printf (dump_kind, " }%s\n",
3321 : 12543 : node->ldst_lanes ? " (load-lanes)" : "");
3322 : : }
3323 : 446148 : if (SLP_TREE_CHILDREN (node).is_empty ())
3324 : 169266 : return;
3325 : 276882 : dump_printf_loc (metadata, user_loc, "\tchildren");
3326 : 1012318 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3327 : 458554 : dump_printf (dump_kind, " %p", (void *)child);
3328 : 276882 : dump_printf (dump_kind, "%s\n",
3329 : 276882 : node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
3330 : : ? " (store-lanes)" : "");
3331 : : }
3332 : :
3333 : : DEBUG_FUNCTION void
3334 : 0 : debug (slp_tree node)
3335 : : {
3336 : 0 : debug_dump_context ctx;
3337 : 0 : vect_print_slp_tree (MSG_NOTE,
3338 : 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3339 : : node);
3340 : 0 : }
3341 : :
3342 : : /* Recursive helper for the dot producer below. */
3343 : :
3344 : : static void
3345 : 0 : dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
3346 : : {
3347 : 0 : if (visited.add (node))
3348 : : return;
3349 : :
3350 : 0 : fprintf (f, "\"%p\" [label=\"", (void *)node);
3351 : 0 : vect_print_slp_tree (MSG_NOTE,
3352 : 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3353 : : node);
3354 : 0 : fprintf (f, "\"];\n");
3355 : :
3356 : :
3357 : 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3358 : 0 : fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
3359 : :
3360 : 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3361 : 0 : if (child)
3362 : 0 : dot_slp_tree (f, child, visited);
3363 : : }
3364 : :
3365 : : DEBUG_FUNCTION void
3366 : 0 : dot_slp_tree (const char *fname, slp_tree node)
3367 : : {
3368 : 0 : FILE *f = fopen (fname, "w");
3369 : 0 : fprintf (f, "digraph {\n");
3370 : 0 : fflush (f);
3371 : 0 : {
3372 : 0 : debug_dump_context ctx (f);
3373 : 0 : hash_set<slp_tree> visited;
3374 : 0 : dot_slp_tree (f, node, visited);
3375 : 0 : }
3376 : 0 : fflush (f);
3377 : 0 : fprintf (f, "}\n");
3378 : 0 : fclose (f);
3379 : 0 : }
3380 : :
3381 : : DEBUG_FUNCTION void
3382 : 0 : dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
3383 : : {
3384 : 0 : FILE *f = fopen (fname, "w");
3385 : 0 : fprintf (f, "digraph {\n");
3386 : 0 : fflush (f);
3387 : 0 : {
3388 : 0 : debug_dump_context ctx (f);
3389 : 0 : hash_set<slp_tree> visited;
3390 : 0 : for (auto inst : slp_instances)
3391 : 0 : dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
3392 : 0 : }
3393 : 0 : fflush (f);
3394 : 0 : fprintf (f, "}\n");
3395 : 0 : fclose (f);
3396 : 0 : }
3397 : :
3398 : : /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
3399 : :
3400 : : static void
3401 : 484390 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3402 : : slp_tree node, hash_set<slp_tree> &visited)
3403 : : {
3404 : 484390 : unsigned i;
3405 : 484390 : slp_tree child;
3406 : :
3407 : 484390 : if (visited.add (node))
3408 : 484390 : return;
3409 : :
3410 : 445760 : vect_print_slp_tree (dump_kind, loc, node);
3411 : :
3412 : 1349613 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3413 : 458093 : if (child)
3414 : 405269 : vect_print_slp_graph (dump_kind, loc, child, visited);
3415 : : }
3416 : :
3417 : : static void
3418 : 48119 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3419 : : slp_tree entry)
3420 : : {
3421 : 48119 : hash_set<slp_tree> visited;
3422 : 48119 : vect_print_slp_graph (dump_kind, loc, entry, visited);
3423 : 48119 : }
3424 : :
3425 : : DEBUG_FUNCTION void
3426 : 0 : debug (slp_instance instance)
3427 : : {
3428 : 0 : debug_dump_context ctx;
3429 : 0 : vect_print_slp_graph (MSG_NOTE,
3430 : 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3431 : : SLP_INSTANCE_TREE (instance));
3432 : 0 : }
3433 : :
3434 : : /* Mark the tree rooted at NODE with PURE_SLP. */
3435 : :
3436 : : static void
3437 : 6518083 : vect_mark_slp_stmts (vec_info *vinfo, slp_tree node,
3438 : : hash_set<slp_tree> &visited)
3439 : : {
3440 : 6518083 : int i;
3441 : 6518083 : stmt_vec_info stmt_info;
3442 : 6518083 : slp_tree child;
3443 : :
3444 : 6518083 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3445 : : return;
3446 : :
3447 : 4633279 : if (visited.add (node))
3448 : : return;
3449 : :
3450 : 10833065 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3451 : 6478165 : if (stmt_info)
3452 : : {
3453 : 6404446 : STMT_SLP_TYPE (stmt_info) = pure_slp;
3454 : : /* ??? For .MASK_LOAD and .MASK_STORE detected as load/store-lanes
3455 : : when there is the mask_conversion pattern applied we have lost the
3456 : : alternate lanes of the uniform mask which nevertheless
3457 : : have separate pattern defs. To not confuse hybrid
3458 : : analysis we mark those as covered as well here. */
3459 : 6404446 : if (node->ldst_lanes)
3460 : 6478165 : if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
3461 : 0 : if (gimple_call_internal_p (call, IFN_MASK_LOAD)
3462 : 0 : || gimple_call_internal_p (call, IFN_MASK_STORE))
3463 : : {
3464 : 0 : tree mask = gimple_call_arg (call,
3465 : : internal_fn_mask_index
3466 : 0 : (gimple_call_internal_fn (call)));
3467 : 0 : if (TREE_CODE (mask) == SSA_NAME)
3468 : 0 : if (stmt_vec_info mask_info = vinfo->lookup_def (mask))
3469 : : {
3470 : 0 : mask_info = vect_stmt_to_vectorize (mask_info);
3471 : 0 : STMT_SLP_TYPE (mask_info) = pure_slp;
3472 : : }
3473 : : }
3474 : : }
3475 : :
3476 : 10287672 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3477 : 5932772 : if (child)
3478 : 4775133 : vect_mark_slp_stmts (vinfo, child, visited);
3479 : : }
3480 : :
3481 : : static void
3482 : 1742950 : vect_mark_slp_stmts (vec_info *vinfo, slp_tree node)
3483 : : {
3484 : 1742950 : hash_set<slp_tree> visited;
3485 : 1742950 : vect_mark_slp_stmts (vinfo, node, visited);
3486 : 1742950 : }
3487 : :
3488 : : /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
3489 : :
3490 : : static void
3491 : 2345559 : vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
3492 : : {
3493 : 2345559 : int i;
3494 : 2345559 : stmt_vec_info stmt_info;
3495 : 2345559 : slp_tree child;
3496 : :
3497 : 2345559 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3498 : : return;
3499 : :
3500 : 1374598 : if (visited.add (node))
3501 : : return;
3502 : :
3503 : 4290582 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3504 : 3020318 : if (stmt_info)
3505 : : {
3506 : 3020318 : gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
3507 : : || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
3508 : 3020318 : STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
3509 : : }
3510 : :
3511 : 2828549 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3512 : 1558285 : if (child)
3513 : 1558285 : vect_mark_slp_stmts_relevant (child, visited);
3514 : : }
3515 : :
3516 : : static void
3517 : 787274 : vect_mark_slp_stmts_relevant (slp_tree node)
3518 : : {
3519 : 787274 : hash_set<slp_tree> visited;
3520 : 787274 : vect_mark_slp_stmts_relevant (node, visited);
3521 : 787274 : }
3522 : :
3523 : :
3524 : : /* Gather loads in the SLP graph NODE and populate the INST loads array. */
3525 : :
3526 : : static void
3527 : 11900688 : vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
3528 : : hash_set<slp_tree> &visited)
3529 : : {
3530 : 11900688 : if (!node || visited.add (node))
3531 : 2540308 : return;
3532 : :
3533 : 9360380 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3534 : : return;
3535 : :
3536 : 6722890 : if (!SLP_TREE_PERMUTE_P (node))
3537 : : {
3538 : 6505384 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3539 : 6505384 : if (STMT_VINFO_DATA_REF (stmt_info)
3540 : 2390198 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3541 : 1294989 : loads.safe_push (node);
3542 : : }
3543 : :
3544 : : unsigned i;
3545 : : slp_tree child;
3546 : 15998085 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3547 : 9275195 : vect_gather_slp_loads (loads, child, visited);
3548 : : }
3549 : :
3550 : :
3551 : : /* Find the last store in SLP INSTANCE. */
3552 : :
3553 : : stmt_vec_info
3554 : 2734978 : vect_find_last_scalar_stmt_in_slp (slp_tree node)
3555 : : {
3556 : 2734978 : stmt_vec_info last = NULL;
3557 : 2734978 : stmt_vec_info stmt_vinfo;
3558 : :
3559 : 9974175 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3560 : 7239197 : if (stmt_vinfo)
3561 : : {
3562 : 7239197 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3563 : 7239197 : last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
3564 : : }
3565 : :
3566 : 2734978 : return last;
3567 : : }
3568 : :
3569 : : /* Find the first stmt in NODE. */
3570 : :
3571 : : stmt_vec_info
3572 : 525836 : vect_find_first_scalar_stmt_in_slp (slp_tree node)
3573 : : {
3574 : 525836 : stmt_vec_info first = NULL;
3575 : 525836 : stmt_vec_info stmt_vinfo;
3576 : :
3577 : 1770873 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3578 : 1245037 : if (stmt_vinfo)
3579 : : {
3580 : 1242404 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3581 : 1242404 : if (!first
3582 : 1242404 : || get_later_stmt (stmt_vinfo, first) == first)
3583 : : first = stmt_vinfo;
3584 : : }
3585 : :
3586 : 525836 : return first;
3587 : : }
3588 : :
3589 : : /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3590 : : two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3591 : : (also containing the first GROUP1_SIZE stmts, since stores are
3592 : : consecutive), the second containing the remainder.
3593 : : Return the first stmt in the second group. */
3594 : :
3595 : : static stmt_vec_info
3596 : 156359 : vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3597 : : {
3598 : 156359 : gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3599 : 156359 : gcc_assert (group1_size > 0);
3600 : 156359 : int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3601 : 156359 : gcc_assert (group2_size > 0);
3602 : 156359 : DR_GROUP_SIZE (first_vinfo) = group1_size;
3603 : :
3604 : 156359 : stmt_vec_info stmt_info = first_vinfo;
3605 : 525340 : for (unsigned i = group1_size; i > 1; i--)
3606 : : {
3607 : 368981 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3608 : 368981 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3609 : : }
3610 : : /* STMT is now the last element of the first group. */
3611 : 156359 : stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3612 : 156359 : DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3613 : :
3614 : 156359 : DR_GROUP_SIZE (group2) = group2_size;
3615 : 435561 : for (stmt_info = group2; stmt_info;
3616 : 279202 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3617 : : {
3618 : 279202 : DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3619 : 279202 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3620 : : }
3621 : :
3622 : : /* For the second group, the DR_GROUP_GAP is that before the original group,
3623 : : plus skipping over the first vector. */
3624 : 156359 : DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3625 : :
3626 : : /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3627 : 156359 : DR_GROUP_GAP (first_vinfo) += group2_size;
3628 : :
3629 : 156359 : if (dump_enabled_p ())
3630 : 61 : dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3631 : : group1_size, group2_size);
3632 : :
3633 : 156359 : return group2;
3634 : : }
3635 : :
3636 : : /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3637 : : statements and a vector of NUNITS elements. */
3638 : :
3639 : : static poly_uint64
3640 : 4454549 : calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3641 : : {
3642 : 4454549 : return exact_div (common_multiple (nunits, group_size), group_size);
3643 : : }
3644 : :
3645 : : /* Helper that checks to see if a node is a load node. */
3646 : :
3647 : : static inline bool
3648 : 54 : vect_is_slp_load_node (slp_tree root)
3649 : : {
3650 : 54 : return (!SLP_TREE_PERMUTE_P (root)
3651 : 54 : && SLP_TREE_DEF_TYPE (root) == vect_internal_def
3652 : 48 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3653 : 94 : && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
3654 : : }
3655 : :
3656 : :
3657 : : /* Helper function of optimize_load_redistribution that performs the operation
3658 : : recursively. */
3659 : :
3660 : : static slp_tree
3661 : 20114 : optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3662 : : vec_info *vinfo, unsigned int group_size,
3663 : : hash_map<slp_tree, slp_tree> *load_map,
3664 : : slp_tree root)
3665 : : {
3666 : 20114 : if (slp_tree *leader = load_map->get (root))
3667 : 3585 : return *leader;
3668 : :
3669 : 16529 : slp_tree node;
3670 : 16529 : unsigned i;
3671 : :
3672 : : /* For now, we don't know anything about externals so do not do anything. */
3673 : 16529 : if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3674 : : return NULL;
3675 : 11981 : else if (SLP_TREE_PERMUTE_P (root))
3676 : : {
3677 : : /* First convert this node into a load node and add it to the leaves
3678 : : list and flatten the permute from a lane to a load one. If it's
3679 : : unneeded it will be elided later. */
3680 : 34 : vec<stmt_vec_info> stmts;
3681 : 34 : stmts.create (SLP_TREE_LANES (root));
3682 : 34 : lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3683 : 74 : for (unsigned j = 0; j < lane_perm.length (); j++)
3684 : : {
3685 : 54 : std::pair<unsigned, unsigned> perm = lane_perm[j];
3686 : 54 : node = SLP_TREE_CHILDREN (root)[perm.first];
3687 : :
3688 : 54 : if (!vect_is_slp_load_node (node)
3689 : 54 : || SLP_TREE_CHILDREN (node).exists ())
3690 : : {
3691 : 14 : stmts.release ();
3692 : 14 : goto next;
3693 : : }
3694 : :
3695 : 40 : stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3696 : : }
3697 : :
3698 : 20 : if (dump_enabled_p ())
3699 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
3700 : : "converting stmts on permute node %p\n",
3701 : : (void *) root);
3702 : :
3703 : 20 : bool *matches = XALLOCAVEC (bool, group_size);
3704 : 20 : poly_uint64 max_nunits = 1;
3705 : 20 : unsigned tree_size = 0, limit = 1;
3706 : 20 : node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3707 : : matches, &limit, &tree_size, bst_map);
3708 : 20 : if (!node)
3709 : 0 : stmts.release ();
3710 : :
3711 : 20 : load_map->put (root, node);
3712 : 20 : return node;
3713 : : }
3714 : :
3715 : 11947 : next:
3716 : 11961 : load_map->put (root, NULL);
3717 : :
3718 : 28297 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3719 : : {
3720 : 16336 : slp_tree value
3721 : 16336 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3722 : : node);
3723 : 16336 : if (value)
3724 : : {
3725 : 20 : SLP_TREE_REF_COUNT (value)++;
3726 : 20 : SLP_TREE_CHILDREN (root)[i] = value;
3727 : : /* ??? We know the original leafs of the replaced nodes will
3728 : : be referenced by bst_map, only the permutes created by
3729 : : pattern matching are not. */
3730 : 20 : if (SLP_TREE_REF_COUNT (node) == 1)
3731 : 20 : load_map->remove (node);
3732 : 20 : vect_free_slp_tree (node);
3733 : : }
3734 : : }
3735 : :
3736 : : return NULL;
3737 : : }
3738 : :
3739 : : /* Temporary workaround for loads not being CSEd during SLP build. This
3740 : : function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3741 : : VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3742 : : same DR such that the final operation is equal to a permuted load. Such
3743 : : NODES are then directly converted into LOADS themselves. The nodes are
3744 : : CSEd using BST_MAP. */
3745 : :
3746 : : static void
3747 : 2850 : optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3748 : : vec_info *vinfo, unsigned int group_size,
3749 : : hash_map<slp_tree, slp_tree> *load_map,
3750 : : slp_tree root)
3751 : : {
3752 : 2850 : slp_tree node;
3753 : 2850 : unsigned i;
3754 : :
3755 : 6628 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3756 : : {
3757 : 3778 : slp_tree value
3758 : 3778 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3759 : : node);
3760 : 3778 : if (value)
3761 : : {
3762 : 0 : SLP_TREE_REF_COUNT (value)++;
3763 : 0 : SLP_TREE_CHILDREN (root)[i] = value;
3764 : : /* ??? We know the original leafs of the replaced nodes will
3765 : : be referenced by bst_map, only the permutes created by
3766 : : pattern matching are not. */
3767 : 0 : if (SLP_TREE_REF_COUNT (node) == 1)
3768 : 0 : load_map->remove (node);
3769 : 0 : vect_free_slp_tree (node);
3770 : : }
3771 : : }
3772 : 2850 : }
3773 : :
3774 : : /* Helper function of vect_match_slp_patterns.
3775 : :
3776 : : Attempts to match patterns against the slp tree rooted in REF_NODE using
3777 : : VINFO. Patterns are matched in post-order traversal.
3778 : :
3779 : : If matching is successful the value in REF_NODE is updated and returned, if
3780 : : not then it is returned unchanged. */
3781 : :
3782 : : static bool
3783 : 6697018 : vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3784 : : slp_tree_to_load_perm_map_t *perm_cache,
3785 : : slp_compat_nodes_map_t *compat_cache,
3786 : : hash_set<slp_tree> *visited)
3787 : : {
3788 : 6697018 : unsigned i;
3789 : 6697018 : slp_tree node = *ref_node;
3790 : 6697018 : bool found_p = false;
3791 : 6697018 : if (!node || visited->add (node))
3792 : 1250803 : return false;
3793 : :
3794 : : slp_tree child;
3795 : 10392640 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3796 : 4946425 : found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3797 : : vinfo, perm_cache, compat_cache,
3798 : : visited);
3799 : :
3800 : 16338645 : for (unsigned x = 0; x < num__slp_patterns; x++)
3801 : : {
3802 : 10892430 : vect_pattern *pattern
3803 : 10892430 : = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3804 : 10892430 : if (pattern)
3805 : : {
3806 : 1072 : pattern->build (vinfo);
3807 : 1072 : delete pattern;
3808 : 1072 : found_p = true;
3809 : : }
3810 : : }
3811 : :
3812 : : return found_p;
3813 : : }
3814 : :
3815 : : /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3816 : : vec_info VINFO.
3817 : :
3818 : : The modified tree is returned. Patterns are tried in order and multiple
3819 : : patterns may match. */
3820 : :
3821 : : static bool
3822 : 1750593 : vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3823 : : hash_set<slp_tree> *visited,
3824 : : slp_tree_to_load_perm_map_t *perm_cache,
3825 : : slp_compat_nodes_map_t *compat_cache)
3826 : : {
3827 : 1750593 : DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3828 : 1750593 : slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3829 : :
3830 : 1750593 : if (dump_enabled_p ())
3831 : 32135 : dump_printf_loc (MSG_NOTE, vect_location,
3832 : : "Analyzing SLP tree %p for patterns\n",
3833 : 32135 : (void *) SLP_INSTANCE_TREE (instance));
3834 : :
3835 : 1750593 : return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3836 : 1750593 : visited);
3837 : : }
3838 : :
3839 : : /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3840 : : vectorizing with VECTYPE that might be NULL. MASKED_P indicates whether
3841 : : the stores are masked.
3842 : : Return true if we could use IFN_STORE_LANES instead and if that appears
3843 : : to be the better approach. */
3844 : :
3845 : : static bool
3846 : 4873 : vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3847 : : tree vectype, bool masked_p,
3848 : : unsigned int group_size,
3849 : : unsigned int new_group_size)
3850 : : {
3851 : 4873 : if (!vectype)
3852 : : {
3853 : 4873 : tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3854 : 4873 : vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3855 : : }
3856 : 4873 : if (!vectype)
3857 : : return false;
3858 : : /* Allow the split if one of the two new groups would operate on full
3859 : : vectors *within* rather than across one scalar loop iteration.
3860 : : This is purely a heuristic, but it should work well for group
3861 : : sizes of 3 and 4, where the possible splits are:
3862 : :
3863 : : 3->2+1: OK if the vector has exactly two elements
3864 : : 4->2+2: Likewise
3865 : : 4->3+1: Less clear-cut. */
3866 : 4873 : if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3867 : 2531 : || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3868 : 2361 : return false;
3869 : 2512 : return vect_store_lanes_supported (vectype, group_size, masked_p) != IFN_LAST;
3870 : : }
3871 : :
3872 : : /* Analyze an SLP instance starting from a group of grouped stores. Call
3873 : : vect_build_slp_tree to build a tree of packed stmts if possible.
3874 : : Return FALSE if it's impossible to SLP any stmt in the loop. */
3875 : :
3876 : : static bool
3877 : : vect_analyze_slp_instance (vec_info *vinfo,
3878 : : scalar_stmts_to_slp_tree_map_t *bst_map,
3879 : : stmt_vec_info stmt_info, slp_instance_kind kind,
3880 : : unsigned max_tree_size, unsigned *limit,
3881 : : bool force_single_lane);
3882 : :
3883 : : /* Build an interleaving scheme for the store sources RHS_NODES from
3884 : : SCALAR_STMTS. */
3885 : :
3886 : : static slp_tree
3887 : 6144 : vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
3888 : : vec<stmt_vec_info> &scalar_stmts,
3889 : : poly_uint64 max_nunits)
3890 : : {
3891 : 6144 : unsigned int group_size = scalar_stmts.length ();
3892 : 12288 : slp_tree node = vect_create_new_slp_node (scalar_stmts,
3893 : 6144 : SLP_TREE_CHILDREN
3894 : : (rhs_nodes[0]).length ());
3895 : 6144 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
3896 : 6144 : node->max_nunits = max_nunits;
3897 : 6144 : for (unsigned l = 0;
3898 : 12315 : l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
3899 : : {
3900 : : /* And a permute merging all RHS SLP trees. */
3901 : 6171 : slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
3902 : 6171 : VEC_PERM_EXPR);
3903 : 6171 : SLP_TREE_CHILDREN (node).quick_push (perm);
3904 : 6171 : SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
3905 : 6171 : SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
3906 : 6171 : perm->max_nunits = max_nunits;
3907 : 6171 : SLP_TREE_LANES (perm) = group_size;
3908 : : /* ??? We should set this NULL but that's not expected. */
3909 : 6171 : SLP_TREE_REPRESENTATIVE (perm)
3910 : 6171 : = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
3911 : 24393 : for (unsigned j = 0; j < rhs_nodes.length (); ++j)
3912 : : {
3913 : 18222 : SLP_TREE_CHILDREN (perm)
3914 : 18222 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
3915 : 18222 : SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
3916 : 18222 : for (unsigned k = 0;
3917 : 38453 : k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
3918 : : {
3919 : : /* ??? We should populate SLP_TREE_SCALAR_STMTS
3920 : : or SLP_TREE_SCALAR_OPS but then we might have
3921 : : a mix of both in our children. */
3922 : 20231 : SLP_TREE_LANE_PERMUTATION (perm)
3923 : 20231 : .quick_push (std::make_pair (j, k));
3924 : : }
3925 : : }
3926 : :
3927 : : /* Now we have a single permute node but we cannot code-generate
3928 : : the case with more than two inputs.
3929 : : Perform pairwise reduction, reducing the two inputs
3930 : : with the least number of lanes to one and then repeat until
3931 : : we end up with two inputs. That scheme makes sure we end
3932 : : up with permutes satisfying the restriction of requiring at
3933 : : most two vector inputs to produce a single vector output
3934 : : when the number of lanes is even. */
3935 : 12051 : while (SLP_TREE_CHILDREN (perm).length () > 2)
3936 : : {
3937 : : /* When we have three equal sized groups left the pairwise
3938 : : reduction does not result in a scheme that avoids using
3939 : : three vectors. Instead merge the first two groups
3940 : : to the final size with do-not-care elements (chosen
3941 : : from the first group) and then merge with the third.
3942 : : { A0, B0, x, A1, B1, x, ... }
3943 : : -> { A0, B0, C0, A1, B1, C1, ... }
3944 : : This handles group size of three (and at least
3945 : : power-of-two multiples of that). */
3946 : 5880 : if (SLP_TREE_CHILDREN (perm).length () == 3
3947 : 3037 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3948 : 3037 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
3949 : 5880 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3950 : 2294 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
3951 : : {
3952 : 2098 : int ai = 0;
3953 : 2098 : int bi = 1;
3954 : 2098 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3955 : 2098 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3956 : 2098 : unsigned n = SLP_TREE_LANES (perm);
3957 : :
3958 : 2098 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3959 : 2098 : SLP_TREE_LANES (permab) = n;
3960 : 2098 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
3961 : 2098 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3962 : 2098 : permab->max_nunits = max_nunits;
3963 : : /* ??? Should be NULL but that's not expected. */
3964 : 2098 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3965 : 2098 : SLP_TREE_CHILDREN (permab).quick_push (a);
3966 : 4205 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3967 : 2107 : SLP_TREE_LANE_PERMUTATION (permab)
3968 : 2107 : .quick_push (std::make_pair (0, k));
3969 : 2098 : SLP_TREE_CHILDREN (permab).quick_push (b);
3970 : 4205 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3971 : 2107 : SLP_TREE_LANE_PERMUTATION (permab)
3972 : 2107 : .quick_push (std::make_pair (1, k));
3973 : : /* Push the do-not-care lanes. */
3974 : 4205 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3975 : 2107 : SLP_TREE_LANE_PERMUTATION (permab)
3976 : 2107 : .quick_push (std::make_pair (0, k));
3977 : :
3978 : : /* Put the merged node into 'perm', in place of a. */
3979 : 2098 : SLP_TREE_CHILDREN (perm)[ai] = permab;
3980 : : /* Adjust the references to b in the permutation
3981 : : of perm and to the later children which we'll
3982 : : remove. */
3983 : 8419 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3984 : : {
3985 : 6321 : std::pair<unsigned, unsigned> &p
3986 : 6321 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
3987 : 6321 : if (p.first == (unsigned) bi)
3988 : : {
3989 : 2107 : p.first = ai;
3990 : 2107 : p.second += SLP_TREE_LANES (a);
3991 : : }
3992 : 4214 : else if (p.first > (unsigned) bi)
3993 : 2107 : p.first--;
3994 : : }
3995 : 2098 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3996 : 2098 : break;
3997 : : }
3998 : :
3999 : : /* Pick the two nodes with the least number of lanes,
4000 : : prefer the earliest candidate and maintain ai < bi. */
4001 : : int ai = -1;
4002 : : int bi = -1;
4003 : 33073 : for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
4004 : : {
4005 : 29291 : if (ai == -1)
4006 : 3782 : ai = ci;
4007 : 25509 : else if (bi == -1)
4008 : 3782 : bi = ci;
4009 : 21727 : else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
4010 : 21727 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
4011 : 21727 : || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
4012 : 17813 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
4013 : : {
4014 : 8716 : if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
4015 : 4358 : <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
4016 : 2075 : bi = ci;
4017 : : else
4018 : : {
4019 : 2283 : ai = bi;
4020 : 2283 : bi = ci;
4021 : : }
4022 : : }
4023 : : }
4024 : :
4025 : : /* Produce a merge of nodes ai and bi. */
4026 : 3782 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
4027 : 3782 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
4028 : 3782 : unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
4029 : 3782 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
4030 : 3782 : SLP_TREE_LANES (permab) = n;
4031 : 3782 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
4032 : 3782 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
4033 : 3782 : permab->max_nunits = max_nunits;
4034 : : /* ??? Should be NULL but that's not expected. */
4035 : 3782 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
4036 : 3782 : SLP_TREE_CHILDREN (permab).quick_push (a);
4037 : 9888 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
4038 : 6106 : SLP_TREE_LANE_PERMUTATION (permab)
4039 : 6106 : .quick_push (std::make_pair (0, k));
4040 : 3782 : SLP_TREE_CHILDREN (permab).quick_push (b);
4041 : 9400 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
4042 : 5618 : SLP_TREE_LANE_PERMUTATION (permab)
4043 : 5618 : .quick_push (std::make_pair (1, k));
4044 : :
4045 : : /* Put the merged node into 'perm', in place of a. */
4046 : 3782 : SLP_TREE_CHILDREN (perm)[ai] = permab;
4047 : : /* Adjust the references to b in the permutation
4048 : : of perm and to the later children which we'll
4049 : : remove. */
4050 : 52699 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
4051 : : {
4052 : 48917 : std::pair<unsigned, unsigned> &p
4053 : 48917 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
4054 : 48917 : if (p.first == (unsigned) bi)
4055 : : {
4056 : 5618 : p.first = ai;
4057 : 5618 : p.second += SLP_TREE_LANES (a);
4058 : : }
4059 : 43299 : else if (p.first > (unsigned) bi)
4060 : 17862 : p.first--;
4061 : : }
4062 : 3782 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
4063 : : }
4064 : : }
4065 : :
4066 : 6144 : return node;
4067 : : }
4068 : :
4069 : : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
4070 : : of KIND. Return true if successful. SCALAR_STMTS is owned by this
4071 : : function, REMAIN and ROOT_STMT_INFOS ownership is transfered back to
4072 : : the caller upon failure. */
4073 : :
4074 : : static bool
4075 : 2122047 : vect_build_slp_instance (vec_info *vinfo,
4076 : : slp_instance_kind kind,
4077 : : vec<stmt_vec_info> &scalar_stmts,
4078 : : vec<stmt_vec_info> &root_stmt_infos,
4079 : : vec<tree> &remain,
4080 : : unsigned max_tree_size, unsigned *limit,
4081 : : scalar_stmts_to_slp_tree_map_t *bst_map,
4082 : : bool force_single_lane)
4083 : : {
4084 : : /* If there's no budget left bail out early. */
4085 : 2122047 : if (*limit == 0)
4086 : : {
4087 : 27205 : scalar_stmts.release ();
4088 : 27205 : return false;
4089 : : }
4090 : :
4091 : 2094842 : if (kind == slp_inst_kind_ctor)
4092 : : {
4093 : 11917 : if (dump_enabled_p ())
4094 : 86 : dump_printf_loc (MSG_NOTE, vect_location,
4095 : : "Analyzing vectorizable constructor: %G\n",
4096 : 43 : root_stmt_infos[0]->stmt);
4097 : : }
4098 : 2082925 : else if (kind == slp_inst_kind_gcond)
4099 : : {
4100 : 249447 : if (dump_enabled_p ())
4101 : 5190 : dump_printf_loc (MSG_NOTE, vect_location,
4102 : : "Analyzing vectorizable control flow: %G",
4103 : 2595 : root_stmt_infos[0]->stmt);
4104 : : }
4105 : :
4106 : 2094842 : if (dump_enabled_p ())
4107 : : {
4108 : 27577 : dump_printf_loc (MSG_NOTE, vect_location,
4109 : : "Starting SLP discovery for\n");
4110 : 58396 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4111 : 61638 : dump_printf_loc (MSG_NOTE, vect_location,
4112 : 30819 : " %G", scalar_stmts[i]->stmt);
4113 : : }
4114 : :
4115 : : /* Build the tree for the SLP instance. */
4116 : 2094842 : unsigned int group_size = scalar_stmts.length ();
4117 : 2094842 : bool *matches = XALLOCAVEC (bool, group_size);
4118 : 2094842 : poly_uint64 max_nunits = 1;
4119 : 2094842 : unsigned tree_size = 0;
4120 : :
4121 : 2094842 : slp_tree node = NULL;
4122 : 2094842 : if (group_size > 1 && force_single_lane)
4123 : : {
4124 : 0 : matches[0] = true;
4125 : 0 : matches[1] = false;
4126 : : }
4127 : : else
4128 : 2094842 : node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4129 : : &max_nunits, matches, limit,
4130 : : &tree_size, bst_map);
4131 : 2094842 : if (node != NULL)
4132 : : {
4133 : : /* Calculate the unrolling factor based on the smallest type. */
4134 : 1011737 : poly_uint64 unrolling_factor
4135 : 1011737 : = calculate_unrolling_factor (max_nunits, group_size);
4136 : :
4137 : 1011737 : if (maybe_ne (unrolling_factor, 1U)
4138 : 1011737 : && is_a <bb_vec_info> (vinfo))
4139 : : {
4140 : 0 : unsigned HOST_WIDE_INT const_max_nunits;
4141 : 0 : if (!max_nunits.is_constant (&const_max_nunits)
4142 : 0 : || const_max_nunits > group_size)
4143 : : {
4144 : 0 : if (dump_enabled_p ())
4145 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4146 : : "Build SLP failed: store group "
4147 : : "size not a multiple of the vector size "
4148 : : "in basic block SLP\n");
4149 : 0 : vect_free_slp_tree (node);
4150 : 0 : return false;
4151 : : }
4152 : : /* Fatal mismatch. */
4153 : 0 : if (dump_enabled_p ())
4154 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
4155 : : "SLP discovery succeeded but node needs "
4156 : : "splitting\n");
4157 : 0 : memset (matches, true, group_size);
4158 : 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
4159 : 0 : vect_free_slp_tree (node);
4160 : : }
4161 : : else
4162 : : {
4163 : : /* Create a new SLP instance. */
4164 : 1011737 : slp_instance new_instance = XNEW (class _slp_instance);
4165 : 1011737 : SLP_INSTANCE_TREE (new_instance) = node;
4166 : 1011737 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4167 : 1011737 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4168 : 1011737 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4169 : 1011737 : SLP_INSTANCE_KIND (new_instance) = kind;
4170 : 1011737 : new_instance->reduc_phis = NULL;
4171 : 1011737 : new_instance->cost_vec = vNULL;
4172 : 1011737 : new_instance->subgraph_entries = vNULL;
4173 : :
4174 : 1011737 : if (dump_enabled_p ())
4175 : 24695 : dump_printf_loc (MSG_NOTE, vect_location,
4176 : : "SLP size %u vs. limit %u.\n",
4177 : : tree_size, max_tree_size);
4178 : :
4179 : 1011737 : vinfo->slp_instances.safe_push (new_instance);
4180 : :
4181 : : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4182 : : the number of scalar stmts in the root in a few places.
4183 : : Verify that assumption holds. */
4184 : 2023474 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4185 : : .length () == group_size);
4186 : :
4187 : 1011737 : if (dump_enabled_p ())
4188 : : {
4189 : 24695 : if (kind == slp_inst_kind_reduc_group)
4190 : 4840 : dump_printf_loc (MSG_NOTE, vect_location,
4191 : : "SLP discovery of size %d reduction group "
4192 : : "succeeded\n", group_size);
4193 : 24695 : dump_printf_loc (MSG_NOTE, vect_location,
4194 : : "Final SLP tree for instance %p:\n",
4195 : : (void *) new_instance);
4196 : 24695 : vect_print_slp_graph (MSG_NOTE, vect_location,
4197 : : SLP_INSTANCE_TREE (new_instance));
4198 : : }
4199 : :
4200 : 1011737 : return true;
4201 : : }
4202 : : }
4203 : : /* Failed to SLP. */
4204 : :
4205 : : /* While we arrive here even with slp_inst_kind_store we should only
4206 : : for group_size == 1. The code to split store groups is only in
4207 : : vect_analyze_slp_instance now. */
4208 : 1083105 : gcc_assert (kind != slp_inst_kind_store || group_size == 1);
4209 : :
4210 : : /* Free the allocated memory. */
4211 : 1083105 : scalar_stmts.release ();
4212 : :
4213 : : /* Failed to SLP. */
4214 : 1083105 : if (dump_enabled_p ())
4215 : 2882 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4216 : : return false;
4217 : : }
4218 : :
4219 : : /* Analyze an SLP instance starting from a the start of a reduction chain.
4220 : : Call vect_build_slp_tree to build a tree of packed stmts if possible.
4221 : : Return FALSE if SLP build fails. */
4222 : :
4223 : : static bool
4224 : 41386 : vect_analyze_slp_reduc_chain (loop_vec_info vinfo,
4225 : : scalar_stmts_to_slp_tree_map_t *bst_map,
4226 : : stmt_vec_info scalar_stmt,
4227 : : unsigned max_tree_size, unsigned *limit)
4228 : : {
4229 : 41386 : vec<stmt_vec_info> scalar_stmts = vNULL;
4230 : :
4231 : 41386 : bool fail = false;
4232 : : /* ??? We could leave operation code checking to SLP discovery. */
4233 : 41386 : code_helper code = STMT_VINFO_REDUC_CODE (STMT_VINFO_REDUC_DEF
4234 : : (vect_orig_stmt (scalar_stmt)));
4235 : 41386 : bool first = true;
4236 : 41386 : stmt_vec_info next_stmt = scalar_stmt;
4237 : 46481 : do
4238 : : {
4239 : 46481 : stmt_vec_info stmt = next_stmt;
4240 : 46481 : gimple_match_op op;
4241 : 46481 : if (!gimple_extract_op (STMT_VINFO_STMT (stmt), &op))
4242 : 0 : gcc_unreachable ();
4243 : 92962 : tree reduc_def = gimple_arg (STMT_VINFO_STMT (stmt),
4244 : 46481 : STMT_VINFO_REDUC_IDX (stmt));
4245 : 46481 : next_stmt = vect_stmt_to_vectorize (vinfo->lookup_def (reduc_def));
4246 : 46481 : gcc_assert (is_a <gphi *> (STMT_VINFO_STMT (next_stmt))
4247 : : || STMT_VINFO_REDUC_IDX (next_stmt) != -1);
4248 : 49891 : if (!gimple_extract_op (STMT_VINFO_STMT (vect_orig_stmt (stmt)), &op))
4249 : 0 : gcc_unreachable ();
4250 : 46481 : if (CONVERT_EXPR_CODE_P (op.code)
4251 : 2125 : && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))
4252 : 48594 : && (first
4253 : 1046 : || is_a <gphi *> (STMT_VINFO_STMT (next_stmt))))
4254 : : ;
4255 : 44370 : else if (code != op.code)
4256 : : {
4257 : 1704 : fail = true;
4258 : 1704 : break;
4259 : : }
4260 : : else
4261 : 42666 : scalar_stmts.safe_push (stmt);
4262 : 44777 : first = false;
4263 : : }
4264 : 44777 : while (!is_a <gphi *> (STMT_VINFO_STMT (next_stmt)));
4265 : 41386 : if (fail)
4266 : 1704 : return false;
4267 : :
4268 : : /* Remember a stmt with the actual reduction operation. */
4269 : 39682 : stmt_vec_info reduc_scalar_stmt = scalar_stmts[0];
4270 : :
4271 : : /* When the SSA def chain through reduc-idx does not form a natural
4272 : : reduction chain try to linearize an associative operation manually. */
4273 : 39682 : if (scalar_stmts.length () == 1
4274 : 38047 : && code.is_tree_code ()
4275 : 34661 : && associative_tree_code ((tree_code)code)
4276 : : /* We may not associate if a fold-left reduction is required. */
4277 : 73470 : && !needs_fold_left_reduction_p (TREE_TYPE (gimple_get_lhs
4278 : : (reduc_scalar_stmt->stmt)),
4279 : : code))
4280 : : {
4281 : 31967 : auto_vec<chain_op_t> chain;
4282 : 31967 : auto_vec<std::pair<tree_code, gimple *> > worklist;
4283 : 31967 : gimple *op_stmt = NULL, *other_op_stmt = NULL;
4284 : 31967 : vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
4285 : 31967 : scalar_stmts[0]->stmt, op_stmt, other_op_stmt,
4286 : : NULL);
4287 : :
4288 : 31967 : scalar_stmts.truncate (0);
4289 : 31967 : stmt_vec_info tail = NULL;
4290 : 159078 : for (auto el : chain)
4291 : : {
4292 : 63847 : if (el.dt == vect_external_def
4293 : 63847 : || el.dt == vect_constant_def
4294 : 63847 : || el.code != (tree_code) code)
4295 : : {
4296 : 670 : scalar_stmts.release ();
4297 : 670 : return false;
4298 : : }
4299 : 63177 : stmt_vec_info stmt = vinfo->lookup_def (el.op);
4300 : 63177 : if (STMT_VINFO_REDUC_IDX (stmt) != -1
4301 : 62222 : || STMT_VINFO_REDUC_DEF (stmt))
4302 : : {
4303 : 31457 : gcc_assert (tail == NULL);
4304 : 31457 : tail = stmt;
4305 : 31457 : continue;
4306 : : }
4307 : 31720 : scalar_stmts.safe_push (stmt);
4308 : : }
4309 : 31297 : gcc_assert (tail);
4310 : :
4311 : : /* When this linearization didn't produce a chain see if stripping
4312 : : a wrapping sign conversion produces one. */
4313 : 31297 : if (scalar_stmts.length () == 1)
4314 : : {
4315 : 31218 : gimple *stmt = scalar_stmts[0]->stmt;
4316 : 31218 : if (!is_gimple_assign (stmt)
4317 : 30337 : || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
4318 : 4007 : || TREE_CODE (gimple_assign_rhs1 (stmt)) != SSA_NAME
4319 : 35225 : || !tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
4320 : 4007 : TREE_TYPE (gimple_assign_rhs1 (stmt))))
4321 : : {
4322 : 29294 : scalar_stmts.release ();
4323 : 29294 : return false;
4324 : : }
4325 : 1924 : stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
4326 : 1924 : if (!is_gimple_assign (stmt)
4327 : 1924 : || gimple_assign_rhs_code (stmt) != (tree_code)code)
4328 : : {
4329 : 1911 : scalar_stmts.release ();
4330 : 1911 : return false;
4331 : : }
4332 : 13 : chain.truncate (0);
4333 : 13 : vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
4334 : : stmt, op_stmt, other_op_stmt, NULL);
4335 : :
4336 : 13 : scalar_stmts.truncate (0);
4337 : 13 : tail = NULL;
4338 : 67 : for (auto el : chain)
4339 : : {
4340 : 32 : if (el.dt == vect_external_def
4341 : 32 : || el.dt == vect_constant_def
4342 : 32 : || el.code != (tree_code) code)
4343 : : {
4344 : 4 : scalar_stmts.release ();
4345 : 4 : return false;
4346 : : }
4347 : 28 : stmt_vec_info stmt = vinfo->lookup_def (el.op);
4348 : 28 : if (STMT_VINFO_REDUC_IDX (stmt) != -1
4349 : 28 : || STMT_VINFO_REDUC_DEF (stmt))
4350 : : {
4351 : 0 : gcc_assert (tail == NULL);
4352 : 0 : tail = stmt;
4353 : 0 : continue;
4354 : : }
4355 : 28 : scalar_stmts.safe_push (stmt);
4356 : : }
4357 : : /* Unlike the above this does not include the reduction SSA
4358 : : cycle. */
4359 : 9 : gcc_assert (!tail);
4360 : : }
4361 : :
4362 : 88 : if (scalar_stmts.length () < 2)
4363 : : {
4364 : 0 : scalar_stmts.release ();
4365 : 0 : return false;
4366 : : }
4367 : :
4368 : 88 : if (dump_enabled_p ())
4369 : : {
4370 : 30 : dump_printf_loc (MSG_NOTE, vect_location,
4371 : : "Starting SLP discovery of reduction chain for\n");
4372 : 128 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4373 : 196 : dump_printf_loc (MSG_NOTE, vect_location,
4374 : 98 : " %G", scalar_stmts[i]->stmt);
4375 : : }
4376 : :
4377 : 88 : unsigned int group_size = scalar_stmts.length ();
4378 : 88 : bool *matches = XALLOCAVEC (bool, group_size);
4379 : 88 : poly_uint64 max_nunits = 1;
4380 : 88 : unsigned tree_size = 0;
4381 : 88 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4382 : : &max_nunits, matches, limit,
4383 : 88 : &tree_size, bst_map);
4384 : 88 : if (!node)
4385 : : {
4386 : 36 : scalar_stmts.release ();
4387 : 36 : return false;
4388 : : }
4389 : :
4390 : 52 : unsigned cycle_id = vinfo->reduc_infos.length ();
4391 : 52 : vect_reduc_info reduc_info = new vect_reduc_info_s ();
4392 : 52 : vinfo->reduc_infos.safe_push (reduc_info);
4393 : 52 : VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (next_stmt);
4394 : 52 : VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (next_stmt);
4395 : 52 : VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (next_stmt);
4396 : 52 : VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
4397 : 52 : reduc_info->is_reduc_chain = true;
4398 : :
4399 : : /* Build the node for the PHI and possibly the conversions. */
4400 : 52 : slp_tree phis = vect_create_new_slp_node (2, ERROR_MARK);
4401 : 52 : SLP_TREE_REPRESENTATIVE (phis) = next_stmt;
4402 : 52 : phis->cycle_info.id = cycle_id;
4403 : 52 : SLP_TREE_LANES (phis) = group_size;
4404 : 52 : if (reduc_scalar_stmt == scalar_stmt)
4405 : 48 : SLP_TREE_VECTYPE (phis) = SLP_TREE_VECTYPE (node);
4406 : : else
4407 : 4 : SLP_TREE_VECTYPE (phis)
4408 : 4 : = signed_or_unsigned_type_for (TYPE_UNSIGNED
4409 : : (TREE_TYPE (gimple_get_lhs
4410 : : (scalar_stmt->stmt))),
4411 : : SLP_TREE_VECTYPE (node));
4412 : : /* ??? vect_cse_slp_nodes cannot cope with cycles without any
4413 : : SLP_TREE_SCALAR_STMTS. */
4414 : 52 : SLP_TREE_SCALAR_STMTS (phis).create (group_size);
4415 : 223 : for (unsigned i = 0; i < group_size; ++i)
4416 : 171 : SLP_TREE_SCALAR_STMTS (phis).quick_push (next_stmt);
4417 : :
4418 : 52 : slp_tree op_input = phis;
4419 : 52 : if (reduc_scalar_stmt != scalar_stmt)
4420 : : {
4421 : 4 : slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
4422 : 4 : SLP_TREE_REPRESENTATIVE (conv)
4423 : 4 : = vinfo->lookup_def (gimple_arg (reduc_scalar_stmt->stmt,
4424 : 4 : STMT_VINFO_REDUC_IDX
4425 : : (reduc_scalar_stmt)));
4426 : 4 : SLP_TREE_CHILDREN (conv).quick_push (phis);
4427 : 4 : conv->cycle_info.id = cycle_id;
4428 : 4 : SLP_TREE_REDUC_IDX (conv) = 0;
4429 : 4 : SLP_TREE_LANES (conv) = group_size;
4430 : 4 : SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (node);
4431 : 4 : SLP_TREE_SCALAR_STMTS (conv) = vNULL;
4432 : 4 : op_input = conv;
4433 : : }
4434 : :
4435 : 52 : slp_tree reduc = vect_create_new_slp_node (2, ERROR_MARK);
4436 : 52 : SLP_TREE_REPRESENTATIVE (reduc) = reduc_scalar_stmt;
4437 : 52 : SLP_TREE_CHILDREN (reduc).quick_push (op_input);
4438 : 52 : SLP_TREE_CHILDREN (reduc).quick_push (node);
4439 : 52 : reduc->cycle_info.id = cycle_id;
4440 : 52 : SLP_TREE_REDUC_IDX (reduc) = 0;
4441 : 52 : SLP_TREE_LANES (reduc) = group_size;
4442 : 52 : SLP_TREE_VECTYPE (reduc) = SLP_TREE_VECTYPE (node);
4443 : : /* ??? For the reduction epilogue we need a live lane. */
4444 : 52 : SLP_TREE_SCALAR_STMTS (reduc).create (group_size);
4445 : 52 : SLP_TREE_SCALAR_STMTS (reduc).quick_push (reduc_scalar_stmt);
4446 : 171 : for (unsigned i = 1; i < group_size; ++i)
4447 : 119 : SLP_TREE_SCALAR_STMTS (reduc).quick_push (NULL);
4448 : :
4449 : 52 : if (reduc_scalar_stmt != scalar_stmt)
4450 : : {
4451 : 4 : slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
4452 : 4 : SLP_TREE_REPRESENTATIVE (conv) = scalar_stmt;
4453 : 4 : SLP_TREE_CHILDREN (conv).quick_push (reduc);
4454 : 4 : conv->cycle_info.id = cycle_id;
4455 : 4 : SLP_TREE_REDUC_IDX (conv) = 0;
4456 : 4 : SLP_TREE_LANES (conv) = group_size;
4457 : 4 : SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (phis);
4458 : : /* ??? For the reduction epilogue we need a live lane. */
4459 : 4 : SLP_TREE_SCALAR_STMTS (conv).create (group_size);
4460 : 4 : SLP_TREE_SCALAR_STMTS (conv).quick_push (scalar_stmt);
4461 : 8 : for (unsigned i = 1; i < group_size; ++i)
4462 : 4 : SLP_TREE_SCALAR_STMTS (conv).quick_push (NULL);
4463 : 4 : reduc = conv;
4464 : : }
4465 : :
4466 : 52 : edge le = loop_latch_edge (LOOP_VINFO_LOOP (vinfo));
4467 : 52 : SLP_TREE_CHILDREN (phis).quick_push (NULL);
4468 : 52 : SLP_TREE_CHILDREN (phis).quick_push (NULL);
4469 : 52 : SLP_TREE_CHILDREN (phis)[le->dest_idx] = reduc;
4470 : 52 : SLP_TREE_REF_COUNT (reduc)++;
4471 : :
4472 : : /* Create a new SLP instance. */
4473 : 52 : slp_instance new_instance = XNEW (class _slp_instance);
4474 : 52 : SLP_INSTANCE_TREE (new_instance) = reduc;
4475 : 52 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4476 : 52 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4477 : 52 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4478 : 52 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
4479 : 52 : new_instance->reduc_phis = NULL;
4480 : 52 : new_instance->cost_vec = vNULL;
4481 : 52 : new_instance->subgraph_entries = vNULL;
4482 : :
4483 : 52 : vinfo->slp_instances.safe_push (new_instance);
4484 : :
4485 : 52 : if (dump_enabled_p ())
4486 : : {
4487 : 20 : dump_printf_loc (MSG_NOTE, vect_location,
4488 : : "Final SLP tree for instance %p:\n",
4489 : : (void *) new_instance);
4490 : 20 : vect_print_slp_graph (MSG_NOTE, vect_location,
4491 : : SLP_INSTANCE_TREE (new_instance));
4492 : : }
4493 : :
4494 : 52 : return true;
4495 : 31967 : }
4496 : :
4497 : 7715 : if (scalar_stmts.length () <= 1)
4498 : : {
4499 : 6080 : scalar_stmts.release ();
4500 : 6080 : return false;
4501 : : }
4502 : :
4503 : 1635 : scalar_stmts.reverse ();
4504 : 1635 : stmt_vec_info reduc_phi_info = next_stmt;
4505 : :
4506 : : /* Build the tree for the SLP instance. */
4507 : 1635 : vec<stmt_vec_info> root_stmt_infos = vNULL;
4508 : 1635 : vec<tree> remain = vNULL;
4509 : :
4510 : 1635 : if (dump_enabled_p ())
4511 : : {
4512 : 173 : dump_printf_loc (MSG_NOTE, vect_location,
4513 : : "Starting SLP discovery of reduction chain for\n");
4514 : 939 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4515 : 1532 : dump_printf_loc (MSG_NOTE, vect_location,
4516 : 766 : " %G", scalar_stmts[i]->stmt);
4517 : : }
4518 : :
4519 : : /* Build the tree for the SLP instance. */
4520 : 1635 : unsigned int group_size = scalar_stmts.length ();
4521 : 1635 : bool *matches = XALLOCAVEC (bool, group_size);
4522 : 1635 : poly_uint64 max_nunits = 1;
4523 : 1635 : unsigned tree_size = 0;
4524 : :
4525 : : /* ??? We need this only for SLP discovery. */
4526 : 6250 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4527 : 4615 : REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = scalar_stmts[0];
4528 : :
4529 : 1635 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4530 : : &max_nunits, matches, limit,
4531 : 1635 : &tree_size, bst_map);
4532 : :
4533 : 6250 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4534 : 4615 : REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = NULL;
4535 : :
4536 : 1635 : if (node != NULL)
4537 : : {
4538 : : /* Create a new SLP instance. */
4539 : 1378 : slp_instance new_instance = XNEW (class _slp_instance);
4540 : 1378 : SLP_INSTANCE_TREE (new_instance) = node;
4541 : 1378 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4542 : 1378 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4543 : 1378 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4544 : 1378 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
4545 : 1378 : new_instance->reduc_phis = NULL;
4546 : 1378 : new_instance->cost_vec = vNULL;
4547 : 1378 : new_instance->subgraph_entries = vNULL;
4548 : :
4549 : 1378 : vect_reduc_info reduc_info = info_for_reduction (vinfo, node);
4550 : 1378 : reduc_info->is_reduc_chain = true;
4551 : :
4552 : 1378 : if (dump_enabled_p ())
4553 : 128 : dump_printf_loc (MSG_NOTE, vect_location,
4554 : : "SLP size %u vs. limit %u.\n",
4555 : : tree_size, max_tree_size);
4556 : :
4557 : : /* Fixup SLP reduction chains. If this is a reduction chain with
4558 : : a conversion in front amend the SLP tree with a node for that. */
4559 : 1378 : gimple *scalar_def = STMT_VINFO_REDUC_DEF (reduc_phi_info)->stmt;
4560 : 1378 : if (is_gimple_assign (scalar_def)
4561 : 1378 : && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (scalar_def)))
4562 : : {
4563 : 18 : stmt_vec_info conv_info = vect_stmt_to_vectorize
4564 : 18 : (STMT_VINFO_REDUC_DEF (reduc_phi_info));
4565 : 18 : scalar_stmts = vNULL;
4566 : 18 : scalar_stmts.create (group_size);
4567 : 60 : for (unsigned i = 0; i < group_size; ++i)
4568 : 42 : scalar_stmts.quick_push (conv_info);
4569 : 18 : slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
4570 : 18 : SLP_TREE_VECTYPE (conv)
4571 : 18 : = get_vectype_for_scalar_type (vinfo,
4572 : 18 : TREE_TYPE
4573 : : (gimple_assign_lhs (scalar_def)),
4574 : : group_size);
4575 : 18 : SLP_TREE_REDUC_IDX (conv) = 0;
4576 : 18 : conv->cycle_info.id = node->cycle_info.id;
4577 : 18 : SLP_TREE_CHILDREN (conv).quick_push (node);
4578 : 18 : SLP_INSTANCE_TREE (new_instance) = conv;
4579 : : }
4580 : : /* Fill the backedge child of the PHI SLP node. The
4581 : : general matching code cannot find it because the
4582 : : scalar code does not reflect how we vectorize the
4583 : : reduction. */
4584 : 1378 : use_operand_p use_p;
4585 : 1378 : imm_use_iterator imm_iter;
4586 : 1378 : class loop *loop = LOOP_VINFO_LOOP (vinfo);
4587 : 6598 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
4588 : : gimple_get_lhs (scalar_def))
4589 : : /* There are exactly two non-debug uses, the reduction
4590 : : PHI and the loop-closed PHI node. */
4591 : 3842 : if (!is_gimple_debug (USE_STMT (use_p))
4592 : 3842 : && gimple_bb (USE_STMT (use_p)) == loop->header)
4593 : : {
4594 : 1378 : auto_vec<stmt_vec_info, 64> phis (group_size);
4595 : 1378 : stmt_vec_info phi_info = vinfo->lookup_stmt (USE_STMT (use_p));
4596 : 5329 : for (unsigned i = 0; i < group_size; ++i)
4597 : 3951 : phis.quick_push (phi_info);
4598 : 1378 : slp_tree *phi_node = bst_map->get (phis);
4599 : 1378 : unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
4600 : 2756 : SLP_TREE_CHILDREN (*phi_node)[dest_idx]
4601 : 1378 : = SLP_INSTANCE_TREE (new_instance);
4602 : 1378 : SLP_INSTANCE_TREE (new_instance)->refcnt++;
4603 : 1378 : }
4604 : :
4605 : 1378 : vinfo->slp_instances.safe_push (new_instance);
4606 : :
4607 : : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4608 : : the number of scalar stmts in the root in a few places.
4609 : : Verify that assumption holds. */
4610 : 2756 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4611 : : .length () == group_size);
4612 : :
4613 : 1378 : if (dump_enabled_p ())
4614 : : {
4615 : 128 : dump_printf_loc (MSG_NOTE, vect_location,
4616 : : "Final SLP tree for instance %p:\n",
4617 : : (void *) new_instance);
4618 : 128 : vect_print_slp_graph (MSG_NOTE, vect_location,
4619 : : SLP_INSTANCE_TREE (new_instance));
4620 : : }
4621 : :
4622 : 1378 : return true;
4623 : : }
4624 : :
4625 : : /* Failed to SLP. */
4626 : 257 : scalar_stmts.release ();
4627 : 257 : if (dump_enabled_p ())
4628 : 45 : dump_printf_loc (MSG_NOTE, vect_location,
4629 : : "SLP discovery of reduction chain failed\n");
4630 : : return false;
4631 : : }
4632 : :
4633 : : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
4634 : : of KIND. Return true if successful. */
4635 : :
4636 : : static bool
4637 : 61453 : vect_analyze_slp_reduction (loop_vec_info vinfo,
4638 : : stmt_vec_info scalar_stmt,
4639 : : unsigned max_tree_size, unsigned *limit,
4640 : : scalar_stmts_to_slp_tree_map_t *bst_map,
4641 : : bool force_single_lane)
4642 : : {
4643 : 61453 : slp_instance_kind kind = slp_inst_kind_reduc_group;
4644 : :
4645 : : /* If there's no budget left bail out early. */
4646 : 61453 : if (*limit == 0)
4647 : : return false;
4648 : :
4649 : : /* Try to gather a reduction chain. */
4650 : 61453 : if (! force_single_lane
4651 : 41593 : && STMT_VINFO_DEF_TYPE (scalar_stmt) == vect_reduction_def
4652 : 102839 : && vect_analyze_slp_reduc_chain (vinfo, bst_map, scalar_stmt,
4653 : : max_tree_size, limit))
4654 : : return true;
4655 : :
4656 : 60023 : vec<stmt_vec_info> scalar_stmts;
4657 : 60023 : scalar_stmts.create (1);
4658 : 60023 : scalar_stmts.quick_push (scalar_stmt);
4659 : :
4660 : 60023 : if (dump_enabled_p ())
4661 : : {
4662 : 3156 : dump_printf_loc (MSG_NOTE, vect_location,
4663 : : "Starting SLP discovery for\n");
4664 : 6312 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4665 : 6312 : dump_printf_loc (MSG_NOTE, vect_location,
4666 : 3156 : " %G", scalar_stmts[i]->stmt);
4667 : : }
4668 : :
4669 : : /* Build the tree for the SLP instance. */
4670 : 60023 : unsigned int group_size = scalar_stmts.length ();
4671 : 60023 : bool *matches = XALLOCAVEC (bool, group_size);
4672 : 60023 : poly_uint64 max_nunits = 1;
4673 : 60023 : unsigned tree_size = 0;
4674 : :
4675 : 60023 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4676 : : &max_nunits, matches, limit,
4677 : : &tree_size, bst_map);
4678 : 60023 : if (node != NULL)
4679 : : {
4680 : : /* Create a new SLP instance. */
4681 : 57812 : slp_instance new_instance = XNEW (class _slp_instance);
4682 : 57812 : SLP_INSTANCE_TREE (new_instance) = node;
4683 : 57812 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4684 : 57812 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4685 : 57812 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4686 : 57812 : SLP_INSTANCE_KIND (new_instance) = kind;
4687 : 57812 : new_instance->reduc_phis = NULL;
4688 : 57812 : new_instance->cost_vec = vNULL;
4689 : 57812 : new_instance->subgraph_entries = vNULL;
4690 : :
4691 : 57812 : if (dump_enabled_p ())
4692 : 3052 : dump_printf_loc (MSG_NOTE, vect_location,
4693 : : "SLP size %u vs. limit %u.\n",
4694 : : tree_size, max_tree_size);
4695 : :
4696 : 57812 : vinfo->slp_instances.safe_push (new_instance);
4697 : :
4698 : : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4699 : : the number of scalar stmts in the root in a few places.
4700 : : Verify that assumption holds. */
4701 : 115624 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4702 : : .length () == group_size);
4703 : :
4704 : 57812 : if (dump_enabled_p ())
4705 : : {
4706 : 3052 : dump_printf_loc (MSG_NOTE, vect_location,
4707 : : "Final SLP tree for instance %p:\n",
4708 : : (void *) new_instance);
4709 : 3052 : vect_print_slp_graph (MSG_NOTE, vect_location,
4710 : : SLP_INSTANCE_TREE (new_instance));
4711 : : }
4712 : :
4713 : 57812 : return true;
4714 : : }
4715 : : /* Failed to SLP. */
4716 : :
4717 : : /* Free the allocated memory. */
4718 : 2211 : scalar_stmts.release ();
4719 : :
4720 : : /* Failed to SLP. */
4721 : 2211 : if (dump_enabled_p ())
4722 : 104 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4723 : : return false;
4724 : : }
4725 : :
4726 : : /* Analyze a single SLP reduction group. If successful add a SLP instance
4727 : : for it and return true, otherwise return false and have *MATCHES
4728 : : populated. */
4729 : :
4730 : : static bool
4731 : 17939 : vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo,
4732 : : vec<stmt_vec_info> scalar_stmts,
4733 : : scalar_stmts_to_slp_tree_map_t *bst_map,
4734 : : unsigned max_tree_size, unsigned *limit,
4735 : : bool *matches)
4736 : : {
4737 : : /* Try to form a reduction group. */
4738 : 17939 : unsigned int group_size = scalar_stmts.length ();
4739 : 17939 : if (!matches)
4740 : 7362 : matches = XALLOCAVEC (bool, group_size);
4741 : 17939 : poly_uint64 max_nunits = 1;
4742 : 17939 : unsigned tree_size = 0;
4743 : 17939 : slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts,
4744 : : group_size,
4745 : : &max_nunits, matches, limit,
4746 : : &tree_size, bst_map);
4747 : 17939 : if (!node)
4748 : : return false;
4749 : :
4750 : : /* Create a new SLP instance. */
4751 : 8477 : slp_instance new_instance = XNEW (class _slp_instance);
4752 : 8477 : SLP_INSTANCE_TREE (new_instance) = node;
4753 : 8477 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4754 : 8477 : SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
4755 : 8477 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
4756 : 8477 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group;
4757 : 8477 : new_instance->reduc_phis = NULL;
4758 : 8477 : new_instance->cost_vec = vNULL;
4759 : 8477 : new_instance->subgraph_entries = vNULL;
4760 : :
4761 : 8477 : if (dump_enabled_p ())
4762 : 519 : dump_printf_loc (MSG_NOTE, vect_location,
4763 : : "SLP size %u vs. limit %u.\n",
4764 : : tree_size, max_tree_size);
4765 : :
4766 : 8477 : loop_vinfo->slp_instances.safe_push (new_instance);
4767 : :
4768 : : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4769 : : the number of scalar stmts in the root in a few places.
4770 : : Verify that assumption holds. */
4771 : 16954 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4772 : : .length () == group_size);
4773 : :
4774 : 8477 : if (dump_enabled_p ())
4775 : : {
4776 : 519 : dump_printf_loc (MSG_NOTE, vect_location,
4777 : : "SLP discovery of size %d reduction group "
4778 : : "succeeded\n", group_size);
4779 : 519 : dump_printf_loc (MSG_NOTE, vect_location,
4780 : : "Final SLP tree for instance %p:\n",
4781 : : (void *) new_instance);
4782 : 519 : vect_print_slp_graph (MSG_NOTE, vect_location,
4783 : : SLP_INSTANCE_TREE (new_instance));
4784 : : }
4785 : :
4786 : : return true;
4787 : : }
4788 : :
4789 : : /* Analyze reductions in LOOP_VINFO and populate SLP instances
4790 : : accordingly. Returns false if something fails. */
4791 : :
4792 : : static bool
4793 : 402248 : vect_analyze_slp_reductions (loop_vec_info loop_vinfo,
4794 : : unsigned max_tree_size, unsigned *limit,
4795 : : scalar_stmts_to_slp_tree_map_t *bst_map,
4796 : : bool force_single_lane)
4797 : : {
4798 : 447885 : if (loop_vinfo->reductions.is_empty ())
4799 : : return true;
4800 : :
4801 : : /* Collect reduction statements we can combine into
4802 : : a SLP reduction. */
4803 : 50977 : vec<stmt_vec_info> scalar_stmts;
4804 : 50977 : scalar_stmts.create (loop_vinfo->reductions.length ());
4805 : 225456 : for (auto next_info : loop_vinfo->reductions)
4806 : : {
4807 : 72525 : next_info = vect_stmt_to_vectorize (next_info);
4808 : 72525 : if ((STMT_VINFO_RELEVANT_P (next_info)
4809 : 4 : || STMT_VINFO_LIVE_P (next_info))
4810 : : /* ??? Make sure we didn't skip a conversion around a
4811 : : reduction path. In that case we'd have to reverse
4812 : : engineer that conversion stmt following the chain using
4813 : : reduc_idx and from the PHI using reduc_def. */
4814 : 72521 : && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
4815 : 72521 : || (STMT_VINFO_DEF_TYPE (next_info)
4816 : : == vect_double_reduction_def)))
4817 : : {
4818 : : /* Do not discover SLP reductions combining lane-reducing
4819 : : ops, that will fail later. */
4820 : 72521 : if (!force_single_lane
4821 : 72521 : && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
4822 : 52220 : scalar_stmts.quick_push (next_info);
4823 : : /* Do SLP discovery for single-lane reductions. */
4824 : 20301 : else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
4825 : : max_tree_size, limit,
4826 : : bst_map,
4827 : : force_single_lane))
4828 : : {
4829 : 0 : scalar_stmts.release ();
4830 : 0 : return false;
4831 : : }
4832 : : }
4833 : : }
4834 : :
4835 : 50977 : if (scalar_stmts.length () > 1)
4836 : : {
4837 : : /* Try to form a reduction group. */
4838 : 3235 : unsigned int group_size = scalar_stmts.length ();
4839 : 3235 : bool *matches = XALLOCAVEC (bool, group_size);
4840 : 3235 : if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts, bst_map,
4841 : : max_tree_size, limit, matches))
4842 : 3149 : return true;
4843 : :
4844 : : /* When analysis as a single SLP reduction group failed try to
4845 : : form sub-groups by collecting matching lanes. Do not recurse
4846 : : that on failure (to limit compile-time costs), but recurse
4847 : : for the initial non-matching parts. Everything not covered
4848 : : by a sub-group gets single-reduction treatment. */
4849 : 2347 : vec<stmt_vec_info> cands = vNULL;
4850 : 7448 : while (matches[0])
4851 : : {
4852 : 7362 : cands.truncate (0);
4853 : 7362 : cands.reserve (group_size, true);
4854 : 57900 : for (unsigned i = 0; i < group_size; ++i)
4855 : 50538 : if (matches[i])
4856 : 12333 : cands.quick_push (scalar_stmts[i]);
4857 : :
4858 : : /* Try to form a reduction group. */
4859 : 7362 : if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map,
4860 : : max_tree_size, limit, NULL))
4861 : 5348 : cands = vNULL;
4862 : : else
4863 : : {
4864 : : /* Do SLP discovery for single-lane reductions. */
4865 : 12241 : for (auto stmt_info : cands)
4866 : 6219 : if (! vect_analyze_slp_reduction (loop_vinfo,
4867 : : vect_stmt_to_vectorize
4868 : : (stmt_info),
4869 : : max_tree_size, limit,
4870 : : bst_map, force_single_lane))
4871 : : {
4872 : 20 : scalar_stmts.release ();
4873 : 20 : cands.release ();
4874 : 20 : return false;
4875 : : }
4876 : : }
4877 : : /* Remove the handled stmts from scalar_stmts and try again,
4878 : : possibly repeating the above with updated matches[]. */
4879 : : unsigned j = 0;
4880 : 57822 : for (unsigned i = 0; i < group_size; ++i)
4881 : 50480 : if (!matches[i])
4882 : : {
4883 : 38180 : scalar_stmts[j] = scalar_stmts[i];
4884 : 38180 : ++j;
4885 : : }
4886 : 7342 : scalar_stmts.truncate (j);
4887 : 7342 : group_size = scalar_stmts.length ();
4888 : 7342 : if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
4889 : : bst_map, max_tree_size, limit,
4890 : : matches))
4891 : : return true;
4892 : : }
4893 : : }
4894 : : /* Do SLP discovery for single-lane reductions. */
4895 : 176226 : for (auto stmt_info : scalar_stmts)
4896 : 34933 : if (! vect_analyze_slp_reduction (loop_vinfo,
4897 : : vect_stmt_to_vectorize (stmt_info),
4898 : : max_tree_size, limit,
4899 : : bst_map, force_single_lane))
4900 : : {
4901 : 2191 : scalar_stmts.release ();
4902 : 2191 : return false;
4903 : : }
4904 : :
4905 : 45637 : scalar_stmts.release ();
4906 : 45637 : return true;
4907 : : }
4908 : :
4909 : : /* Analyze an SLP instance starting from a group of grouped stores. Call
4910 : : vect_build_slp_tree to build a tree of packed stmts if possible.
4911 : : Return FALSE if it's impossible to SLP any stmt in the group. */
4912 : :
4913 : : static bool
4914 : 1087310 : vect_analyze_slp_instance (vec_info *vinfo,
4915 : : scalar_stmts_to_slp_tree_map_t *bst_map,
4916 : : stmt_vec_info stmt_info,
4917 : : slp_instance_kind kind,
4918 : : unsigned max_tree_size, unsigned *limit,
4919 : : bool force_single_lane)
4920 : : {
4921 : 1087310 : vec<stmt_vec_info> scalar_stmts;
4922 : :
4923 : 1087310 : if (is_a <bb_vec_info> (vinfo))
4924 : 1065414 : vect_location = stmt_info->stmt;
4925 : :
4926 : 1087310 : gcc_assert (kind == slp_inst_kind_store);
4927 : :
4928 : : /* Collect the stores and store them in scalar_stmts. */
4929 : 1087310 : scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
4930 : 1087310 : stmt_vec_info next_info = stmt_info;
4931 : 5401448 : while (next_info)
4932 : : {
4933 : 3226828 : scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4934 : 3226828 : next_info = DR_GROUP_NEXT_ELEMENT (next_info);
4935 : : }
4936 : :
4937 : 1087310 : vec<stmt_vec_info> root_stmt_infos = vNULL;
4938 : 1087310 : vec<tree> remain = vNULL;
4939 : :
4940 : : /* Build the tree for the SLP instance. */
4941 : :
4942 : : /* If there's no budget left bail out early. */
4943 : 1087310 : if (*limit == 0)
4944 : : return false;
4945 : :
4946 : 1087287 : if (dump_enabled_p ())
4947 : : {
4948 : 3945 : dump_printf_loc (MSG_NOTE, vect_location,
4949 : : "Starting SLP discovery for\n");
4950 : 22619 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4951 : 37348 : dump_printf_loc (MSG_NOTE, vect_location,
4952 : 18674 : " %G", scalar_stmts[i]->stmt);
4953 : : }
4954 : :
4955 : : /* Build the tree for the SLP instance. */
4956 : 1087287 : unsigned int group_size = scalar_stmts.length ();
4957 : 1087287 : bool *matches = XALLOCAVEC (bool, group_size);
4958 : 1087287 : poly_uint64 max_nunits = 1;
4959 : 1087287 : unsigned tree_size = 0;
4960 : 1087287 : unsigned i;
4961 : :
4962 : 1087287 : slp_tree node = NULL;
4963 : 1087287 : if (group_size > 1 && force_single_lane)
4964 : : {
4965 : 1456 : matches[0] = true;
4966 : 1456 : matches[1] = false;
4967 : : }
4968 : : else
4969 : 1085831 : node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4970 : : &max_nunits, matches, limit,
4971 : : &tree_size, bst_map);
4972 : 1087287 : if (node != NULL)
4973 : : {
4974 : : /* Calculate the unrolling factor based on the smallest type. */
4975 : 678623 : poly_uint64 unrolling_factor
4976 : 678623 : = calculate_unrolling_factor (max_nunits, group_size);
4977 : :
4978 : 678623 : if (maybe_ne (unrolling_factor, 1U)
4979 : 678623 : && is_a <bb_vec_info> (vinfo))
4980 : : {
4981 : 0 : unsigned HOST_WIDE_INT const_max_nunits;
4982 : 0 : if (!max_nunits.is_constant (&const_max_nunits)
4983 : 0 : || const_max_nunits > group_size)
4984 : : {
4985 : 0 : if (dump_enabled_p ())
4986 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4987 : : "Build SLP failed: store group "
4988 : : "size not a multiple of the vector size "
4989 : : "in basic block SLP\n");
4990 : 0 : vect_free_slp_tree (node);
4991 : 0 : return false;
4992 : : }
4993 : : /* Fatal mismatch. */
4994 : 0 : if (dump_enabled_p ())
4995 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
4996 : : "SLP discovery succeeded but node needs "
4997 : : "splitting\n");
4998 : 0 : memset (matches, true, group_size);
4999 : 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
5000 : 0 : vect_free_slp_tree (node);
5001 : : }
5002 : : else
5003 : : {
5004 : : /* Create a new SLP instance. */
5005 : 678623 : slp_instance new_instance = XNEW (class _slp_instance);
5006 : 678623 : SLP_INSTANCE_TREE (new_instance) = node;
5007 : 678623 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
5008 : 678623 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
5009 : 678623 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
5010 : 678623 : SLP_INSTANCE_KIND (new_instance) = kind;
5011 : 678623 : new_instance->reduc_phis = NULL;
5012 : 678623 : new_instance->cost_vec = vNULL;
5013 : 678623 : new_instance->subgraph_entries = vNULL;
5014 : :
5015 : 678623 : if (dump_enabled_p ())
5016 : 2996 : dump_printf_loc (MSG_NOTE, vect_location,
5017 : : "SLP size %u vs. limit %u.\n",
5018 : : tree_size, max_tree_size);
5019 : :
5020 : 678623 : vinfo->slp_instances.safe_push (new_instance);
5021 : :
5022 : : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
5023 : : the number of scalar stmts in the root in a few places.
5024 : : Verify that assumption holds. */
5025 : 1357246 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
5026 : : .length () == group_size);
5027 : :
5028 : 678623 : if (dump_enabled_p ())
5029 : : {
5030 : 2996 : dump_printf_loc (MSG_NOTE, vect_location,
5031 : : "Final SLP tree for instance %p:\n",
5032 : : (void *) new_instance);
5033 : 2996 : vect_print_slp_graph (MSG_NOTE, vect_location,
5034 : : SLP_INSTANCE_TREE (new_instance));
5035 : : }
5036 : :
5037 : 678623 : return true;
5038 : : }
5039 : : }
5040 : : /* Failed to SLP. */
5041 : :
5042 : : /* Try to break the group up into pieces. */
5043 : 408664 : if (*limit > 0 && kind == slp_inst_kind_store)
5044 : : {
5045 : : /* ??? We could delay all the actual splitting of store-groups
5046 : : until after SLP discovery of the original group completed.
5047 : : Then we can recurse to vect_build_slp_instance directly. */
5048 : 1066047 : for (i = 0; i < group_size; i++)
5049 : 1066047 : if (!matches[i])
5050 : : break;
5051 : :
5052 : : /* For basic block SLP, try to break the group up into multiples of
5053 : : a vector size. */
5054 : 408663 : if (is_a <bb_vec_info> (vinfo)
5055 : 408663 : && (i > 1 && i < group_size))
5056 : : {
5057 : : /* Free the allocated memory. */
5058 : 154174 : scalar_stmts.release ();
5059 : :
5060 : 154174 : tree scalar_type
5061 : 154174 : = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
5062 : 308348 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
5063 : 154174 : 1 << floor_log2 (i));
5064 : 154174 : unsigned HOST_WIDE_INT const_nunits;
5065 : 154174 : if (vectype
5066 : 154174 : && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
5067 : : {
5068 : : /* Split into two groups at the first vector boundary. */
5069 : 154174 : gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
5070 : 154174 : unsigned group1_size = i & ~(const_nunits - 1);
5071 : :
5072 : 154174 : if (dump_enabled_p ())
5073 : 59 : dump_printf_loc (MSG_NOTE, vect_location,
5074 : : "Splitting SLP group at stmt %u\n", i);
5075 : 154174 : stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
5076 : : group1_size);
5077 : 154174 : bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
5078 : : kind, max_tree_size,
5079 : : limit, false);
5080 : : /* Split the rest at the failure point and possibly
5081 : : re-analyze the remaining matching part if it has
5082 : : at least two lanes. */
5083 : 154174 : if (group1_size < i
5084 : 5241 : && (i + 1 < group_size
5085 : 3086 : || i - group1_size > 1))
5086 : : {
5087 : 2185 : stmt_vec_info rest2 = rest;
5088 : 2185 : rest = vect_split_slp_store_group (rest, i - group1_size);
5089 : 2185 : if (i - group1_size > 1)
5090 : 59 : res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
5091 : : kind, max_tree_size,
5092 : : limit, false);
5093 : : }
5094 : : /* Re-analyze the non-matching tail if it has at least
5095 : : two lanes. */
5096 : 154174 : if (i + 1 < group_size)
5097 : 22122 : res |= vect_analyze_slp_instance (vinfo, bst_map,
5098 : : rest, kind, max_tree_size,
5099 : : limit, false);
5100 : 154174 : return res;
5101 : : }
5102 : : }
5103 : :
5104 : : /* For loop vectorization split the RHS into arbitrary pieces of
5105 : : size >= 1. */
5106 : 254489 : else if (is_a <loop_vec_info> (vinfo)
5107 : 254489 : && (group_size != 1 && i < group_size))
5108 : : {
5109 : 6393 : gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
5110 : 27 : bool masked_p = call
5111 : 27 : && gimple_call_internal_p (call)
5112 : 27 : && internal_fn_mask_index (gimple_call_internal_fn (call)) != -1;
5113 : : /* There are targets that cannot do even/odd interleaving schemes
5114 : : so they absolutely need to use load/store-lanes. For now
5115 : : force single-lane SLP for them - they would be happy with
5116 : : uniform power-of-two lanes (but depending on element size),
5117 : : but even if we can use 'i' as indicator we would need to
5118 : : backtrack when later lanes fail to discover with the same
5119 : : granularity. We cannot turn any of strided or scatter store
5120 : : into store-lanes. */
5121 : : /* ??? If this is not in sync with what get_load_store_type
5122 : : later decides the SLP representation is not good for other
5123 : : store vectorization methods. */
5124 : 6393 : bool want_store_lanes
5125 : 6393 : = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5126 : 6393 : && ! STMT_VINFO_STRIDED_P (stmt_info)
5127 : 4899 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
5128 : 4896 : && compare_step_with_zero (vinfo, stmt_info) > 0
5129 : 11266 : && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
5130 : 12786 : masked_p, group_size, i));
5131 : 6393 : if (want_store_lanes || force_single_lane)
5132 : : i = 1;
5133 : :
5134 : : /* A fatal discovery fail doesn't always mean single-lane SLP
5135 : : isn't a possibility, so try. */
5136 : 4937 : if (i == 0)
5137 : : i = 1;
5138 : :
5139 : 6393 : if (dump_enabled_p ())
5140 : 848 : dump_printf_loc (MSG_NOTE, vect_location,
5141 : : "Splitting SLP group at stmt %u\n", i);
5142 : :
5143 : : /* Analyze the stored values and pinch them together with
5144 : : a permute node so we can preserve the whole store group. */
5145 : 6393 : auto_vec<slp_tree> rhs_nodes;
5146 : 6393 : poly_uint64 max_nunits = 1;
5147 : :
5148 : 6393 : unsigned int rhs_common_nlanes = 0;
5149 : 6393 : unsigned int start = 0, end = i;
5150 : 29021 : while (start < group_size)
5151 : : {
5152 : 22877 : gcc_assert (end - start >= 1);
5153 : 22877 : vec<stmt_vec_info> substmts;
5154 : 22877 : substmts.create (end - start);
5155 : 69287 : for (unsigned j = start; j < end; ++j)
5156 : 46410 : substmts.quick_push (scalar_stmts[j]);
5157 : 22877 : max_nunits = 1;
5158 : 22877 : node = vect_build_slp_tree (vinfo, substmts, end - start,
5159 : : &max_nunits,
5160 : : matches, limit, &tree_size, bst_map);
5161 : 22877 : if (node)
5162 : : {
5163 : 18165 : rhs_nodes.safe_push (node);
5164 : 18165 : vect_update_max_nunits (&max_nunits, node->max_nunits);
5165 : 18165 : if (start == 0)
5166 : 6148 : rhs_common_nlanes = SLP_TREE_LANES (node);
5167 : 12017 : else if (rhs_common_nlanes != SLP_TREE_LANES (node))
5168 : 1271 : rhs_common_nlanes = 0;
5169 : 18165 : start = end;
5170 : 18165 : if (want_store_lanes || force_single_lane)
5171 : 4463 : end = start + 1;
5172 : : else
5173 : : end = group_size;
5174 : : }
5175 : : else
5176 : : {
5177 : 4712 : substmts.release ();
5178 : 4712 : if (end - start == 1)
5179 : : {
5180 : : /* Single-lane discovery failed. Free ressources. */
5181 : 263 : for (auto node : rhs_nodes)
5182 : 6 : vect_free_slp_tree (node);
5183 : 249 : scalar_stmts.release ();
5184 : 249 : if (dump_enabled_p ())
5185 : 41 : dump_printf_loc (MSG_NOTE, vect_location,
5186 : : "SLP discovery failed\n");
5187 : 249 : return false;
5188 : : }
5189 : :
5190 : : /* ??? It really happens that we soft-fail SLP
5191 : : build at a mismatch but the matching part hard-fails
5192 : : later. As we know we arrived here with a group
5193 : : larger than one try a group of size one! */
5194 : 4463 : if (!matches[0])
5195 : 42 : end = start + 1;
5196 : : else
5197 : 9934 : for (unsigned j = start; j < end; j++)
5198 : 9934 : if (!matches[j - start])
5199 : : {
5200 : : end = j;
5201 : : break;
5202 : : }
5203 : : }
5204 : : }
5205 : :
5206 : : /* Now re-assess whether we want store lanes in case the
5207 : : discovery ended up producing all single-lane RHSs. */
5208 : 6144 : if (! want_store_lanes
5209 : 6144 : && rhs_common_nlanes == 1
5210 : 5278 : && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
5211 : 5278 : && ! STMT_VINFO_STRIDED_P (stmt_info)
5212 : 4050 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
5213 : 4047 : && compare_step_with_zero (vinfo, stmt_info) > 0
5214 : 10180 : && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes[0]),
5215 : : group_size, masked_p)
5216 : : != IFN_LAST))
5217 : : want_store_lanes = true;
5218 : :
5219 : : /* Now we assume we can build the root SLP node from all stores. */
5220 : 6144 : if (want_store_lanes)
5221 : : {
5222 : : /* For store-lanes feed the store node with all RHS nodes
5223 : : in order. */
5224 : 0 : node = vect_create_new_slp_node (scalar_stmts,
5225 : 0 : SLP_TREE_CHILDREN
5226 : : (rhs_nodes[0]).length ());
5227 : 0 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
5228 : 0 : node->max_nunits = max_nunits;
5229 : 0 : node->ldst_lanes = true;
5230 : 0 : SLP_TREE_CHILDREN (node)
5231 : 0 : .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
5232 : 0 : + rhs_nodes.length () - 1);
5233 : : /* First store value and possibly mask. */
5234 : 0 : SLP_TREE_CHILDREN (node)
5235 : 0 : .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
5236 : : /* Rest of the store values. All mask nodes are the same,
5237 : : this should be guaranteed by dataref group discovery. */
5238 : 0 : for (unsigned j = 1; j < rhs_nodes.length (); ++j)
5239 : 0 : SLP_TREE_CHILDREN (node)
5240 : 0 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
5241 : 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
5242 : 0 : child->refcnt++;
5243 : : }
5244 : : else
5245 : 6144 : node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
5246 : : max_nunits);
5247 : :
5248 : 24303 : while (!rhs_nodes.is_empty ())
5249 : 18159 : vect_free_slp_tree (rhs_nodes.pop ());
5250 : :
5251 : : /* Create a new SLP instance. */
5252 : 6144 : slp_instance new_instance = XNEW (class _slp_instance);
5253 : 6144 : SLP_INSTANCE_TREE (new_instance) = node;
5254 : 6144 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
5255 : 6144 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
5256 : 6144 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
5257 : 6144 : SLP_INSTANCE_KIND (new_instance) = kind;
5258 : 6144 : new_instance->reduc_phis = NULL;
5259 : 6144 : new_instance->cost_vec = vNULL;
5260 : 6144 : new_instance->subgraph_entries = vNULL;
5261 : :
5262 : 6144 : if (dump_enabled_p ())
5263 : 807 : dump_printf_loc (MSG_NOTE, vect_location,
5264 : : "SLP size %u vs. limit %u.\n",
5265 : : tree_size, max_tree_size);
5266 : :
5267 : 6144 : vinfo->slp_instances.safe_push (new_instance);
5268 : :
5269 : : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
5270 : : the number of scalar stmts in the root in a few places.
5271 : : Verify that assumption holds. */
5272 : 12288 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
5273 : : .length () == group_size);
5274 : :
5275 : 6144 : if (dump_enabled_p ())
5276 : : {
5277 : 807 : dump_printf_loc (MSG_NOTE, vect_location,
5278 : : "Final SLP tree for instance %p:\n",
5279 : : (void *) new_instance);
5280 : 807 : vect_print_slp_graph (MSG_NOTE, vect_location,
5281 : : SLP_INSTANCE_TREE (new_instance));
5282 : : }
5283 : 6144 : return true;
5284 : 6393 : }
5285 : : else
5286 : : /* Free the allocated memory. */
5287 : 248096 : scalar_stmts.release ();
5288 : :
5289 : : /* Even though the first vector did not all match, we might be able to SLP
5290 : : (some) of the remainder. FORNOW ignore this possibility. */
5291 : : }
5292 : : else
5293 : : /* Free the allocated memory. */
5294 : 1 : scalar_stmts.release ();
5295 : :
5296 : : /* Failed to SLP. */
5297 : 248097 : if (dump_enabled_p ())
5298 : 42 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
5299 : : return false;
5300 : : }
5301 : :
5302 : : /* qsort comparator ordering SLP load nodes. */
5303 : :
5304 : : static int
5305 : 2198950 : vllp_cmp (const void *a_, const void *b_)
5306 : : {
5307 : 2198950 : const slp_tree a = *(const slp_tree *)a_;
5308 : 2198950 : const slp_tree b = *(const slp_tree *)b_;
5309 : 2198950 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
5310 : 2198950 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
5311 : 2198950 : if (STMT_VINFO_GROUPED_ACCESS (a0)
5312 : 1364644 : && STMT_VINFO_GROUPED_ACCESS (b0)
5313 : 3509474 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
5314 : : {
5315 : : /* Same group, order after lanes used. */
5316 : 333302 : if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
5317 : : return 1;
5318 : 327332 : else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
5319 : : return -1;
5320 : : else
5321 : : {
5322 : : /* Try to order loads using the same lanes together, breaking
5323 : : the tie with the lane number that first differs. */
5324 : 320717 : if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
5325 : 320717 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
5326 : : return 0;
5327 : 320717 : else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
5328 : 320717 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
5329 : : return 1;
5330 : 318121 : else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
5331 : 318121 : && SLP_TREE_LOAD_PERMUTATION (b).exists ())
5332 : : return -1;
5333 : : else
5334 : : {
5335 : 313339 : for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
5336 : 313339 : if (SLP_TREE_LOAD_PERMUTATION (a)[i]
5337 : 313339 : != SLP_TREE_LOAD_PERMUTATION (b)[i])
5338 : : {
5339 : : /* In-order lane first, that's what the above case for
5340 : : no permutation does. */
5341 : 312507 : if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
5342 : : return -1;
5343 : 185953 : else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
5344 : : return 1;
5345 : 86956 : else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
5346 : 86956 : < SLP_TREE_LOAD_PERMUTATION (b)[i])
5347 : : return -1;
5348 : : else
5349 : : return 1;
5350 : : }
5351 : : return 0;
5352 : : }
5353 : : }
5354 : : }
5355 : : else /* Different groups or non-groups. */
5356 : : {
5357 : : /* Order groups as their first element to keep them together. */
5358 : 1865648 : if (STMT_VINFO_GROUPED_ACCESS (a0))
5359 : 1865648 : a0 = DR_GROUP_FIRST_ELEMENT (a0);
5360 : 1865648 : if (STMT_VINFO_GROUPED_ACCESS (b0))
5361 : 1865648 : b0 = DR_GROUP_FIRST_ELEMENT (b0);
5362 : 1865648 : if (a0 == b0)
5363 : : return 0;
5364 : : /* Tie using UID. */
5365 : 1865528 : else if (gimple_uid (STMT_VINFO_STMT (a0))
5366 : 1865528 : < gimple_uid (STMT_VINFO_STMT (b0)))
5367 : : return -1;
5368 : : else
5369 : : {
5370 : 833004 : gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
5371 : : != gimple_uid (STMT_VINFO_STMT (b0)));
5372 : : return 1;
5373 : : }
5374 : : }
5375 : : }
5376 : :
5377 : : /* Return whether if the load permutation of NODE is consecutive starting
5378 : : from index START_IDX. */
5379 : :
5380 : : bool
5381 : 531236 : vect_load_perm_consecutive_p (slp_tree node, unsigned start_idx)
5382 : : {
5383 : 531236 : load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
5384 : :
5385 : 531236 : if (!perm.exists () || perm.length () < start_idx)
5386 : : return false;
5387 : :
5388 : 531236 : unsigned int start = perm[start_idx];
5389 : 535455 : for (unsigned int i = start_idx + 1; i < perm.length (); i++)
5390 : 10822 : if (perm[i] != start + (unsigned int)i)
5391 : : return false;
5392 : :
5393 : : return true;
5394 : : }
5395 : :
5396 : : /* Process the set of LOADS that are all from the same dataref group. */
5397 : :
5398 : : static void
5399 : 138564 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
5400 : : scalar_stmts_to_slp_tree_map_t *bst_map,
5401 : : const array_slice<slp_tree> &loads,
5402 : : bool force_single_lane)
5403 : : {
5404 : : /* We at this point want to lower without a fixed VF or vector
5405 : : size in mind which means we cannot actually compute whether we
5406 : : need three or more vectors for a load permutation yet. So always
5407 : : lower. */
5408 : 138564 : stmt_vec_info first
5409 : 138564 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
5410 : 138564 : unsigned group_lanes = DR_GROUP_SIZE (first);
5411 : :
5412 : : /* Verify if all load permutations can be implemented with a suitably
5413 : : large element load-lanes operation. */
5414 : 138564 : unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
5415 : 138564 : if (STMT_VINFO_STRIDED_P (first)
5416 : 136424 : || compare_step_with_zero (loop_vinfo, first) <= 0
5417 : 134277 : || exact_log2 (ld_lanes_lanes) == -1
5418 : : /* ??? For now only support the single-lane case as there is
5419 : : missing support on the store-lane side and code generation
5420 : : isn't up to the task yet. */
5421 : 132377 : || ld_lanes_lanes != 1
5422 : 263782 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
5423 : : group_lanes / ld_lanes_lanes,
5424 : : false) == IFN_LAST)
5425 : : ld_lanes_lanes = 0;
5426 : : else
5427 : : /* Verify the loads access the same number of lanes aligned to
5428 : : ld_lanes_lanes. */
5429 : 0 : for (slp_tree load : loads)
5430 : : {
5431 : 0 : if (SLP_TREE_LANES (load) != ld_lanes_lanes)
5432 : : {
5433 : : ld_lanes_lanes = 0;
5434 : : break;
5435 : : }
5436 : 0 : unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
5437 : 0 : if (first % ld_lanes_lanes != 0)
5438 : : {
5439 : : ld_lanes_lanes = 0;
5440 : : break;
5441 : : }
5442 : 0 : if (!vect_load_perm_consecutive_p (load))
5443 : : {
5444 : : ld_lanes_lanes = 0;
5445 : : break;
5446 : : }
5447 : : }
5448 : :
5449 : : /* Only a power-of-two number of lanes matches interleaving with N levels.
5450 : : ??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
5451 : : at each step. */
5452 : 223184 : if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
5453 : : return;
5454 : :
5455 : 246741 : for (slp_tree load : loads)
5456 : : {
5457 : : /* Leave masked or gather loads alone for now. */
5458 : 167353 : if (!SLP_TREE_CHILDREN (load).is_empty ())
5459 : 47515 : continue;
5460 : :
5461 : : /* For single-element interleaving spanning multiple vectors avoid
5462 : : lowering, we want to use VMAT_ELEMENTWISE later. */
5463 : 167347 : if (ld_lanes_lanes == 0
5464 : 167347 : && SLP_TREE_LANES (load) == 1
5465 : 154384 : && !DR_GROUP_NEXT_ELEMENT (first)
5466 : 224736 : && maybe_gt (group_lanes,
5467 : : TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (load))))
5468 : 28953 : return;
5469 : :
5470 : : /* We want to pattern-match special cases here and keep those
5471 : : alone. Candidates are splats and load-lane. */
5472 : :
5473 : : /* We need to lower only loads of less than half of the groups
5474 : : lanes, including duplicate lanes. Note this leaves nodes
5475 : : with a non-1:1 load permutation around instead of canonicalizing
5476 : : those into a load and a permute node. Removing this early
5477 : : check would do such canonicalization. */
5478 : 138394 : if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
5479 : 44074 : && ld_lanes_lanes == 0)
5480 : 44074 : continue;
5481 : :
5482 : : /* Build the permute to get the original load permutation order. */
5483 : 94320 : bool contiguous = vect_load_perm_consecutive_p (load);
5484 : 94320 : lane_permutation_t final_perm;
5485 : 94320 : final_perm.create (SLP_TREE_LANES (load));
5486 : 189226 : for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
5487 : 189812 : final_perm.quick_push (
5488 : 94906 : std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
5489 : :
5490 : : /* When the load permutation accesses a contiguous unpermuted,
5491 : : power-of-two aligned and sized chunk leave the load alone.
5492 : : We can likely (re-)load it more efficiently rather than
5493 : : extracting it from the larger load.
5494 : : ??? Long-term some of the lowering should move to where
5495 : : the vector types involved are fixed. */
5496 : 97755 : if (!force_single_lane
5497 : 94320 : && ld_lanes_lanes == 0
5498 : 59068 : && contiguous
5499 : 58831 : && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
5500 : 6564 : && pow2p_hwi (SLP_TREE_LANES (load))
5501 : 6564 : && pow2p_hwi (group_lanes)
5502 : 3435 : && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
5503 : 97755 : && group_lanes % SLP_TREE_LANES (load) == 0)
5504 : : {
5505 : 3435 : final_perm.release ();
5506 : 3435 : continue;
5507 : : }
5508 : :
5509 : : /* First build (and possibly re-use) a load node for the
5510 : : unpermuted group. Gaps in the middle and on the end are
5511 : : represented with NULL stmts. */
5512 : 90885 : vec<stmt_vec_info> stmts;
5513 : 90885 : stmts.create (group_lanes);
5514 : 306262 : for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
5515 : : {
5516 : 215377 : if (s != first)
5517 : 128974 : for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
5518 : 4482 : stmts.quick_push (NULL);
5519 : 215377 : stmts.quick_push (s);
5520 : : }
5521 : 174364 : for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
5522 : 83479 : stmts.quick_push (NULL);
5523 : 90885 : poly_uint64 max_nunits = 1;
5524 : 90885 : bool *matches = XALLOCAVEC (bool, group_lanes);
5525 : 90885 : unsigned limit = 1;
5526 : 90885 : unsigned tree_size = 0;
5527 : 90885 : slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
5528 : : group_lanes,
5529 : : &max_nunits, matches, &limit,
5530 : 90885 : &tree_size, bst_map);
5531 : 90885 : gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0).exists ());
5532 : :
5533 : 90885 : if (ld_lanes_lanes != 0)
5534 : : {
5535 : : /* ??? If this is not in sync with what get_load_store_type
5536 : : later decides the SLP representation is not good for other
5537 : : store vectorization methods. */
5538 : 0 : l0->ldst_lanes = true;
5539 : 0 : load->ldst_lanes = true;
5540 : : }
5541 : :
5542 : 279773 : while (1)
5543 : : {
5544 : 185329 : unsigned group_lanes = SLP_TREE_LANES (l0);
5545 : 185329 : if (ld_lanes_lanes != 0
5546 : 185329 : || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
5547 : : break;
5548 : :
5549 : : /* Try to lower by reducing the group to half its size using an
5550 : : interleaving scheme. For this try to compute whether all
5551 : : elements needed for this load are in even or odd elements of
5552 : : an even/odd decomposition with N consecutive elements.
5553 : : Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
5554 : : with N == 2. */
5555 : : /* ??? Only an even number of lanes can be handed this way, but the
5556 : : fallback below could work for any number. We have to make sure
5557 : : to round up in that case. */
5558 : 94444 : gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
5559 : 10046 : unsigned even = 0, odd = 0;
5560 : 10046 : if ((group_lanes & 1) == 0)
5561 : : {
5562 : 10046 : even = (1 << ceil_log2 (group_lanes)) - 1;
5563 : 10046 : odd = even;
5564 : 40711 : for (auto l : final_perm)
5565 : : {
5566 : 10573 : even &= ~l.second;
5567 : 10573 : odd &= l.second;
5568 : : }
5569 : : }
5570 : :
5571 : : /* Now build an even or odd extraction from the unpermuted load. */
5572 : 94444 : lane_permutation_t perm;
5573 : 94444 : perm.create ((group_lanes + 1) / 2);
5574 : 94444 : unsigned even_level = even ? 1 << ctz_hwi (even) : 0;
5575 : 94444 : unsigned odd_level = odd ? 1 << ctz_hwi (odd) : 0;
5576 : 94444 : if (even_level
5577 : 9332 : && group_lanes % (2 * even_level) == 0
5578 : : /* ??? When code generating permutes we do not try to pun
5579 : : to larger component modes so level != 1 isn't a natural
5580 : : even/odd extract. Prefer one if possible. */
5581 : 9332 : && (even_level == 1 || !odd_level || odd_level != 1))
5582 : : {
5583 : : /* { 0, 1, ... 4, 5 ..., } */
5584 : 34865 : for (unsigned i = 0; i < group_lanes / 2 / even_level; ++i)
5585 : 55086 : for (unsigned j = 0; j < even_level; ++j)
5586 : 27632 : perm.quick_push (std::make_pair (0, 2 * i * even_level + j));
5587 : : }
5588 : 85112 : else if (odd_level)
5589 : : {
5590 : : /* { ..., 2, 3, ... 6, 7 } */
5591 : 2623 : gcc_assert (group_lanes % (2 * odd_level) == 0);
5592 : 11591 : for (unsigned i = 0; i < group_lanes / 2 / odd_level; ++i)
5593 : 17936 : for (unsigned j = 0; j < odd_level; ++j)
5594 : 8968 : perm.quick_push
5595 : 8968 : (std::make_pair (0, (2 * i + 1) * odd_level + j));
5596 : : }
5597 : : else
5598 : : {
5599 : : /* As fallback extract all used lanes and fill to half the
5600 : : group size by repeating the last element.
5601 : : ??? This is quite a bad strathegy for re-use - we could
5602 : : brute force our way to find more optimal filling lanes to
5603 : : maximize re-use when looking at all loads from the group. */
5604 : 84410 : auto_bitmap l;
5605 : 337660 : for (auto p : final_perm)
5606 : 84430 : bitmap_set_bit (l, p.second);
5607 : 84410 : unsigned i = 0;
5608 : 84410 : bitmap_iterator bi;
5609 : 168840 : EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
5610 : 84430 : perm.quick_push (std::make_pair (0, i));
5611 : 337648 : while (perm.length () < (group_lanes + 1) / 2)
5612 : 84414 : perm.quick_push (perm.last ());
5613 : 84410 : }
5614 : :
5615 : : /* Update final_perm with the intermediate permute. */
5616 : 189415 : for (unsigned i = 0; i < final_perm.length (); ++i)
5617 : : {
5618 : 94971 : unsigned l = final_perm[i].second;
5619 : 94971 : unsigned j;
5620 : 101897 : for (j = 0; j < perm.length (); ++j)
5621 : 101897 : if (perm[j].second == l)
5622 : : {
5623 : 94971 : final_perm[i].second = j;
5624 : 94971 : break;
5625 : : }
5626 : 94971 : gcc_assert (j < perm.length ());
5627 : : }
5628 : :
5629 : : /* And create scalar stmts. */
5630 : 94444 : vec<stmt_vec_info> perm_stmts;
5631 : 94444 : perm_stmts.create (perm.length ());
5632 : 299888 : for (unsigned i = 0; i < perm.length (); ++i)
5633 : 205444 : perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
5634 : :
5635 : 94444 : slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
5636 : 94444 : SLP_TREE_CHILDREN (p).quick_push (l0);
5637 : 94444 : SLP_TREE_LANE_PERMUTATION (p) = perm;
5638 : 94444 : SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
5639 : 94444 : SLP_TREE_LANES (p) = perm.length ();
5640 : 94444 : SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
5641 : : /* ??? As we have scalar stmts for this intermediate permute we
5642 : : could CSE it via bst_map but we do not want to pick up
5643 : : another SLP node with a load permutation. We instead should
5644 : : have a "local" CSE map here. */
5645 : 94444 : SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
5646 : :
5647 : : /* We now have a node for (group_lanes + 1) / 2 lanes. */
5648 : 94444 : l0 = p;
5649 : 94444 : }
5650 : :
5651 : : /* And finally from the ordered reduction node create the
5652 : : permute to shuffle the lanes into the original load-permutation
5653 : : order. We replace the original load node with this. */
5654 : 90885 : SLP_TREE_CODE (load) = VEC_PERM_EXPR;
5655 : 90885 : SLP_TREE_LOAD_PERMUTATION (load).release ();
5656 : 90885 : SLP_TREE_LANE_PERMUTATION (load) = final_perm;
5657 : 90885 : SLP_TREE_CHILDREN (load).create (1);
5658 : 90885 : SLP_TREE_CHILDREN (load).quick_push (l0);
5659 : : }
5660 : : }
5661 : :
5662 : : /* Transform SLP loads in the SLP graph created by SLP discovery to
5663 : : group loads from the same group and lower load permutations that
5664 : : are unlikely to be supported into a series of permutes.
5665 : : In the degenerate case of having only single-lane SLP instances
5666 : : this should result in a series of permute nodes emulating an
5667 : : interleaving scheme. */
5668 : :
5669 : : static void
5670 : 379705 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
5671 : : scalar_stmts_to_slp_tree_map_t *bst_map,
5672 : : bool force_single_lane)
5673 : : {
5674 : : /* Gather and sort loads across all instances. */
5675 : 379705 : hash_set<slp_tree> visited;
5676 : 379705 : auto_vec<slp_tree> loads;
5677 : 2092711 : for (auto inst : loop_vinfo->slp_instances)
5678 : 955676 : vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
5679 : 379705 : if (loads.is_empty ())
5680 : 70023 : return;
5681 : 309682 : loads.qsort (vllp_cmp);
5682 : :
5683 : : /* Now process each dataref group separately. */
5684 : 309682 : unsigned firsti = 0;
5685 : 588200 : for (unsigned i = 1; i < loads.length (); ++i)
5686 : : {
5687 : 278518 : slp_tree first = loads[firsti];
5688 : 278518 : slp_tree next = loads[i];
5689 : 278518 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
5690 : 278518 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
5691 : 278518 : if (STMT_VINFO_GROUPED_ACCESS (a0)
5692 : 143174 : && STMT_VINFO_GROUPED_ACCESS (b0)
5693 : 409710 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
5694 : 63834 : continue;
5695 : : /* Now we have one or multiple SLP loads of the same group from
5696 : : firsti to i - 1. */
5697 : 214684 : if (STMT_VINFO_GROUPED_ACCESS (a0))
5698 : 79340 : vect_lower_load_permutations (loop_vinfo, bst_map,
5699 : 79340 : make_array_slice (&loads[firsti],
5700 : : i - firsti),
5701 : : force_single_lane);
5702 : : firsti = i;
5703 : : }
5704 : 619364 : if (firsti < loads.length ()
5705 : 619364 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
5706 : 59224 : vect_lower_load_permutations (loop_vinfo, bst_map,
5707 : 59224 : make_array_slice (&loads[firsti],
5708 : 59224 : loads.length () - firsti),
5709 : : force_single_lane);
5710 : 379705 : }
5711 : :
5712 : : /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
5713 : : trees of packed scalar stmts if SLP is possible. */
5714 : :
5715 : : opt_result
5716 : 1045037 : vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
5717 : : bool force_single_lane)
5718 : : {
5719 : 1045037 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5720 : 1045037 : unsigned int i;
5721 : 1045037 : stmt_vec_info first_element;
5722 : 1045037 : slp_instance instance;
5723 : :
5724 : 1045037 : DUMP_VECT_SCOPE ("vect_analyze_slp");
5725 : :
5726 : 1045037 : unsigned limit = max_tree_size;
5727 : :
5728 : 1045037 : scalar_stmts_to_slp_tree_map_t *bst_map
5729 : 1045037 : = new scalar_stmts_to_slp_tree_map_t ();
5730 : :
5731 : : /* Find SLP sequences starting from groups of grouped stores. */
5732 : 3000772 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5733 : 910955 : if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
5734 : : slp_inst_kind_store, max_tree_size, &limit,
5735 : : force_single_lane)
5736 : 910955 : && loop_vinfo)
5737 : 257 : return opt_result::failure_at (vect_location, "SLP build failed.\n");
5738 : :
5739 : : /* For loops also start SLP discovery from non-grouped stores. */
5740 : 1044780 : if (loop_vinfo)
5741 : : {
5742 : : data_reference_p dr;
5743 : 1330341 : FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
5744 : 928093 : if (DR_IS_WRITE (dr))
5745 : : {
5746 : 286517 : stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
5747 : : /* Grouped stores are already handled above. */
5748 : 286517 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5749 : 72873 : continue;
5750 : 213644 : vec<stmt_vec_info> stmts;
5751 : 213644 : vec<stmt_vec_info> roots = vNULL;
5752 : 213644 : vec<tree> remain = vNULL;
5753 : 213644 : stmts.create (1);
5754 : 213644 : stmts.quick_push (stmt_info);
5755 : 213644 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
5756 : : stmts, roots, remain, max_tree_size,
5757 : : &limit, bst_map, force_single_lane))
5758 : 3934 : return opt_result::failure_at (vect_location,
5759 : : "SLP build failed.\n");
5760 : : }
5761 : :
5762 : : stmt_vec_info stmt_info;
5763 : 402288 : FOR_EACH_VEC_ELT (LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo), i, stmt_info)
5764 : : {
5765 : 20 : vec<stmt_vec_info> stmts;
5766 : 20 : vec<stmt_vec_info> roots = vNULL;
5767 : 20 : vec<tree> remain = vNULL;
5768 : 20 : stmts.create (1);
5769 : 20 : stmts.quick_push (stmt_info);
5770 : 20 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
5771 : : stmts, roots, remain, max_tree_size,
5772 : : &limit, bst_map, force_single_lane))
5773 : 0 : return opt_result::failure_at (vect_location,
5774 : : "SLP build failed.\n");
5775 : : }
5776 : : }
5777 : :
5778 : 1040846 : if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
5779 : : {
5780 : 1858800 : for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
5781 : : {
5782 : 1220202 : vect_location = bb_vinfo->roots[i].roots[0]->stmt;
5783 : : /* Apply patterns. */
5784 : 3812534 : for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
5785 : 5184664 : bb_vinfo->roots[i].stmts[j]
5786 : 2666081 : = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
5787 : 1220202 : if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
5788 : 1220202 : bb_vinfo->roots[i].stmts,
5789 : 1220202 : bb_vinfo->roots[i].roots,
5790 : 1220202 : bb_vinfo->roots[i].remain,
5791 : : max_tree_size, &limit, bst_map, false))
5792 : : {
5793 : 131789 : bb_vinfo->roots[i].roots = vNULL;
5794 : 131789 : bb_vinfo->roots[i].remain = vNULL;
5795 : : }
5796 : 1220202 : bb_vinfo->roots[i].stmts = vNULL;
5797 : : }
5798 : : }
5799 : :
5800 : 1040846 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5801 : : {
5802 : : /* Find SLP sequences starting from groups of reductions. */
5803 : 402248 : if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit,
5804 : : bst_map, force_single_lane))
5805 : 2211 : return opt_result::failure_at (vect_location, "SLP build failed.\n");
5806 : :
5807 : : /* Make sure to vectorize only-live stmts, usually inductions. */
5808 : 1849395 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
5809 : 1223827 : for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
5810 : 567368 : gsi_next (&gsi))
5811 : : {
5812 : 574543 : gphi *lc_phi = *gsi;
5813 : 574543 : tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
5814 : 574543 : stmt_vec_info stmt_info;
5815 : 574543 : if (TREE_CODE (def) == SSA_NAME
5816 : 463258 : && !virtual_operand_p (def)
5817 : 229118 : && (stmt_info = loop_vinfo->lookup_def (def))
5818 : 198225 : && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
5819 : 198225 : && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
5820 : 152984 : && STMT_VINFO_LIVE_P (stmt_info)
5821 : 152984 : && !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))
5822 : 657770 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
5823 : : {
5824 : 83148 : vec<stmt_vec_info> stmts;
5825 : 83148 : vec<stmt_vec_info> roots = vNULL;
5826 : 83148 : vec<tree> remain = vNULL;
5827 : 83148 : stmts.create (1);
5828 : 83148 : stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
5829 : 83148 : if (! vect_build_slp_instance (vinfo,
5830 : : slp_inst_kind_reduc_group,
5831 : : stmts, roots, remain,
5832 : : max_tree_size, &limit,
5833 : : bst_map, force_single_lane))
5834 : 7175 : return opt_result::failure_at (vect_location,
5835 : : "SLP build failed.\n");
5836 : : }
5837 : 7175 : }
5838 : :
5839 : : /* Find SLP sequences starting from gconds. */
5840 : 1028033 : for (auto cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
5841 : : {
5842 : 251816 : auto cond_info = loop_vinfo->lookup_stmt (cond);
5843 : :
5844 : 251816 : cond_info = vect_stmt_to_vectorize (cond_info);
5845 : 251816 : vec<stmt_vec_info> roots = vNULL;
5846 : 251816 : roots.safe_push (cond_info);
5847 : 251816 : gimple *stmt = STMT_VINFO_STMT (cond_info);
5848 : 251816 : tree args0 = gimple_cond_lhs (stmt);
5849 : 251816 : tree args1 = gimple_cond_rhs (stmt);
5850 : :
5851 : : /* These should be enforced by cond lowering, but if it failed
5852 : : bail. */
5853 : 251816 : if (gimple_cond_code (stmt) != NE_EXPR
5854 : 250534 : || TREE_TYPE (args0) != boolean_type_node
5855 : 501263 : || !integer_zerop (args1))
5856 : : {
5857 : 2369 : roots.release ();
5858 : 2369 : return opt_result::failure_at (vect_location,
5859 : : "SLP build failed.\n");
5860 : : }
5861 : :
5862 : : /* An argument without a loop def will be codegened from vectorizing the
5863 : : root gcond itself. As such we don't need to try to build an SLP tree
5864 : : from them. It's highly likely that the resulting SLP tree here if both
5865 : : arguments have a def will be incompatible, but we rely on it being split
5866 : : later on. */
5867 : 249447 : auto varg = loop_vinfo->lookup_def (args0);
5868 : 249447 : vec<stmt_vec_info> stmts;
5869 : 249447 : vec<tree> remain = vNULL;
5870 : 249447 : stmts.create (1);
5871 : 249447 : stmts.quick_push (vect_stmt_to_vectorize (varg));
5872 : :
5873 : 249447 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
5874 : : stmts, roots, remain,
5875 : : max_tree_size, &limit,
5876 : : bst_map, force_single_lane))
5877 : : {
5878 : 2362 : roots.release ();
5879 : 2362 : return opt_result::failure_at (vect_location,
5880 : : "SLP build failed.\n");
5881 : : }
5882 : : }
5883 : :
5884 : : /* Find and create slp instances for inductions that have been forced
5885 : : live due to early break. */
5886 : 388131 : edge latch_e = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
5887 : 1103647 : for (auto stmt_info : LOOP_VINFO_EARLY_BREAKS_LIVE_IVS (loop_vinfo))
5888 : : {
5889 : 345318 : vec<stmt_vec_info> stmts;
5890 : 345318 : vec<stmt_vec_info> roots = vNULL;
5891 : 345318 : vec<tree> remain = vNULL;
5892 : 345318 : gphi *phi = as_a<gphi *> (STMT_VINFO_STMT (stmt_info));
5893 : 345318 : tree def = gimple_phi_arg_def_from_edge (phi, latch_e);
5894 : 345318 : stmt_vec_info lc_info = loop_vinfo->lookup_def (def);
5895 : 345318 : if (lc_info)
5896 : : {
5897 : 345318 : stmts.create (1);
5898 : 345348 : stmts.quick_push (vect_stmt_to_vectorize (lc_info));
5899 : 345318 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_reduc_group,
5900 : : stmts, roots, remain,
5901 : : max_tree_size, &limit,
5902 : : bst_map, force_single_lane))
5903 : 8426 : return opt_result::failure_at (vect_location,
5904 : : "SLP build failed.\n");
5905 : : }
5906 : : /* When the latch def is from a different cycle this can only
5907 : : be a induction. Build a simple instance for this.
5908 : : ??? We should be able to start discovery from the PHI
5909 : : for all inductions, but then there will be stray
5910 : : non-SLP stmts we choke on as needing non-SLP handling. */
5911 : 336892 : auto_vec<stmt_vec_info, 1> tem;
5912 : 336892 : tem.quick_push (stmt_info);
5913 : 336892 : if (!bst_map->get (tem))
5914 : : {
5915 : 10268 : stmts.create (1);
5916 : 10268 : stmts.quick_push (stmt_info);
5917 : 10268 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_reduc_group,
5918 : : stmts, roots, remain,
5919 : : max_tree_size, &limit,
5920 : : bst_map, force_single_lane))
5921 : 0 : return opt_result::failure_at (vect_location,
5922 : : "SLP build failed.\n");
5923 : : }
5924 : 336892 : }
5925 : : }
5926 : :
5927 : 1018303 : hash_set<slp_tree> visited_patterns;
5928 : 1018303 : slp_tree_to_load_perm_map_t perm_cache;
5929 : 1018303 : slp_compat_nodes_map_t compat_cache;
5930 : :
5931 : : /* See if any patterns can be found in the SLP tree. */
5932 : 1018303 : bool pattern_found = false;
5933 : 3787199 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5934 : 1750593 : pattern_found |= vect_match_slp_patterns (instance, vinfo,
5935 : : &visited_patterns, &perm_cache,
5936 : : &compat_cache);
5937 : :
5938 : : /* If any were found optimize permutations of loads. */
5939 : 1018303 : if (pattern_found)
5940 : : {
5941 : 192 : hash_map<slp_tree, slp_tree> load_map;
5942 : 3234 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5943 : : {
5944 : 2850 : slp_tree root = SLP_INSTANCE_TREE (instance);
5945 : 2850 : optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
5946 : : &load_map, root);
5947 : : }
5948 : 192 : }
5949 : :
5950 : : /* Check whether we should force some SLP instances to use load/store-lanes
5951 : : and do so by forcing SLP re-discovery with single lanes. We used
5952 : : to cancel SLP when this applied to all instances in a loop but now
5953 : : we decide this per SLP instance. It's important to do this only
5954 : : after SLP pattern recognition. */
5955 : 1018303 : if (is_a <loop_vec_info> (vinfo))
5956 : 1335381 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5957 : 955676 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
5958 : 229392 : && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
5959 : : {
5960 : 229392 : slp_tree slp_root = SLP_INSTANCE_TREE (instance);
5961 : 229392 : unsigned int group_size = SLP_TREE_LANES (slp_root);
5962 : 229392 : tree vectype = SLP_TREE_VECTYPE (slp_root);
5963 : :
5964 : 229392 : stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
5965 : 229392 : gimple *rep = STMT_VINFO_STMT (rep_info);
5966 : 229392 : bool masked = (is_gimple_call (rep)
5967 : 1327 : && gimple_call_internal_p (rep)
5968 : 230699 : && internal_fn_mask_index
5969 : 1307 : (gimple_call_internal_fn (rep)) != -1);
5970 : 229372 : if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
5971 : 21623 : || slp_root->ldst_lanes
5972 : 251015 : || (vect_store_lanes_supported (vectype, group_size, masked)
5973 : : == IFN_LAST))
5974 : 229392 : continue;
5975 : :
5976 : 0 : auto_vec<slp_tree> loads;
5977 : 0 : hash_set<slp_tree> visited;
5978 : 0 : vect_gather_slp_loads (loads, slp_root, visited);
5979 : :
5980 : : /* Check whether any load in the SLP instance is possibly
5981 : : permuted. */
5982 : 0 : bool loads_permuted = false;
5983 : 0 : slp_tree load_node;
5984 : 0 : unsigned j;
5985 : 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
5986 : : {
5987 : 0 : if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
5988 : 0 : continue;
5989 : : unsigned k;
5990 : : stmt_vec_info load_info;
5991 : 0 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
5992 : 0 : if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
5993 : : {
5994 : : loads_permuted = true;
5995 : : break;
5996 : : }
5997 : : }
5998 : :
5999 : : /* If the loads and stores can use load/store-lanes force re-discovery
6000 : : with single lanes. */
6001 : 0 : if (loads_permuted)
6002 : : {
6003 : 0 : bool can_use_lanes = true;
6004 : : bool prefer_load_lanes = false;
6005 : 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
6006 : 0 : if (STMT_VINFO_GROUPED_ACCESS
6007 : : (SLP_TREE_REPRESENTATIVE (load_node)))
6008 : : {
6009 : 0 : stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
6010 : : (SLP_TREE_REPRESENTATIVE (load_node));
6011 : 0 : rep = STMT_VINFO_STMT (stmt_vinfo);
6012 : 0 : masked = (is_gimple_call (rep)
6013 : 0 : && gimple_call_internal_p (rep)
6014 : 0 : && internal_fn_mask_index
6015 : 0 : (gimple_call_internal_fn (rep)));
6016 : : /* Use SLP for strided accesses (or if we can't
6017 : : load-lanes). */
6018 : 0 : if (STMT_VINFO_STRIDED_P (stmt_vinfo)
6019 : 0 : || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
6020 : 0 : || vect_load_lanes_supported
6021 : 0 : (SLP_TREE_VECTYPE (load_node),
6022 : 0 : DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
6023 : : /* ??? During SLP re-discovery with a single lane
6024 : : a masked grouped load will appear permuted and
6025 : : discovery will fail. We have to rework this
6026 : : on the discovery side - for now avoid ICEing. */
6027 : 0 : || masked)
6028 : : {
6029 : : can_use_lanes = false;
6030 : : break;
6031 : : }
6032 : : /* Make sure that the target would prefer store-lanes
6033 : : for at least one of the loads.
6034 : :
6035 : : ??? Perhaps we should instead require this for
6036 : : all loads? */
6037 : 0 : prefer_load_lanes
6038 : : = (prefer_load_lanes
6039 : 0 : || SLP_TREE_LANES (load_node) == group_size
6040 : 0 : || (vect_slp_prefer_store_lanes_p
6041 : 0 : (vinfo, stmt_vinfo,
6042 : : SLP_TREE_VECTYPE (load_node), masked,
6043 : : group_size, SLP_TREE_LANES (load_node))));
6044 : : }
6045 : :
6046 : 0 : if (can_use_lanes && prefer_load_lanes)
6047 : : {
6048 : 0 : if (dump_enabled_p ())
6049 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
6050 : : "SLP instance %p can use load/store-lanes,"
6051 : : " re-discovering with single-lanes\n",
6052 : : (void *) instance);
6053 : :
6054 : 0 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
6055 : :
6056 : 0 : vect_free_slp_instance (instance);
6057 : 0 : limit = max_tree_size;
6058 : 0 : bool res = vect_analyze_slp_instance (vinfo, bst_map,
6059 : : stmt_info,
6060 : : slp_inst_kind_store,
6061 : : max_tree_size, &limit,
6062 : : true);
6063 : 0 : gcc_assert (res);
6064 : 0 : auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
6065 : 0 : LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
6066 : : }
6067 : : }
6068 : 0 : }
6069 : :
6070 : : /* When we end up with load permutations that we cannot possibly handle,
6071 : : like those requiring three vector inputs, lower them using interleaving
6072 : : like schemes. */
6073 : 1018303 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6074 : : {
6075 : 379705 : vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
6076 : 379705 : if (dump_enabled_p ())
6077 : : {
6078 : 18545 : dump_printf_loc (MSG_NOTE, vect_location,
6079 : : "SLP graph after lowering permutations:\n");
6080 : 18545 : hash_set<slp_tree> visited;
6081 : 86470 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6082 : 30859 : vect_print_slp_graph (MSG_NOTE, vect_location,
6083 : : SLP_INSTANCE_TREE (instance), visited);
6084 : 18545 : }
6085 : : }
6086 : :
6087 : 1018303 : release_scalar_stmts_to_slp_tree_map (bst_map);
6088 : :
6089 : 1018303 : if (pattern_found && dump_enabled_p ())
6090 : : {
6091 : 14 : dump_printf_loc (MSG_NOTE, vect_location,
6092 : : "Pattern matched SLP tree\n");
6093 : 14 : hash_set<slp_tree> visited;
6094 : 86 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
6095 : 44 : vect_print_slp_graph (MSG_NOTE, vect_location,
6096 : : SLP_INSTANCE_TREE (instance), visited);
6097 : 14 : }
6098 : :
6099 : 1018303 : return opt_result::success ();
6100 : 1018303 : }
6101 : :
6102 : : /* Estimates the cost of inserting layout changes into the SLP graph.
6103 : : It can also say that the insertion is impossible. */
6104 : :
6105 : : struct slpg_layout_cost
6106 : : {
6107 : 11046683 : slpg_layout_cost () = default;
6108 : : slpg_layout_cost (sreal, bool);
6109 : :
6110 : 446230 : static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
6111 : 4933508 : bool is_possible () const { return depth != sreal::max (); }
6112 : :
6113 : : bool operator== (const slpg_layout_cost &) const;
6114 : : bool operator!= (const slpg_layout_cost &) const;
6115 : :
6116 : : bool is_better_than (const slpg_layout_cost &, bool) const;
6117 : :
6118 : : void add_parallel_cost (const slpg_layout_cost &);
6119 : : void add_serial_cost (const slpg_layout_cost &);
6120 : : void split (unsigned int);
6121 : :
6122 : : /* The longest sequence of layout changes needed during any traversal
6123 : : of the partition dag, weighted by execution frequency.
6124 : :
6125 : : This is the most important metric when optimizing for speed, since
6126 : : it helps to ensure that we keep the number of operations on
6127 : : critical paths to a minimum. */
6128 : : sreal depth = 0;
6129 : :
6130 : : /* An estimate of the total number of operations needed. It is weighted by
6131 : : execution frequency when optimizing for speed but not when optimizing for
6132 : : size. In order to avoid double-counting, a node with a fanout of N will
6133 : : distribute 1/N of its total cost to each successor.
6134 : :
6135 : : This is the most important metric when optimizing for size, since
6136 : : it helps to keep the total number of operations to a minimum, */
6137 : : sreal total = 0;
6138 : : };
6139 : :
6140 : : /* Construct costs for a node with weight WEIGHT. A higher weight
6141 : : indicates more frequent execution. IS_FOR_SIZE is true if we are
6142 : : optimizing for size rather than speed. */
6143 : :
6144 : 1158811 : slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
6145 : 1159679 : : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
6146 : : {
6147 : 1158811 : }
6148 : :
6149 : : bool
6150 : 0 : slpg_layout_cost::operator== (const slpg_layout_cost &other) const
6151 : : {
6152 : 0 : return depth == other.depth && total == other.total;
6153 : : }
6154 : :
6155 : : bool
6156 : 0 : slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
6157 : : {
6158 : 0 : return !operator== (other);
6159 : : }
6160 : :
6161 : : /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
6162 : : true if we are optimizing for size rather than speed. */
6163 : :
6164 : : bool
6165 : 291124 : slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
6166 : : bool is_for_size) const
6167 : : {
6168 : 291124 : if (is_for_size)
6169 : : {
6170 : 382 : if (total != other.total)
6171 : 159 : return total < other.total;
6172 : 223 : return depth < other.depth;
6173 : : }
6174 : : else
6175 : : {
6176 : 290742 : if (depth != other.depth)
6177 : 124348 : return depth < other.depth;
6178 : 166394 : return total < other.total;
6179 : : }
6180 : : }
6181 : :
6182 : : /* Increase the costs to account for something with cost INPUT_COST
6183 : : happening in parallel with the current costs. */
6184 : :
6185 : : void
6186 : 343253 : slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
6187 : : {
6188 : 343253 : depth = std::max (depth, input_cost.depth);
6189 : 343253 : total += input_cost.total;
6190 : 343253 : }
6191 : :
6192 : : /* Increase the costs to account for something with cost INPUT_COST
6193 : : happening in series with the current costs. */
6194 : :
6195 : : void
6196 : 1396876 : slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
6197 : : {
6198 : 1396876 : depth += other.depth;
6199 : 1396876 : total += other.total;
6200 : 1396876 : }
6201 : :
6202 : : /* Split the total cost among TIMES successors or predecessors. */
6203 : :
6204 : : void
6205 : 1151384 : slpg_layout_cost::split (unsigned int times)
6206 : : {
6207 : 1151384 : if (times > 1)
6208 : 480598 : total /= times;
6209 : 1151384 : }
6210 : :
6211 : : /* Information about one node in the SLP graph, for use during
6212 : : vect_optimize_slp_pass. */
6213 : :
6214 : : struct slpg_vertex
6215 : : {
6216 : 10523412 : slpg_vertex (slp_tree node_) : node (node_) {}
6217 : :
6218 : : /* The node itself. */
6219 : : slp_tree node;
6220 : :
6221 : : /* Which partition the node belongs to, or -1 if none. Nodes outside of
6222 : : partitions are flexible; they can have whichever layout consumers
6223 : : want them to have. */
6224 : : int partition = -1;
6225 : :
6226 : : /* The number of nodes that directly use the result of this one
6227 : : (i.e. the number of nodes that count this one as a child). */
6228 : : unsigned int out_degree = 0;
6229 : :
6230 : : /* The execution frequency of the node. */
6231 : : sreal weight = 0;
6232 : :
6233 : : /* The total execution frequency of all nodes that directly use the
6234 : : result of this one. */
6235 : : sreal out_weight = 0;
6236 : : };
6237 : :
6238 : : /* Information about one partition of the SLP graph, for use during
6239 : : vect_optimize_slp_pass. */
6240 : :
6241 : : struct slpg_partition_info
6242 : : {
6243 : : /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
6244 : : of m_partitioned_nodes. */
6245 : : unsigned int node_begin = 0;
6246 : : unsigned int node_end = 0;
6247 : :
6248 : : /* Which layout we've chosen to use for this partition, or -1 if
6249 : : we haven't picked one yet. */
6250 : : int layout = -1;
6251 : :
6252 : : /* The number of predecessors and successors in the partition dag.
6253 : : The predecessors always have lower partition numbers and the
6254 : : successors always have higher partition numbers.
6255 : :
6256 : : Note that the directions of these edges are not necessarily the
6257 : : same as in the data flow graph. For example, if an SCC has separate
6258 : : partitions for an inner loop and an outer loop, the inner loop's
6259 : : partition will have at least two incoming edges from the outer loop's
6260 : : partition: one for a live-in value and one for a live-out value.
6261 : : In data flow terms, one of these edges would also be from the outer loop
6262 : : to the inner loop, but the other would be in the opposite direction. */
6263 : : unsigned int in_degree = 0;
6264 : : unsigned int out_degree = 0;
6265 : : };
6266 : :
6267 : : /* Information about the costs of using a particular layout for a
6268 : : particular partition. It can also say that the combination is
6269 : : impossible. */
6270 : :
6271 : : struct slpg_partition_layout_costs
6272 : : {
6273 : 1408863 : bool is_possible () const { return internal_cost.is_possible (); }
6274 : 49237 : void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
6275 : :
6276 : : /* The costs inherited from predecessor partitions. */
6277 : : slpg_layout_cost in_cost;
6278 : :
6279 : : /* The inherent cost of the layout within the node itself. For example,
6280 : : this is nonzero for a load if choosing a particular layout would require
6281 : : the load to permute the loaded elements. It is nonzero for a
6282 : : VEC_PERM_EXPR if the permutation cannot be eliminated or converted
6283 : : to full-vector moves. */
6284 : : slpg_layout_cost internal_cost;
6285 : :
6286 : : /* The costs inherited from successor partitions. */
6287 : : slpg_layout_cost out_cost;
6288 : : };
6289 : :
6290 : : /* This class tries to optimize the layout of vectors in order to avoid
6291 : : unnecessary shuffling. At the moment, the set of possible layouts are
6292 : : restricted to bijective permutations.
6293 : :
6294 : : The goal of the pass depends on whether we're optimizing for size or
6295 : : for speed. When optimizing for size, the goal is to reduce the overall
6296 : : number of layout changes (including layout changes implied by things
6297 : : like load permutations). When optimizing for speed, the goal is to
6298 : : reduce the maximum latency attributable to layout changes on any
6299 : : non-cyclical path through the data flow graph.
6300 : :
6301 : : For example, when optimizing a loop nest for speed, we will prefer
6302 : : to make layout changes outside of a loop rather than inside of a loop,
6303 : : and will prefer to make layout changes in parallel rather than serially,
6304 : : even if that increases the overall number of layout changes.
6305 : :
6306 : : The high-level procedure is:
6307 : :
6308 : : (1) Build a graph in which edges go from uses (parents) to definitions
6309 : : (children).
6310 : :
6311 : : (2) Divide the graph into a dag of strongly-connected components (SCCs).
6312 : :
6313 : : (3) When optimizing for speed, partition the nodes in each SCC based
6314 : : on their containing cfg loop. When optimizing for size, treat
6315 : : each SCC as a single partition.
6316 : :
6317 : : This gives us a dag of partitions. The goal is now to assign a
6318 : : layout to each partition.
6319 : :
6320 : : (4) Construct a set of vector layouts that are worth considering.
6321 : : Record which nodes must keep their current layout.
6322 : :
6323 : : (5) Perform a forward walk over the partition dag (from loads to stores)
6324 : : accumulating the "forward" cost of using each layout. When visiting
6325 : : each partition, assign a tentative choice of layout to the partition
6326 : : and use that choice when calculating the cost of using a different
6327 : : layout in successor partitions.
6328 : :
6329 : : (6) Perform a backward walk over the partition dag (from stores to loads),
6330 : : accumulating the "backward" cost of using each layout. When visiting
6331 : : each partition, make a final choice of layout for that partition based
6332 : : on the accumulated forward costs (from (5)) and backward costs
6333 : : (from (6)).
6334 : :
6335 : : (7) Apply the chosen layouts to the SLP graph.
6336 : :
6337 : : For example, consider the SLP statements:
6338 : :
6339 : : S1: a_1 = load
6340 : : loop:
6341 : : S2: a_2 = PHI<a_1, a_3>
6342 : : S3: b_1 = load
6343 : : S4: a_3 = a_2 + b_1
6344 : : exit:
6345 : : S5: a_4 = PHI<a_3>
6346 : : S6: store a_4
6347 : :
6348 : : S2 and S4 form an SCC and are part of the same loop. Every other
6349 : : statement is in a singleton SCC. In this example there is a one-to-one
6350 : : mapping between SCCs and partitions and the partition dag looks like this;
6351 : :
6352 : : S1 S3
6353 : : \ /
6354 : : S2+S4
6355 : : |
6356 : : S5
6357 : : |
6358 : : S6
6359 : :
6360 : : S2, S3 and S4 will have a higher execution frequency than the other
6361 : : statements, so when optimizing for speed, the goal is to avoid any
6362 : : layout changes:
6363 : :
6364 : : - within S3
6365 : : - within S2+S4
6366 : : - on the S3->S2+S4 edge
6367 : :
6368 : : For example, if S3 was originally a reversing load, the goal of the
6369 : : pass is to make it an unreversed load and change the layout on the
6370 : : S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
6371 : : on S1->S2+S4 and S5->S6 would also be acceptable.)
6372 : :
6373 : : The difference between SCCs and partitions becomes important if we
6374 : : add an outer loop:
6375 : :
6376 : : S1: a_1 = ...
6377 : : loop1:
6378 : : S2: a_2 = PHI<a_1, a_6>
6379 : : S3: b_1 = load
6380 : : S4: a_3 = a_2 + b_1
6381 : : loop2:
6382 : : S5: a_4 = PHI<a_3, a_5>
6383 : : S6: c_1 = load
6384 : : S7: a_5 = a_4 + c_1
6385 : : exit2:
6386 : : S8: a_6 = PHI<a_5>
6387 : : S9: store a_6
6388 : : exit1:
6389 : :
6390 : : Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
6391 : : for speed, we usually do not want restrictions in the outer loop to "infect"
6392 : : the decision for the inner loop. For example, if an outer-loop node
6393 : : in the SCC contains a statement with a fixed layout, that should not
6394 : : prevent the inner loop from using a different layout. Conversely,
6395 : : the inner loop should not dictate a layout to the outer loop: if the
6396 : : outer loop does a lot of computation, then it may not be efficient to
6397 : : do all of that computation in the inner loop's preferred layout.
6398 : :
6399 : : So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
6400 : : and S5+S7 (inner). We also try to arrange partitions so that:
6401 : :
6402 : : - the partition for an outer loop comes before the partition for
6403 : : an inner loop
6404 : :
6405 : : - if a sibling loop A dominates a sibling loop B, A's partition
6406 : : comes before B's
6407 : :
6408 : : This gives the following partition dag for the example above:
6409 : :
6410 : : S1 S3
6411 : : \ /
6412 : : S2+S4+S8 S6
6413 : : | \\ /
6414 : : | S5+S7
6415 : : |
6416 : : S9
6417 : :
6418 : : There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
6419 : : one for a reversal of the edge S7->S8.
6420 : :
6421 : : The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
6422 : : for S2+S4+S8 therefore has to balance the cost of using the outer loop's
6423 : : preferred layout against the cost of changing the layout on entry to the
6424 : : inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
6425 : :
6426 : : Although this works well when optimizing for speed, it has the downside
6427 : : when optimizing for size that the choice of layout for S5+S7 is completely
6428 : : independent of S9, which lessens the chance of reducing the overall number
6429 : : of permutations. We therefore do not partition SCCs when optimizing
6430 : : for size.
6431 : :
6432 : : To give a concrete example of the difference between optimizing
6433 : : for size and speed, consider:
6434 : :
6435 : : a[0] = (b[1] << c[3]) - d[1];
6436 : : a[1] = (b[0] << c[2]) - d[0];
6437 : : a[2] = (b[3] << c[1]) - d[3];
6438 : : a[3] = (b[2] << c[0]) - d[2];
6439 : :
6440 : : There are three different layouts here: one for a, one for b and d,
6441 : : and one for c. When optimizing for speed it is better to permute each
6442 : : of b, c and d into the order required by a, since those permutations
6443 : : happen in parallel. But when optimizing for size, it is better to:
6444 : :
6445 : : - permute c into the same order as b
6446 : : - do the arithmetic
6447 : : - permute the result into the order required by a
6448 : :
6449 : : This gives 2 permutations rather than 3. */
6450 : :
6451 : : class vect_optimize_slp_pass
6452 : : {
6453 : : public:
6454 : 625746 : vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
6455 : : void run ();
6456 : :
6457 : : private:
6458 : : /* Graph building. */
6459 : : struct loop *containing_loop (slp_tree);
6460 : : bool is_cfg_latch_edge (graph_edge *);
6461 : : void build_vertices (hash_set<slp_tree> &, slp_tree);
6462 : : void build_vertices ();
6463 : : void build_graph ();
6464 : :
6465 : : /* Partitioning. */
6466 : : void create_partitions ();
6467 : : template<typename T> void for_each_partition_edge (unsigned int, T);
6468 : :
6469 : : /* Layout selection. */
6470 : : bool is_compatible_layout (slp_tree, unsigned int);
6471 : : bool is_compatible_layout (const slpg_partition_info &, unsigned int);
6472 : : int change_layout_cost (slp_tree, unsigned int, unsigned int);
6473 : : slpg_partition_layout_costs &partition_layout_costs (unsigned int,
6474 : : unsigned int);
6475 : : void change_vec_perm_layout (slp_tree, lane_permutation_t &,
6476 : : int, unsigned int);
6477 : : int internal_node_cost (slp_tree, int, unsigned int);
6478 : : void start_choosing_layouts ();
6479 : : bool legitimize ();
6480 : :
6481 : : /* Cost propagation. */
6482 : : slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
6483 : : unsigned int, unsigned int);
6484 : : slpg_layout_cost total_in_cost (unsigned int);
6485 : : slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
6486 : : slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
6487 : : void forward_pass ();
6488 : : void backward_pass ();
6489 : :
6490 : : /* Rematerialization. */
6491 : : slp_tree get_result_with_layout (slp_tree, unsigned int);
6492 : : void materialize ();
6493 : :
6494 : : /* Clean-up. */
6495 : : void remove_redundant_permutations ();
6496 : :
6497 : : /* Masked load lanes discovery. */
6498 : : void decide_masked_load_lanes ();
6499 : :
6500 : : void dump ();
6501 : :
6502 : : vec_info *m_vinfo;
6503 : :
6504 : : /* True if we should optimize the graph for size, false if we should
6505 : : optimize it for speed. (It wouldn't be easy to make this decision
6506 : : more locally.) */
6507 : : bool m_optimize_size;
6508 : :
6509 : : /* A graph of all SLP nodes, with edges leading from uses to definitions.
6510 : : In other words, a node's predecessors are its slp_tree parents and
6511 : : a node's successors are its slp_tree children. */
6512 : : graph *m_slpg = nullptr;
6513 : :
6514 : : /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
6515 : : auto_vec<slpg_vertex> m_vertices;
6516 : :
6517 : : /* The list of all leaves of M_SLPG. such as external definitions, constants,
6518 : : and loads. */
6519 : : auto_vec<int> m_leafs;
6520 : :
6521 : : /* This array has one entry for every vector layout that we're considering.
6522 : : Element 0 is null and indicates "no change". Other entries describe
6523 : : permutations that are inherent in the current graph and that we would
6524 : : like to reverse if possible.
6525 : :
6526 : : For example, a permutation { 1, 2, 3, 0 } means that something has
6527 : : effectively been permuted in that way, such as a load group
6528 : : { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
6529 : : We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
6530 : : in order to put things "back" in order. */
6531 : : auto_vec<vec<unsigned> > m_perms;
6532 : :
6533 : : /* A partitioning of the nodes for which a layout must be chosen.
6534 : : Each partition represents an <SCC, cfg loop> pair; that is,
6535 : : nodes in different SCCs belong to different partitions, and nodes
6536 : : within an SCC can be further partitioned according to a containing
6537 : : cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
6538 : :
6539 : : - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
6540 : : from leaves (such as loads) to roots (such as stores).
6541 : :
6542 : : - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
6543 : : auto_vec<slpg_partition_info> m_partitions;
6544 : :
6545 : : /* The list of all nodes for which a layout must be chosen. Nodes for
6546 : : partition P come before the nodes for partition P+1. Nodes within a
6547 : : partition are in reverse postorder. */
6548 : : auto_vec<unsigned int> m_partitioned_nodes;
6549 : :
6550 : : /* Index P * num-layouts + L contains the cost of using layout L
6551 : : for partition P. */
6552 : : auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
6553 : :
6554 : : /* Index N * num-layouts + L, if nonnull, is a node that provides the
6555 : : original output of node N adjusted to have layout L. */
6556 : : auto_vec<slp_tree> m_node_layouts;
6557 : : };
6558 : :
6559 : : /* Fill the vertices and leafs vector with all nodes in the SLP graph.
6560 : : Also record whether we should optimize anything for speed rather
6561 : : than size. */
6562 : :
6563 : : void
6564 : 11360136 : vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
6565 : : slp_tree node)
6566 : : {
6567 : 11360136 : unsigned i;
6568 : 11360136 : slp_tree child;
6569 : :
6570 : 11360136 : if (visited.add (node))
6571 : 11360136 : return;
6572 : :
6573 : 10523412 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
6574 : : {
6575 : 7960015 : basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
6576 : 7219851 : if (optimize_bb_for_speed_p (bb))
6577 : 7099509 : m_optimize_size = false;
6578 : : }
6579 : :
6580 : 10523412 : node->vertex = m_vertices.length ();
6581 : 10523412 : m_vertices.safe_push (slpg_vertex (node));
6582 : :
6583 : 10523412 : bool leaf = true;
6584 : 10523412 : bool force_leaf = false;
6585 : 20107510 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6586 : 9584098 : if (child)
6587 : : {
6588 : 8020502 : leaf = false;
6589 : 8020502 : build_vertices (visited, child);
6590 : : }
6591 : : else
6592 : : force_leaf = true;
6593 : : /* Since SLP discovery works along use-def edges all cycles have an
6594 : : entry - but there's the exception of cycles where we do not handle
6595 : : the entry explicitely (but with a NULL SLP node), like some reductions
6596 : : and inductions. Force those SLP PHIs to act as leafs to make them
6597 : : backwards reachable. */
6598 : 10523412 : if (leaf || force_leaf)
6599 : 5371200 : m_leafs.safe_push (node->vertex);
6600 : : }
6601 : :
6602 : : /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
6603 : :
6604 : : void
6605 : 1251492 : vect_optimize_slp_pass::build_vertices ()
6606 : : {
6607 : 1251492 : hash_set<slp_tree> visited;
6608 : 1251492 : unsigned i;
6609 : 1251492 : slp_instance instance;
6610 : 1251492 : m_vertices.truncate (0);
6611 : 1251492 : m_leafs.truncate (0);
6612 : 7094110 : FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
6613 : 3339634 : build_vertices (visited, SLP_INSTANCE_TREE (instance));
6614 : 1251492 : }
6615 : :
6616 : : /* Apply (reverse) bijectite PERM to VEC. */
6617 : :
6618 : : template <class T>
6619 : : static void
6620 : 189648 : vect_slp_permute (vec<unsigned> perm,
6621 : : vec<T> &vec, bool reverse)
6622 : : {
6623 : 189648 : auto_vec<T, 64> saved;
6624 : 189648 : saved.create (vec.length ());
6625 : 618084 : for (unsigned i = 0; i < vec.length (); ++i)
6626 : 428436 : saved.quick_push (vec[i]);
6627 : :
6628 : 189648 : if (reverse)
6629 : : {
6630 : 1225789 : for (unsigned i = 0; i < vec.length (); ++i)
6631 : 427016 : vec[perm[i]] = saved[i];
6632 : 616022 : for (unsigned i = 0; i < vec.length (); ++i)
6633 : 753807 : gcc_assert (vec[perm[i]] == saved[i]);
6634 : : }
6635 : : else
6636 : : {
6637 : 4124 : for (unsigned i = 0; i < vec.length (); ++i)
6638 : 1420 : vec[i] = saved[perm[i]];
6639 : 191068 : for (unsigned i = 0; i < vec.length (); ++i)
6640 : 2130 : gcc_assert (vec[i] == saved[perm[i]]);
6641 : : }
6642 : 189648 : }
6643 : :
6644 : : /* Return the cfg loop that contains NODE. */
6645 : :
6646 : : struct loop *
6647 : 3895663 : vect_optimize_slp_pass::containing_loop (slp_tree node)
6648 : : {
6649 : 3895663 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
6650 : 3895663 : if (!rep)
6651 : 4271 : return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
6652 : 4270487 : return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
6653 : : }
6654 : :
6655 : : /* Return true if UD (an edge from a use to a definition) is associated
6656 : : with a loop latch edge in the cfg. */
6657 : :
6658 : : bool
6659 : 8020502 : vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
6660 : : {
6661 : 8020502 : slp_tree use = m_vertices[ud->src].node;
6662 : 8020502 : slp_tree def = m_vertices[ud->dest].node;
6663 : 8020502 : if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
6664 : 8020502 : || SLP_TREE_PERMUTE_P (use))
6665 : 7646256 : || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
6666 : : return false;
6667 : :
6668 : 4352696 : stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
6669 : 4352696 : return (is_a<gphi *> (use_rep->stmt)
6670 : 306264 : && bb_loop_header_p (gimple_bb (use_rep->stmt))
6671 : 4506128 : && containing_loop (def) == containing_loop (use));
6672 : : }
6673 : :
6674 : : /* Build the graph. Mark edges that correspond to cfg loop latch edges with
6675 : : a nonnull data field. */
6676 : :
6677 : : void
6678 : 1251492 : vect_optimize_slp_pass::build_graph ()
6679 : : {
6680 : 1251492 : m_optimize_size = true;
6681 : 1251492 : build_vertices ();
6682 : :
6683 : 2502984 : m_slpg = new_graph (m_vertices.length ());
6684 : 14277888 : for (slpg_vertex &v : m_vertices)
6685 : 32099630 : for (slp_tree child : SLP_TREE_CHILDREN (v.node))
6686 : 9584098 : if (child)
6687 : : {
6688 : 8020502 : graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
6689 : 8020502 : if (is_cfg_latch_edge (ud))
6690 : 145288 : ud->data = this;
6691 : : }
6692 : 1251492 : }
6693 : :
6694 : : /* Return true if E corresponds to a loop latch edge in the cfg. */
6695 : :
6696 : : static bool
6697 : 4082600 : skip_cfg_latch_edges (graph_edge *e)
6698 : : {
6699 : 4082600 : return e->data;
6700 : : }
6701 : :
6702 : : /* Create the node partitions. */
6703 : :
6704 : : void
6705 : 625746 : vect_optimize_slp_pass::create_partitions ()
6706 : : {
6707 : : /* Calculate a postorder of the graph, ignoring edges that correspond
6708 : : to natural latch edges in the cfg. Reading the vector from the end
6709 : : to the beginning gives the reverse postorder. */
6710 : 625746 : auto_vec<int> initial_rpo;
6711 : 1251492 : graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
6712 : : false, NULL, skip_cfg_latch_edges);
6713 : 1877238 : gcc_assert (initial_rpo.length () == m_vertices.length ());
6714 : :
6715 : : /* Calculate the strongly connected components of the graph. */
6716 : 625746 : auto_vec<int> scc_grouping;
6717 : 625746 : unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
6718 : :
6719 : : /* Create a new index order in which all nodes from the same SCC are
6720 : : consecutive. Use scc_pos to record the index of the first node in
6721 : : each SCC. */
6722 : 625746 : auto_vec<unsigned int> scc_pos (num_sccs);
6723 : 625746 : int last_component = -1;
6724 : 625746 : unsigned int node_count = 0;
6725 : 7138651 : for (unsigned int node_i : scc_grouping)
6726 : : {
6727 : 5261413 : if (last_component != m_slpg->vertices[node_i].component)
6728 : : {
6729 : 5171573 : last_component = m_slpg->vertices[node_i].component;
6730 : 10343146 : gcc_assert (last_component == int (scc_pos.length ()));
6731 : 5171573 : scc_pos.quick_push (node_count);
6732 : : }
6733 : 5261413 : node_count += 1;
6734 : : }
6735 : 1251492 : gcc_assert (node_count == initial_rpo.length ()
6736 : : && last_component + 1 == int (num_sccs));
6737 : :
6738 : : /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
6739 : : inside each SCC following the RPO we calculated above. The fact that
6740 : : we ignored natural latch edges when calculating the RPO should ensure
6741 : : that, for natural loop nests:
6742 : :
6743 : : - the first node that we encounter in a cfg loop is the loop header phi
6744 : : - the loop header phis are in dominance order
6745 : :
6746 : : Arranging for this is an optimization (see below) rather than a
6747 : : correctness issue. Unnatural loops with a tangled mess of backedges
6748 : : will still work correctly, but might give poorer results.
6749 : :
6750 : : Also update scc_pos so that it gives 1 + the index of the last node
6751 : : in the SCC. */
6752 : 625746 : m_partitioned_nodes.safe_grow (node_count);
6753 : 6512905 : for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
6754 : : {
6755 : 5261413 : unsigned int node_i = initial_rpo[old_i];
6756 : 5261413 : unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
6757 : 5261413 : m_partitioned_nodes[new_i] = node_i;
6758 : : }
6759 : :
6760 : : /* When optimizing for speed, partition each SCC based on the containing
6761 : : cfg loop. The order we constructed above should ensure that, for natural
6762 : : cfg loops, we'll create sub-SCC partitions for outer loops before
6763 : : the corresponding sub-SCC partitions for inner loops. Similarly,
6764 : : when one sibling loop A dominates another sibling loop B, we should
6765 : : create a sub-SCC partition for A before a sub-SCC partition for B.
6766 : :
6767 : : As above, nothing depends for correctness on whether this achieves
6768 : : a natural nesting, but we should get better results when it does. */
6769 : 1251492 : m_partitions.reserve (m_vertices.length ());
6770 : 625746 : unsigned int next_partition_i = 0;
6771 : 625746 : hash_map<struct loop *, int> loop_partitions;
6772 : 625746 : unsigned int rpo_begin = 0;
6773 : 625746 : unsigned int num_partitioned_nodes = 0;
6774 : 7048811 : for (unsigned int rpo_end : scc_pos)
6775 : : {
6776 : 5171573 : loop_partitions.empty ();
6777 : : unsigned int partition_i = next_partition_i;
6778 : 10432986 : for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
6779 : : {
6780 : : /* Handle externals and constants optimistically throughout.
6781 : : But treat existing vectors as fixed since we do not handle
6782 : : permuting them. */
6783 : 5261413 : unsigned int node_i = m_partitioned_nodes[rpo_i];
6784 : 5261413 : auto &vertex = m_vertices[node_i];
6785 : 5261413 : if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
6786 : 503870 : && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
6787 : 5263390 : || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
6788 : 1647509 : vertex.partition = -1;
6789 : : else
6790 : : {
6791 : 3613904 : bool existed;
6792 : 3613904 : if (m_optimize_size)
6793 : 25105 : existed = next_partition_i > partition_i;
6794 : : else
6795 : : {
6796 : 3588799 : struct loop *loop = containing_loop (vertex.node);
6797 : 3588799 : auto &entry = loop_partitions.get_or_insert (loop, &existed);
6798 : 3588799 : if (!existed)
6799 : 3499879 : entry = next_partition_i;
6800 : 3588799 : partition_i = entry;
6801 : : }
6802 : 3613904 : if (!existed)
6803 : : {
6804 : 3524906 : m_partitions.quick_push (slpg_partition_info ());
6805 : 3524906 : next_partition_i += 1;
6806 : : }
6807 : 3613904 : vertex.partition = partition_i;
6808 : 3613904 : num_partitioned_nodes += 1;
6809 : 3613904 : m_partitions[partition_i].node_end += 1;
6810 : : }
6811 : : }
6812 : 5171573 : rpo_begin = rpo_end;
6813 : : }
6814 : :
6815 : : /* Assign ranges of consecutive node indices to each partition,
6816 : : in partition order. Start with node_end being the same as
6817 : : node_begin so that the next loop can use it as a counter. */
6818 : 625746 : unsigned int node_begin = 0;
6819 : 5402144 : for (auto &partition : m_partitions)
6820 : : {
6821 : 3524906 : partition.node_begin = node_begin;
6822 : 3524906 : node_begin += partition.node_end;
6823 : 3524906 : partition.node_end = partition.node_begin;
6824 : : }
6825 : 625746 : gcc_assert (node_begin == num_partitioned_nodes);
6826 : :
6827 : : /* Finally build the list of nodes in partition order. */
6828 : 625746 : m_partitioned_nodes.truncate (num_partitioned_nodes);
6829 : 5887159 : for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
6830 : : {
6831 : 5261413 : int partition_i = m_vertices[node_i].partition;
6832 : 5261413 : if (partition_i >= 0)
6833 : : {
6834 : 3613904 : unsigned int order_i = m_partitions[partition_i].node_end++;
6835 : 3613904 : m_partitioned_nodes[order_i] = node_i;
6836 : : }
6837 : : }
6838 : 625746 : }
6839 : :
6840 : : /* Look for edges from earlier partitions into node NODE_I and edges from
6841 : : node NODE_I into later partitions. Call:
6842 : :
6843 : : FN (ud, other_node_i)
6844 : :
6845 : : for each such use-to-def edge ud, where other_node_i is the node at the
6846 : : other end of the edge. */
6847 : :
6848 : : template<typename T>
6849 : : void
6850 : 4002245 : vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
6851 : : {
6852 : 4002245 : int partition_i = m_vertices[node_i].partition;
6853 : 4002245 : for (graph_edge *pred = m_slpg->vertices[node_i].pred;
6854 : 6755176 : pred; pred = pred->pred_next)
6855 : : {
6856 : 2752931 : int src_partition_i = m_vertices[pred->src].partition;
6857 : 2752931 : if (src_partition_i >= 0 && src_partition_i != partition_i)
6858 : 2530315 : fn (pred, pred->src);
6859 : : }
6860 : 4002245 : for (graph_edge *succ = m_slpg->vertices[node_i].succ;
6861 : 8551381 : succ; succ = succ->succ_next)
6862 : : {
6863 : 4549136 : int dest_partition_i = m_vertices[succ->dest].partition;
6864 : 4549136 : if (dest_partition_i >= 0 && dest_partition_i != partition_i)
6865 : 2552059 : fn (succ, succ->dest);
6866 : : }
6867 : 4002245 : }
6868 : :
6869 : : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
6870 : : that NODE would operate on. This test is independent of NODE's actual
6871 : : operation. */
6872 : :
6873 : : bool
6874 : 1570183 : vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
6875 : : unsigned int layout_i)
6876 : : {
6877 : 1570183 : if (layout_i == 0)
6878 : : return true;
6879 : :
6880 : 907626 : if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
6881 : 11525 : return false;
6882 : :
6883 : : return true;
6884 : : }
6885 : :
6886 : : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
6887 : : that NODE would operate on for each NODE in PARTITION.
6888 : : This test is independent of NODE's actual operations. */
6889 : :
6890 : : bool
6891 : 16486 : vect_optimize_slp_pass::is_compatible_layout (const slpg_partition_info
6892 : : &partition,
6893 : : unsigned int layout_i)
6894 : : {
6895 : 33208 : for (unsigned int order_i = partition.node_begin;
6896 : 33208 : order_i < partition.node_end; ++order_i)
6897 : : {
6898 : 16788 : unsigned int node_i = m_partitioned_nodes[order_i];
6899 : 16788 : auto &vertex = m_vertices[node_i];
6900 : :
6901 : : /* The layout is incompatible if it is individually incompatible
6902 : : with any node in the partition. */
6903 : 16788 : if (!is_compatible_layout (vertex.node, layout_i))
6904 : : return false;
6905 : : }
6906 : : return true;
6907 : : }
6908 : :
6909 : : /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
6910 : : to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
6911 : : layouts is incompatible with NODE or if the change is not possible for
6912 : : some other reason.
6913 : :
6914 : : The properties taken from NODE include the number of lanes and the
6915 : : vector type. The actual operation doesn't matter. */
6916 : :
6917 : : int
6918 : 673385 : vect_optimize_slp_pass::change_layout_cost (slp_tree node,
6919 : : unsigned int from_layout_i,
6920 : : unsigned int to_layout_i)
6921 : : {
6922 : 673385 : if (!is_compatible_layout (node, from_layout_i)
6923 : 673385 : || !is_compatible_layout (node, to_layout_i))
6924 : 531 : return -1;
6925 : :
6926 : 672854 : if (from_layout_i == to_layout_i)
6927 : : return 0;
6928 : :
6929 : 291682 : auto_vec<slp_tree, 1> children (1);
6930 : 291682 : children.quick_push (node);
6931 : 291682 : auto_lane_permutation_t perm (SLP_TREE_LANES (node));
6932 : 291682 : if (from_layout_i > 0)
6933 : 822763 : for (unsigned int i : m_perms[from_layout_i])
6934 : 361717 : perm.quick_push ({ 0, i });
6935 : : else
6936 : 445767 : for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
6937 : 307767 : perm.quick_push ({ 0, i });
6938 : 291682 : if (to_layout_i > 0)
6939 : 138427 : vect_slp_permute (m_perms[to_layout_i], perm, true);
6940 : 291682 : auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
6941 : : children, false);
6942 : 291682 : if (count >= 0)
6943 : 287204 : return MAX (count, 1);
6944 : :
6945 : : /* ??? In principle we could try changing via layout 0, giving two
6946 : : layout changes rather than 1. Doing that would require
6947 : : corresponding support in get_result_with_layout. */
6948 : : return -1;
6949 : 291682 : }
6950 : :
6951 : : /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
6952 : :
6953 : : inline slpg_partition_layout_costs &
6954 : 968904 : vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
6955 : : unsigned int layout_i)
6956 : : {
6957 : 1937808 : return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
6958 : : }
6959 : :
6960 : : /* Change PERM in one of two ways:
6961 : :
6962 : : - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
6963 : : chosen for child I of NODE.
6964 : :
6965 : : - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
6966 : :
6967 : : In both cases, arrange for the output to have layout OUT_LAYOUT_I */
6968 : :
6969 : : void
6970 : 27358 : vect_optimize_slp_pass::
6971 : : change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
6972 : : int in_layout_i, unsigned int out_layout_i)
6973 : : {
6974 : 159792 : for (auto &entry : perm)
6975 : : {
6976 : 77718 : int this_in_layout_i = in_layout_i;
6977 : 77718 : if (this_in_layout_i < 0)
6978 : : {
6979 : 55395 : slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
6980 : 55395 : unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
6981 : 55395 : if (in_partition_i == -1u)
6982 : 329 : continue;
6983 : 55066 : this_in_layout_i = m_partitions[in_partition_i].layout;
6984 : : }
6985 : 77389 : if (this_in_layout_i > 0)
6986 : 17057 : entry.second = m_perms[this_in_layout_i][entry.second];
6987 : : }
6988 : 27358 : if (out_layout_i > 0)
6989 : 6235 : vect_slp_permute (m_perms[out_layout_i], perm, true);
6990 : 27358 : }
6991 : :
6992 : : /* Check whether the target allows NODE to be rearranged so that the node's
6993 : : output has layout OUT_LAYOUT_I. Return the cost of the change if so,
6994 : : in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
6995 : :
6996 : : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
6997 : : NODE can adapt to the layout changes that have (perhaps provisionally)
6998 : : been chosen for NODE's children, so that no extra permutations are
6999 : : needed on either the input or the output of NODE.
7000 : :
7001 : : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
7002 : : that all inputs will be forced into layout IN_LAYOUT_I beforehand.
7003 : :
7004 : : IN_LAYOUT_I has no meaning for other types of node.
7005 : :
7006 : : Keeping the node as-is is always valid. If the target doesn't appear
7007 : : to support the node as-is, but might realistically support other layouts,
7008 : : then layout 0 instead has the cost of a worst-case permutation. On the
7009 : : one hand, this ensures that every node has at least one valid layout,
7010 : : avoiding what would otherwise be an awkward special case. On the other,
7011 : : it still encourages the pass to change an invalid pre-existing layout
7012 : : choice into a valid one. */
7013 : :
7014 : : int
7015 : 206368 : vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
7016 : : unsigned int out_layout_i)
7017 : : {
7018 : 206368 : const int fallback_cost = 1;
7019 : :
7020 : 206368 : if (SLP_TREE_PERMUTE_P (node))
7021 : : {
7022 : 23112 : auto_lane_permutation_t tmp_perm;
7023 : 23112 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
7024 : :
7025 : : /* Check that the child nodes support the chosen layout. Checking
7026 : : the first child is enough, since any second child would have the
7027 : : same shape. */
7028 : 23112 : auto first_child = SLP_TREE_CHILDREN (node)[0];
7029 : 23112 : if (in_layout_i > 0
7030 : 23112 : && !is_compatible_layout (first_child, in_layout_i))
7031 : : return -1;
7032 : :
7033 : 22585 : change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
7034 : 45170 : int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
7035 : : node, tmp_perm,
7036 : 22585 : SLP_TREE_CHILDREN (node),
7037 : : false);
7038 : 22585 : if (count < 0)
7039 : : {
7040 : 1430 : if (in_layout_i == 0 && out_layout_i == 0)
7041 : : {
7042 : : /* Use the fallback cost if the node could in principle support
7043 : : some nonzero layout for both the inputs and the outputs.
7044 : : Otherwise assume that the node will be rejected later
7045 : : and rebuilt from scalars. */
7046 : 335 : if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
7047 : : return fallback_cost;
7048 : 261 : return 0;
7049 : : }
7050 : : return -1;
7051 : : }
7052 : :
7053 : : /* We currently have no way of telling whether the new layout is cheaper
7054 : : or more expensive than the old one. But at least in principle,
7055 : : it should be worth making zero permutations (whole-vector shuffles)
7056 : : cheaper than real permutations, in case the pass is able to remove
7057 : : the latter. */
7058 : 21155 : return count == 0 ? 0 : 1;
7059 : 23112 : }
7060 : :
7061 : 183256 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
7062 : 183256 : if (rep
7063 : 182361 : && STMT_VINFO_DATA_REF (rep)
7064 : 57094 : && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
7065 : 223841 : && SLP_TREE_LOAD_PERMUTATION (node).exists ())
7066 : : {
7067 : 34628 : auto_load_permutation_t tmp_perm;
7068 : 34628 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
7069 : 34628 : if (out_layout_i > 0)
7070 : 12216 : vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
7071 : :
7072 : 34628 : poly_uint64 vf = 1;
7073 : 34628 : if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
7074 : 7832 : vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7075 : 34628 : unsigned int n_perms;
7076 : 34628 : if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
7077 : : nullptr, vf, true, false, &n_perms))
7078 : : {
7079 : 1482 : auto rep = SLP_TREE_REPRESENTATIVE (node);
7080 : 1482 : if (out_layout_i == 0)
7081 : : {
7082 : : /* Use the fallback cost if the load is an N-to-N permutation.
7083 : : Otherwise assume that the node will be rejected later
7084 : : and rebuilt from scalars. */
7085 : 1077 : if (STMT_VINFO_GROUPED_ACCESS (rep)
7086 : 2154 : && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
7087 : 1077 : == SLP_TREE_LANES (node)))
7088 : 563 : return fallback_cost;
7089 : : return 0;
7090 : : }
7091 : : return -1;
7092 : : }
7093 : :
7094 : : /* See the comment above the corresponding VEC_PERM_EXPR handling. */
7095 : 33146 : return n_perms == 0 ? 0 : 1;
7096 : 34628 : }
7097 : :
7098 : : return 0;
7099 : : }
7100 : :
7101 : : /* Decide which element layouts we should consider using. Calculate the
7102 : : weights associated with inserting layout changes on partition edges.
7103 : : Also mark partitions that cannot change layout, by setting their
7104 : : layout to zero. */
7105 : :
7106 : : void
7107 : 625746 : vect_optimize_slp_pass::start_choosing_layouts ()
7108 : : {
7109 : : /* Used to assign unique permutation indices. */
7110 : 625746 : using perm_hash = unbounded_hashmap_traits<
7111 : : vec_free_hash_base<int_hash_base<unsigned>>,
7112 : : int_hash<int, -1, -2>
7113 : : >;
7114 : 625746 : hash_map<vec<unsigned>, int, perm_hash> layout_ids;
7115 : :
7116 : : /* Layout 0 is "no change". */
7117 : 625746 : m_perms.safe_push (vNULL);
7118 : :
7119 : : /* Create layouts from existing permutations. */
7120 : 625746 : auto_load_permutation_t tmp_perm;
7121 : 5491142 : for (unsigned int node_i : m_partitioned_nodes)
7122 : : {
7123 : : /* Leafs also double as entries to the reverse graph. Allow the
7124 : : layout of those to be changed. */
7125 : 3613904 : auto &vertex = m_vertices[node_i];
7126 : 3613904 : auto &partition = m_partitions[vertex.partition];
7127 : 3613904 : if (!m_slpg->vertices[node_i].succ)
7128 : 976041 : partition.layout = 0;
7129 : :
7130 : : /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
7131 : 3613904 : slp_tree node = vertex.node;
7132 : 3613904 : stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
7133 : 3613904 : slp_tree child;
7134 : 3613904 : unsigned HOST_WIDE_INT imin, imax = 0;
7135 : 3613904 : bool any_permute = false;
7136 : 3613904 : tmp_perm.truncate (0);
7137 : 3613904 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
7138 : : {
7139 : : /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
7140 : : unpermuted, record a layout that reverses this permutation.
7141 : :
7142 : : We would need more work to cope with loads that are internally
7143 : : permuted and also have inputs (such as masks for
7144 : : IFN_MASK_LOADs). */
7145 : 486831 : gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
7146 : 486831 : if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
7147 : : {
7148 : 343608 : partition.layout = -1;
7149 : 3598421 : continue;
7150 : : }
7151 : 143223 : dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
7152 : 143223 : imin = DR_GROUP_SIZE (dr_stmt) + 1;
7153 : 143223 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
7154 : : }
7155 : 6097596 : else if (SLP_TREE_PERMUTE_P (node)
7156 : 171689 : && SLP_TREE_CHILDREN (node).length () == 1
7157 : 156550 : && (child = SLP_TREE_CHILDREN (node)[0])
7158 : 3283623 : && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
7159 : 156550 : .is_constant (&imin)))
7160 : : {
7161 : : /* If the child has the same vector size as this node,
7162 : : reversing the permutation can make the permutation a no-op.
7163 : : In other cases it can change a true permutation into a
7164 : : full-vector extract. */
7165 : 156550 : tmp_perm.reserve (SLP_TREE_LANES (node));
7166 : 411351 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7167 : 254801 : tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
7168 : : }
7169 : : else
7170 : 2970523 : continue;
7171 : :
7172 : 795741 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7173 : : {
7174 : 495968 : unsigned idx = tmp_perm[j];
7175 : 495968 : imin = MIN (imin, idx);
7176 : 495968 : imax = MAX (imax, idx);
7177 : 495968 : if (idx - tmp_perm[0] != j)
7178 : 153354 : any_permute = true;
7179 : : }
7180 : : /* If the span doesn't match we'd disrupt VF computation, avoid
7181 : : that for now. */
7182 : 299773 : if (imax - imin + 1 != SLP_TREE_LANES (node))
7183 : 99910 : continue;
7184 : : /* If there's no permute no need to split one out. In this case
7185 : : we can consider turning a load into a permuted load, if that
7186 : : turns out to be cheaper than alternatives. */
7187 : 199863 : if (!any_permute)
7188 : : {
7189 : 184234 : partition.layout = -1;
7190 : 184234 : continue;
7191 : : }
7192 : :
7193 : : /* For now only handle true permutes, like
7194 : : vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
7195 : : when permuting constants and invariants keeping the permute
7196 : : bijective. */
7197 : 15629 : auto_sbitmap load_index (SLP_TREE_LANES (node));
7198 : 15629 : bitmap_clear (load_index);
7199 : 60753 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7200 : 45124 : bitmap_set_bit (load_index, tmp_perm[j] - imin);
7201 : : unsigned j;
7202 : 60044 : for (j = 0; j < SLP_TREE_LANES (node); ++j)
7203 : 44561 : if (!bitmap_bit_p (load_index, j))
7204 : : break;
7205 : 15629 : if (j != SLP_TREE_LANES (node))
7206 : 146 : continue;
7207 : :
7208 : 15483 : vec<unsigned> perm = vNULL;
7209 : 15483 : perm.safe_grow (SLP_TREE_LANES (node), true);
7210 : 59796 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
7211 : 44313 : perm[j] = tmp_perm[j] - imin;
7212 : :
7213 : 30966 : if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
7214 : : {
7215 : : /* Continue to use existing layouts, but don't add any more. */
7216 : 0 : int *entry = layout_ids.get (perm);
7217 : 0 : partition.layout = entry ? *entry : 0;
7218 : 0 : perm.release ();
7219 : : }
7220 : : else
7221 : : {
7222 : 15483 : bool existed;
7223 : 15483 : int &layout_i = layout_ids.get_or_insert (perm, &existed);
7224 : 15483 : if (existed)
7225 : 5385 : perm.release ();
7226 : : else
7227 : : {
7228 : 10098 : layout_i = m_perms.length ();
7229 : 10098 : m_perms.safe_push (perm);
7230 : : }
7231 : 15483 : partition.layout = layout_i;
7232 : : }
7233 : 15629 : }
7234 : :
7235 : : /* Initially assume that every layout is possible and has zero cost
7236 : : in every partition. */
7237 : 625746 : m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
7238 : 1251492 : * m_perms.length ());
7239 : :
7240 : : /* We have to mark outgoing permutations facing non-associating-reduction
7241 : : graph entries that are not represented as to be materialized.
7242 : : slp_inst_kind_bb_reduc currently only covers associatable reductions. */
7243 : 3547055 : for (slp_instance instance : m_vinfo->slp_instances)
7244 : 1669817 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
7245 : : {
7246 : 5403 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
7247 : 5403 : m_partitions[m_vertices[node_i].partition].layout = 0;
7248 : : }
7249 : 1664414 : else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
7250 : : {
7251 : 1379 : stmt_vec_info stmt_info
7252 : 1379 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
7253 : 1379 : vect_reduc_info reduc_info
7254 : 1379 : = info_for_reduction (as_a <loop_vec_info> (m_vinfo),
7255 : : SLP_INSTANCE_TREE (instance));
7256 : 1379 : if (needs_fold_left_reduction_p (TREE_TYPE
7257 : : (gimple_get_lhs (stmt_info->stmt)),
7258 : : VECT_REDUC_INFO_CODE (reduc_info)))
7259 : : {
7260 : 64 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
7261 : 64 : m_partitions[m_vertices[node_i].partition].layout = 0;
7262 : : }
7263 : : }
7264 : :
7265 : : /* Check which layouts each node and partition can handle. Calculate the
7266 : : weights associated with inserting layout changes on edges. */
7267 : 5491142 : for (unsigned int node_i : m_partitioned_nodes)
7268 : : {
7269 : 3613904 : auto &vertex = m_vertices[node_i];
7270 : 3613904 : auto &partition = m_partitions[vertex.partition];
7271 : 3613904 : slp_tree node = vertex.node;
7272 : :
7273 : 3613904 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
7274 : : {
7275 : 3609633 : vertex.weight = vect_slp_node_weight (node);
7276 : :
7277 : : /* We do not handle stores with a permutation, so all
7278 : : incoming permutations must have been materialized.
7279 : :
7280 : : We also don't handle masked grouped loads, which lack a
7281 : : permutation vector. In this case the memory locations
7282 : : form an implicit second input to the loads, on top of the
7283 : : explicit mask input, and the memory input's layout cannot
7284 : : be changed.
7285 : :
7286 : : On the other hand, we do support permuting gather loads and
7287 : : masked gather loads, where each scalar load is independent
7288 : : of the others. This can be useful if the address/index input
7289 : : benefits from permutation. */
7290 : 3609633 : if (STMT_VINFO_DATA_REF (rep)
7291 : 1647532 : && STMT_VINFO_GROUPED_ACCESS (rep)
7292 : 4716609 : && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
7293 : 963753 : partition.layout = 0;
7294 : :
7295 : : /* We cannot change the layout of an operation that is
7296 : : not independent on lanes. Note this is an explicit
7297 : : negative list since that's much shorter than the respective
7298 : : positive one but it's critical to keep maintaining it. */
7299 : 3609633 : if (is_gimple_call (STMT_VINFO_STMT (rep)))
7300 : 23329 : switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
7301 : : {
7302 : 1062 : case CFN_COMPLEX_ADD_ROT90:
7303 : 1062 : case CFN_COMPLEX_ADD_ROT270:
7304 : 1062 : case CFN_COMPLEX_MUL:
7305 : 1062 : case CFN_COMPLEX_MUL_CONJ:
7306 : 1062 : case CFN_VEC_ADDSUB:
7307 : 1062 : case CFN_VEC_FMADDSUB:
7308 : 1062 : case CFN_VEC_FMSUBADD:
7309 : 1062 : partition.layout = 0;
7310 : : default:;
7311 : : }
7312 : : }
7313 : :
7314 : 7997056 : auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
7315 : : {
7316 : 4383152 : auto &other_vertex = m_vertices[other_node_i];
7317 : :
7318 : : /* Count the number of edges from earlier partitions and the number
7319 : : of edges to later partitions. */
7320 : 4383152 : if (other_vertex.partition < vertex.partition)
7321 : 2191576 : partition.in_degree += 1;
7322 : : else
7323 : 2191576 : partition.out_degree += 1;
7324 : :
7325 : : /* If the current node uses the result of OTHER_NODE_I, accumulate
7326 : : the effects of that. */
7327 : 4383152 : if (ud->src == int (node_i))
7328 : : {
7329 : 2191576 : other_vertex.out_weight += vertex.weight;
7330 : 2191576 : other_vertex.out_degree += 1;
7331 : : }
7332 : 7997056 : };
7333 : 3613904 : for_each_partition_edge (node_i, process_edge);
7334 : : }
7335 : 625746 : }
7336 : :
7337 : : /* Return the incoming costs for node NODE_I, assuming that each input keeps
7338 : : its current (provisional) choice of layout. The inputs do not necessarily
7339 : : have the same layout as each other. */
7340 : :
7341 : : slpg_layout_cost
7342 : 3035 : vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
7343 : : {
7344 : 3035 : auto &vertex = m_vertices[node_i];
7345 : 3035 : slpg_layout_cost cost;
7346 : 11081 : auto add_cost = [&](graph_edge *, unsigned int other_node_i)
7347 : : {
7348 : 8046 : auto &other_vertex = m_vertices[other_node_i];
7349 : 8046 : if (other_vertex.partition < vertex.partition)
7350 : : {
7351 : 5110 : auto &other_partition = m_partitions[other_vertex.partition];
7352 : 10220 : auto &other_costs = partition_layout_costs (other_vertex.partition,
7353 : 5110 : other_partition.layout);
7354 : 5110 : slpg_layout_cost this_cost = other_costs.in_cost;
7355 : 5110 : this_cost.add_serial_cost (other_costs.internal_cost);
7356 : 5110 : this_cost.split (other_partition.out_degree);
7357 : 5110 : cost.add_parallel_cost (this_cost);
7358 : : }
7359 : 11081 : };
7360 : 3035 : for_each_partition_edge (node_i, add_cost);
7361 : 3035 : return cost;
7362 : : }
7363 : :
7364 : : /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
7365 : : and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
7366 : : slpg_layout_cost::impossible () if the change isn't possible. */
7367 : :
7368 : : slpg_layout_cost
7369 : 673385 : vect_optimize_slp_pass::
7370 : : edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
7371 : : unsigned int layout2_i)
7372 : : {
7373 : 673385 : auto &def_vertex = m_vertices[ud->dest];
7374 : 673385 : auto &use_vertex = m_vertices[ud->src];
7375 : 673385 : auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
7376 : 673385 : auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
7377 : 673385 : auto factor = change_layout_cost (def_vertex.node, def_layout_i,
7378 : : use_layout_i);
7379 : 673385 : if (factor < 0)
7380 : 5009 : return slpg_layout_cost::impossible ();
7381 : :
7382 : : /* We have a choice of putting the layout change at the site of the
7383 : : definition or at the site of the use. Prefer the former when
7384 : : optimizing for size or when the execution frequency of the
7385 : : definition is no greater than the combined execution frequencies of
7386 : : the uses. When putting the layout change at the site of the definition,
7387 : : divvy up the cost among all consumers. */
7388 : 668376 : if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
7389 : : {
7390 : 650570 : slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
7391 : 650570 : cost.split (def_vertex.out_degree);
7392 : 650570 : return cost;
7393 : : }
7394 : 17806 : return { use_vertex.weight * factor, m_optimize_size };
7395 : : }
7396 : :
7397 : : /* UD represents a use-def link between FROM_NODE_I and a node in a later
7398 : : partition; FROM_NODE_I could be the definition node or the use node.
7399 : : The node at the other end of the link wants to use layout TO_LAYOUT_I.
7400 : : Return the cost of any necessary fix-ups on edge UD, or return
7401 : : slpg_layout_cost::impossible () if the change isn't possible.
7402 : :
7403 : : At this point, FROM_NODE_I's partition has chosen the cheapest
7404 : : layout based on the information available so far, but this choice
7405 : : is only provisional. */
7406 : :
7407 : : slpg_layout_cost
7408 : 176710 : vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
7409 : : unsigned int to_layout_i)
7410 : : {
7411 : 176710 : auto &from_vertex = m_vertices[from_node_i];
7412 : 176710 : unsigned int from_partition_i = from_vertex.partition;
7413 : 176710 : slpg_partition_info &from_partition = m_partitions[from_partition_i];
7414 : 176710 : gcc_assert (from_partition.layout >= 0);
7415 : :
7416 : : /* First calculate the cost on the assumption that FROM_PARTITION sticks
7417 : : with its current layout preference. */
7418 : 176710 : slpg_layout_cost cost = slpg_layout_cost::impossible ();
7419 : 176710 : auto edge_cost = edge_layout_cost (ud, from_node_i,
7420 : 176710 : from_partition.layout, to_layout_i);
7421 : 176710 : if (edge_cost.is_possible ())
7422 : : {
7423 : 348280 : auto &from_costs = partition_layout_costs (from_partition_i,
7424 : 174140 : from_partition.layout);
7425 : 174140 : cost = from_costs.in_cost;
7426 : 174140 : cost.add_serial_cost (from_costs.internal_cost);
7427 : 174140 : cost.split (from_partition.out_degree);
7428 : 174140 : cost.add_serial_cost (edge_cost);
7429 : : }
7430 : 2570 : else if (from_partition.layout == 0)
7431 : : /* We must allow the source partition to have layout 0 as a fallback,
7432 : : in case all other options turn out to be impossible. */
7433 : 2570 : return cost;
7434 : :
7435 : : /* Take the minimum of that cost and the cost that applies if
7436 : : FROM_PARTITION instead switches to TO_LAYOUT_I. */
7437 : 174140 : auto &direct_layout_costs = partition_layout_costs (from_partition_i,
7438 : : to_layout_i);
7439 : 174140 : if (direct_layout_costs.is_possible ())
7440 : : {
7441 : 157561 : slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
7442 : 157561 : direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
7443 : 157561 : direct_cost.split (from_partition.out_degree);
7444 : 157561 : if (!cost.is_possible ()
7445 : 157561 : || direct_cost.is_better_than (cost, m_optimize_size))
7446 : 42018 : cost = direct_cost;
7447 : : }
7448 : :
7449 : 174140 : return cost;
7450 : : }
7451 : :
7452 : : /* UD represents a use-def link between TO_NODE_I and a node in an earlier
7453 : : partition; TO_NODE_I could be the definition node or the use node.
7454 : : The node at the other end of the link wants to use layout FROM_LAYOUT_I;
7455 : : return the cost of any necessary fix-ups on edge UD, or
7456 : : slpg_layout_cost::impossible () if the choice cannot be made.
7457 : :
7458 : : At this point, TO_NODE_I's partition has a fixed choice of layout. */
7459 : :
7460 : : slpg_layout_cost
7461 : 164003 : vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
7462 : : unsigned int from_layout_i)
7463 : : {
7464 : 164003 : auto &to_vertex = m_vertices[to_node_i];
7465 : 164003 : unsigned int to_partition_i = to_vertex.partition;
7466 : 164003 : slpg_partition_info &to_partition = m_partitions[to_partition_i];
7467 : 164003 : gcc_assert (to_partition.layout >= 0);
7468 : :
7469 : : /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
7470 : : adjusted for this input having layout FROM_LAYOUT_I. Assume that
7471 : : any other inputs keep their current choice of layout. */
7472 : 164003 : auto &to_costs = partition_layout_costs (to_partition_i,
7473 : : to_partition.layout);
7474 : 164003 : if (ud->src == int (to_node_i)
7475 : 163841 : && SLP_TREE_PERMUTE_P (to_vertex.node))
7476 : : {
7477 : 9105 : auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
7478 : 9105 : auto old_layout = from_partition.layout;
7479 : 9105 : from_partition.layout = from_layout_i;
7480 : 18210 : int factor = internal_node_cost (to_vertex.node, -1,
7481 : 9105 : to_partition.layout);
7482 : 9105 : from_partition.layout = old_layout;
7483 : 9105 : if (factor >= 0)
7484 : : {
7485 : 8495 : slpg_layout_cost cost = to_costs.out_cost;
7486 : 16990 : cost.add_serial_cost ({ to_vertex.weight * factor,
7487 : 8495 : m_optimize_size });
7488 : 8495 : cost.split (to_partition.in_degree);
7489 : 8495 : return cost;
7490 : : }
7491 : : }
7492 : :
7493 : : /* Compute the cost if we insert any necessary layout change on edge UD. */
7494 : 155508 : auto edge_cost = edge_layout_cost (ud, to_node_i,
7495 : 155508 : to_partition.layout, from_layout_i);
7496 : 155508 : if (edge_cost.is_possible ())
7497 : : {
7498 : 155508 : slpg_layout_cost cost = to_costs.out_cost;
7499 : 155508 : cost.add_serial_cost (to_costs.internal_cost);
7500 : 155508 : cost.split (to_partition.in_degree);
7501 : 155508 : cost.add_serial_cost (edge_cost);
7502 : 155508 : return cost;
7503 : : }
7504 : :
7505 : 0 : return slpg_layout_cost::impossible ();
7506 : : }
7507 : :
7508 : : /* Make a forward pass through the partitions, accumulating input costs.
7509 : : Make a tentative (provisional) choice of layout for each partition,
7510 : : ensuring that this choice still allows later partitions to keep
7511 : : their original layout. */
7512 : :
7513 : : void
7514 : 5193 : vect_optimize_slp_pass::forward_pass ()
7515 : : {
7516 : 112830 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
7517 : : ++partition_i)
7518 : : {
7519 : 107637 : auto &partition = m_partitions[partition_i];
7520 : :
7521 : : /* If the partition consists of a single VEC_PERM_EXPR, precompute
7522 : : the incoming cost that would apply if every predecessor partition
7523 : : keeps its current layout. This is used within the loop below. */
7524 : 107637 : slpg_layout_cost in_cost;
7525 : 107637 : slp_tree single_node = nullptr;
7526 : 107637 : if (partition.node_end == partition.node_begin + 1)
7527 : : {
7528 : 103752 : unsigned int node_i = m_partitioned_nodes[partition.node_begin];
7529 : 103752 : single_node = m_vertices[node_i].node;
7530 : 103752 : if (SLP_TREE_PERMUTE_P (single_node))
7531 : 3035 : in_cost = total_in_cost (node_i);
7532 : : }
7533 : :
7534 : : /* Go through the possible layouts. Decide which ones are valid
7535 : : for this partition and record which of the valid layouts has
7536 : : the lowest cost. */
7537 : 107637 : unsigned int min_layout_i = 0;
7538 : 107637 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
7539 : 328846 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7540 : : {
7541 : 221209 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7542 : 221209 : if (!layout_costs.is_possible ())
7543 : 49237 : continue;
7544 : :
7545 : : /* If the recorded layout is already 0 then the layout cannot
7546 : : change. */
7547 : 221209 : if (partition.layout == 0 && layout_i != 0)
7548 : : {
7549 : 35603 : layout_costs.mark_impossible ();
7550 : 35603 : continue;
7551 : : }
7552 : :
7553 : 185606 : bool is_possible = true;
7554 : 376559 : for (unsigned int order_i = partition.node_begin;
7555 : 376559 : order_i < partition.node_end; ++order_i)
7556 : : {
7557 : 202360 : unsigned int node_i = m_partitioned_nodes[order_i];
7558 : 202360 : auto &vertex = m_vertices[node_i];
7559 : :
7560 : : /* Reject the layout if it is individually incompatible
7561 : : with any node in the partition. */
7562 : 202360 : if (!is_compatible_layout (vertex.node, layout_i))
7563 : : {
7564 : 10401 : is_possible = false;
7565 : 11407 : break;
7566 : : }
7567 : :
7568 : 535821 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
7569 : : {
7570 : 343862 : auto &other_vertex = m_vertices[other_node_i];
7571 : 343862 : if (other_vertex.partition < vertex.partition)
7572 : : {
7573 : : /* Accumulate the incoming costs from earlier
7574 : : partitions, plus the cost of any layout changes
7575 : : on UD itself. */
7576 : 176710 : auto cost = forward_cost (ud, other_node_i, layout_i);
7577 : 176710 : if (!cost.is_possible ())
7578 : 2570 : is_possible = false;
7579 : : else
7580 : 174140 : layout_costs.in_cost.add_parallel_cost (cost);
7581 : : }
7582 : : else
7583 : : /* Reject the layout if it would make layout 0 impossible
7584 : : for later partitions. This amounts to testing that the
7585 : : target supports reversing the layout change on edges
7586 : : to later partitions.
7587 : :
7588 : : In principle, it might be possible to push a layout
7589 : : change all the way down a graph, so that it never
7590 : : needs to be reversed and so that the target doesn't
7591 : : need to support the reverse operation. But it would
7592 : : be awkward to bail out if we hit a partition that
7593 : : does not support the new layout, especially since
7594 : : we are not dealing with a lattice. */
7595 : 167152 : is_possible &= edge_layout_cost (ud, other_node_i, 0,
7596 : 167152 : layout_i).is_possible ();
7597 : 535821 : };
7598 : 191959 : for_each_partition_edge (node_i, add_cost);
7599 : :
7600 : : /* Accumulate the cost of using LAYOUT_I within NODE,
7601 : : both for the inputs and the outputs. */
7602 : 191959 : int factor = internal_node_cost (vertex.node, layout_i,
7603 : : layout_i);
7604 : 191959 : if (factor < 0)
7605 : : {
7606 : 1006 : is_possible = false;
7607 : 1006 : break;
7608 : : }
7609 : 190953 : else if (factor)
7610 : 30817 : layout_costs.internal_cost.add_serial_cost
7611 : 30817 : ({ vertex.weight * factor, m_optimize_size });
7612 : : }
7613 : 185606 : if (!is_possible)
7614 : : {
7615 : 13634 : layout_costs.mark_impossible ();
7616 : 13634 : continue;
7617 : : }
7618 : :
7619 : : /* Combine the incoming and partition-internal costs. */
7620 : 171972 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7621 : 171972 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7622 : :
7623 : : /* If this partition consists of a single VEC_PERM_EXPR, see
7624 : : if the VEC_PERM_EXPR can be changed to support output layout
7625 : : LAYOUT_I while keeping all the provisional choices of input
7626 : : layout. */
7627 : 171972 : if (single_node && SLP_TREE_PERMUTE_P (single_node))
7628 : : {
7629 : 5304 : int factor = internal_node_cost (single_node, -1, layout_i);
7630 : 5304 : if (factor >= 0)
7631 : : {
7632 : 4893 : auto weight = m_vertices[single_node->vertex].weight;
7633 : 4893 : slpg_layout_cost internal_cost
7634 : 4893 : = { weight * factor, m_optimize_size };
7635 : :
7636 : 4893 : slpg_layout_cost alt_cost = in_cost;
7637 : 4893 : alt_cost.add_serial_cost (internal_cost);
7638 : 4893 : if (alt_cost.is_better_than (combined_cost, m_optimize_size))
7639 : : {
7640 : 1552 : combined_cost = alt_cost;
7641 : 1552 : layout_costs.in_cost = in_cost;
7642 : 1552 : layout_costs.internal_cost = internal_cost;
7643 : : }
7644 : : }
7645 : : }
7646 : :
7647 : : /* Record the layout with the lowest cost. Prefer layout 0 in
7648 : : the event of a tie between it and another layout. */
7649 : 171972 : if (!min_layout_cost.is_possible ()
7650 : 64335 : || combined_cost.is_better_than (min_layout_cost,
7651 : 64335 : m_optimize_size))
7652 : : {
7653 : 120775 : min_layout_i = layout_i;
7654 : 120775 : min_layout_cost = combined_cost;
7655 : : }
7656 : : }
7657 : :
7658 : : /* This loop's handling of earlier partitions should ensure that
7659 : : choosing the original layout for the current partition is no
7660 : : less valid than it was in the original graph, even with the
7661 : : provisional layout choices for those earlier partitions. */
7662 : 107637 : gcc_assert (min_layout_cost.is_possible ());
7663 : 107637 : partition.layout = min_layout_i;
7664 : : }
7665 : 5193 : }
7666 : :
7667 : : /* Make a backward pass through the partitions, accumulating output costs.
7668 : : Make a final choice of layout for each partition. */
7669 : :
7670 : : void
7671 : 5193 : vect_optimize_slp_pass::backward_pass ()
7672 : : {
7673 : 118023 : for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
7674 : : {
7675 : 107637 : auto &partition = m_partitions[partition_i];
7676 : :
7677 : 107637 : unsigned int min_layout_i = 0;
7678 : 107637 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
7679 : 328846 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7680 : : {
7681 : 221209 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7682 : 221209 : if (!layout_costs.is_possible ())
7683 : 49237 : continue;
7684 : :
7685 : : /* Accumulate the costs from successor partitions. */
7686 : 171972 : bool is_possible = true;
7687 : 360650 : for (unsigned int order_i = partition.node_begin;
7688 : 360650 : order_i < partition.node_end; ++order_i)
7689 : : {
7690 : 188678 : unsigned int node_i = m_partitioned_nodes[order_i];
7691 : 188678 : auto &vertex = m_vertices[node_i];
7692 : 526696 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
7693 : : {
7694 : 338018 : auto &other_vertex = m_vertices[other_node_i];
7695 : 338018 : auto &other_partition = m_partitions[other_vertex.partition];
7696 : 338018 : if (other_vertex.partition > vertex.partition)
7697 : : {
7698 : : /* Accumulate the incoming costs from later
7699 : : partitions, plus the cost of any layout changes
7700 : : on UD itself. */
7701 : 164003 : auto cost = backward_cost (ud, other_node_i, layout_i);
7702 : 164003 : if (!cost.is_possible ())
7703 : 0 : is_possible = false;
7704 : : else
7705 : 164003 : layout_costs.out_cost.add_parallel_cost (cost);
7706 : : }
7707 : : else
7708 : : /* Make sure that earlier partitions can (if necessary
7709 : : or beneficial) keep the layout that they chose in
7710 : : the forward pass. This ensures that there is at
7711 : : least one valid choice of layout. */
7712 : 174015 : is_possible &= edge_layout_cost (ud, other_node_i,
7713 : 174015 : other_partition.layout,
7714 : 174015 : layout_i).is_possible ();
7715 : 526696 : };
7716 : 188678 : for_each_partition_edge (node_i, add_cost);
7717 : : }
7718 : 171972 : if (!is_possible)
7719 : : {
7720 : 0 : layout_costs.mark_impossible ();
7721 : 0 : continue;
7722 : : }
7723 : :
7724 : : /* Locally combine the costs from the forward and backward passes.
7725 : : (This combined cost is not passed on, since that would lead
7726 : : to double counting.) */
7727 : 171972 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7728 : 171972 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7729 : 171972 : combined_cost.add_serial_cost (layout_costs.out_cost);
7730 : :
7731 : : /* Record the layout with the lowest cost. Prefer layout 0 in
7732 : : the event of a tie between it and another layout. */
7733 : 171972 : if (!min_layout_cost.is_possible ()
7734 : 64335 : || combined_cost.is_better_than (min_layout_cost,
7735 : 64335 : m_optimize_size))
7736 : : {
7737 : 115372 : min_layout_i = layout_i;
7738 : 115372 : min_layout_cost = combined_cost;
7739 : : }
7740 : : }
7741 : :
7742 : 107637 : gcc_assert (min_layout_cost.is_possible ());
7743 : 107637 : partition.layout = min_layout_i;
7744 : : }
7745 : 5193 : }
7746 : :
7747 : : /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
7748 : : NODE already has the layout that was selected for its partition. */
7749 : :
7750 : : slp_tree
7751 : 145494 : vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
7752 : : unsigned int to_layout_i)
7753 : : {
7754 : 145494 : unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
7755 : 145494 : slp_tree result = m_node_layouts[result_i];
7756 : 145494 : if (result)
7757 : : return result;
7758 : :
7759 : 145023 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7760 : 145023 : || (SLP_TREE_DEF_TYPE (node) == vect_external_def
7761 : : /* We can't permute vector defs in place. */
7762 : 20145 : && SLP_TREE_VEC_DEFS (node).is_empty ()))
7763 : : {
7764 : : /* If the vector is uniform or unchanged, there's nothing to do. */
7765 : 37836 : if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
7766 : : result = node;
7767 : : else
7768 : : {
7769 : 1928 : auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
7770 : 1928 : result = vect_create_new_slp_node (scalar_ops);
7771 : 1928 : vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
7772 : : }
7773 : : }
7774 : : else
7775 : : {
7776 : 107187 : unsigned int partition_i = m_vertices[node->vertex].partition;
7777 : 107187 : unsigned int from_layout_i = m_partitions[partition_i].layout;
7778 : 107187 : if (from_layout_i == to_layout_i)
7779 : 106599 : return node;
7780 : :
7781 : : /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
7782 : : permutation instead of a serial one. Leave the new permutation
7783 : : in TMP_PERM on success. */
7784 : 588 : auto_lane_permutation_t tmp_perm;
7785 : 588 : unsigned int num_inputs = 1;
7786 : 588 : if (SLP_TREE_PERMUTE_P (node))
7787 : : {
7788 : 7 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
7789 : 7 : if (from_layout_i != 0)
7790 : 7 : vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
7791 : 7 : if (to_layout_i != 0)
7792 : 4 : vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
7793 : 7 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
7794 : : tmp_perm,
7795 : 7 : SLP_TREE_CHILDREN (node),
7796 : : false) >= 0)
7797 : 7 : num_inputs = SLP_TREE_CHILDREN (node).length ();
7798 : : else
7799 : 0 : tmp_perm.truncate (0);
7800 : : }
7801 : :
7802 : 588 : if (dump_enabled_p ())
7803 : : {
7804 : 68 : if (tmp_perm.length () > 0)
7805 : 6 : dump_printf_loc (MSG_NOTE, vect_location,
7806 : : "duplicating permutation node %p with"
7807 : : " layout %d\n",
7808 : : (void *) node, to_layout_i);
7809 : : else
7810 : 62 : dump_printf_loc (MSG_NOTE, vect_location,
7811 : : "inserting permutation node in place of %p\n",
7812 : : (void *) node);
7813 : : }
7814 : :
7815 : 588 : unsigned int num_lanes = SLP_TREE_LANES (node);
7816 : 588 : result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
7817 : 588 : if (SLP_TREE_SCALAR_STMTS (node).length ())
7818 : : {
7819 : 587 : auto &stmts = SLP_TREE_SCALAR_STMTS (result);
7820 : 587 : stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
7821 : 587 : if (from_layout_i != 0)
7822 : 321 : vect_slp_permute (m_perms[from_layout_i], stmts, false);
7823 : 587 : if (to_layout_i != 0)
7824 : 270 : vect_slp_permute (m_perms[to_layout_i], stmts, true);
7825 : : }
7826 : 588 : SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
7827 : 588 : SLP_TREE_LANES (result) = num_lanes;
7828 : 588 : SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
7829 : 588 : result->vertex = -1;
7830 : :
7831 : 588 : auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
7832 : 588 : if (tmp_perm.length ())
7833 : : {
7834 : 7 : lane_perm.safe_splice (tmp_perm);
7835 : 7 : SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
7836 : : }
7837 : : else
7838 : : {
7839 : 581 : lane_perm.create (num_lanes);
7840 : 1807 : for (unsigned j = 0; j < num_lanes; ++j)
7841 : 1226 : lane_perm.quick_push ({ 0, j });
7842 : 581 : if (from_layout_i != 0)
7843 : 314 : vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
7844 : 581 : if (to_layout_i != 0)
7845 : 267 : vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
7846 : 581 : SLP_TREE_CHILDREN (result).safe_push (node);
7847 : : }
7848 : 2356 : for (slp_tree child : SLP_TREE_CHILDREN (result))
7849 : 592 : child->refcnt++;
7850 : 588 : }
7851 : 38424 : m_node_layouts[result_i] = result;
7852 : 38424 : return result;
7853 : : }
7854 : :
7855 : : /* Apply the chosen vector layouts to the SLP graph. */
7856 : :
7857 : : void
7858 : 9677 : vect_optimize_slp_pass::materialize ()
7859 : : {
7860 : : /* We no longer need the costs, so avoid having two O(N * P) arrays
7861 : : live at the same time. */
7862 : 9677 : m_partition_layout_costs.release ();
7863 : 29031 : m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
7864 : :
7865 : 19354 : auto_sbitmap fully_folded (m_vertices.length ());
7866 : 9677 : bitmap_clear (fully_folded);
7867 : 153264 : for (unsigned int node_i : m_partitioned_nodes)
7868 : : {
7869 : 124233 : auto &vertex = m_vertices[node_i];
7870 : 124233 : slp_tree node = vertex.node;
7871 : 124233 : int layout_i = m_partitions[vertex.partition].layout;
7872 : 124233 : gcc_assert (layout_i >= 0);
7873 : :
7874 : : /* Rearrange the scalar statements to match the chosen layout. */
7875 : 124233 : if (layout_i > 0)
7876 : 14879 : vect_slp_permute (m_perms[layout_i],
7877 : 14879 : SLP_TREE_SCALAR_STMTS (node), true);
7878 : :
7879 : : /* Update load and lane permutations. */
7880 : 124233 : if (SLP_TREE_PERMUTE_P (node))
7881 : : {
7882 : : /* First try to absorb the input vector layouts. If that fails,
7883 : : force the inputs to have layout LAYOUT_I too. We checked that
7884 : : that was possible before deciding to use nonzero output layouts.
7885 : : (Note that at this stage we don't really have any guarantee that
7886 : : the target supports the original VEC_PERM_EXPR.) */
7887 : 4438 : auto &perm = SLP_TREE_LANE_PERMUTATION (node);
7888 : 4438 : auto_lane_permutation_t tmp_perm;
7889 : 4438 : tmp_perm.safe_splice (perm);
7890 : 4438 : change_vec_perm_layout (node, tmp_perm, -1, layout_i);
7891 : 4438 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
7892 : : tmp_perm,
7893 : 4438 : SLP_TREE_CHILDREN (node),
7894 : : false) >= 0)
7895 : : {
7896 : 4103 : if (dump_enabled_p ()
7897 : 4919 : && !std::equal (tmp_perm.begin (), tmp_perm.end (),
7898 : : perm.begin ()))
7899 : 58 : dump_printf_loc (MSG_NOTE, vect_location,
7900 : : "absorbing input layouts into %p\n",
7901 : : (void *) node);
7902 : 23390 : std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
7903 : 4103 : bitmap_set_bit (fully_folded, node_i);
7904 : : }
7905 : : else
7906 : : {
7907 : : /* Not MSG_MISSED because it would make no sense to users. */
7908 : 335 : if (dump_enabled_p ())
7909 : 46 : dump_printf_loc (MSG_NOTE, vect_location,
7910 : : "failed to absorb input layouts into %p\n",
7911 : : (void *) node);
7912 : 335 : change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
7913 : : }
7914 : 4438 : }
7915 : : else
7916 : : {
7917 : 119795 : gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
7918 : 119795 : auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
7919 : 119795 : if (layout_i > 0)
7920 : : /* ??? When we handle non-bijective permutes the idea
7921 : : is that we can force the load-permutation to be
7922 : : { min, min + 1, min + 2, ... max }. But then the
7923 : : scalar defs might no longer match the lane content
7924 : : which means wrong-code with live lane vectorization.
7925 : : So we possibly have to have NULL entries for those. */
7926 : 14780 : vect_slp_permute (m_perms[layout_i], load_perm, true);
7927 : : }
7928 : : }
7929 : :
7930 : : /* Do this before any nodes disappear, since it involves a walk
7931 : : over the leaves. */
7932 : 9677 : remove_redundant_permutations ();
7933 : :
7934 : : /* Replace each child with a correctly laid-out version. */
7935 : 153264 : for (unsigned int node_i : m_partitioned_nodes)
7936 : : {
7937 : : /* Skip nodes that have already been handled above. */
7938 : 124233 : if (bitmap_bit_p (fully_folded, node_i))
7939 : 4103 : continue;
7940 : :
7941 : 120130 : auto &vertex = m_vertices[node_i];
7942 : 120130 : int in_layout_i = m_partitions[vertex.partition].layout;
7943 : 120130 : gcc_assert (in_layout_i >= 0);
7944 : :
7945 : : unsigned j;
7946 : : slp_tree child;
7947 : 359381 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
7948 : : {
7949 : 149157 : if (!child)
7950 : 3663 : continue;
7951 : :
7952 : 145494 : slp_tree new_child = get_result_with_layout (child, in_layout_i);
7953 : 145494 : if (new_child != child)
7954 : : {
7955 : 2729 : vect_free_slp_tree (child);
7956 : 2729 : SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
7957 : 2729 : new_child->refcnt += 1;
7958 : : }
7959 : : }
7960 : : }
7961 : 9677 : }
7962 : :
7963 : : /* Elide load permutations that are not necessary. Such permutations might
7964 : : be pre-existing, rather than created by the layout optimizations. */
7965 : :
7966 : : void
7967 : 625746 : vect_optimize_slp_pass::remove_redundant_permutations ()
7968 : : {
7969 : 4562838 : for (unsigned int node_i : m_leafs)
7970 : : {
7971 : 2685600 : slp_tree node = m_vertices[node_i].node;
7972 : 2685600 : if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
7973 : 2198769 : continue;
7974 : :
7975 : : /* In basic block vectorization we allow any subchain of an interleaving
7976 : : chain.
7977 : : FORNOW: not in loop SLP because of realignment complications. */
7978 : 486831 : if (is_a <bb_vec_info> (m_vinfo))
7979 : : {
7980 : 151708 : bool subchain_p = true;
7981 : : stmt_vec_info next_load_info = NULL;
7982 : : stmt_vec_info load_info;
7983 : : unsigned j;
7984 : 151708 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
7985 : : {
7986 : 123105 : if (j != 0
7987 : 123105 : && (next_load_info != load_info
7988 : 57503 : || ! load_info
7989 : 57503 : || DR_GROUP_GAP (load_info) != 1))
7990 : : {
7991 : : subchain_p = false;
7992 : : break;
7993 : : }
7994 : 101431 : next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
7995 : : }
7996 : 50277 : if (subchain_p)
7997 : : {
7998 : 28603 : SLP_TREE_LOAD_PERMUTATION (node).release ();
7999 : 28603 : continue;
8000 : : }
8001 : : }
8002 : : else
8003 : : {
8004 : 436554 : loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
8005 : 436554 : bool this_load_permuted = !vect_load_perm_consecutive_p (node);
8006 : : /* When this isn't a grouped access we know it's single element
8007 : : and contiguous. */
8008 : 436554 : if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
8009 : : {
8010 : 343608 : if (!this_load_permuted
8011 : 343608 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
8012 : 343063 : || SLP_TREE_LANES (node) == 1))
8013 : 342884 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8014 : 343608 : continue;
8015 : : }
8016 : 92946 : stmt_vec_info first_stmt_info
8017 : 92946 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
8018 : 93494 : if (!this_load_permuted
8019 : : /* The load requires permutation when unrolling exposes
8020 : : a gap either because the group is larger than the SLP
8021 : : group-size or because there is a gap between the groups. */
8022 : 92946 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
8023 : 87014 : || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
8024 : 120 : && DR_GROUP_GAP (first_stmt_info) == 0)))
8025 : : {
8026 : 548 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8027 : 548 : continue;
8028 : : }
8029 : : }
8030 : : }
8031 : 625746 : }
8032 : :
8033 : : /* Print the partition graph and layout information to the dump file. */
8034 : :
8035 : : void
8036 : 639 : vect_optimize_slp_pass::dump ()
8037 : : {
8038 : 639 : dump_printf_loc (MSG_NOTE, vect_location,
8039 : : "SLP optimize permutations:\n");
8040 : 1291 : for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
8041 : : {
8042 : 652 : dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
8043 : 652 : const char *sep = "";
8044 : 5537 : for (unsigned int idx : m_perms[layout_i])
8045 : : {
8046 : 3581 : dump_printf (MSG_NOTE, "%s%d", sep, idx);
8047 : 3581 : sep = ", ";
8048 : : }
8049 : 652 : dump_printf (MSG_NOTE, " }\n");
8050 : : }
8051 : 639 : dump_printf_loc (MSG_NOTE, vect_location,
8052 : : "SLP optimize partitions:\n");
8053 : 5086 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
8054 : : ++partition_i)
8055 : : {
8056 : 4447 : auto &partition = m_partitions[partition_i];
8057 : 4447 : dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
8058 : 4447 : dump_printf_loc (MSG_NOTE, vect_location,
8059 : : " partition %d (layout %d):\n",
8060 : : partition_i, partition.layout);
8061 : 4447 : dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
8062 : 9116 : for (unsigned int order_i = partition.node_begin;
8063 : 9116 : order_i < partition.node_end; ++order_i)
8064 : : {
8065 : 4669 : auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
8066 : 9338 : dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
8067 : 4669 : (void *) vertex.node);
8068 : 4669 : dump_printf_loc (MSG_NOTE, vect_location,
8069 : : " weight: %f\n",
8070 : : vertex.weight.to_double ());
8071 : 4669 : if (vertex.out_degree)
8072 : 3594 : dump_printf_loc (MSG_NOTE, vect_location,
8073 : : " out weight: %f (degree %d)\n",
8074 : : vertex.out_weight.to_double (),
8075 : : vertex.out_degree);
8076 : 4669 : if (SLP_TREE_PERMUTE_P (vertex.node))
8077 : 454 : dump_printf_loc (MSG_NOTE, vect_location,
8078 : : " op: VEC_PERM_EXPR\n");
8079 : 4215 : else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
8080 : 4197 : dump_printf_loc (MSG_NOTE, vect_location,
8081 : : " op template: %G", rep->stmt);
8082 : : }
8083 : 4447 : dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
8084 : 9116 : for (unsigned int order_i = partition.node_begin;
8085 : 9116 : order_i < partition.node_end; ++order_i)
8086 : : {
8087 : 4669 : unsigned int node_i = m_partitioned_nodes[order_i];
8088 : 4669 : auto &vertex = m_vertices[node_i];
8089 : 13965 : auto print_edge = [&](graph_edge *, unsigned int other_node_i)
8090 : : {
8091 : 9296 : auto &other_vertex = m_vertices[other_node_i];
8092 : 9296 : if (other_vertex.partition < vertex.partition)
8093 : 4648 : dump_printf_loc (MSG_NOTE, vect_location,
8094 : : " - %p [%d] --> %p\n",
8095 : 4648 : (void *) other_vertex.node,
8096 : : other_vertex.partition,
8097 : 4648 : (void *) vertex.node);
8098 : : else
8099 : 4648 : dump_printf_loc (MSG_NOTE, vect_location,
8100 : : " - %p --> [%d] %p\n",
8101 : 4648 : (void *) vertex.node,
8102 : : other_vertex.partition,
8103 : 4648 : (void *) other_vertex.node);
8104 : 13965 : };
8105 : 4669 : for_each_partition_edge (node_i, print_edge);
8106 : : }
8107 : :
8108 : 13540 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
8109 : : {
8110 : 9093 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
8111 : 9093 : if (layout_costs.is_possible ())
8112 : : {
8113 : 7394 : dump_printf_loc (MSG_NOTE, vect_location,
8114 : : " layout %d:%s\n", layout_i,
8115 : 7394 : partition.layout == int (layout_i)
8116 : : ? " (*)" : "");
8117 : 7394 : slpg_layout_cost combined_cost = layout_costs.in_cost;
8118 : 7394 : combined_cost.add_serial_cost (layout_costs.internal_cost);
8119 : 7394 : combined_cost.add_serial_cost (layout_costs.out_cost);
8120 : : #define TEMPLATE "{depth: %f, total: %f}"
8121 : 7394 : dump_printf_loc (MSG_NOTE, vect_location,
8122 : : " " TEMPLATE "\n",
8123 : : layout_costs.in_cost.depth.to_double (),
8124 : : layout_costs.in_cost.total.to_double ());
8125 : 7394 : dump_printf_loc (MSG_NOTE, vect_location,
8126 : : " + " TEMPLATE "\n",
8127 : : layout_costs.internal_cost.depth.to_double (),
8128 : : layout_costs.internal_cost.total.to_double ());
8129 : 7394 : dump_printf_loc (MSG_NOTE, vect_location,
8130 : : " + " TEMPLATE "\n",
8131 : : layout_costs.out_cost.depth.to_double (),
8132 : : layout_costs.out_cost.total.to_double ());
8133 : 7394 : dump_printf_loc (MSG_NOTE, vect_location,
8134 : : " = " TEMPLATE "\n",
8135 : : combined_cost.depth.to_double (),
8136 : : combined_cost.total.to_double ());
8137 : : #undef TEMPLATE
8138 : : }
8139 : : else
8140 : 1699 : dump_printf_loc (MSG_NOTE, vect_location,
8141 : : " layout %d: rejected\n", layout_i);
8142 : : }
8143 : : }
8144 : 639 : }
8145 : :
8146 : : /* Masked load lanes discovery. */
8147 : :
8148 : : void
8149 : 625746 : vect_optimize_slp_pass::decide_masked_load_lanes ()
8150 : : {
8151 : 7139237 : for (auto v : m_vertices)
8152 : : {
8153 : 5261999 : slp_tree node = v.node;
8154 : 5261999 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
8155 : 3612503 : || SLP_TREE_PERMUTE_P (node))
8156 : 1821771 : continue;
8157 : 3440228 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
8158 : 1490522 : if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
8159 : : /* The mask has to be uniform. */
8160 : 950317 : || STMT_VINFO_SLP_VECT_ONLY (stmt_info)
8161 : 950251 : || ! is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
8162 : 3440283 : || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
8163 : : IFN_MASK_LOAD))
8164 : 3440225 : continue;
8165 : 3 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8166 : 6 : if (STMT_VINFO_STRIDED_P (stmt_info)
8167 : 3 : || compare_step_with_zero (m_vinfo, stmt_info) <= 0
8168 : 3 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (node),
8169 : 0 : DR_GROUP_SIZE (stmt_info),
8170 : : true) == IFN_LAST)
8171 : 3 : continue;
8172 : :
8173 : : /* Uniform masks need to be suitably represented. */
8174 : 0 : slp_tree mask = SLP_TREE_CHILDREN (node)[0];
8175 : 0 : if (!SLP_TREE_PERMUTE_P (mask)
8176 : 0 : || SLP_TREE_CHILDREN (mask).length () != 1)
8177 : 0 : continue;
8178 : 0 : bool match = true;
8179 : 0 : for (auto perm : SLP_TREE_LANE_PERMUTATION (mask))
8180 : 0 : if (perm.first != 0 || perm.second != 0)
8181 : : {
8182 : : match = false;
8183 : : break;
8184 : : }
8185 : 0 : if (!match)
8186 : 0 : continue;
8187 : :
8188 : : /* Now see if the consumer side matches. */
8189 : 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
8190 : 0 : pred; pred = pred->pred_next)
8191 : : {
8192 : 0 : slp_tree pred_node = m_vertices[pred->src].node;
8193 : : /* All consumers should be a permute with a single outgoing lane. */
8194 : 0 : if (!SLP_TREE_PERMUTE_P (pred_node)
8195 : 0 : || SLP_TREE_LANES (pred_node) != 1)
8196 : : {
8197 : : match = false;
8198 : : break;
8199 : : }
8200 : 0 : gcc_assert (SLP_TREE_CHILDREN (pred_node).length () == 1);
8201 : : }
8202 : 0 : if (!match)
8203 : 0 : continue;
8204 : : /* Now we can mark the nodes as to use load lanes. */
8205 : 0 : node->ldst_lanes = true;
8206 : 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
8207 : 0 : pred; pred = pred->pred_next)
8208 : 0 : m_vertices[pred->src].node->ldst_lanes = true;
8209 : : /* The catch is we have to massage the mask. We have arranged
8210 : : analyzed uniform masks to be represented by a splat VEC_PERM
8211 : : which we can now simply elide as we cannot easily re-do SLP
8212 : : discovery here. */
8213 : 0 : slp_tree new_mask = SLP_TREE_CHILDREN (mask)[0];
8214 : 0 : SLP_TREE_REF_COUNT (new_mask)++;
8215 : 0 : SLP_TREE_CHILDREN (node)[0] = new_mask;
8216 : 0 : vect_free_slp_tree (mask);
8217 : : }
8218 : 625746 : }
8219 : :
8220 : : /* Perform legitimizing attempts. This is intended to improve the
8221 : : situation when layout 0 is not valid which is a situation the cost
8222 : : based propagation does not handle well.
8223 : : Return true if further layout optimization is possible, false if
8224 : : the layout configuration should be considered final. */
8225 : :
8226 : : bool
8227 : 9677 : vect_optimize_slp_pass::legitimize ()
8228 : : {
8229 : : /* Perform a very simple legitimizing attempt by attempting to choose
8230 : : a single layout for all partitions that will make all permutations
8231 : : a noop. That should also be the optimal layout choice in case
8232 : : layout zero is legitimate.
8233 : : ??? Disconnected components of the SLP graph could have distinct
8234 : : single layouts. */
8235 : 9677 : int single_layout_i = -1;
8236 : 9677 : unsigned deferred_up_to = -1U;
8237 : 29298 : for (unsigned partition_i = 0; partition_i < m_partitions.length ();
8238 : : ++partition_i)
8239 : : {
8240 : 24808 : auto &partition = m_partitions[partition_i];
8241 : 24808 : if (single_layout_i == -1)
8242 : : {
8243 : 12994 : single_layout_i = partition.layout;
8244 : 12994 : deferred_up_to = partition_i;
8245 : : }
8246 : 11814 : else if (partition.layout == single_layout_i || partition.layout == -1)
8247 : : ;
8248 : : else
8249 : : single_layout_i = 0;
8250 : 21761 : if (single_layout_i == 0)
8251 : : return true;
8252 : :
8253 : 19681 : if (single_layout_i != -1
8254 : 19681 : && !is_compatible_layout (partition, single_layout_i))
8255 : : return true;
8256 : : }
8257 : :
8258 : 4490 : if (single_layout_i <= 0)
8259 : : return true;
8260 : :
8261 : 4606 : for (unsigned partition_i = 0; partition_i < deferred_up_to; ++partition_i)
8262 : 122 : if (!is_compatible_layout (m_partitions[partition_i],
8263 : : single_layout_i))
8264 : : return true;
8265 : :
8266 : 11343 : for (unsigned partition_i = 0; partition_i < m_partitions.length ();
8267 : : ++partition_i)
8268 : : {
8269 : 6859 : auto &partition = m_partitions[partition_i];
8270 : 6859 : partition.layout = single_layout_i;
8271 : : }
8272 : :
8273 : : return false;
8274 : : }
8275 : :
8276 : : /* Main entry point for the SLP graph optimization pass. */
8277 : :
8278 : : void
8279 : 625746 : vect_optimize_slp_pass::run ()
8280 : : {
8281 : 625746 : build_graph ();
8282 : 625746 : create_partitions ();
8283 : 625746 : start_choosing_layouts ();
8284 : 625746 : if (m_perms.length () > 1)
8285 : : {
8286 : 9677 : if (legitimize ())
8287 : : {
8288 : 5193 : forward_pass ();
8289 : 5193 : backward_pass ();
8290 : : }
8291 : 9677 : if (dump_enabled_p ())
8292 : 639 : dump ();
8293 : 9677 : materialize ();
8294 : 39129 : while (!m_perms.is_empty ())
8295 : 19775 : m_perms.pop ().release ();
8296 : : }
8297 : : else
8298 : 616069 : remove_redundant_permutations ();
8299 : 625746 : free_graph (m_slpg);
8300 : 625746 : build_graph ();
8301 : 625746 : decide_masked_load_lanes ();
8302 : 625746 : free_graph (m_slpg);
8303 : 625746 : }
8304 : :
8305 : : /* Apply CSE to NODE and its children using BST_MAP. */
8306 : :
8307 : : static void
8308 : 5677269 : vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
8309 : : {
8310 : 5677269 : bool put_p = false;
8311 : 5677269 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
8312 : : /* Besides some VEC_PERM_EXPR, two-operator nodes also
8313 : : lack scalar stmts and thus CSE doesn't work via bst_map. Ideally
8314 : : we'd have sth that works for all internal and external nodes. */
8315 : 5677269 : && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
8316 : : {
8317 : 4006098 : slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
8318 : 4006098 : if (leader)
8319 : : {
8320 : : /* We've visited this node already. */
8321 : 417111 : if (!*leader || *leader == node)
8322 : : return;
8323 : :
8324 : 2394 : if (dump_enabled_p ())
8325 : 756 : dump_printf_loc (MSG_NOTE, vect_location,
8326 : : "re-using SLP tree %p for %p\n",
8327 : : (void *)*leader, (void *)node);
8328 : 2394 : vect_free_slp_tree (node);
8329 : 2394 : (*leader)->refcnt += 1;
8330 : 2394 : node = *leader;
8331 : 2394 : return;
8332 : : }
8333 : :
8334 : : /* Avoid creating a cycle by populating the map only after recursion. */
8335 : 3588987 : bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
8336 : 3588987 : node->refcnt += 1;
8337 : 3588987 : put_p = true;
8338 : : /* And recurse. */
8339 : : }
8340 : :
8341 : 16040006 : for (slp_tree &child : SLP_TREE_CHILDREN (node))
8342 : 4789250 : if (child)
8343 : 4007452 : vect_cse_slp_nodes (bst_map, child);
8344 : :
8345 : : /* Now record the node for CSE in other siblings. */
8346 : 5260158 : if (put_p)
8347 : 3588987 : *bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
8348 : : }
8349 : :
8350 : : /* Optimize the SLP graph of VINFO. */
8351 : :
8352 : : void
8353 : 979620 : vect_optimize_slp (vec_info *vinfo)
8354 : : {
8355 : 979620 : if (vinfo->slp_instances.is_empty ())
8356 : : return;
8357 : 625746 : vect_optimize_slp_pass (vinfo).run ();
8358 : :
8359 : : /* Apply CSE again to nodes after permute optimization. */
8360 : 625746 : scalar_stmts_to_slp_tree_map_t *bst_map
8361 : 625746 : = new scalar_stmts_to_slp_tree_map_t ();
8362 : :
8363 : 3547055 : for (auto inst : vinfo->slp_instances)
8364 : 1669817 : vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
8365 : :
8366 : 625746 : release_scalar_stmts_to_slp_tree_map (bst_map);
8367 : : }
8368 : :
8369 : : /* Gather loads reachable from the individual SLP graph entries. */
8370 : :
8371 : : void
8372 : 979620 : vect_gather_slp_loads (vec_info *vinfo)
8373 : : {
8374 : 979620 : unsigned i;
8375 : 979620 : slp_instance instance;
8376 : 2649437 : FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
8377 : : {
8378 : 1669817 : hash_set<slp_tree> visited;
8379 : 1669817 : vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
8380 : : SLP_INSTANCE_TREE (instance), visited);
8381 : 1669817 : }
8382 : 979620 : }
8383 : :
8384 : : /* For NODE update VF based on the number of lanes and the vector types
8385 : : used. */
8386 : :
8387 : : static void
8388 : 4832090 : vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
8389 : : hash_set<slp_tree> &visited)
8390 : : {
8391 : 4832090 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
8392 : 2070619 : return;
8393 : 3141635 : if (visited.add (node))
8394 : : return;
8395 : :
8396 : 11101423 : for (slp_tree child : SLP_TREE_CHILDREN (node))
8397 : 3876414 : vect_update_slp_vf_for_node (child, vf, visited);
8398 : :
8399 : : /* We do not visit SLP nodes for constants or externals - those neither
8400 : : have a vector type set yet (vectorizable_* does this) nor do they
8401 : : have max_nunits set. Instead we rely on internal nodes max_nunit
8402 : : to cover constant/external operands.
8403 : : Note that when we stop using fixed size vectors externs and constants
8404 : : shouldn't influence the (minimum) vectorization factor, instead
8405 : : vectorizable_* should honor the vectorization factor when trying to
8406 : : assign vector types to constants and externals and cause iteration
8407 : : to a higher vectorization factor when required. */
8408 : 2761471 : poly_uint64 node_vf
8409 : 2761471 : = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
8410 : 2761471 : vf = force_common_multiple (vf, node_vf);
8411 : :
8412 : : /* For permute nodes that are fed from externs or constants we have to
8413 : : consider their number of lanes as well. Likewise for store-lanes. */
8414 : 2761471 : if (SLP_TREE_PERMUTE_P (node) || node->ldst_lanes)
8415 : 811240 : for (slp_tree child : SLP_TREE_CHILDREN (node))
8416 : 213289 : if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
8417 : : {
8418 : 2718 : poly_uint64 child_vf
8419 : 2718 : = calculate_unrolling_factor (node->max_nunits,
8420 : : SLP_TREE_LANES (child));
8421 : 2718 : vf = force_common_multiple (vf, child_vf);
8422 : : }
8423 : : }
8424 : :
8425 : : /* For each possible SLP instance decide whether to SLP it and calculate overall
8426 : : unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
8427 : : least one instance. */
8428 : :
8429 : : bool
8430 : 379705 : vect_make_slp_decision (loop_vec_info loop_vinfo)
8431 : : {
8432 : 379705 : unsigned int i;
8433 : 379705 : poly_uint64 unrolling_factor = 1;
8434 : 379705 : const vec<slp_instance> &slp_instances
8435 : : = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
8436 : 379705 : slp_instance instance;
8437 : 379705 : int decided_to_slp = 0;
8438 : :
8439 : 379705 : DUMP_VECT_SCOPE ("vect_make_slp_decision");
8440 : :
8441 : 379705 : hash_set<slp_tree> visited;
8442 : 1335381 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
8443 : : {
8444 : 955676 : slp_tree root = SLP_INSTANCE_TREE (instance);
8445 : :
8446 : : /* All unroll factors have the form:
8447 : :
8448 : : GET_MODE_SIZE (vinfo->vector_mode) * X
8449 : :
8450 : : for some rational X, so they must have a common multiple. */
8451 : 955676 : vect_update_slp_vf_for_node (root, unrolling_factor, visited);
8452 : :
8453 : : /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
8454 : : call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
8455 : : loop-based vectorization. Such stmts will be marked as HYBRID. */
8456 : 955676 : vect_mark_slp_stmts (loop_vinfo, root);
8457 : :
8458 : : /* If all instances ended up with vector(1) T roots make sure to
8459 : : not vectorize. RVV for example relies on loop vectorization
8460 : : when some instances are essentially kept scalar. See PR121048. */
8461 : 955676 : if (SLP_TREE_VECTYPE (root)
8462 : 955676 : && known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U))
8463 : 750773 : decided_to_slp++;
8464 : : }
8465 : :
8466 : 379705 : LOOP_VINFO_VECT_FACTOR (loop_vinfo) = unrolling_factor;
8467 : :
8468 : 379705 : if (decided_to_slp && dump_enabled_p ())
8469 : : {
8470 : 17790 : dump_printf_loc (MSG_NOTE, vect_location,
8471 : : "Decided to SLP %d instances. Unrolling factor ",
8472 : : decided_to_slp);
8473 : 17790 : dump_dec (MSG_NOTE, unrolling_factor);
8474 : 17790 : dump_printf (MSG_NOTE, "\n");
8475 : : }
8476 : :
8477 : 379705 : return (decided_to_slp > 0);
8478 : 379705 : }
8479 : :
8480 : : /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
8481 : :
8482 : 2376869 : _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
8483 : : : vec_info (vec_info::bb, shared),
8484 : 2376869 : roots (vNULL)
8485 : : {
8486 : : /* The region we are operating on. bbs[0] is the entry, excluding
8487 : : its PHI nodes. In the future we might want to track an explicit
8488 : : entry edge to cover bbs[0] PHI nodes and have a region entry
8489 : : insert location. */
8490 : 2376869 : bbs = _bbs.address ();
8491 : 2376869 : nbbs = _bbs.length ();
8492 : :
8493 : 18376151 : for (unsigned i = 0; i < nbbs; ++i)
8494 : : {
8495 : 15999282 : if (i != 0)
8496 : 20814999 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
8497 : 7192586 : gsi_next (&si))
8498 : : {
8499 : 7192586 : gphi *phi = si.phi ();
8500 : 7192586 : gimple_set_uid (phi, 0);
8501 : 7192586 : add_stmt (phi);
8502 : : }
8503 : 31998564 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
8504 : 134043672 : !gsi_end_p (gsi); gsi_next (&gsi))
8505 : : {
8506 : 118044390 : gimple *stmt = gsi_stmt (gsi);
8507 : 118044390 : gimple_set_uid (stmt, 0);
8508 : 118044390 : if (is_gimple_debug (stmt))
8509 : 71889737 : continue;
8510 : 46154653 : add_stmt (stmt);
8511 : : }
8512 : : }
8513 : 2376869 : }
8514 : :
8515 : :
8516 : : /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
8517 : : stmts in the basic block. */
8518 : :
8519 : 2376869 : _bb_vec_info::~_bb_vec_info ()
8520 : : {
8521 : : /* Reset region marker. */
8522 : 18376151 : for (unsigned i = 0; i < nbbs; ++i)
8523 : : {
8524 : 15999282 : if (i != 0)
8525 : 20829945 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
8526 : 7207532 : gsi_next (&si))
8527 : : {
8528 : 7207532 : gphi *phi = si.phi ();
8529 : 7207532 : gimple_set_uid (phi, -1);
8530 : : }
8531 : 31998564 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
8532 : 133968318 : !gsi_end_p (gsi); gsi_next (&gsi))
8533 : : {
8534 : 117969036 : gimple *stmt = gsi_stmt (gsi);
8535 : 117969036 : gimple_set_uid (stmt, -1);
8536 : : }
8537 : : }
8538 : :
8539 : 3597071 : for (unsigned i = 0; i < roots.length (); ++i)
8540 : : {
8541 : 1220202 : roots[i].stmts.release ();
8542 : 1220202 : roots[i].roots.release ();
8543 : 1220202 : roots[i].remain.release ();
8544 : : }
8545 : 2376869 : roots.release ();
8546 : 2376869 : }
8547 : :
8548 : : /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
8549 : : given then that child nodes have already been processed, and that
8550 : : their def types currently match their SLP node's def type. */
8551 : :
8552 : : static bool
8553 : 2412314 : vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
8554 : : slp_instance node_instance,
8555 : : stmt_vector_for_cost *cost_vec)
8556 : : {
8557 : : /* Handle purely internal nodes. */
8558 : 2412314 : if (SLP_TREE_PERMUTE_P (node))
8559 : : {
8560 : 98795 : if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
8561 : : return false;
8562 : :
8563 : : stmt_vec_info slp_stmt_info;
8564 : : unsigned int i;
8565 : 255635 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
8566 : : {
8567 : 158018 : if (slp_stmt_info
8568 : 153204 : && STMT_VINFO_LIVE_P (slp_stmt_info)
8569 : 158036 : && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
8570 : : node_instance, i,
8571 : : false, cost_vec))
8572 : : return false;
8573 : : }
8574 : 97617 : SLP_TREE_TYPE (node) = permute_info_type;
8575 : 97617 : return true;
8576 : : }
8577 : :
8578 : 2313519 : return vect_analyze_stmt (vinfo, node, node_instance, cost_vec);
8579 : : }
8580 : :
8581 : : static int
8582 : 1849405 : sort_ints (const void *a_, const void *b_)
8583 : : {
8584 : 1849405 : int a = *(const int *)a_;
8585 : 1849405 : int b = *(const int *)b_;
8586 : 1849405 : return a - b;
8587 : : }
8588 : :
8589 : : /* Verify if we can externalize a set of internal defs. */
8590 : :
8591 : : static bool
8592 : 390988 : vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
8593 : : {
8594 : : /* Constant generation uses get_later_stmt which can only handle
8595 : : defs from the same BB or a set of defs that can be ordered
8596 : : with a dominance query. */
8597 : 390988 : basic_block bb = NULL;
8598 : 390988 : bool all_same = true;
8599 : 390988 : auto_vec<int> bbs;
8600 : 781976 : bbs.reserve_exact (stmts.length ());
8601 : 2111532 : for (stmt_vec_info stmt : stmts)
8602 : : {
8603 : 938568 : if (!stmt)
8604 : : return false;
8605 : 938568 : else if (!bb)
8606 : 390988 : bb = gimple_bb (stmt->stmt);
8607 : 547580 : else if (gimple_bb (stmt->stmt) != bb)
8608 : 172825 : all_same = false;
8609 : 938568 : bbs.quick_push (gimple_bb (stmt->stmt)->index);
8610 : : }
8611 : 390988 : if (all_same)
8612 : : return true;
8613 : :
8614 : : /* Produce a vector of unique BB indexes for the defs. */
8615 : 129467 : bbs.qsort (sort_ints);
8616 : : unsigned i, j;
8617 : 315454 : for (i = 1, j = 1; i < bbs.length (); ++i)
8618 : 185987 : if (bbs[i] != bbs[j-1])
8619 : 138256 : bbs[j++] = bbs[i];
8620 : 129467 : gcc_assert (j >= 2);
8621 : 129467 : bbs.truncate (j);
8622 : :
8623 : 258934 : if (bbs.length () == 2)
8624 : 126009 : return (dominated_by_p (CDI_DOMINATORS,
8625 : 126009 : BASIC_BLOCK_FOR_FN (cfun, bbs[0]),
8626 : 126009 : BASIC_BLOCK_FOR_FN (cfun, bbs[1]))
8627 : 241662 : || dominated_by_p (CDI_DOMINATORS,
8628 : 115653 : BASIC_BLOCK_FOR_FN (cfun, bbs[1]),
8629 : 115653 : BASIC_BLOCK_FOR_FN (cfun, bbs[0])));
8630 : :
8631 : : /* ??? For more than two BBs we can sort the vector and verify the
8632 : : result is a total order. But we can't use vec::qsort with a
8633 : : compare function using a dominance query since there's no way to
8634 : : signal failure and any fallback for an unordered pair would
8635 : : fail qsort_chk later.
8636 : : For now simply hope that ordering after BB index provides the
8637 : : best candidate total order. If required we can implement our
8638 : : own mergesort or export an entry without checking. */
8639 : 406602 : for (unsigned i = 1; i < bbs.length (); ++i)
8640 : 12221 : if (!dominated_by_p (CDI_DOMINATORS,
8641 : 12221 : BASIC_BLOCK_FOR_FN (cfun, bbs[i]),
8642 : 12221 : BASIC_BLOCK_FOR_FN (cfun, bbs[i-1])))
8643 : : return false;
8644 : :
8645 : : return true;
8646 : 390988 : }
8647 : :
8648 : : /* Try to build NODE from scalars, returning true on success.
8649 : : NODE_INSTANCE is the SLP instance that contains NODE. */
8650 : :
8651 : : static bool
8652 : 530920 : vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
8653 : : slp_instance node_instance)
8654 : : {
8655 : 530920 : stmt_vec_info stmt_info;
8656 : 530920 : unsigned int i;
8657 : :
8658 : 530920 : if (!is_a <bb_vec_info> (vinfo)
8659 : 74238 : || node == SLP_INSTANCE_TREE (node_instance)
8660 : 22285 : || !SLP_TREE_SCALAR_STMTS (node).exists ()
8661 : 22241 : || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
8662 : : /* Force the mask use to be built from scalars instead. */
8663 : 20048 : || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
8664 : 550786 : || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
8665 : 511054 : return false;
8666 : :
8667 : 19866 : if (dump_enabled_p ())
8668 : 73 : dump_printf_loc (MSG_NOTE, vect_location,
8669 : : "Building vector operands of %p from scalars instead\n",
8670 : : (void *) node);
8671 : :
8672 : : /* Don't remove and free the child nodes here, since they could be
8673 : : referenced by other structures. The analysis and scheduling phases
8674 : : (need to) ignore child nodes of anything that isn't vect_internal_def. */
8675 : 19866 : unsigned int group_size = SLP_TREE_LANES (node);
8676 : 19866 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
8677 : : /* Invariants get their vector type from the uses. */
8678 : 19866 : SLP_TREE_VECTYPE (node) = NULL_TREE;
8679 : 19866 : SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
8680 : 19866 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8681 : 69596 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8682 : : {
8683 : 49730 : tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
8684 : 49730 : SLP_TREE_SCALAR_OPS (node)[i] = lhs;
8685 : : }
8686 : : return true;
8687 : : }
8688 : :
8689 : : /* Return true if all elements of the slice are the same. */
8690 : : bool
8691 : 460851 : vect_scalar_ops_slice::all_same_p () const
8692 : : {
8693 : 510258 : for (unsigned int i = 1; i < length; ++i)
8694 : 433256 : if (!operand_equal_p (op (0), op (i)))
8695 : : return false;
8696 : : return true;
8697 : : }
8698 : :
8699 : : hashval_t
8700 : 398830 : vect_scalar_ops_slice_hash::hash (const value_type &s)
8701 : : {
8702 : 398830 : hashval_t hash = 0;
8703 : 1534198 : for (unsigned i = 0; i < s.length; ++i)
8704 : 1135368 : hash = iterative_hash_expr (s.op (i), hash);
8705 : 398830 : return hash;
8706 : : }
8707 : :
8708 : : bool
8709 : 218378 : vect_scalar_ops_slice_hash::equal (const value_type &s1,
8710 : : const compare_type &s2)
8711 : : {
8712 : 218378 : if (s1.length != s2.length)
8713 : : return false;
8714 : 377677 : for (unsigned i = 0; i < s1.length; ++i)
8715 : 329817 : if (!operand_equal_p (s1.op (i), s2.op (i)))
8716 : : return false;
8717 : : return true;
8718 : : }
8719 : :
8720 : : /* Compute the prologue cost for invariant or constant operands represented
8721 : : by NODE. */
8722 : :
8723 : : static void
8724 : 1030877 : vect_prologue_cost_for_slp (vec_info *vinfo, slp_tree node,
8725 : : stmt_vector_for_cost *cost_vec)
8726 : : {
8727 : : /* There's a special case of an existing vector, that costs nothing. */
8728 : 1030877 : if (SLP_TREE_SCALAR_OPS (node).length () == 0
8729 : 1030877 : && !SLP_TREE_VEC_DEFS (node).is_empty ())
8730 : 1464 : return;
8731 : : /* Without looking at the actual initializer a vector of
8732 : : constants can be implemented as load from the constant pool.
8733 : : When all elements are the same we can use a splat. */
8734 : 1029413 : tree vectype = SLP_TREE_VECTYPE (node);
8735 : 1029413 : unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
8736 : 1029413 : unsigned HOST_WIDE_INT const_nunits;
8737 : 1029413 : unsigned nelt_limit;
8738 : 1029413 : unsigned nvectors = vect_get_num_copies (vinfo, node);
8739 : 1029413 : auto ops = &SLP_TREE_SCALAR_OPS (node);
8740 : 1029413 : auto_vec<unsigned int> starts (nvectors);
8741 : 1029413 : if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
8742 : 1029413 : && ! multiple_p (const_nunits, group_size))
8743 : : {
8744 : 63628 : nelt_limit = const_nunits;
8745 : 63628 : hash_set<vect_scalar_ops_slice_hash> vector_ops;
8746 : 263165 : for (unsigned int i = 0; i < nvectors; ++i)
8747 : 199537 : if (!vector_ops.add ({ ops, i * nelt_limit, nelt_limit }))
8748 : 151677 : starts.quick_push (i * nelt_limit);
8749 : 63628 : }
8750 : : else
8751 : : {
8752 : : /* If either the vector has variable length or the vectors
8753 : : are composed of repeated whole groups we only need to
8754 : : cost construction once. All vectors will be the same. */
8755 : 965785 : nelt_limit = group_size;
8756 : 965785 : starts.quick_push (0);
8757 : : }
8758 : : /* ??? We're just tracking whether vectors in a single node are the same.
8759 : : Ideally we'd do something more global. */
8760 : 1029413 : bool passed = false;
8761 : 4205701 : for (unsigned int start : starts)
8762 : : {
8763 : 1117462 : vect_cost_for_stmt kind;
8764 : 1117462 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
8765 : : kind = vector_load;
8766 : 460851 : else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
8767 : : kind = scalar_to_vec;
8768 : : else
8769 : 383849 : kind = vec_construct;
8770 : : /* The target cost hook has no idea which part of the SLP node
8771 : : we are costing so avoid passing it down more than once. Pass
8772 : : it to the first vec_construct or scalar_to_vec part since for those
8773 : : the x86 backend tries to account for GPR to XMM register moves. */
8774 : 1117462 : record_stmt_cost (cost_vec, 1, kind, nullptr,
8775 : 1117462 : (kind != vector_load && !passed) ? node : nullptr,
8776 : : vectype, 0, vect_prologue);
8777 : 1117462 : if (kind != vector_load)
8778 : 460851 : passed = true;
8779 : : }
8780 : 1029413 : }
8781 : :
8782 : : /* Analyze statements contained in SLP tree NODE after recursively analyzing
8783 : : the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
8784 : :
8785 : : Return true if the operations are supported. */
8786 : :
8787 : : static bool
8788 : 4488531 : vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
8789 : : slp_instance node_instance,
8790 : : hash_set<slp_tree> &visited_set,
8791 : : vec<slp_tree> &visited_vec,
8792 : : stmt_vector_for_cost *cost_vec)
8793 : : {
8794 : 4488531 : int i, j;
8795 : 4488531 : slp_tree child;
8796 : :
8797 : : /* Assume we can code-generate all invariants. */
8798 : 4488531 : if (!node
8799 : 4196810 : || SLP_TREE_DEF_TYPE (node) == vect_constant_def
8800 : 3494910 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8801 : : return true;
8802 : :
8803 : 2978966 : if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
8804 : : {
8805 : 10 : if (dump_enabled_p ())
8806 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
8807 : : "Failed cyclic SLP reference in %p\n", (void *) node);
8808 : 10 : return false;
8809 : : }
8810 : 2978956 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
8811 : :
8812 : : /* If we already analyzed the exact same set of scalar stmts we're done.
8813 : : We share the generated vector stmts for those. */
8814 : 2978956 : if (visited_set.add (node))
8815 : : return true;
8816 : 2714889 : visited_vec.safe_push (node);
8817 : :
8818 : 2714889 : bool res = true;
8819 : 2714889 : unsigned visited_rec_start = visited_vec.length ();
8820 : 2714889 : unsigned cost_vec_rec_start = cost_vec->length ();
8821 : 2714889 : bool seen_non_constant_child = false;
8822 : 5688846 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8823 : : {
8824 : 3276385 : res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
8825 : : visited_set, visited_vec,
8826 : : cost_vec);
8827 : 3276385 : if (!res)
8828 : : break;
8829 : 2973957 : if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
8830 : 2973957 : seen_non_constant_child = true;
8831 : : }
8832 : : /* We're having difficulties scheduling nodes with just constant
8833 : : operands and no scalar stmts since we then cannot compute a stmt
8834 : : insertion place. */
8835 : 2714889 : if (res
8836 : 2714889 : && !seen_non_constant_child
8837 : 2714889 : && SLP_TREE_SCALAR_STMTS (node).is_empty ())
8838 : : {
8839 : 147 : if (dump_enabled_p ())
8840 : 6 : dump_printf_loc (MSG_NOTE, vect_location,
8841 : : "Cannot vectorize all-constant op node %p\n",
8842 : : (void *) node);
8843 : : res = false;
8844 : : }
8845 : :
8846 : 2714742 : if (res)
8847 : 2412314 : res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
8848 : : cost_vec);
8849 : : /* If analysis failed we have to pop all recursive visited nodes
8850 : : plus ourselves. */
8851 : 2714889 : if (!res)
8852 : : {
8853 : 2610754 : while (visited_vec.length () >= visited_rec_start)
8854 : 774457 : visited_set.remove (visited_vec.pop ());
8855 : 530920 : cost_vec->truncate (cost_vec_rec_start);
8856 : : }
8857 : :
8858 : : /* When the node can be vectorized cost invariant nodes it references.
8859 : : This is not done in DFS order to allow the refering node
8860 : : vectorizable_* calls to nail down the invariant nodes vector type
8861 : : and possibly unshare it if it needs a different vector type than
8862 : : other referrers. */
8863 : 2714889 : if (res)
8864 : 4868036 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
8865 : 2684067 : if (child
8866 : 2449212 : && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
8867 : 2449212 : || SLP_TREE_DEF_TYPE (child) == vect_external_def)
8868 : : /* Perform usual caching, note code-generation still
8869 : : code-gens these nodes multiple times but we expect
8870 : : to CSE them later. */
8871 : 3776537 : && !visited_set.add (child))
8872 : : {
8873 : 1067947 : visited_vec.safe_push (child);
8874 : : /* ??? After auditing more code paths make a "default"
8875 : : and push the vector type from NODE to all children
8876 : : if it is not already set. */
8877 : : /* Compute the number of vectors to be generated. */
8878 : 1067947 : tree vector_type = SLP_TREE_VECTYPE (child);
8879 : 1067947 : if (!vector_type)
8880 : : {
8881 : : /* Masked loads can have an undefined (default SSA definition)
8882 : : else operand. We do not need to cost it. */
8883 : 37070 : vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
8884 : 38103 : if (SLP_TREE_TYPE (node) == load_vec_info_type
8885 : 38103 : && ((ops.length ()
8886 : 1033 : && TREE_CODE (ops[0]) == SSA_NAME
8887 : 0 : && SSA_NAME_IS_DEFAULT_DEF (ops[0])
8888 : 0 : && VAR_P (SSA_NAME_VAR (ops[0])))
8889 : 1033 : || SLP_TREE_DEF_TYPE (child) == vect_constant_def))
8890 : 1033 : continue;
8891 : :
8892 : : /* For shifts with a scalar argument we don't need
8893 : : to cost or code-generate anything.
8894 : : ??? Represent this more explicitely. */
8895 : 36037 : gcc_assert (SLP_TREE_TYPE (node) == shift_vec_info_type
8896 : : && j == 1);
8897 : 36037 : continue;
8898 : 36037 : }
8899 : :
8900 : : /* And cost them. */
8901 : 1030877 : vect_prologue_cost_for_slp (vinfo, child, cost_vec);
8902 : : }
8903 : :
8904 : : /* If this node or any of its children can't be vectorized, try pruning
8905 : : the tree here rather than felling the whole thing. */
8906 : 530920 : if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
8907 : : {
8908 : : /* We'll need to revisit this for invariant costing and number
8909 : : of vectorized stmt setting. */
8910 : : res = true;
8911 : : }
8912 : :
8913 : : return res;
8914 : : }
8915 : :
8916 : : /* Given a definition DEF, analyze if it will have any live scalar use after
8917 : : performing SLP vectorization whose information is represented by BB_VINFO,
8918 : : and record result into hash map SCALAR_USE_MAP as cache for later fast
8919 : : check. If recursion DEPTH exceeds a limit, stop analysis and make a
8920 : : conservative assumption. Return 0 if no scalar use, 1 if there is, -1
8921 : : means recursion is limited. */
8922 : :
8923 : : static int
8924 : 549037 : vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
8925 : : hash_map<tree, int> &scalar_use_map,
8926 : : int depth = 0)
8927 : : {
8928 : 549037 : const int depth_limit = 2;
8929 : 549037 : imm_use_iterator use_iter;
8930 : 549037 : gimple *use_stmt;
8931 : :
8932 : 549037 : if (int *res = scalar_use_map.get (def))
8933 : 22116 : return *res;
8934 : :
8935 : 526921 : int scalar_use = 1;
8936 : :
8937 : 1726991 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
8938 : : {
8939 : 784524 : if (is_gimple_debug (use_stmt))
8940 : 167434 : continue;
8941 : :
8942 : 617090 : stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
8943 : :
8944 : 617090 : if (!use_stmt_info)
8945 : : break;
8946 : :
8947 : 619562 : if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
8948 : 503658 : continue;
8949 : :
8950 : : /* Do not step forward when encounter PHI statement, since it may
8951 : : involve cyclic reference and cause infinite recursive invocation. */
8952 : 107255 : if (gimple_code (use_stmt) == GIMPLE_PHI)
8953 : : break;
8954 : :
8955 : : /* When pattern recognition is involved, a statement whose definition is
8956 : : consumed in some pattern, may not be included in the final replacement
8957 : : pattern statements, so would be skipped when building SLP graph.
8958 : :
8959 : : * Original
8960 : : char a_c = *(char *) a;
8961 : : char b_c = *(char *) b;
8962 : : unsigned short a_s = (unsigned short) a_c;
8963 : : int a_i = (int) a_s;
8964 : : int b_i = (int) b_c;
8965 : : int r_i = a_i - b_i;
8966 : :
8967 : : * After pattern replacement
8968 : : a_s = (unsigned short) a_c;
8969 : : a_i = (int) a_s;
8970 : :
8971 : : patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
8972 : : patt_b_i = (int) patt_b_s; // b_i = (int) b_c
8973 : :
8974 : : patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
8975 : : patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
8976 : :
8977 : : The definitions of a_i(original statement) and b_i(pattern statement)
8978 : : are related to, but actually not part of widen_minus pattern.
8979 : : Vectorizing the pattern does not cause these definition statements to
8980 : : be marked as PURE_SLP. For this case, we need to recursively check
8981 : : whether their uses are all absorbed into vectorized code. But there
8982 : : is an exception that some use may participate in an vectorized
8983 : : operation via an external SLP node containing that use as an element.
8984 : : The parameter "scalar_use_map" tags such kind of SSA as having scalar
8985 : : use in advance. */
8986 : 88421 : tree lhs = gimple_get_lhs (use_stmt);
8987 : :
8988 : 88421 : if (!lhs || TREE_CODE (lhs) != SSA_NAME)
8989 : : break;
8990 : :
8991 : 56326 : if (depth_limit && depth >= depth_limit)
8992 : 10758 : return -1;
8993 : :
8994 : 45568 : if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
8995 : : depth + 1)))
8996 : : break;
8997 : 10758 : }
8998 : :
8999 : 516163 : if (end_imm_use_stmt_p (&use_iter))
9000 : 415546 : scalar_use = 0;
9001 : :
9002 : : /* If recursion is limited, do not cache result for non-root defs. */
9003 : 516163 : if (!depth || scalar_use >= 0)
9004 : : {
9005 : 505405 : bool added = scalar_use_map.put (def, scalar_use);
9006 : 505405 : gcc_assert (!added);
9007 : : }
9008 : :
9009 : 516163 : return scalar_use;
9010 : : }
9011 : :
9012 : : /* Mark lanes of NODE that are live outside of the basic-block vectorized
9013 : : region and that can be vectorized using vectorizable_live_operation
9014 : : with STMT_VINFO_LIVE_P. Not handled live operations will cause the
9015 : : scalar code computing it to be retained. */
9016 : :
9017 : : static void
9018 : 908646 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
9019 : : slp_instance instance,
9020 : : stmt_vector_for_cost *cost_vec,
9021 : : hash_map<tree, int> &scalar_use_map,
9022 : : hash_set<stmt_vec_info> &svisited,
9023 : : hash_set<slp_tree> &visited)
9024 : : {
9025 : 908646 : if (visited.add (node))
9026 : 38660 : return;
9027 : :
9028 : 869986 : unsigned i;
9029 : 869986 : stmt_vec_info stmt_info;
9030 : 869986 : stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
9031 : 3150448 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9032 : : {
9033 : 2280462 : if (!stmt_info || svisited.contains (stmt_info))
9034 : 29783 : continue;
9035 : 2259232 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
9036 : 2259232 : if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
9037 : 11471 : && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
9038 : : /* Only the pattern root stmt computes the original scalar value. */
9039 : 8553 : continue;
9040 : 2250679 : bool mark_visited = true;
9041 : 2250679 : gimple *orig_stmt = orig_stmt_info->stmt;
9042 : 2250679 : ssa_op_iter op_iter;
9043 : 2250679 : def_operand_p def_p;
9044 : 5004827 : FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
9045 : : {
9046 : 503469 : if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
9047 : : scalar_use_map))
9048 : : {
9049 : 89854 : STMT_VINFO_LIVE_P (stmt_info) = true;
9050 : 89854 : if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
9051 : : instance, i, false, cost_vec))
9052 : : /* ??? So we know we can vectorize the live stmt from one SLP
9053 : : node. If we cannot do so from all or none consistently
9054 : : we'd have to record which SLP node (and lane) we want to
9055 : : use for the live operation. So make sure we can
9056 : : code-generate from all nodes. */
9057 : : mark_visited = false;
9058 : : else
9059 : 0 : STMT_VINFO_LIVE_P (stmt_info) = false;
9060 : : }
9061 : :
9062 : : /* We have to verify whether we can insert the lane extract
9063 : : before all uses. The following is a conservative approximation.
9064 : : We cannot put this into vectorizable_live_operation because
9065 : : iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
9066 : : doesn't work.
9067 : : Note that while the fact that we emit code for loads at the
9068 : : first load should make this a non-problem leafs we construct
9069 : : from scalars are vectorized after the last scalar def.
9070 : : ??? If we'd actually compute the insert location during
9071 : : analysis we could use sth less conservative than the last
9072 : : scalar stmt in the node for the dominance check. */
9073 : : /* ??? What remains is "live" uses in vector CTORs in the same
9074 : : SLP graph which is where those uses can end up code-generated
9075 : : right after their definition instead of close to their original
9076 : : use. But that would restrict us to code-generate lane-extracts
9077 : : from the latest stmt in a node. So we compensate for this
9078 : : during code-generation, simply not replacing uses for those
9079 : : hopefully rare cases. */
9080 : 503469 : imm_use_iterator use_iter;
9081 : 503469 : gimple *use_stmt;
9082 : 503469 : stmt_vec_info use_stmt_info;
9083 : :
9084 : 503469 : if (STMT_VINFO_LIVE_P (stmt_info))
9085 : 612032 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
9086 : 432324 : if (!is_gimple_debug (use_stmt)
9087 : 321723 : && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
9088 : 311601 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
9089 : 612535 : && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
9090 : : {
9091 : 16606 : if (dump_enabled_p ())
9092 : 282 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9093 : : "Cannot determine insertion place for "
9094 : : "lane extract\n");
9095 : 16606 : STMT_VINFO_LIVE_P (stmt_info) = false;
9096 : 16606 : mark_visited = true;
9097 : 89854 : }
9098 : : }
9099 : 2250679 : if (mark_visited)
9100 : 2174418 : svisited.add (stmt_info);
9101 : : }
9102 : :
9103 : : slp_tree child;
9104 : 2513100 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9105 : 877194 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9106 : 226575 : vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
9107 : : scalar_use_map, svisited, visited);
9108 : : }
9109 : :
9110 : : /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
9111 : : are live outside of the basic-block vectorized region and that can be
9112 : : vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
9113 : :
9114 : : static void
9115 : 282966 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
9116 : : {
9117 : 282966 : if (bb_vinfo->slp_instances.is_empty ())
9118 : 32088 : return;
9119 : :
9120 : 250878 : hash_set<stmt_vec_info> svisited;
9121 : 250878 : hash_set<slp_tree> visited;
9122 : 250878 : hash_map<tree, int> scalar_use_map;
9123 : 250878 : auto_vec<slp_tree> worklist;
9124 : :
9125 : 1434705 : for (slp_instance instance : bb_vinfo->slp_instances)
9126 : : {
9127 : 682071 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
9128 : 57278 : for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
9129 : 16479 : if (TREE_CODE (op) == SSA_NAME)
9130 : 13911 : scalar_use_map.put (op, 1);
9131 : 682071 : if (!visited.add (SLP_INSTANCE_TREE (instance)))
9132 : 680143 : worklist.safe_push (SLP_INSTANCE_TREE (instance));
9133 : : }
9134 : :
9135 : 1519364 : do
9136 : : {
9137 : 1519364 : slp_tree node = worklist.pop ();
9138 : :
9139 : 1519364 : if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
9140 : : {
9141 : 1593198 : for (tree op : SLP_TREE_SCALAR_OPS (node))
9142 : 704552 : if (TREE_CODE (op) == SSA_NAME)
9143 : 474827 : scalar_use_map.put (op, 1);
9144 : : }
9145 : : else
9146 : : {
9147 : 3631684 : for (slp_tree child : SLP_TREE_CHILDREN (node))
9148 : 877170 : if (child && !visited.add (child))
9149 : 839221 : worklist.safe_push (child);
9150 : : }
9151 : : }
9152 : 3038728 : while (!worklist.is_empty ());
9153 : :
9154 : 250878 : visited.empty ();
9155 : :
9156 : 1434705 : for (slp_instance instance : bb_vinfo->slp_instances)
9157 : : {
9158 : 682071 : vect_location = instance->location ();
9159 : 682071 : vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
9160 : : instance, &instance->cost_vec,
9161 : : scalar_use_map, svisited, visited);
9162 : : }
9163 : 250878 : }
9164 : :
9165 : : /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
9166 : :
9167 : : static bool
9168 : 75720 : vectorizable_bb_reduc_epilogue (slp_instance instance,
9169 : : stmt_vector_for_cost *cost_vec)
9170 : : {
9171 : 75720 : gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
9172 : 75720 : enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
9173 : 75720 : if (reduc_code == MINUS_EXPR)
9174 : 0 : reduc_code = PLUS_EXPR;
9175 : 75720 : internal_fn reduc_fn;
9176 : 75720 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
9177 : 75720 : if (!vectype
9178 : 75708 : || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9179 : 75708 : || reduc_fn == IFN_LAST
9180 : 75708 : || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
9181 : 111974 : || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
9182 : 36254 : TREE_TYPE (vectype)))
9183 : : {
9184 : 51744 : if (dump_enabled_p ())
9185 : 271 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9186 : : "not vectorized: basic block reduction epilogue "
9187 : : "operation unsupported.\n");
9188 : 51744 : return false;
9189 : : }
9190 : :
9191 : : /* There's no way to cost a horizontal vector reduction via REDUC_FN so
9192 : : cost log2 vector operations plus shuffles and one extraction. */
9193 : 23976 : unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
9194 : 23976 : record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
9195 : : vectype, 0, vect_body);
9196 : 23976 : record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
9197 : : vectype, 0, vect_body);
9198 : 23976 : record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
9199 : : vectype, 0, vect_body);
9200 : :
9201 : : /* Since we replace all stmts of a possibly longer scalar reduction
9202 : : chain account for the extra scalar stmts for that. */
9203 : 23976 : if (!instance->remain_defs.is_empty ())
9204 : 19200 : record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
9205 : 9600 : instance->root_stmts[0], 0, vect_body);
9206 : : return true;
9207 : : }
9208 : :
9209 : : /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
9210 : : and recurse to children. */
9211 : :
9212 : : static void
9213 : 186759 : vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
9214 : : hash_set<slp_tree> &visited)
9215 : : {
9216 : 186759 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
9217 : 186759 : || visited.add (node))
9218 : 82511 : return;
9219 : :
9220 : : stmt_vec_info stmt;
9221 : : unsigned i;
9222 : 346243 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
9223 : 241995 : if (stmt)
9224 : 245519 : roots.remove (vect_orig_stmt (stmt));
9225 : :
9226 : : slp_tree child;
9227 : 236985 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9228 : 132737 : if (child)
9229 : 125969 : vect_slp_prune_covered_roots (child, roots, visited);
9230 : : }
9231 : :
9232 : : /* Analyze statements in SLP instances of VINFO. Return true if the
9233 : : operations are supported. */
9234 : :
9235 : : bool
9236 : 607208 : vect_slp_analyze_operations (vec_info *vinfo)
9237 : : {
9238 : 607208 : slp_instance instance;
9239 : 607208 : int i;
9240 : :
9241 : 607208 : DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
9242 : :
9243 : 607208 : hash_set<slp_tree> visited;
9244 : 1608179 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
9245 : : {
9246 : 1212146 : auto_vec<slp_tree> visited_vec;
9247 : 1212146 : stmt_vector_for_cost cost_vec;
9248 : 1212146 : cost_vec.create (2);
9249 : 1212146 : if (is_a <bb_vec_info> (vinfo))
9250 : 787274 : vect_location = instance->location ();
9251 : 1212146 : if (!vect_slp_analyze_node_operations (vinfo,
9252 : : SLP_INSTANCE_TREE (instance),
9253 : : instance, visited, visited_vec,
9254 : : &cost_vec)
9255 : : /* CTOR instances require vectorized defs for the SLP tree root. */
9256 : 1003510 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
9257 : 4712 : && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
9258 : : != vect_internal_def
9259 : : /* Make sure we vectorized with the expected type. */
9260 : 4708 : || !useless_type_conversion_p
9261 : 4708 : (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
9262 : : (instance->root_stmts[0]->stmt))),
9263 : 4708 : TREE_TYPE (SLP_TREE_VECTYPE
9264 : : (SLP_INSTANCE_TREE (instance))))))
9265 : : /* Check we can vectorize the reduction. */
9266 : 1003491 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
9267 : 75720 : && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
9268 : : /* Check we can vectorize the gcond. */
9269 : 2163893 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
9270 : 56693 : && !vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
9271 : 56693 : SLP_INSTANCE_ROOT_STMTS (instance)[0],
9272 : : NULL,
9273 : : SLP_INSTANCE_TREE (instance),
9274 : : &cost_vec)))
9275 : : {
9276 : 314891 : cost_vec.release ();
9277 : 314891 : slp_tree node = SLP_INSTANCE_TREE (instance);
9278 : 314891 : stmt_vec_info stmt_info;
9279 : 314891 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9280 : 246658 : stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
9281 : 68233 : else if (!SLP_TREE_SCALAR_STMTS (node).is_empty ()
9282 : 68233 : && SLP_TREE_SCALAR_STMTS (node)[0])
9283 : : stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
9284 : : else
9285 : 0 : stmt_info = SLP_TREE_REPRESENTATIVE (node);
9286 : 314891 : if (is_a <loop_vec_info> (vinfo))
9287 : : {
9288 : 211175 : if (dump_enabled_p ())
9289 : 6214 : dump_printf_loc (MSG_NOTE, vect_location,
9290 : : "unsupported SLP instance starting from: %G",
9291 : : stmt_info->stmt);
9292 : 211175 : return false;
9293 : : }
9294 : 103716 : if (dump_enabled_p ())
9295 : 322 : dump_printf_loc (MSG_NOTE, vect_location,
9296 : : "removing SLP instance operations starting from: %G",
9297 : : stmt_info->stmt);
9298 : 103716 : vect_free_slp_instance (instance);
9299 : 103716 : vinfo->slp_instances.ordered_remove (i);
9300 : 1554488 : while (!visited_vec.is_empty ())
9301 : 346132 : visited.remove (visited_vec.pop ());
9302 : : }
9303 : : else
9304 : : {
9305 : 897255 : i++;
9306 : 897255 : if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
9307 : : {
9308 : 213697 : add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
9309 : 213697 : cost_vec.release ();
9310 : : }
9311 : : else
9312 : : /* For BB vectorization remember the SLP graph entry
9313 : : cost for later. */
9314 : 683558 : instance->cost_vec = cost_vec;
9315 : : }
9316 : 1212146 : }
9317 : :
9318 : : /* Now look for SLP instances with a root that are covered by other
9319 : : instances and remove them. */
9320 : 396033 : hash_set<stmt_vec_info> roots;
9321 : 1633403 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
9322 : 872001 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9323 : 30664 : roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
9324 : 396033 : if (!roots.is_empty ())
9325 : : {
9326 : 12167 : visited.empty ();
9327 : 72957 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
9328 : 60790 : vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
9329 : : visited);
9330 : 72957 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
9331 : 60790 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
9332 : 30664 : && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
9333 : : {
9334 : 1487 : stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
9335 : 1487 : if (dump_enabled_p ())
9336 : 20 : dump_printf_loc (MSG_NOTE, vect_location,
9337 : : "removing SLP instance operations starting "
9338 : : "from: %G", root->stmt);
9339 : 1487 : vect_free_slp_instance (instance);
9340 : 1487 : vinfo->slp_instances.ordered_remove (i);
9341 : : }
9342 : : else
9343 : 59303 : ++i;
9344 : : }
9345 : :
9346 : : /* Compute vectorizable live stmts. */
9347 : 396033 : if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
9348 : 282966 : vect_bb_slp_mark_live_stmts (bb_vinfo);
9349 : :
9350 : 792066 : return !vinfo->slp_instances.is_empty ();
9351 : 1003241 : }
9352 : :
9353 : : /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
9354 : : closing the eventual chain. */
9355 : :
9356 : : static slp_instance
9357 : 741271 : get_ultimate_leader (slp_instance instance,
9358 : : hash_map<slp_instance, slp_instance> &instance_leader)
9359 : : {
9360 : 741271 : auto_vec<slp_instance *, 8> chain;
9361 : 741271 : slp_instance *tem;
9362 : 811650 : while (*(tem = instance_leader.get (instance)) != instance)
9363 : : {
9364 : 70379 : chain.safe_push (tem);
9365 : 70379 : instance = *tem;
9366 : : }
9367 : 811650 : while (!chain.is_empty ())
9368 : 70379 : *chain.pop () = instance;
9369 : 741271 : return instance;
9370 : 741271 : }
9371 : :
9372 : : namespace {
9373 : : /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
9374 : : KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
9375 : : for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
9376 : :
9377 : : INSTANCE_LEADER is as for get_ultimate_leader. */
9378 : :
9379 : : template<typename T>
9380 : : bool
9381 : 3283458 : vect_map_to_instance (slp_instance instance, T key,
9382 : : hash_map<T, slp_instance> &key_to_instance,
9383 : : hash_map<slp_instance, slp_instance> &instance_leader)
9384 : : {
9385 : : bool existed_p;
9386 : 3283458 : slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
9387 : 3283458 : if (!existed_p)
9388 : : ;
9389 : 163400 : else if (key_instance != instance)
9390 : : {
9391 : : /* If we're running into a previously marked key make us the
9392 : : leader of the current ultimate leader. This keeps the
9393 : : leader chain acyclic and works even when the current instance
9394 : : connects two previously independent graph parts. */
9395 : 59200 : slp_instance key_leader
9396 : 59200 : = get_ultimate_leader (key_instance, instance_leader);
9397 : 59200 : if (key_leader != instance)
9398 : 17758 : instance_leader.put (key_leader, instance);
9399 : : }
9400 : 3283458 : key_instance = instance;
9401 : 3283458 : return existed_p;
9402 : : }
9403 : : }
9404 : :
9405 : : /* Worker of vect_bb_partition_graph, recurse on NODE. */
9406 : :
9407 : : static void
9408 : 908646 : vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
9409 : : slp_instance instance, slp_tree node,
9410 : : hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
9411 : : hash_map<slp_tree, slp_instance> &node_to_instance,
9412 : : hash_map<slp_instance, slp_instance> &instance_leader)
9413 : : {
9414 : 908646 : stmt_vec_info stmt_info;
9415 : 908646 : unsigned i;
9416 : :
9417 : 3283458 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9418 : 2374812 : if (stmt_info)
9419 : 2374812 : vect_map_to_instance (instance, stmt_info, stmt_to_instance,
9420 : : instance_leader);
9421 : :
9422 : 908646 : if (vect_map_to_instance (instance, node, node_to_instance,
9423 : : instance_leader))
9424 : 908646 : return;
9425 : :
9426 : : slp_tree child;
9427 : 1747180 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9428 : 877194 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9429 : 226575 : vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
9430 : : node_to_instance, instance_leader);
9431 : : }
9432 : :
9433 : : /* Partition the SLP graph into pieces that can be costed independently. */
9434 : :
9435 : : static void
9436 : 250878 : vect_bb_partition_graph (bb_vec_info bb_vinfo)
9437 : : {
9438 : 250878 : DUMP_VECT_SCOPE ("vect_bb_partition_graph");
9439 : :
9440 : : /* First walk the SLP graph assigning each involved scalar stmt a
9441 : : corresponding SLP graph entry and upon visiting a previously
9442 : : marked stmt, make the stmts leader the current SLP graph entry. */
9443 : 250878 : hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
9444 : 250878 : hash_map<slp_tree, slp_instance> node_to_instance;
9445 : 250878 : hash_map<slp_instance, slp_instance> instance_leader;
9446 : 250878 : slp_instance instance;
9447 : 932949 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
9448 : : {
9449 : 682071 : instance_leader.put (instance, instance);
9450 : 682071 : vect_bb_partition_graph_r (bb_vinfo,
9451 : : instance, SLP_INSTANCE_TREE (instance),
9452 : : stmt_to_instance, node_to_instance,
9453 : : instance_leader);
9454 : : }
9455 : :
9456 : : /* Then collect entries to each independent subgraph. */
9457 : 1183827 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
9458 : : {
9459 : 682071 : slp_instance leader = get_ultimate_leader (instance, instance_leader);
9460 : 682071 : leader->subgraph_entries.safe_push (instance);
9461 : 682071 : if (dump_enabled_p ()
9462 : 682071 : && leader != instance)
9463 : 67 : dump_printf_loc (MSG_NOTE, vect_location,
9464 : : "instance %p is leader of %p\n",
9465 : : (void *) leader, (void *) instance);
9466 : : }
9467 : 250878 : }
9468 : :
9469 : : /* Compute the set of scalar stmts participating in internal and external
9470 : : nodes. */
9471 : :
9472 : : static void
9473 : 1546472 : vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
9474 : : hash_set<slp_tree> &visited,
9475 : : hash_set<stmt_vec_info> &vstmts,
9476 : : hash_set<stmt_vec_info> &estmts)
9477 : : {
9478 : 1546472 : int i;
9479 : 1546472 : stmt_vec_info stmt_info;
9480 : 1546472 : slp_tree child;
9481 : :
9482 : 1546472 : if (visited.add (node))
9483 : 38658 : return;
9484 : :
9485 : 1507814 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
9486 : : {
9487 : 3093474 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9488 : 2231970 : if (stmt_info)
9489 : 2231970 : vstmts.add (stmt_info);
9490 : :
9491 : 3134763 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9492 : 867711 : if (child)
9493 : 867711 : vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
9494 : : vstmts, estmts);
9495 : : }
9496 : : else
9497 : 3628936 : for (tree def : SLP_TREE_SCALAR_OPS (node))
9498 : : {
9499 : 1691016 : stmt_vec_info def_stmt = vinfo->lookup_def (def);
9500 : 1691016 : if (def_stmt)
9501 : 335095 : estmts.add (def_stmt);
9502 : : }
9503 : : }
9504 : :
9505 : :
9506 : : /* Compute the scalar cost of the SLP node NODE and its children
9507 : : and return it. Do not account defs that are marked in LIFE and
9508 : : update LIFE according to uses of NODE. */
9509 : :
9510 : : static void
9511 : 899381 : vect_bb_slp_scalar_cost (vec_info *vinfo,
9512 : : slp_tree node, vec<bool, va_heap> *life,
9513 : : stmt_vector_for_cost *cost_vec,
9514 : : hash_set<stmt_vec_info> &vectorized_scalar_stmts,
9515 : : hash_set<stmt_vec_info> &scalar_stmts_in_externs,
9516 : : hash_set<slp_tree> &visited)
9517 : : {
9518 : 899381 : unsigned i;
9519 : 899381 : stmt_vec_info stmt_info;
9520 : 899381 : slp_tree child;
9521 : :
9522 : 899381 : if (visited.add (node))
9523 : 37860 : return;
9524 : :
9525 : 3093525 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9526 : : {
9527 : 2232004 : ssa_op_iter op_iter;
9528 : 2232004 : def_operand_p def_p;
9529 : :
9530 : 2262292 : if (!stmt_info
9531 : 2232004 : || (*life)[i]
9532 : : /* Defs also used in external nodes are not in the
9533 : : vectorized_scalar_stmts set as they need to be preserved.
9534 : : Honor that. */
9535 : 4436633 : || scalar_stmts_in_externs.contains (stmt_info))
9536 : 102069 : continue;
9537 : :
9538 : 2201716 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
9539 : 2201716 : gimple *orig_stmt = orig_stmt_info->stmt;
9540 : :
9541 : : /* If there is a non-vectorized use of the defs then the scalar
9542 : : stmt is kept live in which case we do not account it or any
9543 : : required defs in the SLP children in the scalar cost. This
9544 : : way we make the vectorization more costly when compared to
9545 : : the scalar cost. */
9546 : 2201716 : if (!STMT_VINFO_LIVE_P (stmt_info))
9547 : : {
9548 : 2134893 : auto_vec<gimple *, 8> worklist;
9549 : 2134893 : hash_set<gimple *> *worklist_visited = NULL;
9550 : 2134893 : worklist.quick_push (orig_stmt);
9551 : 2139915 : do
9552 : : {
9553 : 2139915 : gimple *work_stmt = worklist.pop ();
9554 : 4675224 : FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
9555 : : {
9556 : 414255 : imm_use_iterator use_iter;
9557 : 414255 : gimple *use_stmt;
9558 : 1022371 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
9559 : : DEF_FROM_PTR (def_p))
9560 : 626977 : if (!is_gimple_debug (use_stmt))
9561 : : {
9562 : 484588 : stmt_vec_info use_stmt_info
9563 : 484588 : = vinfo->lookup_stmt (use_stmt);
9564 : 484588 : if (!use_stmt_info
9565 : 484588 : || !vectorized_scalar_stmts.contains (use_stmt_info))
9566 : : {
9567 : 23972 : if (use_stmt_info
9568 : 21496 : && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
9569 : : {
9570 : : /* For stmts participating in patterns we have
9571 : : to check its uses recursively. */
9572 : 5111 : if (!worklist_visited)
9573 : 3962 : worklist_visited = new hash_set<gimple *> ();
9574 : 5111 : if (!worklist_visited->add (use_stmt))
9575 : 5111 : worklist.safe_push (use_stmt);
9576 : 5111 : continue;
9577 : : }
9578 : 18861 : (*life)[i] = true;
9579 : 18861 : goto next_lane;
9580 : : }
9581 : 414255 : }
9582 : : }
9583 : : }
9584 : 4242108 : while (!worklist.is_empty ());
9585 : 2116032 : next_lane:
9586 : 2134893 : if (worklist_visited)
9587 : 3962 : delete worklist_visited;
9588 : 2134893 : if ((*life)[i])
9589 : 18861 : continue;
9590 : 2134893 : }
9591 : :
9592 : : /* Count scalar stmts only once. */
9593 : 2182855 : if (gimple_visited_p (orig_stmt))
9594 : 25118 : continue;
9595 : 2157737 : gimple_set_visited (orig_stmt, true);
9596 : :
9597 : 2157737 : vect_cost_for_stmt kind;
9598 : 2157737 : if (STMT_VINFO_DATA_REF (orig_stmt_info))
9599 : : {
9600 : 1963410 : data_reference_p dr = STMT_VINFO_DATA_REF (orig_stmt_info);
9601 : 1963410 : tree base = get_base_address (DR_REF (dr));
9602 : : /* When the scalar access is to a non-global not address-taken
9603 : : decl that is not BLKmode assume we can access it with a single
9604 : : non-load/store instruction. */
9605 : 1963410 : if (DECL_P (base)
9606 : 1528838 : && !is_global_var (base)
9607 : 1454274 : && !TREE_ADDRESSABLE (base)
9608 : 2517472 : && DECL_MODE (base) != BLKmode)
9609 : : kind = scalar_stmt;
9610 : 1818172 : else if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
9611 : : kind = scalar_load;
9612 : : else
9613 : 1596140 : kind = scalar_store;
9614 : : }
9615 : 194327 : else if (vect_nop_conversion_p (orig_stmt_info))
9616 : 20608 : continue;
9617 : : /* For single-argument PHIs assume coalescing which means zero cost
9618 : : for the scalar and the vector PHIs. This avoids artificially
9619 : : favoring the vector path (but may pessimize it in some cases). */
9620 : 173719 : else if (is_a <gphi *> (orig_stmt_info->stmt)
9621 : 173719 : && gimple_phi_num_args
9622 : 79483 : (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
9623 : 7194 : continue;
9624 : : else
9625 : : kind = scalar_stmt;
9626 : 2129935 : record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
9627 : : SLP_TREE_VECTYPE (node), 0, vect_body);
9628 : : }
9629 : :
9630 : 1723042 : auto_vec<bool, 20> subtree_life;
9631 : 2488503 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9632 : : {
9633 : 867735 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9634 : : {
9635 : : /* Do not directly pass LIFE to the recursive call, copy it to
9636 : : confine changes in the callee to the current child/subtree. */
9637 : 220620 : if (SLP_TREE_PERMUTE_P (node))
9638 : : {
9639 : 3548 : subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
9640 : 12396 : for (unsigned j = 0;
9641 : 12396 : j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
9642 : : {
9643 : 8848 : auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
9644 : 8848 : if (perm.first == i)
9645 : 4700 : subtree_life[perm.second] = (*life)[j];
9646 : : }
9647 : : }
9648 : : else
9649 : : {
9650 : 217072 : gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
9651 : 217072 : subtree_life.safe_splice (*life);
9652 : : }
9653 : 220620 : vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
9654 : : vectorized_scalar_stmts,
9655 : : scalar_stmts_in_externs, visited);
9656 : 220620 : subtree_life.truncate (0);
9657 : : }
9658 : : }
9659 : : }
9660 : :
9661 : : /* Comparator for the loop-index sorted cost vectors. */
9662 : :
9663 : : static int
9664 : 17451312 : li_cost_vec_cmp (const void *a_, const void *b_)
9665 : : {
9666 : 17451312 : auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
9667 : 17451312 : auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
9668 : 17451312 : if (a->first < b->first)
9669 : : return -1;
9670 : 16746607 : else if (a->first == b->first)
9671 : 16125198 : return 0;
9672 : : return 1;
9673 : : }
9674 : :
9675 : : /* Check if vectorization of the basic block is profitable for the
9676 : : subgraph denoted by SLP_INSTANCES. */
9677 : :
9678 : : static bool
9679 : 661130 : vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
9680 : : vec<slp_instance> slp_instances,
9681 : : loop_p orig_loop)
9682 : : {
9683 : 661130 : slp_instance instance;
9684 : 661130 : int i;
9685 : 661130 : unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
9686 : 661130 : unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
9687 : :
9688 : 661130 : if (dump_enabled_p ())
9689 : : {
9690 : 96 : dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
9691 : 96 : hash_set<slp_tree> visited;
9692 : 387 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9693 : 99 : vect_print_slp_graph (MSG_NOTE, vect_location,
9694 : : SLP_INSTANCE_TREE (instance), visited);
9695 : 96 : }
9696 : :
9697 : : /* Compute the set of scalar stmts we know will go away 'locally' when
9698 : : vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
9699 : : not accurate for nodes promoted extern late or for scalar stmts that
9700 : : are used both in extern defs and in vectorized defs. */
9701 : 661130 : hash_set<stmt_vec_info> vectorized_scalar_stmts;
9702 : 661130 : hash_set<stmt_vec_info> scalar_stmts_in_externs;
9703 : 661130 : hash_set<slp_tree> visited;
9704 : 1339891 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9705 : : {
9706 : 678761 : vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
9707 : : SLP_INSTANCE_TREE (instance),
9708 : : visited,
9709 : : vectorized_scalar_stmts,
9710 : : scalar_stmts_in_externs);
9711 : 782530 : for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
9712 : 49903 : vectorized_scalar_stmts.add (rstmt);
9713 : : }
9714 : : /* Scalar stmts used as defs in external nodes need to be preseved, so
9715 : : remove them from vectorized_scalar_stmts. */
9716 : 960388 : for (stmt_vec_info stmt : scalar_stmts_in_externs)
9717 : 299258 : vectorized_scalar_stmts.remove (stmt);
9718 : :
9719 : : /* Calculate scalar cost and sum the cost for the vector stmts
9720 : : previously collected. */
9721 : 661130 : stmt_vector_for_cost scalar_costs = vNULL;
9722 : 661130 : stmt_vector_for_cost vector_costs = vNULL;
9723 : 661130 : visited.empty ();
9724 : 1339891 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9725 : : {
9726 : 678761 : auto_vec<bool, 20> life;
9727 : 678761 : life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
9728 : : true);
9729 : 678761 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9730 : 53866 : record_stmt_cost (&scalar_costs,
9731 : 26933 : SLP_INSTANCE_ROOT_STMTS (instance).length (),
9732 : : scalar_stmt,
9733 : 26933 : SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
9734 : 678761 : vect_bb_slp_scalar_cost (bb_vinfo,
9735 : : SLP_INSTANCE_TREE (instance),
9736 : : &life, &scalar_costs, vectorized_scalar_stmts,
9737 : : scalar_stmts_in_externs, visited);
9738 : 678761 : vector_costs.safe_splice (instance->cost_vec);
9739 : 678761 : instance->cost_vec.release ();
9740 : 678761 : }
9741 : :
9742 : 661130 : if (dump_enabled_p ())
9743 : 96 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
9744 : :
9745 : : /* When costing non-loop vectorization we need to consider each covered
9746 : : loop independently and make sure vectorization is profitable. For
9747 : : now we assume a loop may be not entered or executed an arbitrary
9748 : : number of iterations (??? static information can provide more
9749 : : precise info here) which means we can simply cost each containing
9750 : : loops stmts separately. */
9751 : :
9752 : : /* First produce cost vectors sorted by loop index. */
9753 : 661130 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
9754 : 661130 : li_scalar_costs (scalar_costs.length ());
9755 : 661130 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
9756 : 661130 : li_vector_costs (vector_costs.length ());
9757 : 661130 : stmt_info_for_cost *cost;
9758 : 2817998 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9759 : : {
9760 : 2156868 : unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
9761 : 2156868 : li_scalar_costs.quick_push (std::make_pair (l, cost));
9762 : : }
9763 : : /* Use a random used loop as fallback in case the first vector_costs
9764 : : entry does not have a stmt_info associated with it. */
9765 : 661130 : unsigned l = li_scalar_costs[0].first;
9766 : 2424464 : FOR_EACH_VEC_ELT (vector_costs, i, cost)
9767 : : {
9768 : : /* We inherit from the previous COST, invariants, externals and
9769 : : extracts immediately follow the cost for the related stmt. */
9770 : 1763334 : if (cost->stmt_info)
9771 : 1041159 : l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
9772 : 1763334 : li_vector_costs.quick_push (std::make_pair (l, cost));
9773 : : }
9774 : 661130 : li_scalar_costs.qsort (li_cost_vec_cmp);
9775 : 661130 : li_vector_costs.qsort (li_cost_vec_cmp);
9776 : :
9777 : : /* Now cost the portions individually. */
9778 : : unsigned vi = 0;
9779 : : unsigned si = 0;
9780 : 1145853 : bool profitable = true;
9781 : 1145853 : while (si < li_scalar_costs.length ()
9782 : 1811361 : && vi < li_vector_costs.length ())
9783 : : {
9784 : 665508 : unsigned sl = li_scalar_costs[si].first;
9785 : 665508 : unsigned vl = li_vector_costs[vi].first;
9786 : 665508 : if (sl != vl)
9787 : : {
9788 : 1254 : if (dump_enabled_p ())
9789 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
9790 : : "Scalar %d and vector %d loop part do not "
9791 : : "match up, skipping scalar part\n", sl, vl);
9792 : : /* Skip the scalar part, assuming zero cost on the vector side. */
9793 : 2547 : do
9794 : : {
9795 : 2547 : si++;
9796 : : }
9797 : 2547 : while (si < li_scalar_costs.length ()
9798 : 4697 : && li_scalar_costs[si].first == sl);
9799 : 1254 : continue;
9800 : : }
9801 : :
9802 : 664254 : class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
9803 : 2137841 : do
9804 : : {
9805 : 2137841 : add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
9806 : 2137841 : si++;
9807 : : }
9808 : 2137841 : while (si < li_scalar_costs.length ()
9809 : 4282623 : && li_scalar_costs[si].first == sl);
9810 : 664254 : scalar_target_cost_data->finish_cost (nullptr);
9811 : 664254 : scalar_cost = scalar_target_cost_data->body_cost ();
9812 : :
9813 : : /* Complete the target-specific vector cost calculation. */
9814 : 664254 : class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
9815 : 1731092 : do
9816 : : {
9817 : 1731092 : add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
9818 : 1731092 : vi++;
9819 : : }
9820 : 1731092 : while (vi < li_vector_costs.length ()
9821 : 3470070 : && li_vector_costs[vi].first == vl);
9822 : 664254 : vect_target_cost_data->finish_cost (scalar_target_cost_data);
9823 : 664254 : vec_prologue_cost = vect_target_cost_data->prologue_cost ();
9824 : 664254 : vec_inside_cost = vect_target_cost_data->body_cost ();
9825 : 664254 : vec_epilogue_cost = vect_target_cost_data->epilogue_cost ();
9826 : 664254 : delete scalar_target_cost_data;
9827 : 664254 : delete vect_target_cost_data;
9828 : :
9829 : 664254 : vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
9830 : :
9831 : 664254 : if (dump_enabled_p ())
9832 : : {
9833 : 96 : dump_printf_loc (MSG_NOTE, vect_location,
9834 : : "Cost model analysis for part in loop %d:\n", sl);
9835 : 96 : dump_printf (MSG_NOTE, " Vector cost: %d\n",
9836 : : vec_inside_cost + vec_outside_cost);
9837 : 96 : dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
9838 : : }
9839 : :
9840 : : /* Vectorization is profitable if its cost is more than the cost of scalar
9841 : : version. Note that we err on the vector side for equal cost because
9842 : : the cost estimate is otherwise quite pessimistic (constant uses are
9843 : : free on the scalar side but cost a load on the vector side for
9844 : : example). */
9845 : 664254 : if (vec_outside_cost + vec_inside_cost > scalar_cost)
9846 : : {
9847 : : profitable = false;
9848 : : break;
9849 : : }
9850 : : }
9851 : 1141462 : if (profitable && vi < li_vector_costs.length ())
9852 : : {
9853 : 1042 : if (dump_enabled_p ())
9854 : 12 : dump_printf_loc (MSG_NOTE, vect_location,
9855 : : "Excess vector cost for part in loop %d:\n",
9856 : 6 : li_vector_costs[vi].first);
9857 : : profitable = false;
9858 : : }
9859 : :
9860 : : /* Unset visited flag. This is delayed when the subgraph is profitable
9861 : : and we process the loop for remaining unvectorized if-converted code. */
9862 : 661130 : if (!orig_loop || !profitable)
9863 : 2816699 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9864 : 2155662 : gimple_set_visited (cost->stmt_info->stmt, false);
9865 : :
9866 : 661130 : scalar_costs.release ();
9867 : 661130 : vector_costs.release ();
9868 : :
9869 : 661130 : return profitable;
9870 : 661130 : }
9871 : :
9872 : : /* qsort comparator for lane defs. */
9873 : :
9874 : : static int
9875 : 40 : vld_cmp (const void *a_, const void *b_)
9876 : : {
9877 : 40 : auto *a = (const std::pair<unsigned, tree> *)a_;
9878 : 40 : auto *b = (const std::pair<unsigned, tree> *)b_;
9879 : 40 : return a->first - b->first;
9880 : : }
9881 : :
9882 : : /* Return true if USE_STMT is a vector lane insert into VEC and set
9883 : : *THIS_LANE to the lane number that is set. */
9884 : :
9885 : : static bool
9886 : 244 : vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
9887 : : {
9888 : 244 : gassign *use_ass = dyn_cast <gassign *> (use_stmt);
9889 : 91 : if (!use_ass
9890 : 91 : || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
9891 : 22 : || (vec
9892 : 22 : ? gimple_assign_rhs1 (use_ass) != vec
9893 : 24 : : ((vec = gimple_assign_rhs1 (use_ass)), false))
9894 : 46 : || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
9895 : 46 : TREE_TYPE (gimple_assign_rhs2 (use_ass)))
9896 : 46 : || !constant_multiple_p
9897 : 46 : (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
9898 : 92 : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
9899 : : this_lane))
9900 : 198 : return false;
9901 : : return true;
9902 : : }
9903 : :
9904 : : /* Find any vectorizable constructors and add them to the grouped_store
9905 : : array. */
9906 : :
9907 : : static void
9908 : 2376869 : vect_slp_check_for_roots (bb_vec_info bb_vinfo)
9909 : : {
9910 : 18376151 : for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
9911 : 31998564 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
9912 : 134043672 : !gsi_end_p (gsi); gsi_next (&gsi))
9913 : : {
9914 : 118044390 : gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
9915 : : /* This can be used to start SLP discovery for early breaks for BB early breaks
9916 : : when we get that far. */
9917 : 118044390 : if (!assign)
9918 : 174948111 : continue;
9919 : :
9920 : 31514412 : tree rhs = gimple_assign_rhs1 (assign);
9921 : 31514412 : enum tree_code code = gimple_assign_rhs_code (assign);
9922 : 31514412 : use_operand_p use_p;
9923 : 31514412 : gimple *use_stmt;
9924 : 31514412 : if (code == CONSTRUCTOR)
9925 : : {
9926 : 1622961 : if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9927 : 60865 : || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
9928 : 89414 : CONSTRUCTOR_NELTS (rhs))
9929 : 41590 : || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
9930 : 1664551 : || uniform_vector_p (rhs))
9931 : 1611046 : continue;
9932 : :
9933 : : unsigned j;
9934 : : tree val;
9935 : 59642 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9936 : 47727 : if (TREE_CODE (val) != SSA_NAME
9937 : 47727 : || !bb_vinfo->lookup_def (val))
9938 : : break;
9939 : 29636 : if (j != CONSTRUCTOR_NELTS (rhs))
9940 : 2903 : continue;
9941 : :
9942 : 11915 : vec<stmt_vec_info> roots = vNULL;
9943 : 11915 : roots.safe_push (bb_vinfo->lookup_stmt (assign));
9944 : 11915 : vec<stmt_vec_info> stmts;
9945 : 11915 : stmts.create (CONSTRUCTOR_NELTS (rhs));
9946 : 67116 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9947 : 43286 : stmts.quick_push
9948 : 43286 : (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
9949 : 11915 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9950 : 11915 : stmts, roots));
9951 : : }
9952 : 29891451 : else if (code == BIT_INSERT_EXPR
9953 : 889 : && VECTOR_TYPE_P (TREE_TYPE (rhs))
9954 : 590 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
9955 : 590 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
9956 : 587 : && integer_zerop (gimple_assign_rhs3 (assign))
9957 : 335 : && useless_type_conversion_p
9958 : 335 : (TREE_TYPE (TREE_TYPE (rhs)),
9959 : 335 : TREE_TYPE (gimple_assign_rhs2 (assign)))
9960 : 29892061 : && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
9961 : : {
9962 : : /* We start to match on insert to lane zero but since the
9963 : : inserts need not be ordered we'd have to search both
9964 : : the def and the use chains. */
9965 : 211 : tree vectype = TREE_TYPE (rhs);
9966 : 211 : unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
9967 : 211 : auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
9968 : 211 : auto_sbitmap lanes (nlanes);
9969 : 211 : bitmap_clear (lanes);
9970 : 211 : bitmap_set_bit (lanes, 0);
9971 : 211 : tree def = gimple_assign_lhs (assign);
9972 : 211 : lane_defs.quick_push
9973 : 211 : (std::make_pair (0, gimple_assign_rhs2 (assign)));
9974 : 211 : unsigned lanes_found = 1;
9975 : : /* Start with the use chains, the last stmt will be the root. */
9976 : 211 : stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
9977 : 211 : vec<stmt_vec_info> roots = vNULL;
9978 : 211 : roots.safe_push (last);
9979 : 213 : do
9980 : : {
9981 : 213 : use_operand_p use_p;
9982 : 213 : gimple *use_stmt;
9983 : 213 : if (!single_imm_use (def, &use_p, &use_stmt))
9984 : : break;
9985 : 207 : unsigned this_lane;
9986 : 207 : if (!bb_vinfo->lookup_stmt (use_stmt)
9987 : 207 : || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
9988 : 229 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
9989 : : break;
9990 : 22 : if (bitmap_bit_p (lanes, this_lane))
9991 : : break;
9992 : 2 : lanes_found++;
9993 : 2 : bitmap_set_bit (lanes, this_lane);
9994 : 2 : gassign *use_ass = as_a <gassign *> (use_stmt);
9995 : 2 : lane_defs.quick_push (std::make_pair
9996 : 2 : (this_lane, gimple_assign_rhs2 (use_ass)));
9997 : 2 : last = bb_vinfo->lookup_stmt (use_ass);
9998 : 2 : roots.safe_push (last);
9999 : 2 : def = gimple_assign_lhs (use_ass);
10000 : : }
10001 : 2 : while (lanes_found < nlanes);
10002 : 211 : if (roots.length () > 1)
10003 : 2 : std::swap(roots[0], roots[roots.length () - 1]);
10004 : 211 : if (lanes_found < nlanes)
10005 : : {
10006 : : /* Now search the def chain. */
10007 : 211 : def = gimple_assign_rhs1 (assign);
10008 : 213 : do
10009 : : {
10010 : 213 : if (TREE_CODE (def) != SSA_NAME
10011 : 213 : || !has_single_use (def))
10012 : : break;
10013 : 56 : gimple *def_stmt = SSA_NAME_DEF_STMT (def);
10014 : 56 : unsigned this_lane;
10015 : 56 : if (!bb_vinfo->lookup_stmt (def_stmt)
10016 : 37 : || !vect_slp_is_lane_insert (def_stmt,
10017 : : NULL_TREE, &this_lane)
10018 : 80 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
10019 : : break;
10020 : 24 : if (bitmap_bit_p (lanes, this_lane))
10021 : : break;
10022 : 4 : lanes_found++;
10023 : 4 : bitmap_set_bit (lanes, this_lane);
10024 : 8 : lane_defs.quick_push (std::make_pair
10025 : 4 : (this_lane,
10026 : 4 : gimple_assign_rhs2 (def_stmt)));
10027 : 4 : roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
10028 : 4 : def = gimple_assign_rhs1 (def_stmt);
10029 : : }
10030 : 4 : while (lanes_found < nlanes);
10031 : : }
10032 : 211 : if (lanes_found == nlanes)
10033 : : {
10034 : : /* Sort lane_defs after the lane index and register the root. */
10035 : 2 : lane_defs.qsort (vld_cmp);
10036 : 2 : vec<stmt_vec_info> stmts;
10037 : 2 : stmts.create (nlanes);
10038 : 10 : for (unsigned i = 0; i < nlanes; ++i)
10039 : 8 : stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
10040 : 2 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
10041 : 2 : stmts, roots));
10042 : : }
10043 : : else
10044 : 209 : roots.release ();
10045 : 211 : }
10046 : 29891240 : else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
10047 : 28937209 : && (associative_tree_code (code) || code == MINUS_EXPR)
10048 : : /* ??? This pessimizes a two-element reduction. PR54400.
10049 : : ??? In-order reduction could be handled if we only
10050 : : traverse one operand chain in vect_slp_linearize_chain. */
10051 : 33806774 : && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
10052 : : /* Ops with constants at the tail can be stripped here. */
10053 : 5807422 : && TREE_CODE (rhs) == SSA_NAME
10054 : 5741492 : && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
10055 : : /* Should be the chain end. */
10056 : 32186145 : && (!single_imm_use (gimple_assign_lhs (assign),
10057 : : &use_p, &use_stmt)
10058 : 1774404 : || !is_gimple_assign (use_stmt)
10059 : 1181936 : || (gimple_assign_rhs_code (use_stmt) != code
10060 : 872010 : && ((code != PLUS_EXPR && code != MINUS_EXPR)
10061 : 487518 : || (gimple_assign_rhs_code (use_stmt)
10062 : 487518 : != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
10063 : : {
10064 : : /* We start the match at the end of a possible association
10065 : : chain. */
10066 : 1891888 : auto_vec<chain_op_t> chain;
10067 : 1891888 : auto_vec<std::pair<tree_code, gimple *> > worklist;
10068 : 1891888 : auto_vec<gimple *> chain_stmts;
10069 : 1891888 : gimple *code_stmt = NULL, *alt_code_stmt = NULL;
10070 : 1891888 : if (code == MINUS_EXPR)
10071 : 316026 : code = PLUS_EXPR;
10072 : 1891888 : internal_fn reduc_fn;
10073 : 2168997 : if (!reduction_fn_for_scalar_code (code, &reduc_fn)
10074 : 1891888 : || reduc_fn == IFN_LAST)
10075 : 277109 : continue;
10076 : 1614779 : vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
10077 : : /* ??? */
10078 : : code_stmt, alt_code_stmt, &chain_stmts);
10079 : 3229558 : if (chain.length () > 1)
10080 : : {
10081 : : /* Sort the chain according to def_type and operation. */
10082 : 1614779 : chain.sort (dt_sort_cmp, bb_vinfo);
10083 : : /* ??? Now we'd want to strip externals and constants
10084 : : but record those to be handled in the epilogue. */
10085 : : /* ??? For now do not allow mixing ops or externs/constants. */
10086 : 1614779 : bool invalid = false;
10087 : 1614779 : unsigned remain_cnt = 0;
10088 : 1614779 : unsigned last_idx = 0;
10089 : 4863350 : for (unsigned i = 0; i < chain.length (); ++i)
10090 : : {
10091 : 3586294 : if (chain[i].code != code)
10092 : : {
10093 : : invalid = true;
10094 : : break;
10095 : : }
10096 : 3248571 : if (chain[i].dt != vect_internal_def
10097 : : /* Avoid stmts where the def is not the LHS, like
10098 : : ASMs. */
10099 : 6265201 : || (gimple_get_lhs (bb_vinfo->lookup_def
10100 : 3016630 : (chain[i].op)->stmt)
10101 : 3016630 : != chain[i].op))
10102 : 234885 : remain_cnt++;
10103 : : else
10104 : : last_idx = i;
10105 : : }
10106 : : /* Make sure to have an even number of lanes as we later do
10107 : : all-or-nothing discovery, not trying to split further. */
10108 : 1614779 : if ((chain.length () - remain_cnt) & 1)
10109 : 188485 : remain_cnt++;
10110 : 1614779 : if (!invalid && chain.length () - remain_cnt > 1)
10111 : : {
10112 : 1208285 : vec<stmt_vec_info> stmts;
10113 : 1208285 : vec<tree> remain = vNULL;
10114 : 1208285 : stmts.create (chain.length ());
10115 : 1208285 : if (remain_cnt > 0)
10116 : 111280 : remain.create (remain_cnt);
10117 : 3879883 : for (unsigned i = 0; i < chain.length (); ++i)
10118 : : {
10119 : 2671598 : stmt_vec_info stmt_info;
10120 : 2671598 : if (chain[i].dt == vect_internal_def
10121 : 2633999 : && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
10122 : 2633999 : gimple_get_lhs (stmt_info->stmt) == chain[i].op)
10123 : 5305513 : && (i != last_idx
10124 : 1208285 : || (stmts.length () & 1)))
10125 : 2549038 : stmts.quick_push (stmt_info);
10126 : : else
10127 : 122560 : remain.quick_push (chain[i].op);
10128 : : }
10129 : 1208285 : vec<stmt_vec_info> roots;
10130 : 1208285 : roots.create (chain_stmts.length ());
10131 : 2671598 : for (unsigned i = 0; i < chain_stmts.length (); ++i)
10132 : 1463313 : roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
10133 : 1208285 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
10134 : 1208285 : stmts, roots, remain));
10135 : : }
10136 : : }
10137 : 1891888 : }
10138 : : }
10139 : 2376869 : }
10140 : :
10141 : : /* Walk the grouped store chains and replace entries with their
10142 : : pattern variant if any. */
10143 : :
10144 : : static void
10145 : 638598 : vect_fixup_store_groups_with_patterns (vec_info *vinfo)
10146 : : {
10147 : 638598 : stmt_vec_info first_element;
10148 : 638598 : unsigned i;
10149 : :
10150 : 1527657 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
10151 : : {
10152 : : /* We also have CTORs in this array. */
10153 : 889059 : if (!STMT_VINFO_GROUPED_ACCESS (first_element))
10154 : 0 : continue;
10155 : 889059 : if (STMT_VINFO_IN_PATTERN_P (first_element))
10156 : : {
10157 : 241 : stmt_vec_info orig = first_element;
10158 : 241 : first_element = STMT_VINFO_RELATED_STMT (first_element);
10159 : 241 : DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
10160 : 241 : DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
10161 : 241 : DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
10162 : 241 : DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
10163 : 241 : vinfo->grouped_stores[i] = first_element;
10164 : : }
10165 : 889059 : stmt_vec_info prev = first_element;
10166 : 2499452 : while (DR_GROUP_NEXT_ELEMENT (prev))
10167 : : {
10168 : 1610393 : stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
10169 : 1610393 : if (STMT_VINFO_IN_PATTERN_P (elt))
10170 : : {
10171 : 857 : stmt_vec_info orig = elt;
10172 : 857 : elt = STMT_VINFO_RELATED_STMT (elt);
10173 : 857 : DR_GROUP_NEXT_ELEMENT (prev) = elt;
10174 : 857 : DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
10175 : 857 : DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
10176 : : }
10177 : 1610393 : DR_GROUP_FIRST_ELEMENT (elt) = first_element;
10178 : 1610393 : prev = elt;
10179 : : }
10180 : : }
10181 : 638598 : }
10182 : :
10183 : : /* Check if the region described by BB_VINFO can be vectorized, returning
10184 : : true if so. When returning false, set FATAL to true if the same failure
10185 : : would prevent vectorization at other vector sizes, false if it is still
10186 : : worth trying other sizes. N_STMTS is the number of statements in the
10187 : : region. */
10188 : :
10189 : : static bool
10190 : 2376869 : vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
10191 : : vec<int> *dataref_groups)
10192 : : {
10193 : 2376869 : DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
10194 : :
10195 : 2376869 : slp_instance instance;
10196 : 2376869 : int i;
10197 : :
10198 : : /* The first group of checks is independent of the vector size. */
10199 : 2376869 : fatal = true;
10200 : :
10201 : : /* Analyze the data references. */
10202 : :
10203 : 2376869 : if (!vect_analyze_data_refs (bb_vinfo, NULL))
10204 : : {
10205 : 0 : if (dump_enabled_p ())
10206 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10207 : : "not vectorized: unhandled data-ref in basic "
10208 : : "block.\n");
10209 : 0 : return false;
10210 : : }
10211 : :
10212 : 2376869 : if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
10213 : : {
10214 : 0 : if (dump_enabled_p ())
10215 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10216 : : "not vectorized: unhandled data access in "
10217 : : "basic block.\n");
10218 : 0 : return false;
10219 : : }
10220 : :
10221 : 2376869 : vect_slp_check_for_roots (bb_vinfo);
10222 : :
10223 : : /* If there are no grouped stores and no constructors in the region
10224 : : there is no need to continue with pattern recog as vect_analyze_slp
10225 : : will fail anyway. */
10226 : 2376869 : if (bb_vinfo->grouped_stores.is_empty ()
10227 : 2014595 : && bb_vinfo->roots.is_empty ())
10228 : : {
10229 : 1738271 : if (dump_enabled_p ())
10230 : 1012 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10231 : : "not vectorized: no grouped stores in "
10232 : : "basic block.\n");
10233 : 1738271 : return false;
10234 : : }
10235 : :
10236 : : /* While the rest of the analysis below depends on it in some way. */
10237 : 638598 : fatal = false;
10238 : :
10239 : 638598 : vect_pattern_recog (bb_vinfo);
10240 : :
10241 : : /* Update store groups from pattern processing. */
10242 : 638598 : vect_fixup_store_groups_with_patterns (bb_vinfo);
10243 : :
10244 : : /* Check the SLP opportunities in the basic block, analyze and build SLP
10245 : : trees. */
10246 : 638598 : if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
10247 : : {
10248 : 0 : if (dump_enabled_p ())
10249 : : {
10250 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10251 : : "Failed to SLP the basic block.\n");
10252 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10253 : : "not vectorized: failed to find SLP opportunities "
10254 : : "in basic block.\n");
10255 : : }
10256 : 0 : return false;
10257 : : }
10258 : :
10259 : : /* Optimize permutations. */
10260 : 638598 : vect_optimize_slp (bb_vinfo);
10261 : :
10262 : : /* Gather the loads reachable from the SLP graph entries. */
10263 : 638598 : vect_gather_slp_loads (bb_vinfo);
10264 : :
10265 : 638598 : vect_record_base_alignments (bb_vinfo);
10266 : :
10267 : : /* Analyze and verify the alignment of data references and the
10268 : : dependence in the SLP instances. */
10269 : 1433515 : for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
10270 : : {
10271 : 794917 : vect_location = instance->location ();
10272 : 794917 : if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
10273 : 794917 : || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
10274 : : {
10275 : 7643 : slp_tree node = SLP_INSTANCE_TREE (instance);
10276 : 7643 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
10277 : 7643 : if (dump_enabled_p ())
10278 : 4 : dump_printf_loc (MSG_NOTE, vect_location,
10279 : : "removing SLP instance operations starting from: %G",
10280 : : stmt_info->stmt);
10281 : 7643 : vect_free_slp_instance (instance);
10282 : 7643 : BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
10283 : 7643 : continue;
10284 : 7643 : }
10285 : :
10286 : : /* Mark all the statements that we want to vectorize as pure SLP and
10287 : : relevant. */
10288 : 787274 : vect_mark_slp_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance));
10289 : 787274 : vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
10290 : 787274 : unsigned j;
10291 : 787274 : stmt_vec_info root;
10292 : : /* Likewise consider instance root stmts as vectorized. */
10293 : 1740523 : FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
10294 : 165975 : STMT_SLP_TYPE (root) = pure_slp;
10295 : :
10296 : 787274 : i++;
10297 : : }
10298 : 2408957 : if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
10299 : : return false;
10300 : :
10301 : 282966 : if (!vect_slp_analyze_operations (bb_vinfo))
10302 : : {
10303 : 32088 : if (dump_enabled_p ())
10304 : 78 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10305 : : "not vectorized: bad operation in basic block.\n");
10306 : 32088 : return false;
10307 : : }
10308 : :
10309 : 250878 : vect_bb_partition_graph (bb_vinfo);
10310 : :
10311 : 250878 : return true;
10312 : : }
10313 : :
10314 : : /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
10315 : : basic blocks in BBS, returning true on success.
10316 : : The region has N_STMTS statements and has the datarefs given by DATAREFS. */
10317 : :
10318 : : static bool
10319 : 2044789 : vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
10320 : : vec<int> *dataref_groups, unsigned int n_stmts,
10321 : : loop_p orig_loop)
10322 : : {
10323 : 2044789 : bb_vec_info bb_vinfo;
10324 : 2044789 : auto_vector_modes vector_modes;
10325 : :
10326 : : /* Autodetect first vector size we try. */
10327 : 2044789 : machine_mode next_vector_mode = VOIDmode;
10328 : 2044789 : targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
10329 : 2044789 : unsigned int mode_i = 0;
10330 : :
10331 : 2044789 : vec_info_shared shared;
10332 : :
10333 : 2044789 : machine_mode autodetected_vector_mode = VOIDmode;
10334 : 2708949 : while (1)
10335 : : {
10336 : 2376869 : bool vectorized = false;
10337 : 2376869 : bool fatal = false;
10338 : 2376869 : bb_vinfo = new _bb_vec_info (bbs, &shared);
10339 : :
10340 : 2376869 : bool first_time_p = shared.datarefs.is_empty ();
10341 : 2376869 : BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
10342 : 2376869 : if (first_time_p)
10343 : 2067706 : bb_vinfo->shared->save_datarefs ();
10344 : : else
10345 : 309163 : bb_vinfo->shared->check_datarefs ();
10346 : 2376869 : bb_vinfo->vector_mode = next_vector_mode;
10347 : :
10348 : 2376869 : if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
10349 : : {
10350 : 250878 : if (dump_enabled_p ())
10351 : : {
10352 : 1478 : dump_printf_loc (MSG_NOTE, vect_location,
10353 : : "***** Analysis succeeded with vector mode"
10354 : 739 : " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
10355 : 739 : dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
10356 : : }
10357 : :
10358 : 250878 : bb_vinfo->shared->check_datarefs ();
10359 : :
10360 : 250878 : bool force_clear = false;
10361 : 250878 : auto_vec<slp_instance> profitable_subgraphs;
10362 : 1434705 : for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
10363 : : {
10364 : 682071 : if (instance->subgraph_entries.is_empty ())
10365 : 217343 : continue;
10366 : :
10367 : 664313 : dump_user_location_t saved_vect_location = vect_location;
10368 : 664313 : vect_location = instance->location ();
10369 : 664313 : if (!unlimited_cost_model (NULL)
10370 : 1325443 : && !vect_bb_vectorization_profitable_p
10371 : 661130 : (bb_vinfo, instance->subgraph_entries, orig_loop))
10372 : : {
10373 : 181827 : if (dump_enabled_p ())
10374 : 26 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10375 : : "not vectorized: vectorization is not "
10376 : : "profitable.\n");
10377 : 181827 : vect_location = saved_vect_location;
10378 : 181827 : continue;
10379 : : }
10380 : :
10381 : 482486 : vect_location = saved_vect_location;
10382 : 482486 : if (!dbg_cnt (vect_slp))
10383 : : {
10384 : 0 : force_clear = true;
10385 : 0 : continue;
10386 : : }
10387 : :
10388 : 482486 : profitable_subgraphs.safe_push (instance);
10389 : : }
10390 : :
10391 : : /* When we're vectorizing an if-converted loop body make sure
10392 : : we vectorized all if-converted code. */
10393 : 418815 : if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
10394 : : {
10395 : 103 : gcc_assert (bb_vinfo->nbbs == 1);
10396 : 206 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
10397 : 4256 : !gsi_end_p (gsi); gsi_next (&gsi))
10398 : : {
10399 : : /* The costing above left us with DCEable vectorized scalar
10400 : : stmts having the visited flag set on profitable
10401 : : subgraphs. Do the delayed clearing of the flag here. */
10402 : 4153 : if (gimple_visited_p (gsi_stmt (gsi)))
10403 : : {
10404 : 1180 : gimple_set_visited (gsi_stmt (gsi), false);
10405 : 1180 : continue;
10406 : : }
10407 : 2973 : if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
10408 : 867 : continue;
10409 : :
10410 : 6121 : if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
10411 : 2558 : if (gimple_assign_rhs_code (ass) == COND_EXPR)
10412 : : {
10413 : 63 : if (!profitable_subgraphs.is_empty ()
10414 : 26 : && dump_enabled_p ())
10415 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
10416 : : "not profitable because of "
10417 : : "unprofitable if-converted scalar "
10418 : : "code\n");
10419 : 37 : profitable_subgraphs.truncate (0);
10420 : : }
10421 : : }
10422 : : }
10423 : :
10424 : : /* Finally schedule the profitable subgraphs. */
10425 : 1069202 : for (slp_instance instance : profitable_subgraphs)
10426 : : {
10427 : 482450 : if (!vectorized && dump_enabled_p ())
10428 : 716 : dump_printf_loc (MSG_NOTE, vect_location,
10429 : : "Basic block will be vectorized "
10430 : : "using SLP\n");
10431 : 482450 : vectorized = true;
10432 : :
10433 : : /* Dump before scheduling as store vectorization will remove
10434 : : the original stores and mess with the instance tree
10435 : : so querying its location will eventually ICE. */
10436 : 482450 : if (flag_checking)
10437 : 1939518 : for (slp_instance sub : instance->subgraph_entries)
10438 : 492168 : gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
10439 : 482450 : unsigned HOST_WIDE_INT bytes;
10440 : 482450 : if (dump_enabled_p ())
10441 : 3415 : for (slp_instance sub : instance->subgraph_entries)
10442 : : {
10443 : 904 : tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
10444 : 1808 : if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
10445 : 904 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
10446 : 904 : sub->location (),
10447 : : "basic block part vectorized using %wu "
10448 : : "byte vectors\n", bytes);
10449 : : else
10450 : : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
10451 : : sub->location (),
10452 : : "basic block part vectorized using "
10453 : : "variable length vectors\n");
10454 : : }
10455 : :
10456 : 482450 : dump_user_location_t saved_vect_location = vect_location;
10457 : 482450 : vect_location = instance->location ();
10458 : :
10459 : 482450 : vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
10460 : :
10461 : 482450 : vect_location = saved_vect_location;
10462 : : }
10463 : :
10464 : :
10465 : : /* Generate the invariant statements. */
10466 : 250878 : if (!gimple_seq_empty_p (bb_vinfo->inv_pattern_def_seq))
10467 : : {
10468 : 23 : if (dump_enabled_p ())
10469 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
10470 : : "------>generating invariant statements\n");
10471 : :
10472 : 23 : bb_vinfo->insert_seq_on_entry (NULL,
10473 : : bb_vinfo->inv_pattern_def_seq);
10474 : : }
10475 : 250878 : }
10476 : : else
10477 : : {
10478 : 2125991 : if (dump_enabled_p ())
10479 : 1289 : dump_printf_loc (MSG_NOTE, vect_location,
10480 : : "***** Analysis failed with vector mode %s\n",
10481 : 1289 : GET_MODE_NAME (bb_vinfo->vector_mode));
10482 : : }
10483 : :
10484 : 2376869 : if (mode_i == 0)
10485 : 2044789 : autodetected_vector_mode = bb_vinfo->vector_mode;
10486 : :
10487 : 2376869 : if (!fatal)
10488 : 3366620 : while (mode_i < vector_modes.length ()
10489 : 1840852 : && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
10490 : : {
10491 : 351153 : if (dump_enabled_p ())
10492 : 1622 : dump_printf_loc (MSG_NOTE, vect_location,
10493 : : "***** The result for vector mode %s would"
10494 : : " be the same\n",
10495 : 811 : GET_MODE_NAME (vector_modes[mode_i]));
10496 : 351153 : mode_i += 1;
10497 : : }
10498 : :
10499 : 2376869 : delete bb_vinfo;
10500 : :
10501 : 2376869 : if (mode_i < vector_modes.length ()
10502 : 2191551 : && VECTOR_MODE_P (autodetected_vector_mode)
10503 : 2089744 : && (related_vector_mode (vector_modes[mode_i],
10504 : : GET_MODE_INNER (autodetected_vector_mode))
10505 : 1044872 : == autodetected_vector_mode)
10506 : 4568420 : && (related_vector_mode (autodetected_vector_mode,
10507 : 544462 : GET_MODE_INNER (vector_modes[mode_i]))
10508 : 1088924 : == vector_modes[mode_i]))
10509 : : {
10510 : 544462 : if (dump_enabled_p ())
10511 : 206 : dump_printf_loc (MSG_NOTE, vect_location,
10512 : : "***** Skipping vector mode %s, which would"
10513 : : " repeat the analysis for %s\n",
10514 : 206 : GET_MODE_NAME (vector_modes[mode_i]),
10515 : 206 : GET_MODE_NAME (autodetected_vector_mode));
10516 : 544462 : mode_i += 1;
10517 : : }
10518 : :
10519 : 2376869 : if (vectorized
10520 : 2208958 : || mode_i == vector_modes.length ()
10521 : 2023683 : || autodetected_vector_mode == VOIDmode
10522 : : /* If vect_slp_analyze_bb_1 signaled that analysis for all
10523 : : vector sizes will fail do not bother iterating. */
10524 : 3253873 : || fatal)
10525 : 4089578 : return vectorized;
10526 : :
10527 : : /* Try the next biggest vector size. */
10528 : 332080 : next_vector_mode = vector_modes[mode_i++];
10529 : 332080 : if (dump_enabled_p ())
10530 : 207 : dump_printf_loc (MSG_NOTE, vect_location,
10531 : : "***** Re-trying analysis with vector mode %s\n",
10532 : 207 : GET_MODE_NAME (next_vector_mode));
10533 : 332080 : }
10534 : 2044789 : }
10535 : :
10536 : :
10537 : : /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
10538 : : true if anything in the basic-block was vectorized. */
10539 : :
10540 : : static bool
10541 : 2044789 : vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
10542 : : {
10543 : 2044789 : vec<data_reference_p> datarefs = vNULL;
10544 : 2044789 : auto_vec<int> dataref_groups;
10545 : 2044789 : int insns = 0;
10546 : 2044789 : int current_group = 0;
10547 : :
10548 : 13045334 : for (unsigned i = 0; i < bbs.length (); i++)
10549 : : {
10550 : 11000545 : basic_block bb = bbs[i];
10551 : 90015150 : for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
10552 : 79014605 : gsi_next (&gsi))
10553 : : {
10554 : 79014605 : gimple *stmt = gsi_stmt (gsi);
10555 : 79014605 : if (is_gimple_debug (stmt))
10556 : 48792109 : continue;
10557 : :
10558 : 30222496 : insns++;
10559 : :
10560 : 30222496 : if (gimple_location (stmt) != UNKNOWN_LOCATION)
10561 : 27143433 : vect_location = stmt;
10562 : :
10563 : 30222496 : if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
10564 : : &dataref_groups, current_group))
10565 : 5211536 : ++current_group;
10566 : : }
10567 : : /* New BBs always start a new DR group. */
10568 : 11000545 : ++current_group;
10569 : : }
10570 : :
10571 : 2044789 : return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
10572 : 2044789 : }
10573 : :
10574 : : /* Special entry for the BB vectorizer. Analyze and transform a single
10575 : : if-converted BB with ORIG_LOOPs body being the not if-converted
10576 : : representation. Returns true if anything in the basic-block was
10577 : : vectorized. */
10578 : :
10579 : : bool
10580 : 16902 : vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
10581 : : {
10582 : 16902 : auto_vec<basic_block> bbs;
10583 : 16902 : bbs.safe_push (bb);
10584 : 16902 : return vect_slp_bbs (bbs, orig_loop);
10585 : 16902 : }
10586 : :
10587 : : /* Main entry for the BB vectorizer. Analyze and transform BB, returns
10588 : : true if anything in the basic-block was vectorized. */
10589 : :
10590 : : bool
10591 : 910473 : vect_slp_function (function *fun)
10592 : : {
10593 : 910473 : bool r = false;
10594 : 910473 : int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
10595 : 910473 : auto_bitmap exit_bbs;
10596 : 910473 : bitmap_set_bit (exit_bbs, EXIT_BLOCK);
10597 : 910473 : edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
10598 : 910473 : unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
10599 : 910473 : true, rpo, NULL);
10600 : :
10601 : : /* For the moment split the function into pieces to avoid making
10602 : : the iteration on the vector mode moot. Split at points we know
10603 : : to not handle well which is CFG merges (SLP discovery doesn't
10604 : : handle non-loop-header PHIs) and loop exits. Since pattern
10605 : : recog requires reverse iteration to visit uses before defs
10606 : : simply chop RPO into pieces. */
10607 : 910473 : auto_vec<basic_block> bbs;
10608 : 11924297 : for (unsigned i = 0; i < n; i++)
10609 : : {
10610 : 11013824 : basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
10611 : 11013824 : bool split = false;
10612 : :
10613 : : /* Split when a BB is not dominated by the first block. */
10614 : 20757542 : if (!bbs.is_empty ()
10615 : 9743718 : && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
10616 : : {
10617 : 776379 : if (dump_enabled_p ())
10618 : 146 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10619 : : "splitting region at dominance boundary bb%d\n",
10620 : : bb->index);
10621 : : split = true;
10622 : : }
10623 : : /* Split when the loop determined by the first block
10624 : : is exited. This is because we eventually insert
10625 : : invariants at region begin. */
10626 : 19204784 : else if (!bbs.is_empty ()
10627 : 8967339 : && bbs[0]->loop_father != bb->loop_father
10628 : 2261280 : && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
10629 : : {
10630 : 4455 : if (dump_enabled_p ())
10631 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10632 : : "splitting region at loop %d exit at bb%d\n",
10633 : 3 : bbs[0]->loop_father->num, bb->index);
10634 : : split = true;
10635 : : }
10636 : 10232990 : else if (!bbs.is_empty ()
10637 : 8962884 : && bb->loop_father->header == bb
10638 : 484287 : && bb->loop_father->dont_vectorize)
10639 : : {
10640 : 7128 : if (dump_enabled_p ())
10641 : 72 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10642 : : "splitting region at dont-vectorize loop %d "
10643 : : "entry at bb%d\n",
10644 : : bb->loop_father->num, bb->index);
10645 : : split = true;
10646 : : }
10647 : :
10648 : 11801786 : if (split && !bbs.is_empty ())
10649 : : {
10650 : 787962 : r |= vect_slp_bbs (bbs, NULL);
10651 : 787962 : bbs.truncate (0);
10652 : : }
10653 : :
10654 : 11013824 : if (bbs.is_empty ())
10655 : : {
10656 : : /* We need to be able to insert at the head of the region which
10657 : : we cannot for region starting with a returns-twice call. */
10658 : 2058068 : if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
10659 : 405722 : if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
10660 : : {
10661 : 294 : if (dump_enabled_p ())
10662 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10663 : : "skipping bb%d as start of region as it "
10664 : : "starts with returns-twice call\n",
10665 : : bb->index);
10666 : 30181 : continue;
10667 : : }
10668 : : /* If the loop this BB belongs to is marked as not to be vectorized
10669 : : honor that also for BB vectorization. */
10670 : 2057774 : if (bb->loop_father->dont_vectorize)
10671 : 29887 : continue;
10672 : : }
10673 : :
10674 : 10983643 : bbs.safe_push (bb);
10675 : :
10676 : : /* When we have a stmt ending this block and defining a
10677 : : value we have to insert on edges when inserting after it for
10678 : : a vector containing its definition. Avoid this for now. */
10679 : 21967286 : if (gimple *last = *gsi_last_bb (bb))
10680 : 8904351 : if (gimple_get_lhs (last)
10681 : 8904351 : && is_ctrl_altering_stmt (last))
10682 : : {
10683 : 329459 : if (dump_enabled_p ())
10684 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10685 : : "splitting region at control altering "
10686 : : "definition %G", last);
10687 : 329459 : r |= vect_slp_bbs (bbs, NULL);
10688 : 329459 : bbs.truncate (0);
10689 : : }
10690 : : }
10691 : :
10692 : 910473 : if (!bbs.is_empty ())
10693 : 910466 : r |= vect_slp_bbs (bbs, NULL);
10694 : :
10695 : 910473 : free (rpo);
10696 : :
10697 : 910473 : return r;
10698 : 910473 : }
10699 : :
10700 : : /* Build a variable-length vector in which the elements in ELTS are repeated
10701 : : to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
10702 : : RESULTS and add any new instructions to SEQ.
10703 : :
10704 : : The approach we use is:
10705 : :
10706 : : (1) Find a vector mode VM with integer elements of mode IM.
10707 : :
10708 : : (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
10709 : : ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
10710 : : from small vectors to IM.
10711 : :
10712 : : (3) Duplicate each ELTS'[I] into a vector of mode VM.
10713 : :
10714 : : (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
10715 : : correct byte contents.
10716 : :
10717 : : (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
10718 : :
10719 : : We try to find the largest IM for which this sequence works, in order
10720 : : to cut down on the number of interleaves. */
10721 : :
10722 : : void
10723 : 0 : duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
10724 : : const vec<tree> &elts, unsigned int nresults,
10725 : : vec<tree> &results)
10726 : : {
10727 : 0 : unsigned int nelts = elts.length ();
10728 : 0 : tree element_type = TREE_TYPE (vector_type);
10729 : :
10730 : : /* (1) Find a vector mode VM with integer elements of mode IM. */
10731 : 0 : unsigned int nvectors = 1;
10732 : 0 : tree new_vector_type;
10733 : 0 : tree permutes[2];
10734 : 0 : if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
10735 : : &nvectors, &new_vector_type,
10736 : : permutes))
10737 : 0 : gcc_unreachable ();
10738 : :
10739 : : /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
10740 : 0 : unsigned int partial_nelts = nelts / nvectors;
10741 : 0 : tree partial_vector_type = build_vector_type (element_type, partial_nelts);
10742 : :
10743 : 0 : tree_vector_builder partial_elts;
10744 : 0 : auto_vec<tree, 32> pieces (nvectors * 2);
10745 : 0 : pieces.quick_grow_cleared (nvectors * 2);
10746 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
10747 : : {
10748 : : /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
10749 : : ELTS' has mode IM. */
10750 : 0 : partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
10751 : 0 : for (unsigned int j = 0; j < partial_nelts; ++j)
10752 : 0 : partial_elts.quick_push (elts[i * partial_nelts + j]);
10753 : 0 : tree t = gimple_build_vector (seq, &partial_elts);
10754 : 0 : t = gimple_build (seq, VIEW_CONVERT_EXPR,
10755 : 0 : TREE_TYPE (new_vector_type), t);
10756 : :
10757 : : /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
10758 : 0 : pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
10759 : : }
10760 : :
10761 : : /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
10762 : : correct byte contents.
10763 : :
10764 : : Conceptually, we need to repeat the following operation log2(nvectors)
10765 : : times, where hi_start = nvectors / 2:
10766 : :
10767 : : out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
10768 : : out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
10769 : :
10770 : : However, if each input repeats every N elements and the VF is
10771 : : a multiple of N * 2, the HI result is the same as the LO result.
10772 : : This will be true for the first N1 iterations of the outer loop,
10773 : : followed by N2 iterations for which both the LO and HI results
10774 : : are needed. I.e.:
10775 : :
10776 : : N1 + N2 = log2(nvectors)
10777 : :
10778 : : Each "N1 iteration" doubles the number of redundant vectors and the
10779 : : effect of the process as a whole is to have a sequence of nvectors/2**N1
10780 : : vectors that repeats 2**N1 times. Rather than generate these redundant
10781 : : vectors, we halve the number of vectors for each N1 iteration. */
10782 : : unsigned int in_start = 0;
10783 : : unsigned int out_start = nvectors;
10784 : : unsigned int new_nvectors = nvectors;
10785 : 0 : for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
10786 : : {
10787 : 0 : unsigned int hi_start = new_nvectors / 2;
10788 : 0 : unsigned int out_i = 0;
10789 : 0 : for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
10790 : : {
10791 : 0 : if ((in_i & 1) != 0
10792 : 0 : && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
10793 : : 2 * in_repeat))
10794 : 0 : continue;
10795 : :
10796 : 0 : tree output = make_ssa_name (new_vector_type);
10797 : 0 : tree input1 = pieces[in_start + (in_i / 2)];
10798 : 0 : tree input2 = pieces[in_start + (in_i / 2) + hi_start];
10799 : 0 : gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
10800 : : input1, input2,
10801 : : permutes[in_i & 1]);
10802 : 0 : gimple_seq_add_stmt (seq, stmt);
10803 : 0 : pieces[out_start + out_i] = output;
10804 : 0 : out_i += 1;
10805 : : }
10806 : 0 : std::swap (in_start, out_start);
10807 : 0 : new_nvectors = out_i;
10808 : : }
10809 : :
10810 : : /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
10811 : 0 : results.reserve (nresults);
10812 : 0 : for (unsigned int i = 0; i < nresults; ++i)
10813 : 0 : if (i < new_nvectors)
10814 : 0 : results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
10815 : 0 : pieces[in_start + i]));
10816 : : else
10817 : 0 : results.quick_push (results[i - new_nvectors]);
10818 : 0 : }
10819 : :
10820 : :
10821 : : /* For constant and loop invariant defs in OP_NODE this function creates
10822 : : vector defs that will be used in the vectorized stmts and stores them
10823 : : to SLP_TREE_VEC_DEFS of OP_NODE. */
10824 : :
10825 : : static void
10826 : 494844 : vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
10827 : : {
10828 : 494844 : unsigned HOST_WIDE_INT nunits;
10829 : 494844 : tree vec_cst;
10830 : 494844 : unsigned j, number_of_places_left_in_vector;
10831 : 494844 : tree vector_type;
10832 : 494844 : tree vop;
10833 : 494844 : int group_size = op_node->ops.length ();
10834 : 494844 : unsigned int vec_num, i;
10835 : 494844 : unsigned number_of_copies = 1;
10836 : 494844 : bool constant_p;
10837 : 494844 : gimple_seq ctor_seq = NULL;
10838 : 494844 : auto_vec<tree, 16> permute_results;
10839 : :
10840 : : /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
10841 : 494844 : vector_type = SLP_TREE_VECTYPE (op_node);
10842 : :
10843 : 494844 : unsigned int number_of_vectors = vect_get_num_copies (vinfo, op_node);
10844 : 494844 : SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
10845 : 494844 : auto_vec<tree> voprnds (number_of_vectors);
10846 : :
10847 : : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
10848 : : created vectors. It is greater than 1 if unrolling is performed.
10849 : :
10850 : : For example, we have two scalar operands, s1 and s2 (e.g., group of
10851 : : strided accesses of size two), while NUNITS is four (i.e., four scalars
10852 : : of this type can be packed in a vector). The output vector will contain
10853 : : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
10854 : : will be 2).
10855 : :
10856 : : If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
10857 : : containing the operands.
10858 : :
10859 : : For example, NUNITS is four as before, and the group size is 8
10860 : : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
10861 : : {s5, s6, s7, s8}. */
10862 : :
10863 : : /* When using duplicate_and_interleave, we just need one element for
10864 : : each scalar statement. */
10865 : 494844 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
10866 : : nunits = group_size;
10867 : :
10868 : 494844 : number_of_copies = nunits * number_of_vectors / group_size;
10869 : :
10870 : 494844 : number_of_places_left_in_vector = nunits;
10871 : 494844 : constant_p = true;
10872 : 494844 : tree uniform_elt = NULL_TREE;
10873 : 494844 : tree_vector_builder elts (vector_type, nunits, 1);
10874 : 494844 : elts.quick_grow (nunits);
10875 : 494844 : stmt_vec_info insert_after = NULL;
10876 : 1496182 : for (j = 0; j < number_of_copies; j++)
10877 : : {
10878 : 1001338 : tree op;
10879 : 3820192 : for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
10880 : : {
10881 : : /* Create 'vect_ = {op0,op1,...,opn}'. */
10882 : 1817516 : tree orig_op = op;
10883 : 1817516 : if (number_of_places_left_in_vector == nunits)
10884 : : uniform_elt = op;
10885 : 1191064 : else if (uniform_elt && operand_equal_p (uniform_elt, op))
10886 : 763862 : op = elts[number_of_places_left_in_vector];
10887 : : else
10888 : : uniform_elt = NULL_TREE;
10889 : 1817516 : number_of_places_left_in_vector--;
10890 : 1817516 : if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
10891 : : {
10892 : 281171 : if (CONSTANT_CLASS_P (op))
10893 : : {
10894 : 102481 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10895 : : {
10896 : : /* Can't use VIEW_CONVERT_EXPR for booleans because
10897 : : of possibly different sizes of scalar value and
10898 : : vector element. */
10899 : 53 : if (integer_zerop (op))
10900 : 53 : op = build_int_cst (TREE_TYPE (vector_type), 0);
10901 : 0 : else if (integer_onep (op))
10902 : 0 : op = build_all_ones_cst (TREE_TYPE (vector_type));
10903 : : else
10904 : 0 : gcc_unreachable ();
10905 : : }
10906 : : else
10907 : 102428 : op = fold_unary (VIEW_CONVERT_EXPR,
10908 : : TREE_TYPE (vector_type), op);
10909 : 102481 : gcc_assert (op && CONSTANT_CLASS_P (op));
10910 : : }
10911 : : else
10912 : : {
10913 : 178690 : tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
10914 : 178690 : gimple *init_stmt;
10915 : 178690 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10916 : : {
10917 : 403 : tree true_val
10918 : 403 : = build_all_ones_cst (TREE_TYPE (vector_type));
10919 : 403 : tree false_val
10920 : 403 : = build_zero_cst (TREE_TYPE (vector_type));
10921 : 403 : gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
10922 : 403 : init_stmt = gimple_build_assign (new_temp, COND_EXPR,
10923 : : op, true_val,
10924 : : false_val);
10925 : : }
10926 : : else
10927 : : {
10928 : 178287 : op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
10929 : : op);
10930 : 178287 : init_stmt
10931 : 178287 : = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
10932 : : op);
10933 : : }
10934 : 178690 : gimple_seq_add_stmt (&ctor_seq, init_stmt);
10935 : 178690 : op = new_temp;
10936 : : }
10937 : : }
10938 : 1817516 : elts[number_of_places_left_in_vector] = op;
10939 : 1817516 : if (!CONSTANT_CLASS_P (op))
10940 : 325439 : constant_p = false;
10941 : : /* For BB vectorization we have to compute an insert location
10942 : : when a def is inside the analyzed region since we cannot
10943 : : simply insert at the BB start in this case. */
10944 : 1817516 : stmt_vec_info opdef;
10945 : 1817516 : if (TREE_CODE (orig_op) == SSA_NAME
10946 : 188256 : && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
10947 : 168628 : && is_a <bb_vec_info> (vinfo)
10948 : 1930261 : && (opdef = vinfo->lookup_def (orig_op)))
10949 : : {
10950 : 90164 : if (!insert_after)
10951 : : insert_after = opdef;
10952 : : else
10953 : 49718 : insert_after = get_later_stmt (insert_after, opdef);
10954 : : }
10955 : :
10956 : 1817516 : if (number_of_places_left_in_vector == 0)
10957 : : {
10958 : 626452 : auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
10959 : 626452 : if (uniform_elt)
10960 : 655158 : vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
10961 : 327579 : elts[0]);
10962 : 597746 : else if (constant_p
10963 : 597746 : ? multiple_p (type_nunits, nunits)
10964 : 114102 : : known_eq (type_nunits, nunits))
10965 : 298873 : vec_cst = gimple_build_vector (&ctor_seq, &elts);
10966 : : else
10967 : : {
10968 : 0 : if (permute_results.is_empty ())
10969 : 0 : duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
10970 : : elts, number_of_vectors,
10971 : : permute_results);
10972 : 0 : vec_cst = permute_results[number_of_vectors - j - 1];
10973 : : }
10974 : 626452 : if (!gimple_seq_empty_p (ctor_seq))
10975 : : {
10976 : 141186 : if (insert_after)
10977 : : {
10978 : 40446 : gimple_stmt_iterator gsi;
10979 : 40446 : if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
10980 : : {
10981 : 773 : gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
10982 : 773 : gsi_insert_seq_before (&gsi, ctor_seq,
10983 : : GSI_CONTINUE_LINKING);
10984 : : }
10985 : 39673 : else if (!stmt_ends_bb_p (insert_after->stmt))
10986 : : {
10987 : 39673 : gsi = gsi_for_stmt (insert_after->stmt);
10988 : 39673 : gsi_insert_seq_after (&gsi, ctor_seq,
10989 : : GSI_CONTINUE_LINKING);
10990 : : }
10991 : : else
10992 : : {
10993 : : /* When we want to insert after a def where the
10994 : : defining stmt throws then insert on the fallthru
10995 : : edge. */
10996 : 0 : edge e = find_fallthru_edge
10997 : 0 : (gimple_bb (insert_after->stmt)->succs);
10998 : 0 : basic_block new_bb
10999 : 0 : = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
11000 : 0 : gcc_assert (!new_bb);
11001 : : }
11002 : : }
11003 : : else
11004 : 100740 : vinfo->insert_seq_on_entry (NULL, ctor_seq);
11005 : 141186 : ctor_seq = NULL;
11006 : : }
11007 : 626452 : voprnds.quick_push (vec_cst);
11008 : 626452 : insert_after = NULL;
11009 : 626452 : number_of_places_left_in_vector = nunits;
11010 : 626452 : constant_p = true;
11011 : 626452 : elts.new_vector (vector_type, nunits, 1);
11012 : 626452 : elts.quick_grow (nunits);
11013 : : }
11014 : : }
11015 : : }
11016 : :
11017 : : /* Since the vectors are created in the reverse order, we should invert
11018 : : them. */
11019 : 494844 : vec_num = voprnds.length ();
11020 : 1121296 : for (j = vec_num; j != 0; j--)
11021 : : {
11022 : 626452 : vop = voprnds[j - 1];
11023 : 626452 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
11024 : : }
11025 : :
11026 : : /* In case that VF is greater than the unrolling factor needed for the SLP
11027 : : group of stmts, NUMBER_OF_VECTORS to be created is greater than
11028 : : NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
11029 : : to replicate the vectors. */
11030 : 494844 : while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
11031 : 494844 : for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
11032 : : i++)
11033 : 0 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
11034 : 494844 : }
11035 : :
11036 : : /* Get the scalar definition of the Nth lane from SLP_NODE or NULL_TREE
11037 : : if there is no definition for it in the scalar IL or it is not known. */
11038 : :
11039 : : tree
11040 : 1833 : vect_get_slp_scalar_def (slp_tree slp_node, unsigned n)
11041 : : {
11042 : 1833 : if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
11043 : : {
11044 : 1823 : if (!SLP_TREE_SCALAR_STMTS (slp_node).exists ())
11045 : : return NULL_TREE;
11046 : 1823 : stmt_vec_info def = SLP_TREE_SCALAR_STMTS (slp_node)[n];
11047 : 1823 : if (!def)
11048 : : return NULL_TREE;
11049 : 1823 : return gimple_get_lhs (STMT_VINFO_STMT (def));
11050 : : }
11051 : : else
11052 : 10 : return SLP_TREE_SCALAR_OPS (slp_node)[n];
11053 : : }
11054 : :
11055 : : /* Get the Ith vectorized definition from SLP_NODE. */
11056 : :
11057 : : tree
11058 : 139979 : vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
11059 : : {
11060 : 139979 : return SLP_TREE_VEC_DEFS (slp_node)[i];
11061 : : }
11062 : :
11063 : : /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
11064 : :
11065 : : void
11066 : 928886 : vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
11067 : : {
11068 : 1857772 : vec_defs->create (SLP_TREE_VEC_DEFS (slp_node).length ());
11069 : 928886 : vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
11070 : 928886 : }
11071 : :
11072 : : /* Get N vectorized definitions for SLP_NODE. */
11073 : :
11074 : : void
11075 : 2935 : vect_get_slp_defs (vec_info *,
11076 : : slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
11077 : : {
11078 : 2935 : if (n == -1U)
11079 : 2935 : n = SLP_TREE_CHILDREN (slp_node).length ();
11080 : :
11081 : 10371 : for (unsigned i = 0; i < n; ++i)
11082 : : {
11083 : 7436 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
11084 : 7436 : vec<tree> vec_defs = vNULL;
11085 : 7436 : vect_get_slp_defs (child, &vec_defs);
11086 : 7436 : vec_oprnds->quick_push (vec_defs);
11087 : : }
11088 : 2935 : }
11089 : :
11090 : : /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
11091 : : - PERM gives the permutation that the caller wants to use for NODE,
11092 : : which might be different from SLP_LOAD_PERMUTATION.
11093 : : - DUMP_P controls whether the function dumps information. */
11094 : :
11095 : : static bool
11096 : 111495 : vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
11097 : : load_permutation_t &perm,
11098 : : const vec<tree> &dr_chain,
11099 : : gimple_stmt_iterator *gsi, poly_uint64 vf,
11100 : : bool analyze_only, bool dump_p,
11101 : : unsigned *n_perms, unsigned int *n_loads,
11102 : : bool dce_chain)
11103 : : {
11104 : 111495 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
11105 : 111495 : int vec_index = 0;
11106 : 111495 : tree vectype = SLP_TREE_VECTYPE (node);
11107 : 111495 : unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
11108 : 111495 : unsigned int mask_element;
11109 : 111495 : unsigned dr_group_size;
11110 : 111495 : machine_mode mode;
11111 : :
11112 : 111495 : if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
11113 : : {
11114 : : /* We have both splats of the same non-grouped load and groups
11115 : : of distinct invariant loads entering here. */
11116 : 1005 : unsigned max_idx = 0;
11117 : 5577 : for (auto idx : perm)
11118 : 2562 : max_idx = idx > max_idx ? idx : max_idx;
11119 : 1005 : dr_group_size = max_idx + 1;
11120 : : }
11121 : : else
11122 : : {
11123 : 110490 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
11124 : 110490 : dr_group_size = DR_GROUP_SIZE (stmt_info);
11125 : : }
11126 : :
11127 : 111495 : mode = TYPE_MODE (vectype);
11128 : 111495 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
11129 : 111495 : unsigned int nstmts = vect_get_num_copies (vinfo, node);
11130 : :
11131 : : /* Initialize the vect stmts of NODE to properly insert the generated
11132 : : stmts later. */
11133 : 111495 : if (! analyze_only)
11134 : 52194 : for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
11135 : 19982 : SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
11136 : :
11137 : : /* Generate permutation masks for every NODE. Number of masks for each NODE
11138 : : is equal to GROUP_SIZE.
11139 : : E.g., we have a group of three nodes with three loads from the same
11140 : : location in each node, and the vector size is 4. I.e., we have a
11141 : : a0b0c0a1b1c1... sequence and we need to create the following vectors:
11142 : : for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
11143 : : for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
11144 : : ...
11145 : :
11146 : : The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
11147 : : The last mask is illegal since we assume two operands for permute
11148 : : operation, and the mask element values can't be outside that range.
11149 : : Hence, the last mask must be converted into {2,5,5,5}.
11150 : : For the first two permutations we need the first and the second input
11151 : : vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
11152 : : we need the second and the third vectors: {b1,c1,a2,b2} and
11153 : : {c2,a3,b3,c3}. */
11154 : :
11155 : 111495 : int vect_stmts_counter = 0;
11156 : 111495 : unsigned int index = 0;
11157 : 111495 : int first_vec_index = -1;
11158 : 111495 : int second_vec_index = -1;
11159 : 111495 : bool noop_p = true;
11160 : 111495 : *n_perms = 0;
11161 : :
11162 : 111495 : vec_perm_builder mask;
11163 : 111495 : unsigned int nelts_to_build;
11164 : 111495 : unsigned int nvectors_per_build;
11165 : 111495 : unsigned int in_nlanes;
11166 : 111495 : bool repeating_p = (group_size == dr_group_size
11167 : 142726 : && multiple_p (nunits, group_size));
11168 : 111495 : if (repeating_p)
11169 : : {
11170 : : /* A single vector contains a whole number of copies of the node, so:
11171 : : (a) all permutes can use the same mask; and
11172 : : (b) the permutes only need a single vector input. */
11173 : 29091 : mask.new_vector (nunits, group_size, 3);
11174 : 29091 : nelts_to_build = mask.encoded_nelts ();
11175 : : /* It's possible to obtain zero nstmts during analyze_only, so make
11176 : : it at least one to ensure the later computation for n_perms
11177 : : proceed. */
11178 : 29091 : nvectors_per_build = nstmts > 0 ? nstmts : 1;
11179 : 29091 : in_nlanes = dr_group_size * 3;
11180 : : }
11181 : : else
11182 : : {
11183 : : /* We need to construct a separate mask for each vector statement. */
11184 : 82404 : unsigned HOST_WIDE_INT const_nunits, const_vf;
11185 : 82404 : if (!nunits.is_constant (&const_nunits)
11186 : 82404 : || !vf.is_constant (&const_vf))
11187 : : return false;
11188 : 82404 : mask.new_vector (const_nunits, const_nunits, 1);
11189 : 82404 : nelts_to_build = const_vf * group_size;
11190 : 82404 : nvectors_per_build = 1;
11191 : 82404 : in_nlanes = const_vf * dr_group_size;
11192 : : }
11193 : 111495 : auto_sbitmap used_in_lanes (in_nlanes);
11194 : 111495 : bitmap_clear (used_in_lanes);
11195 : 111495 : auto_bitmap used_defs;
11196 : :
11197 : 111495 : unsigned int count = mask.encoded_nelts ();
11198 : 111495 : mask.quick_grow (count);
11199 : 111495 : vec_perm_indices indices;
11200 : :
11201 : 602992 : for (unsigned int j = 0; j < nelts_to_build; j++)
11202 : : {
11203 : 501152 : unsigned int iter_num = j / group_size;
11204 : 501152 : unsigned int stmt_num = j % group_size;
11205 : 501152 : unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
11206 : 501152 : bitmap_set_bit (used_in_lanes, i);
11207 : 501152 : if (repeating_p)
11208 : : {
11209 : : first_vec_index = 0;
11210 : : mask_element = i;
11211 : : }
11212 : : else
11213 : : {
11214 : : /* Enforced before the loop when !repeating_p. */
11215 : 313544 : unsigned int const_nunits = nunits.to_constant ();
11216 : 313544 : vec_index = i / const_nunits;
11217 : 313544 : mask_element = i % const_nunits;
11218 : 313544 : if (vec_index == first_vec_index
11219 : 313544 : || first_vec_index == -1)
11220 : : {
11221 : : first_vec_index = vec_index;
11222 : : }
11223 : 126359 : else if (vec_index == second_vec_index
11224 : 126359 : || second_vec_index == -1)
11225 : : {
11226 : 120042 : second_vec_index = vec_index;
11227 : 120042 : mask_element += const_nunits;
11228 : : }
11229 : : else
11230 : : {
11231 : 6317 : if (dump_p)
11232 : 229 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11233 : : "permutation requires at "
11234 : : "least three vectors %G",
11235 : : stmt_info->stmt);
11236 : 6317 : gcc_assert (analyze_only);
11237 : : return false;
11238 : : }
11239 : :
11240 : 307227 : gcc_assert (mask_element < 2 * const_nunits);
11241 : : }
11242 : :
11243 : 494835 : if (mask_element != index)
11244 : 321476 : noop_p = false;
11245 : 494835 : mask[index++] = mask_element;
11246 : :
11247 : 494835 : if (index == count)
11248 : : {
11249 : 128878 : if (!noop_p)
11250 : : {
11251 : 179376 : indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
11252 : 106919 : if (!can_vec_perm_const_p (mode, mode, indices))
11253 : : {
11254 : 3338 : if (dump_p)
11255 : : {
11256 : 73 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11257 : : "unsupported vect permute { ");
11258 : 607 : for (i = 0; i < count; ++i)
11259 : : {
11260 : 534 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
11261 : 534 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
11262 : : }
11263 : 73 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
11264 : : }
11265 : 3338 : gcc_assert (analyze_only);
11266 : : return false;
11267 : : }
11268 : :
11269 : 103581 : tree mask_vec = NULL_TREE;
11270 : 103581 : if (!analyze_only)
11271 : 18601 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
11272 : :
11273 : 103581 : if (second_vec_index == -1)
11274 : 32611 : second_vec_index = first_vec_index;
11275 : :
11276 : 209162 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
11277 : : {
11278 : 105581 : ++*n_perms;
11279 : 105581 : if (analyze_only)
11280 : 86697 : continue;
11281 : : /* Generate the permute statement if necessary. */
11282 : 18884 : tree first_vec = dr_chain[first_vec_index + ri];
11283 : 18884 : tree second_vec = dr_chain[second_vec_index + ri];
11284 : 18884 : gassign *stmt = as_a<gassign *> (stmt_info->stmt);
11285 : 18884 : tree perm_dest
11286 : 18884 : = vect_create_destination_var (gimple_assign_lhs (stmt),
11287 : : vectype);
11288 : 18884 : perm_dest = make_ssa_name (perm_dest);
11289 : 18884 : gimple *perm_stmt
11290 : 18884 : = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
11291 : : second_vec, mask_vec);
11292 : 18884 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
11293 : : gsi);
11294 : 18884 : if (dce_chain)
11295 : : {
11296 : 18107 : bitmap_set_bit (used_defs, first_vec_index + ri);
11297 : 18107 : bitmap_set_bit (used_defs, second_vec_index + ri);
11298 : : }
11299 : :
11300 : : /* Store the vector statement in NODE. */
11301 : 18884 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
11302 : : }
11303 : : }
11304 : 21959 : else if (!analyze_only)
11305 : : {
11306 : 2196 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
11307 : : {
11308 : 1098 : tree first_vec = dr_chain[first_vec_index + ri];
11309 : : /* If mask was NULL_TREE generate the requested
11310 : : identity transform. */
11311 : 1098 : if (dce_chain)
11312 : 1097 : bitmap_set_bit (used_defs, first_vec_index + ri);
11313 : :
11314 : : /* Store the vector statement in NODE. */
11315 : 1098 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
11316 : : }
11317 : : }
11318 : :
11319 : : index = 0;
11320 : : first_vec_index = -1;
11321 : : second_vec_index = -1;
11322 : : noop_p = true;
11323 : : }
11324 : : }
11325 : :
11326 : 101840 : if (n_loads)
11327 : : {
11328 : 0 : if (repeating_p)
11329 : 0 : *n_loads = nstmts;
11330 : : else
11331 : : {
11332 : : /* Enforced above when !repeating_p. */
11333 : 0 : unsigned int const_nunits = nunits.to_constant ();
11334 : 0 : *n_loads = 0;
11335 : 0 : bool load_seen = false;
11336 : 0 : for (unsigned i = 0; i < in_nlanes; ++i)
11337 : : {
11338 : 0 : if (i % const_nunits == 0)
11339 : : {
11340 : 0 : if (load_seen)
11341 : 0 : *n_loads += 1;
11342 : : load_seen = false;
11343 : : }
11344 : 0 : if (bitmap_bit_p (used_in_lanes, i))
11345 : 0 : load_seen = true;
11346 : : }
11347 : 0 : if (load_seen)
11348 : 0 : *n_loads += 1;
11349 : : }
11350 : : }
11351 : :
11352 : 101840 : if (dce_chain)
11353 : 158569 : for (unsigned i = 0; i < dr_chain.length (); ++i)
11354 : 31597 : if (!bitmap_bit_p (used_defs, i))
11355 : : {
11356 : 2227 : tree def = dr_chain[i];
11357 : 2286 : do
11358 : : {
11359 : 2286 : gimple *stmt = SSA_NAME_DEF_STMT (def);
11360 : 2286 : if (is_gimple_assign (stmt)
11361 : 2286 : && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
11362 : 2286 : || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
11363 : 397 : def = single_ssa_tree_operand (stmt, SSA_OP_USE);
11364 : : else
11365 : : def = NULL;
11366 : 2286 : gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
11367 : 2286 : gsi_remove (&rgsi, true);
11368 : 2286 : release_defs (stmt);
11369 : : }
11370 : 2286 : while (def);
11371 : : }
11372 : :
11373 : : return true;
11374 : 111495 : }
11375 : :
11376 : : /* Generate vector permute statements from a list of loads in DR_CHAIN.
11377 : : If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
11378 : : permute statements for the SLP node NODE. Store the number of vector
11379 : : permute instructions in *N_PERMS and the number of vector load
11380 : : instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
11381 : : that were not needed. */
11382 : :
11383 : : bool
11384 : 76867 : vect_transform_slp_perm_load (vec_info *vinfo,
11385 : : slp_tree node, const vec<tree> &dr_chain,
11386 : : gimple_stmt_iterator *gsi, poly_uint64 vf,
11387 : : bool analyze_only, unsigned *n_perms,
11388 : : unsigned int *n_loads, bool dce_chain)
11389 : : {
11390 : 76867 : return vect_transform_slp_perm_load_1 (vinfo, node,
11391 : 76867 : SLP_TREE_LOAD_PERMUTATION (node),
11392 : : dr_chain, gsi, vf, analyze_only,
11393 : : dump_enabled_p (), n_perms, n_loads,
11394 : 76867 : dce_chain);
11395 : : }
11396 : :
11397 : : /* Produce the next vector result for SLP permutation NODE by adding a vector
11398 : : statement at GSI. If MASK_VEC is nonnull, add:
11399 : :
11400 : : <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
11401 : :
11402 : : otherwise add:
11403 : :
11404 : : <new SSA name> = FIRST_DEF. */
11405 : :
11406 : : static void
11407 : 30200 : vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11408 : : slp_tree node, tree first_def, tree second_def,
11409 : : tree mask_vec, poly_uint64 identity_offset)
11410 : : {
11411 : 30200 : tree vectype = SLP_TREE_VECTYPE (node);
11412 : :
11413 : : /* ??? We SLP match existing vector element extracts but
11414 : : allow punning which we need to re-instantiate at uses
11415 : : but have no good way of explicitly representing. */
11416 : 30200 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
11417 : 30200 : && !types_compatible_p (TREE_TYPE (first_def), vectype))
11418 : : {
11419 : 15 : gassign *conv_stmt
11420 : 15 : = gimple_build_assign (make_ssa_name (vectype),
11421 : : build1 (VIEW_CONVERT_EXPR, vectype, first_def));
11422 : 15 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
11423 : 15 : first_def = gimple_assign_lhs (conv_stmt);
11424 : : }
11425 : 30200 : gassign *perm_stmt;
11426 : 30200 : tree perm_dest = make_ssa_name (vectype);
11427 : 30200 : if (mask_vec)
11428 : : {
11429 : 27213 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
11430 : 27213 : TYPE_SIZE (vectype))
11431 : 27213 : && !types_compatible_p (TREE_TYPE (second_def), vectype))
11432 : : {
11433 : 8 : gassign *conv_stmt
11434 : 8 : = gimple_build_assign (make_ssa_name (vectype),
11435 : : build1 (VIEW_CONVERT_EXPR,
11436 : : vectype, second_def));
11437 : 8 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
11438 : 8 : second_def = gimple_assign_lhs (conv_stmt);
11439 : : }
11440 : 27213 : perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
11441 : : first_def, second_def,
11442 : : mask_vec);
11443 : : }
11444 : 2987 : else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
11445 : : {
11446 : : /* For identity permutes we still need to handle the case
11447 : : of offsetted extracts or concats. */
11448 : 191 : unsigned HOST_WIDE_INT c;
11449 : 191 : auto first_def_nunits
11450 : 191 : = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
11451 : 191 : if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
11452 : : {
11453 : 187 : unsigned HOST_WIDE_INT elsz
11454 : 187 : = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
11455 : 374 : tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
11456 : 187 : TYPE_SIZE (vectype),
11457 : 187 : bitsize_int (identity_offset * elsz));
11458 : 187 : perm_stmt = gimple_build_assign (perm_dest, lowpart);
11459 : : }
11460 : 4 : else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
11461 : 4 : first_def_nunits, &c) && c == 2)
11462 : : {
11463 : 4 : tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
11464 : : NULL_TREE, second_def);
11465 : 4 : perm_stmt = gimple_build_assign (perm_dest, ctor);
11466 : : }
11467 : : else
11468 : 0 : gcc_unreachable ();
11469 : : }
11470 : : else
11471 : : {
11472 : : /* We need a copy here in case the def was external. */
11473 : 2796 : perm_stmt = gimple_build_assign (perm_dest, first_def);
11474 : : }
11475 : 30200 : vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
11476 : : /* Store the vector statement in NODE. */
11477 : 30200 : node->push_vec_def (perm_stmt);
11478 : 30200 : }
11479 : :
11480 : : /* Subroutine of vectorizable_slp_permutation. Check whether the target
11481 : : can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
11482 : : If GSI is nonnull, emit the permutation there.
11483 : :
11484 : : When GSI is null, the only purpose of NODE is to give properties
11485 : : of the result, such as the vector type and number of SLP lanes.
11486 : : The node does not need to be a VEC_PERM_EXPR.
11487 : :
11488 : : If the target supports the operation, return the number of individual
11489 : : VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
11490 : : dump file if DUMP_P is true. */
11491 : :
11492 : : static int
11493 : 433410 : vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
11494 : : slp_tree node, lane_permutation_t &perm,
11495 : : vec<slp_tree> &children, bool dump_p)
11496 : : {
11497 : 433410 : tree vectype = SLP_TREE_VECTYPE (node);
11498 : :
11499 : : /* ??? We currently only support all same vector input types
11500 : : while the SLP IL should really do a concat + select and thus accept
11501 : : arbitrary mismatches. */
11502 : 433410 : slp_tree child;
11503 : 433410 : unsigned i;
11504 : 433410 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
11505 : 433410 : bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
11506 : : /* True if we're permuting a single input of 2N vectors down
11507 : : to N vectors. This case doesn't generalize beyond 2 since
11508 : : VEC_PERM_EXPR only takes 2 inputs. */
11509 : 433410 : bool pack_p = false;
11510 : : /* If we're permuting inputs of N vectors each into X*N outputs,
11511 : : this is the value of X, otherwise it is 1. */
11512 : 433410 : unsigned int unpack_factor = 1;
11513 : 433410 : tree op_vectype = NULL_TREE;
11514 : 434581 : FOR_EACH_VEC_ELT (children, i, child)
11515 : 434502 : if (SLP_TREE_VECTYPE (child))
11516 : : {
11517 : : op_vectype = SLP_TREE_VECTYPE (child);
11518 : : break;
11519 : : }
11520 : 433410 : if (!op_vectype)
11521 : 79 : op_vectype = vectype;
11522 : 925640 : FOR_EACH_VEC_ELT (children, i, child)
11523 : : {
11524 : 492230 : if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
11525 : 9491 : && !vect_maybe_update_slp_op_vectype (child, op_vectype))
11526 : 492230 : || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
11527 : 984460 : || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
11528 : : {
11529 : 0 : if (dump_p)
11530 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11531 : : "Unsupported vector types in lane permutation\n");
11532 : 0 : return -1;
11533 : : }
11534 : 492230 : auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
11535 : 492230 : unsigned int this_unpack_factor;
11536 : : /* Detect permutations of external, pre-existing vectors. The external
11537 : : node's SLP_TREE_LANES stores the total number of units in the vector,
11538 : : or zero if the vector has variable length.
11539 : :
11540 : : We are expected to keep the original VEC_PERM_EXPR for such cases.
11541 : : There is no repetition to model. */
11542 : 492230 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def
11543 : 492230 : && SLP_TREE_SCALAR_OPS (child).is_empty ())
11544 : : repeating_p = false;
11545 : : /* Check whether the input has twice as many lanes per vector. */
11546 : 484816 : else if (children.length () == 1
11547 : 484816 : && known_eq (SLP_TREE_LANES (child) * nunits,
11548 : : SLP_TREE_LANES (node) * op_nunits * 2))
11549 : : pack_p = true;
11550 : : /* Check whether the output has N times as many lanes per vector. */
11551 : 492230 : else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
11552 : 442768 : SLP_TREE_LANES (child) * nunits,
11553 : : &this_unpack_factor)
11554 : 408161 : && (i == 0 || unpack_factor == this_unpack_factor))
11555 : : unpack_factor = this_unpack_factor;
11556 : : else
11557 : : repeating_p = false;
11558 : : }
11559 : :
11560 : 866820 : gcc_assert (perm.length () == SLP_TREE_LANES (node));
11561 : :
11562 : : /* Load-lanes permute. This permute only acts as a forwarder to
11563 : : select the correct vector def of the load-lanes load which
11564 : : has the permuted vectors in its vector defs like
11565 : : { v0, w0, r0, v1, w1, r1 ... } for a ld3. All costs are
11566 : : accounted for in the costing for the actual load so we
11567 : : return zero here. */
11568 : 433410 : if (node->ldst_lanes)
11569 : : {
11570 : 0 : gcc_assert (children.length () == 1);
11571 : 0 : if (!gsi)
11572 : : /* This is a trivial op always supported. */
11573 : : return 0;
11574 : 0 : slp_tree child = children[0];
11575 : 0 : unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
11576 : 0 : / SLP_TREE_LANES (node));
11577 : 0 : unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
11578 : 0 : unsigned nvectors = vect_get_num_copies (vinfo, node);
11579 : 0 : for (unsigned i = 0; i < nvectors; ++i)
11580 : : {
11581 : 0 : tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx];
11582 : 0 : node->push_vec_def (def);
11583 : : }
11584 : : return 0;
11585 : : }
11586 : :
11587 : : /* Set REPEATING_P to true if the permutations are cylical wrt UNPACK_FACTOR
11588 : : and if we can generate the vectors in a vector-length agnostic way.
11589 : : This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
11590 : : compile time.
11591 : :
11592 : : The significance of UNPACK_STEP is that, when PACK_P is false,
11593 : : output vector I operates on a window of UNPACK_STEP elements from each
11594 : : input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR). For example,
11595 : : when UNPACK_FACTOR is 2, the first output vector operates on lanes
11596 : : [0, NUNITS / 2 - 1] of each input vector and the second output vector
11597 : : operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
11598 : :
11599 : : When REPEATING_P is true, NOUTPUTS holds the total number of outputs
11600 : : that we actually need to generate. */
11601 : 433410 : uint64_t noutputs = 0;
11602 : 433410 : poly_uint64 unpack_step = 0;
11603 : 433410 : loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
11604 : 146835 : if (!linfo
11605 : 471522 : || !multiple_p (nunits, unpack_factor, &unpack_step)
11606 : 145960 : || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
11607 : 145960 : * SLP_TREE_LANES (node), nunits, &noutputs))
11608 : : repeating_p = false;
11609 : :
11610 : : /* We can handle the conditions described for REPEATING_P above for
11611 : : both variable- and constant-length vectors. The fallback requires
11612 : : us to generate every element of every permute vector explicitly,
11613 : : which is only possible for constant-length permute vectors.
11614 : :
11615 : : Set:
11616 : :
11617 : : - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
11618 : : mask vectors that we want to build.
11619 : :
11620 : : - NCOPIES to the number of copies of PERM that we need in order
11621 : : to build the necessary permute mask vectors. */
11622 : 145960 : uint64_t npatterns;
11623 : 145960 : unsigned nelts_per_pattern;
11624 : 145960 : uint64_t ncopies;
11625 : 145960 : if (repeating_p)
11626 : : {
11627 : : /* We need permute mask vectors that have the form:
11628 : :
11629 : : { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
11630 : :
11631 : : In other words, the original n-element permute in PERM is
11632 : : "unrolled" to fill a full vector. The stepped vector encoding
11633 : : that we use for permutes requires 3n elements. */
11634 : 107848 : npatterns = SLP_TREE_LANES (node);
11635 : 107848 : nelts_per_pattern = ncopies = 3;
11636 : : }
11637 : : else
11638 : : {
11639 : : /* Calculate every element of every permute mask vector explicitly,
11640 : : instead of relying on the pattern described above. */
11641 : 325562 : if (!nunits.is_constant (&npatterns)
11642 : 325562 : || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
11643 : : {
11644 : : if (dump_p)
11645 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11646 : : "unsupported permutation %p on variable-length"
11647 : : " vectors\n", (void *) node);
11648 : : return -1;
11649 : : }
11650 : 325562 : nelts_per_pattern = ncopies = 1;
11651 : 325562 : if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
11652 : : {
11653 : : if (dump_p)
11654 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11655 : : "unsupported permutation %p for variable VF\n",
11656 : : (void *) node);
11657 : : return -1;
11658 : : }
11659 : : pack_p = false;
11660 : : unpack_factor = 1;
11661 : : }
11662 : 433410 : unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
11663 : 433410 : gcc_assert (repeating_p || multiple_p (olanes, nunits));
11664 : :
11665 : : /* Compute the { { SLP operand, vector index}, lane } permutation sequence
11666 : : from the { SLP operand, scalar lane } permutation as recorded in the
11667 : : SLP node as intermediate step. This part should already work
11668 : : with SLP children with arbitrary number of lanes. */
11669 : 433410 : auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
11670 : 433410 : auto_vec<poly_uint64> active_lane;
11671 : 433410 : vperm.create (olanes);
11672 : 433410 : active_lane.safe_grow_cleared (children.length (), true);
11673 : 873733 : for (unsigned int ui = 0; ui < unpack_factor; ++ui)
11674 : : {
11675 : 1892710 : for (unsigned j = 0; j < children.length (); ++j)
11676 : 506032 : active_lane[j] = ui * unpack_step;
11677 : 1209283 : for (unsigned i = 0; i < ncopies; ++i)
11678 : : {
11679 : 4796092 : for (unsigned pi = 0; pi < perm.length (); ++pi)
11680 : : {
11681 : 1629086 : std::pair<unsigned, unsigned> p = perm[pi];
11682 : 1629086 : tree vtype = SLP_TREE_VECTYPE (children[p.first]);
11683 : 1629086 : if (repeating_p)
11684 : 615492 : vperm.quick_push ({{p.first, 0},
11685 : 615492 : p.second + active_lane[p.first]});
11686 : : else
11687 : : {
11688 : : /* We checked above that the vectors are constant-length. */
11689 : 1013594 : unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
11690 : 1013594 : .to_constant ();
11691 : 1013594 : unsigned lane = active_lane[p.first].to_constant ();
11692 : 1013594 : unsigned vi = (lane + p.second) / vnunits;
11693 : 1013594 : unsigned vl = (lane + p.second) % vnunits;
11694 : 1013594 : vperm.quick_push ({{p.first, vi}, vl});
11695 : : }
11696 : : }
11697 : : /* Advance to the next group. */
11698 : 1655826 : for (unsigned j = 0; j < children.length (); ++j)
11699 : 886866 : active_lane[j] += SLP_TREE_LANES (children[j]);
11700 : : }
11701 : : }
11702 : :
11703 : 433410 : if (dump_p)
11704 : : {
11705 : 8058 : dump_printf_loc (MSG_NOTE, vect_location,
11706 : : "vectorizing permutation %p", (void *)node);
11707 : 29171 : for (unsigned i = 0; i < perm.length (); ++i)
11708 : 21113 : dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
11709 : 8058 : if (repeating_p)
11710 : 6818 : dump_printf (MSG_NOTE, " (repeat %d)", SLP_TREE_LANES (node));
11711 : 8058 : dump_printf (MSG_NOTE, "\n");
11712 : 8058 : dump_printf_loc (MSG_NOTE, vect_location, "as");
11713 : 81419 : for (unsigned i = 0; i < vperm.length (); ++i)
11714 : : {
11715 : 73361 : if (i != 0
11716 : 73361 : && (repeating_p
11717 : 50749 : ? multiple_p (i, npatterns)
11718 : 54214 : : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
11719 : 22139 : dump_printf (MSG_NOTE, ",");
11720 : 73361 : dump_printf (MSG_NOTE, " vops%u[%u][",
11721 : 73361 : vperm[i].first.first, vperm[i].first.second);
11722 : 73361 : dump_dec (MSG_NOTE, vperm[i].second);
11723 : 73361 : dump_printf (MSG_NOTE, "]");
11724 : : }
11725 : 8058 : dump_printf (MSG_NOTE, "\n");
11726 : : }
11727 : :
11728 : : /* We can only handle two-vector permutes, everything else should
11729 : : be lowered on the SLP level. The following is closely inspired
11730 : : by vect_transform_slp_perm_load and is supposed to eventually
11731 : : replace it.
11732 : : ??? As intermediate step do code-gen in the SLP tree representation
11733 : : somehow? */
11734 : 433410 : std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
11735 : 433410 : std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
11736 : 433410 : unsigned int index = 0;
11737 : 433410 : poly_uint64 mask_element;
11738 : 433410 : vec_perm_builder mask;
11739 : 433410 : mask.new_vector (nunits, npatterns, nelts_per_pattern);
11740 : 433410 : unsigned int count = mask.encoded_nelts ();
11741 : 433410 : mask.quick_grow (count);
11742 : 433410 : vec_perm_indices indices;
11743 : 433410 : unsigned nperms = 0;
11744 : : /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
11745 : : vectors to check during analysis, but we need to generate NOUTPUTS
11746 : : vectors during transformation. */
11747 : 433410 : unsigned total_nelts = olanes;
11748 : 433410 : unsigned process_nelts = olanes;
11749 : 433410 : if (repeating_p)
11750 : : {
11751 : 107848 : total_nelts = (total_nelts / unpack_factor) * noutputs;
11752 : 107848 : if (gsi)
11753 : 9569 : process_nelts = total_nelts;
11754 : : }
11755 : 433410 : unsigned last_ei = (total_nelts - 1) % process_nelts;
11756 : 2070567 : for (unsigned i = 0; i < process_nelts; ++i)
11757 : : {
11758 : : /* VI is the input vector index when generating code for REPEATING_P. */
11759 : 1644578 : unsigned vi = i / olanes * (pack_p ? 2 : 1);
11760 : 1644578 : unsigned ei = i % olanes;
11761 : 1644578 : mask_element = vperm[ei].second;
11762 : 1644578 : if (pack_p)
11763 : : {
11764 : : /* In this case, we have N outputs and the single child provides 2N
11765 : : inputs. Output X permutes inputs 2X and 2X+1.
11766 : :
11767 : : The mask indices are taken directly from the SLP permutation node.
11768 : : Index X selects from the first vector if (X / NUNITS) % 2 == 0;
11769 : : X selects from the second vector otherwise. These conditions
11770 : : are only known at compile time for constant-length vectors. */
11771 : : first_vec = std::make_pair (0, 0);
11772 : : second_vec = std::make_pair (0, 1);
11773 : : }
11774 : 1485689 : else if (first_vec.first == -1U
11775 : 1485689 : || first_vec == vperm[ei].first)
11776 : 1293509 : first_vec = vperm[ei].first;
11777 : 192180 : else if (second_vec.first == -1U
11778 : 192180 : || second_vec == vperm[ei].first)
11779 : : {
11780 : 191792 : second_vec = vperm[ei].first;
11781 : 191792 : mask_element += nunits;
11782 : : }
11783 : : else
11784 : : {
11785 : 388 : if (dump_p)
11786 : 7 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11787 : : "permutation requires at "
11788 : : "least three vectors\n");
11789 : 388 : gcc_assert (!gsi);
11790 : : return -1;
11791 : : }
11792 : :
11793 : 1644190 : mask[index++] = mask_element;
11794 : :
11795 : 1644190 : if (index == count)
11796 : : {
11797 : 713405 : indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
11798 : : TYPE_VECTOR_SUBPARTS (op_vectype));
11799 : 569110 : bool identity_p = (indices.series_p (0, 1, mask[0], 1)
11800 : 882396 : && constant_multiple_p (mask[0], nunits));
11801 : 569110 : machine_mode vmode = TYPE_MODE (vectype);
11802 : 569110 : machine_mode op_vmode = TYPE_MODE (op_vectype);
11803 : 569110 : unsigned HOST_WIDE_INT c;
11804 : 569110 : if ((!identity_p
11805 : 528932 : && !can_vec_perm_const_p (vmode, op_vmode, indices))
11806 : 569110 : || (identity_p
11807 : 40178 : && !known_le (nunits,
11808 : : TYPE_VECTOR_SUBPARTS (op_vectype))
11809 : 7041 : && (!constant_multiple_p (nunits,
11810 : 8 : TYPE_VECTOR_SUBPARTS (op_vectype),
11811 : 8 : &c) || c != 2)))
11812 : : {
11813 : 7033 : if (dump_p)
11814 : : {
11815 : 152 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
11816 : : vect_location,
11817 : : "unsupported vect permute { ");
11818 : 1586 : for (i = 0; i < count; ++i)
11819 : : {
11820 : 1434 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
11821 : 1434 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
11822 : : }
11823 : 152 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
11824 : : }
11825 : 7033 : gcc_assert (!gsi);
11826 : 7421 : return -1;
11827 : : }
11828 : :
11829 : 562077 : if (!identity_p)
11830 : 521899 : nperms += CEIL (total_nelts, process_nelts) - (ei > last_ei);
11831 : 562077 : if (gsi)
11832 : : {
11833 : 30200 : if (second_vec.first == -1U)
11834 : 6664 : second_vec = first_vec;
11835 : :
11836 : 30200 : slp_tree
11837 : 30200 : first_node = children[first_vec.first],
11838 : 30200 : second_node = children[second_vec.first];
11839 : :
11840 : 30200 : tree mask_vec = NULL_TREE;
11841 : 30200 : if (!identity_p)
11842 : 27213 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
11843 : :
11844 : 30200 : tree first_def
11845 : 30200 : = vect_get_slp_vect_def (first_node, first_vec.second + vi);
11846 : 30200 : tree second_def
11847 : 30200 : = vect_get_slp_vect_def (second_node, second_vec.second + vi);
11848 : 30200 : vect_add_slp_permutation (vinfo, gsi, node, first_def,
11849 : 30200 : second_def, mask_vec, mask[0]);
11850 : : }
11851 : :
11852 : : index = 0;
11853 : : first_vec = std::make_pair (-1U, -1U);
11854 : : second_vec = std::make_pair (-1U, -1U);
11855 : : }
11856 : : }
11857 : :
11858 : 425989 : return nperms;
11859 : 433410 : }
11860 : :
11861 : : /* Vectorize the SLP permutations in NODE as specified
11862 : : in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
11863 : : child number and lane number.
11864 : : Interleaving of two two-lane two-child SLP subtrees (not supported):
11865 : : [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
11866 : : A blend of two four-lane two-child SLP subtrees:
11867 : : [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
11868 : : Highpart of a four-lane one-child SLP subtree (not supported):
11869 : : [ { 0, 2 }, { 0, 3 } ]
11870 : : Where currently only a subset is supported by code generating below. */
11871 : :
11872 : : bool
11873 : 114698 : vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11874 : : slp_tree node, stmt_vector_for_cost *cost_vec)
11875 : : {
11876 : 114698 : tree vectype = SLP_TREE_VECTYPE (node);
11877 : 114698 : lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
11878 : 114698 : int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
11879 : 114698 : SLP_TREE_CHILDREN (node),
11880 : : dump_enabled_p ());
11881 : 114698 : if (nperms < 0)
11882 : : return false;
11883 : :
11884 : 113520 : if (!gsi && nperms != 0)
11885 : 92651 : record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
11886 : :
11887 : : return true;
11888 : : }
11889 : :
11890 : : /* Vectorize SLP NODE. */
11891 : :
11892 : : static void
11893 : 1470368 : vect_schedule_slp_node (vec_info *vinfo,
11894 : : slp_tree node, slp_instance instance)
11895 : : {
11896 : 1470368 : gimple_stmt_iterator si;
11897 : 1470368 : int i;
11898 : 1470368 : slp_tree child;
11899 : :
11900 : : /* Vectorize externals and constants. */
11901 : 1470368 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
11902 : 1470368 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
11903 : : {
11904 : : /* ??? vectorizable_shift can end up using a scalar operand which is
11905 : : currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
11906 : : node in this case. */
11907 : 501640 : if (!SLP_TREE_VECTYPE (node))
11908 : 501640 : return;
11909 : :
11910 : : /* There are two reasons vector defs might already exist. The first
11911 : : is that we are vectorizing an existing vector def. The second is
11912 : : when performing BB vectorization shared constant/external nodes
11913 : : are not split apart during partitioning so during the code-gen
11914 : : DFS walk we can end up visiting them twice. */
11915 : 495546 : if (! SLP_TREE_VEC_DEFS (node).exists ())
11916 : 494844 : vect_create_constant_vectors (vinfo, node);
11917 : 495546 : return;
11918 : : }
11919 : :
11920 : 968728 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
11921 : :
11922 : 968728 : gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
11923 : 968728 : if (SLP_TREE_VECTYPE (node))
11924 : 968722 : SLP_TREE_VEC_DEFS (node).create (vect_get_num_copies (vinfo, node));
11925 : :
11926 : 968728 : if (!SLP_TREE_PERMUTE_P (node) && STMT_VINFO_DATA_REF (stmt_info))
11927 : : {
11928 : : /* Vectorized loads go before the first scalar load to make it
11929 : : ready early, vectorized stores go before the last scalar
11930 : : stmt which is where all uses are ready. */
11931 : 708228 : stmt_vec_info last_stmt_info = NULL;
11932 : 708228 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
11933 : 162728 : last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
11934 : : else /* DR_IS_WRITE */
11935 : 545500 : last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
11936 : 708228 : si = gsi_for_stmt (last_stmt_info->stmt);
11937 : 708228 : }
11938 : 260500 : else if (!SLP_TREE_PERMUTE_P (node)
11939 : 244597 : && (SLP_TREE_TYPE (node) == cycle_phi_info_type
11940 : : || SLP_TREE_TYPE (node) == induc_vec_info_type
11941 : : || SLP_TREE_TYPE (node) == phi_info_type))
11942 : : {
11943 : : /* For PHI node vectorization we do not use the insertion iterator. */
11944 : 54735 : si = gsi_none ();
11945 : : }
11946 : : else
11947 : : {
11948 : : /* Emit other stmts after the children vectorized defs which is
11949 : : earliest possible. */
11950 : : gimple *last_stmt = NULL;
11951 : : bool seen_vector_def = false;
11952 : 573533 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11953 : 367768 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
11954 : : {
11955 : : /* For fold-left reductions we are retaining the scalar
11956 : : reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
11957 : : set so the representation isn't perfect. Resort to the
11958 : : last scalar def here. */
11959 : 293528 : if (SLP_TREE_VEC_DEFS (child).is_empty ())
11960 : : {
11961 : 845 : gcc_assert (SLP_TREE_TYPE (child) == cycle_phi_info_type);
11962 : 845 : gphi *phi = as_a <gphi *>
11963 : 845 : (vect_find_last_scalar_stmt_in_slp (child)->stmt);
11964 : 845 : if (!last_stmt)
11965 : : last_stmt = phi;
11966 : 630 : else if (vect_stmt_dominates_stmt_p (last_stmt, phi))
11967 : : last_stmt = phi;
11968 : 619 : else if (vect_stmt_dominates_stmt_p (phi, last_stmt))
11969 : : ;
11970 : : else
11971 : 0 : gcc_unreachable ();
11972 : : }
11973 : : /* We are emitting all vectorized stmts in the same place and
11974 : : the last one is the last.
11975 : : ??? Unless we have a load permutation applied and that
11976 : : figures to re-use an earlier generated load. */
11977 : : unsigned j;
11978 : : tree vdef;
11979 : 693359 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11980 : : {
11981 : 399831 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11982 : 399831 : if (!last_stmt)
11983 : : last_stmt = vstmt;
11984 : 204461 : else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11985 : : last_stmt = vstmt;
11986 : 44759 : else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
11987 : : ;
11988 : : else
11989 : 0 : gcc_unreachable ();
11990 : : }
11991 : : }
11992 : 74240 : else if (!SLP_TREE_VECTYPE (child))
11993 : : {
11994 : : /* For externals we use unvectorized at all scalar defs. */
11995 : : unsigned j;
11996 : : tree def;
11997 : 12711 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
11998 : 7234 : if (TREE_CODE (def) == SSA_NAME
11999 : 7234 : && !SSA_NAME_IS_DEFAULT_DEF (def))
12000 : : {
12001 : 163 : gimple *stmt = SSA_NAME_DEF_STMT (def);
12002 : 163 : if (gimple_uid (stmt) == -1u)
12003 : : /* If the stmt is not inside the region do not
12004 : : use it as possible insertion point. */
12005 : : ;
12006 : 155 : else if (!last_stmt)
12007 : : last_stmt = stmt;
12008 : 149 : else if (vect_stmt_dominates_stmt_p (last_stmt, stmt))
12009 : : last_stmt = stmt;
12010 : 149 : else if (vect_stmt_dominates_stmt_p (stmt, last_stmt))
12011 : : ;
12012 : : else
12013 : 0 : gcc_unreachable ();
12014 : : }
12015 : : }
12016 : : else
12017 : : {
12018 : : /* For externals we have to look at all defs since their
12019 : : insertion place is decided per vector. But beware
12020 : : of pre-existing vectors where we need to make sure
12021 : : we do not insert before the region boundary. */
12022 : 68763 : if (SLP_TREE_SCALAR_OPS (child).is_empty ()
12023 : 566 : && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
12024 : : seen_vector_def = true;
12025 : : else
12026 : : {
12027 : : unsigned j;
12028 : : tree vdef;
12029 : 534269 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
12030 : 97854 : if (TREE_CODE (vdef) == SSA_NAME
12031 : 97854 : && !SSA_NAME_IS_DEFAULT_DEF (vdef))
12032 : : {
12033 : 19095 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
12034 : 19095 : if (!last_stmt)
12035 : : last_stmt = vstmt;
12036 : 10644 : else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
12037 : : last_stmt = vstmt;
12038 : 8492 : else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
12039 : : ;
12040 : : else
12041 : 0 : gcc_unreachable ();
12042 : : }
12043 : : }
12044 : : }
12045 : : /* This can happen when all children are pre-existing vectors or
12046 : : constants. */
12047 : 205765 : if (!last_stmt)
12048 : 1723 : last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
12049 : 1723 : if (!last_stmt)
12050 : : {
12051 : 0 : gcc_assert (seen_vector_def);
12052 : 0 : si = gsi_after_labels (vinfo->bbs[0]);
12053 : : }
12054 : 205765 : else if (is_ctrl_altering_stmt (last_stmt))
12055 : : {
12056 : : /* We split regions to vectorize at control altering stmts
12057 : : with a definition so this must be an external which
12058 : : we can insert at the start of the region. */
12059 : 0 : si = gsi_after_labels (vinfo->bbs[0]);
12060 : : }
12061 : 205765 : else if (is_a <bb_vec_info> (vinfo)
12062 : 17073 : && !SLP_TREE_PERMUTE_P (node)
12063 : 15759 : && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
12064 : 206978 : && gimple_could_trap_p (stmt_info->stmt))
12065 : : {
12066 : : /* We've constrained possibly trapping operations to all come
12067 : : from the same basic-block, if vectorized defs would allow earlier
12068 : : scheduling still force vectorized stmts to the original block.
12069 : : This is only necessary for BB vectorization since for loop vect
12070 : : all operations are in a single BB and scalar stmt based
12071 : : placement doesn't play well with epilogue vectorization. */
12072 : 51 : gcc_assert (dominated_by_p (CDI_DOMINATORS,
12073 : : gimple_bb (stmt_info->stmt),
12074 : : gimple_bb (last_stmt)));
12075 : 51 : si = gsi_after_labels (gimple_bb (stmt_info->stmt));
12076 : : }
12077 : 205714 : else if (is_a <gphi *> (last_stmt))
12078 : 15783 : si = gsi_after_labels (gimple_bb (last_stmt));
12079 : : else
12080 : : {
12081 : 189931 : si = gsi_for_stmt (last_stmt);
12082 : 189931 : gsi_next (&si);
12083 : :
12084 : : /* Avoid scheduling internal defs outside of the loop when
12085 : : we might have only implicitly tracked loop mask/len defs. */
12086 : 189931 : if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
12087 : 71 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
12088 : 173116 : || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
12089 : : {
12090 : 71 : gimple_stmt_iterator si2
12091 : 71 : = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
12092 : 71 : if ((gsi_end_p (si2)
12093 : 0 : && (LOOP_VINFO_LOOP (loop_vinfo)->header
12094 : 0 : != gimple_bb (last_stmt))
12095 : 0 : && dominated_by_p (CDI_DOMINATORS,
12096 : : LOOP_VINFO_LOOP (loop_vinfo)->header,
12097 : 0 : gimple_bb (last_stmt)))
12098 : 71 : || (!gsi_end_p (si2)
12099 : 71 : && last_stmt != *si2
12100 : 70 : && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
12101 : 3 : si = si2;
12102 : : }
12103 : : }
12104 : : }
12105 : :
12106 : 968728 : if (dump_enabled_p ())
12107 : : {
12108 : 69875 : if (stmt_info)
12109 : 69824 : dump_printf_loc (MSG_NOTE, vect_location,
12110 : : "------>vectorizing SLP node starting from: %G",
12111 : : stmt_info->stmt);
12112 : : else
12113 : : {
12114 : 51 : dump_printf_loc (MSG_NOTE, vect_location,
12115 : : "------>vectorizing SLP node:\n");
12116 : 51 : vect_print_slp_tree (MSG_NOTE, vect_location, node);
12117 : : }
12118 : : }
12119 : 968728 : vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
12120 : : }
12121 : :
12122 : : /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
12123 : : For loop vectorization this is done in vectorizable_call, but for SLP
12124 : : it needs to be deferred until end of vect_schedule_slp, because multiple
12125 : : SLP instances may refer to the same scalar stmt. */
12126 : :
12127 : : static void
12128 : 606621 : vect_remove_slp_scalar_calls (vec_info *vinfo,
12129 : : slp_tree node, hash_set<slp_tree> &visited)
12130 : : {
12131 : 606621 : gimple *new_stmt;
12132 : 606621 : gimple_stmt_iterator gsi;
12133 : 606621 : int i;
12134 : 606621 : slp_tree child;
12135 : 606621 : tree lhs;
12136 : 606621 : stmt_vec_info stmt_info;
12137 : :
12138 : 606621 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
12139 : 193822 : return;
12140 : :
12141 : 456037 : if (visited.add (node))
12142 : : return;
12143 : :
12144 : 928205 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
12145 : 515406 : vect_remove_slp_scalar_calls (vinfo, child, visited);
12146 : :
12147 : 1300942 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
12148 : : {
12149 : 479393 : if (!stmt_info)
12150 : 3821 : continue;
12151 : 475572 : stmt_info = vect_orig_stmt (stmt_info);
12152 : 475572 : gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
12153 : 5082 : if (!stmt || gimple_bb (stmt) == NULL)
12154 : 470512 : continue;
12155 : 5060 : lhs = gimple_call_lhs (stmt);
12156 : 5060 : if (lhs)
12157 : 4498 : new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
12158 : : else
12159 : : {
12160 : 562 : new_stmt = gimple_build_nop ();
12161 : 562 : unlink_stmt_vdef (stmt_info->stmt);
12162 : : }
12163 : 5060 : gsi = gsi_for_stmt (stmt);
12164 : 5060 : vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
12165 : 5060 : if (lhs)
12166 : 4498 : SSA_NAME_DEF_STMT (lhs) = new_stmt;
12167 : : }
12168 : : }
12169 : :
12170 : : static void
12171 : 91215 : vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
12172 : : {
12173 : 91215 : hash_set<slp_tree> visited;
12174 : 91215 : vect_remove_slp_scalar_calls (vinfo, node, visited);
12175 : 91215 : }
12176 : :
12177 : : /* Vectorize the instance root. */
12178 : :
12179 : : void
12180 : 10220 : vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance instance)
12181 : : {
12182 : 10220 : gassign *rstmt = NULL;
12183 : :
12184 : 10220 : if (instance->kind == slp_inst_kind_ctor)
12185 : : {
12186 : 4291 : if (SLP_TREE_VEC_DEFS (node).length () == 1)
12187 : : {
12188 : 4287 : tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
12189 : 4287 : tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
12190 : 4287 : if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
12191 : 4287 : TREE_TYPE (vect_lhs)))
12192 : 0 : vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
12193 : : vect_lhs);
12194 : 4287 : rstmt = gimple_build_assign (root_lhs, vect_lhs);
12195 : : }
12196 : : else
12197 : : {
12198 : 4 : gcc_assert (SLP_TREE_VEC_DEFS (node).length () > 1);
12199 : 4 : tree child_def;
12200 : 4 : int j;
12201 : 4 : vec<constructor_elt, va_gc> *v;
12202 : 4 : vec_alloc (v, SLP_TREE_VEC_DEFS (node).length ());
12203 : :
12204 : : /* A CTOR can handle V16HI composition from VNx8HI so we
12205 : : do not need to convert vector elements if the types
12206 : : do not match. */
12207 : 12 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
12208 : 8 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
12209 : 4 : tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
12210 : 4 : tree rtype
12211 : 4 : = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
12212 : 4 : tree r_constructor = build_constructor (rtype, v);
12213 : 4 : rstmt = gimple_build_assign (lhs, r_constructor);
12214 : : }
12215 : : }
12216 : 5929 : else if (instance->kind == slp_inst_kind_bb_reduc)
12217 : : {
12218 : : /* Largely inspired by reduction chain epilogue handling in
12219 : : vect_create_epilog_for_reduction. */
12220 : 4291 : vec<tree> vec_defs = vNULL;
12221 : 4291 : vect_get_slp_defs (node, &vec_defs);
12222 : 4291 : enum tree_code reduc_code
12223 : 4291 : = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
12224 : : /* ??? We actually have to reflect signs somewhere. */
12225 : 4291 : if (reduc_code == MINUS_EXPR)
12226 : 0 : reduc_code = PLUS_EXPR;
12227 : 4291 : gimple_seq epilogue = NULL;
12228 : : /* We may end up with more than one vector result, reduce them
12229 : : to one vector. */
12230 : 4291 : tree vec_def = vec_defs[0];
12231 : 4291 : tree vectype = TREE_TYPE (vec_def);
12232 : 4291 : tree compute_vectype = vectype;
12233 : 4291 : bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
12234 : 4092 : && TYPE_OVERFLOW_UNDEFINED (vectype)
12235 : 7227 : && operation_can_overflow (reduc_code));
12236 : 2810 : if (pun_for_overflow_p)
12237 : : {
12238 : 2810 : compute_vectype = unsigned_type_for (vectype);
12239 : 2810 : vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
12240 : : compute_vectype, vec_def);
12241 : : }
12242 : 6653 : for (unsigned i = 1; i < vec_defs.length (); ++i)
12243 : : {
12244 : 2362 : tree def = vec_defs[i];
12245 : 2362 : if (pun_for_overflow_p)
12246 : 2273 : def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
12247 : : compute_vectype, def);
12248 : 2362 : vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
12249 : : vec_def, def);
12250 : : }
12251 : 4291 : vec_defs.release ();
12252 : : /* ??? Support other schemes than direct internal fn. */
12253 : 4291 : internal_fn reduc_fn;
12254 : 4291 : if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
12255 : 4291 : || reduc_fn == IFN_LAST)
12256 : 0 : gcc_unreachable ();
12257 : 4291 : tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
12258 : 4291 : TREE_TYPE (compute_vectype), vec_def);
12259 : 4291 : if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
12260 : : {
12261 : 2809 : tree rem_def = NULL_TREE;
12262 : 12395 : for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
12263 : : {
12264 : 9586 : def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
12265 : 9586 : if (!rem_def)
12266 : : rem_def = def;
12267 : : else
12268 : 6777 : rem_def = gimple_build (&epilogue, reduc_code,
12269 : 6777 : TREE_TYPE (scalar_def),
12270 : : rem_def, def);
12271 : : }
12272 : 2809 : scalar_def = gimple_build (&epilogue, reduc_code,
12273 : 2809 : TREE_TYPE (scalar_def),
12274 : : scalar_def, rem_def);
12275 : : }
12276 : 4291 : scalar_def = gimple_convert (&epilogue,
12277 : 4291 : TREE_TYPE (vectype), scalar_def);
12278 : 4291 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
12279 : 4291 : gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
12280 : 4291 : gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
12281 : 4291 : update_stmt (gsi_stmt (rgsi));
12282 : 4291 : return;
12283 : : }
12284 : 1638 : else if (instance->kind == slp_inst_kind_gcond)
12285 : : {
12286 : : /* Only support a single root for now as we can't codegen CFG yet and so we
12287 : : can't support lane > 1 at this time. */
12288 : 1638 : gcc_assert (instance->root_stmts.length () == 1);
12289 : 1638 : auto root_stmt_info = instance->root_stmts[0];
12290 : 1638 : auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
12291 : 1638 : gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
12292 : 1638 : gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
12293 : 1638 : bool res = vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
12294 : : root_stmt_info, &rgsi, node, NULL);
12295 : 1638 : gcc_assert (res);
12296 : 1638 : return;
12297 : : }
12298 : : else
12299 : 0 : gcc_unreachable ();
12300 : :
12301 : 4291 : gcc_assert (rstmt);
12302 : :
12303 : 4291 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
12304 : 4291 : gsi_replace (&rgsi, rstmt, true);
12305 : : }
12306 : :
12307 : : struct slp_scc_info
12308 : : {
12309 : : bool on_stack;
12310 : : int dfs;
12311 : : int lowlink;
12312 : : };
12313 : :
12314 : : /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
12315 : :
12316 : : static void
12317 : 1470368 : vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
12318 : : hash_map<slp_tree, slp_scc_info> &scc_info,
12319 : : int &maxdfs, vec<slp_tree> &stack)
12320 : : {
12321 : 1470368 : bool existed_p;
12322 : 1470368 : slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
12323 : 1470368 : gcc_assert (!existed_p);
12324 : 1470368 : info->dfs = maxdfs;
12325 : 1470368 : info->lowlink = maxdfs;
12326 : 1470368 : maxdfs++;
12327 : :
12328 : : /* Leaf. */
12329 : 1470368 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
12330 : : {
12331 : 501640 : info->on_stack = false;
12332 : 501640 : vect_schedule_slp_node (vinfo, node, instance);
12333 : 1034551 : return;
12334 : : }
12335 : :
12336 : 968728 : info->on_stack = true;
12337 : 968728 : stack.safe_push (node);
12338 : :
12339 : 968728 : unsigned i;
12340 : 968728 : slp_tree child;
12341 : : /* DFS recurse. */
12342 : 2002594 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
12343 : : {
12344 : 1033866 : if (!child)
12345 : 58478 : continue;
12346 : 975388 : slp_scc_info *child_info = scc_info.get (child);
12347 : 975388 : if (!child_info)
12348 : : {
12349 : 887168 : vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
12350 : : /* Recursion might have re-allocated the node. */
12351 : 887168 : info = scc_info.get (node);
12352 : 887168 : child_info = scc_info.get (child);
12353 : 887168 : info->lowlink = MIN (info->lowlink, child_info->lowlink);
12354 : : }
12355 : 88220 : else if (child_info->on_stack)
12356 : 25048 : info->lowlink = MIN (info->lowlink, child_info->dfs);
12357 : : }
12358 : 968728 : if (info->lowlink != info->dfs)
12359 : : return;
12360 : :
12361 : 937457 : auto_vec<slp_tree, 4> phis_to_fixup;
12362 : :
12363 : : /* Singleton. */
12364 : 937457 : if (stack.last () == node)
12365 : : {
12366 : 914005 : stack.pop ();
12367 : 914005 : info->on_stack = false;
12368 : 914005 : vect_schedule_slp_node (vinfo, node, instance);
12369 : 914005 : if (!SLP_TREE_PERMUTE_P (node)
12370 : 914005 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
12371 : 31349 : phis_to_fixup.quick_push (node);
12372 : : }
12373 : : else
12374 : : {
12375 : : /* SCC. */
12376 : 23452 : int last_idx = stack.length () - 1;
12377 : 54723 : while (stack[last_idx] != node)
12378 : 31271 : last_idx--;
12379 : : /* We can break the cycle at PHIs who have at least one child
12380 : : code generated. Then we could re-start the DFS walk until
12381 : : all nodes in the SCC are covered (we might have new entries
12382 : : for only back-reachable nodes). But it's simpler to just
12383 : : iterate and schedule those that are ready. */
12384 : 23452 : unsigned todo = stack.length () - last_idx;
12385 : 23773 : do
12386 : : {
12387 : 103987 : for (int idx = stack.length () - 1; idx >= last_idx; --idx)
12388 : : {
12389 : 56441 : slp_tree entry = stack[idx];
12390 : 56441 : if (!entry)
12391 : 928 : continue;
12392 : 55513 : bool phi = (!SLP_TREE_PERMUTE_P (entry)
12393 : 55513 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
12394 : 55513 : bool ready = !phi;
12395 : 140569 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
12396 : 109689 : if (!child)
12397 : : {
12398 : 22672 : gcc_assert (phi);
12399 : : ready = true;
12400 : : break;
12401 : : }
12402 : 87017 : else if (scc_info.get (child)->on_stack)
12403 : : {
12404 : 23652 : if (!phi)
12405 : : {
12406 : : ready = false;
12407 : : break;
12408 : : }
12409 : : }
12410 : : else
12411 : : {
12412 : 63365 : if (phi)
12413 : : {
12414 : : ready = true;
12415 : : break;
12416 : : }
12417 : : }
12418 : 32841 : if (ready)
12419 : : {
12420 : 54723 : vect_schedule_slp_node (vinfo, entry, instance);
12421 : 54723 : scc_info.get (entry)->on_stack = false;
12422 : 54723 : stack[idx] = NULL;
12423 : 54723 : todo--;
12424 : 54723 : if (phi)
12425 : 23873 : phis_to_fixup.safe_push (entry);
12426 : : }
12427 : : }
12428 : : }
12429 : 23773 : while (todo != 0);
12430 : :
12431 : : /* Pop the SCC. */
12432 : 23452 : stack.truncate (last_idx);
12433 : : }
12434 : :
12435 : : /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
12436 : : slp_tree phi_node;
12437 : 1930136 : FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
12438 : : {
12439 : 55222 : gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
12440 : 55222 : edge_iterator ei;
12441 : 55222 : edge e;
12442 : 173352 : FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
12443 : : {
12444 : 118130 : unsigned dest_idx = e->dest_idx;
12445 : 118130 : child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
12446 : 118130 : if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
12447 : 68799 : continue;
12448 : 49331 : unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
12449 : : /* Simply fill all args. */
12450 : 49331 : if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
12451 : : != vect_first_order_recurrence)
12452 : 105208 : for (unsigned i = 0; i < n; ++i)
12453 : : {
12454 : 55917 : tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
12455 : 55917 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
12456 : 55917 : add_phi_arg (phi, vect_get_slp_vect_def (child, i),
12457 : : e, gimple_phi_arg_location (phi, dest_idx));
12458 : : }
12459 : : else
12460 : : {
12461 : : /* Unless it is a first order recurrence which needs
12462 : : args filled in for both the PHI node and the permutes. */
12463 : 40 : gimple *perm
12464 : 40 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
12465 : 40 : gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
12466 : 40 : add_phi_arg (as_a <gphi *> (rphi),
12467 : : vect_get_slp_vect_def (child, n - 1),
12468 : : e, gimple_phi_arg_location (phi, dest_idx));
12469 : 117 : for (unsigned i = 0; i < n; ++i)
12470 : : {
12471 : 77 : gimple *perm
12472 : 77 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
12473 : 77 : if (i > 0)
12474 : 37 : gimple_assign_set_rhs1 (perm,
12475 : : vect_get_slp_vect_def (child, i - 1));
12476 : 77 : gimple_assign_set_rhs2 (perm,
12477 : : vect_get_slp_vect_def (child, i));
12478 : 77 : update_stmt (perm);
12479 : : }
12480 : : }
12481 : : }
12482 : : }
12483 : 937457 : }
12484 : :
12485 : : /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
12486 : :
12487 : : void
12488 : 543056 : vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
12489 : : {
12490 : 543056 : slp_instance instance;
12491 : 543056 : unsigned int i;
12492 : :
12493 : 543056 : hash_map<slp_tree, slp_scc_info> scc_info;
12494 : 543056 : int maxdfs = 0;
12495 : 1126439 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
12496 : : {
12497 : 583383 : slp_tree node = SLP_INSTANCE_TREE (instance);
12498 : 583383 : if (dump_enabled_p ())
12499 : : {
12500 : 15902 : dump_printf_loc (MSG_NOTE, vect_location,
12501 : : "Vectorizing SLP tree:\n");
12502 : : /* ??? Dump all? */
12503 : 15902 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
12504 : 407 : dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
12505 : 407 : SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
12506 : 15902 : vect_print_slp_graph (MSG_NOTE, vect_location,
12507 : : SLP_INSTANCE_TREE (instance));
12508 : : }
12509 : : /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
12510 : : have a PHI be the node breaking the cycle. */
12511 : 583383 : auto_vec<slp_tree> stack;
12512 : 583383 : if (!scc_info.get (node))
12513 : 583200 : vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
12514 : :
12515 : 583383 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
12516 : 10220 : vectorize_slp_instance_root_stmt (vinfo, node, instance);
12517 : :
12518 : 583383 : if (dump_enabled_p ())
12519 : 15902 : dump_printf_loc (MSG_NOTE, vect_location,
12520 : : "vectorizing stmts using SLP.\n");
12521 : 583383 : }
12522 : :
12523 : 1669495 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
12524 : : {
12525 : 583383 : slp_tree root = SLP_INSTANCE_TREE (instance);
12526 : 583383 : stmt_vec_info store_info;
12527 : 583383 : unsigned int j;
12528 : :
12529 : : /* Remove scalar call stmts. Do not do this for basic-block
12530 : : vectorization as not all uses may be vectorized.
12531 : : ??? Why should this be necessary? DCE should be able to
12532 : : remove the stmts itself.
12533 : : ??? For BB vectorization we can as well remove scalar
12534 : : stmts starting from the SLP tree root if they have no
12535 : : uses. */
12536 : 583383 : if (is_a <loop_vec_info> (vinfo))
12537 : 91215 : vect_remove_slp_scalar_calls (vinfo, root);
12538 : :
12539 : : /* Remove vectorized stores original scalar stmts. */
12540 : 2601114 : for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
12541 : : {
12542 : 1472231 : if (!store_info
12543 : 1472217 : || !STMT_VINFO_DATA_REF (store_info)
12544 : 1442350 : || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
12545 : : break;
12546 : :
12547 : 1434348 : store_info = vect_orig_stmt (store_info);
12548 : : /* Free the attached stmt_vec_info and remove the stmt. */
12549 : 1434348 : vinfo->remove_stmt (store_info);
12550 : :
12551 : : /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
12552 : : to not crash in vect_free_slp_tree later. */
12553 : 1434348 : if (SLP_TREE_REPRESENTATIVE (root) == store_info)
12554 : 545207 : SLP_TREE_REPRESENTATIVE (root) = NULL;
12555 : : }
12556 : : }
12557 : 543056 : }
|