Branch data Line data Source code
1 : : /* SLP - Basic Block Vectorization
2 : : Copyright (C) 2007-2025 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : : and Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #include "config.h"
23 : : #define INCLUDE_ALGORITHM
24 : : #include "system.h"
25 : : #include "coretypes.h"
26 : : #include "backend.h"
27 : : #include "target.h"
28 : : #include "rtl.h"
29 : : #include "tree.h"
30 : : #include "gimple.h"
31 : : #include "tree-pass.h"
32 : : #include "ssa.h"
33 : : #include "optabs-tree.h"
34 : : #include "insn-config.h"
35 : : #include "recog.h" /* FIXME: for insn_data */
36 : : #include "fold-const.h"
37 : : #include "stor-layout.h"
38 : : #include "gimple-iterator.h"
39 : : #include "cfgloop.h"
40 : : #include "tree-vectorizer.h"
41 : : #include "langhooks.h"
42 : : #include "gimple-walk.h"
43 : : #include "dbgcnt.h"
44 : : #include "tree-vector-builder.h"
45 : : #include "vec-perm-indices.h"
46 : : #include "gimple-fold.h"
47 : : #include "internal-fn.h"
48 : : #include "dump-context.h"
49 : : #include "cfganal.h"
50 : : #include "tree-eh.h"
51 : : #include "tree-cfg.h"
52 : : #include "alloc-pool.h"
53 : : #include "sreal.h"
54 : : #include "predict.h"
55 : :
56 : : static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 : : load_permutation_t &,
58 : : const vec<tree> &,
59 : : gimple_stmt_iterator *,
60 : : poly_uint64, bool, bool,
61 : : unsigned *,
62 : : unsigned * = nullptr,
63 : : bool = false);
64 : : static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 : : slp_tree, lane_permutation_t &,
66 : : vec<slp_tree> &, bool);
67 : : static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 : : slp_tree, stmt_vector_for_cost *);
69 : : static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
70 : : static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
71 : :
72 : : static object_allocator<_slp_tree> *slp_tree_pool;
73 : : static slp_tree slp_first_node;
74 : :
75 : : void
76 : 1080180 : vect_slp_init (void)
77 : : {
78 : 1080180 : slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
79 : 1080180 : }
80 : :
81 : : void
82 : 1080180 : vect_slp_fini (void)
83 : : {
84 : 1562498 : while (slp_first_node)
85 : 482318 : delete slp_first_node;
86 : 2160360 : delete slp_tree_pool;
87 : 1080180 : slp_tree_pool = NULL;
88 : 1080180 : }
89 : :
90 : : void *
91 : 7341262 : _slp_tree::operator new (size_t n)
92 : : {
93 : 7341262 : gcc_assert (n == sizeof (_slp_tree));
94 : 7341262 : return slp_tree_pool->allocate_raw ();
95 : : }
96 : :
97 : : void
98 : 7341262 : _slp_tree::operator delete (void *node, size_t n)
99 : : {
100 : 7341262 : gcc_assert (n == sizeof (_slp_tree));
101 : 7341262 : slp_tree_pool->remove_raw (node);
102 : 7341262 : }
103 : :
104 : :
105 : : /* Initialize a SLP node. */
106 : :
107 : 7341262 : _slp_tree::_slp_tree ()
108 : : {
109 : 7341262 : this->prev_node = NULL;
110 : 7341262 : if (slp_first_node)
111 : 6473002 : slp_first_node->prev_node = this;
112 : 7341262 : this->next_node = slp_first_node;
113 : 7341262 : slp_first_node = this;
114 : 7341262 : SLP_TREE_SCALAR_STMTS (this) = vNULL;
115 : 7341262 : SLP_TREE_SCALAR_OPS (this) = vNULL;
116 : 7341262 : SLP_TREE_VEC_DEFS (this) = vNULL;
117 : 7341262 : SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
118 : 7341262 : SLP_TREE_CHILDREN (this) = vNULL;
119 : 7341262 : SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
120 : 7341262 : SLP_TREE_LANE_PERMUTATION (this) = vNULL;
121 : 7341262 : SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
122 : 7341262 : SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
123 : 7341262 : SLP_TREE_CODE (this) = ERROR_MARK;
124 : 7341262 : this->ldst_lanes = false;
125 : 7341262 : SLP_TREE_VECTYPE (this) = NULL_TREE;
126 : 7341262 : SLP_TREE_REPRESENTATIVE (this) = NULL;
127 : 7341262 : SLP_TREE_MEMORY_ACCESS_TYPE (this) = VMAT_INVARIANT;
128 : 7341262 : SLP_TREE_REF_COUNT (this) = 1;
129 : 7341262 : this->failed = NULL;
130 : 7341262 : this->max_nunits = 1;
131 : 7341262 : this->lanes = 0;
132 : 7341262 : }
133 : :
134 : : /* Tear down a SLP node. */
135 : :
136 : 7341262 : _slp_tree::~_slp_tree ()
137 : : {
138 : 7341262 : if (this->prev_node)
139 : 4764011 : this->prev_node->next_node = this->next_node;
140 : : else
141 : 2577251 : slp_first_node = this->next_node;
142 : 7341262 : if (this->next_node)
143 : 5433045 : this->next_node->prev_node = this->prev_node;
144 : 7341262 : SLP_TREE_CHILDREN (this).release ();
145 : 7341262 : SLP_TREE_SCALAR_STMTS (this).release ();
146 : 7341262 : SLP_TREE_SCALAR_OPS (this).release ();
147 : 7341262 : SLP_TREE_VEC_DEFS (this).release ();
148 : 7341262 : SLP_TREE_LOAD_PERMUTATION (this).release ();
149 : 7341262 : SLP_TREE_LANE_PERMUTATION (this).release ();
150 : 7341262 : SLP_TREE_SIMD_CLONE_INFO (this).release ();
151 : 7341262 : if (this->failed)
152 : 1867238 : free (failed);
153 : 7341262 : }
154 : :
155 : : /* Push the single SSA definition in DEF to the vector of vector defs. */
156 : :
157 : : void
158 : 469828 : _slp_tree::push_vec_def (gimple *def)
159 : : {
160 : 469828 : if (gphi *phi = dyn_cast <gphi *> (def))
161 : 54255 : vec_defs.quick_push (gimple_phi_result (phi));
162 : : else
163 : : {
164 : 415573 : def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
165 : 415573 : vec_defs.quick_push (get_def_from_ptr (defop));
166 : : }
167 : 469828 : }
168 : :
169 : : /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
170 : :
171 : : void
172 : 14323055 : vect_free_slp_tree (slp_tree node)
173 : : {
174 : 14323055 : int i;
175 : 14323055 : slp_tree child;
176 : :
177 : 14323055 : if (--SLP_TREE_REF_COUNT (node) != 0)
178 : 14323055 : return;
179 : :
180 : 10998238 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
181 : 4139294 : if (child)
182 : 3493493 : vect_free_slp_tree (child);
183 : :
184 : : /* If the node defines any SLP only patterns then those patterns are no
185 : : longer valid and should be removed. */
186 : 6858944 : stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
187 : 6858944 : if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
188 : : {
189 : 448 : stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
190 : 448 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
191 : 448 : STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
192 : : }
193 : :
194 : 6858944 : delete node;
195 : : }
196 : :
197 : : /* Return a location suitable for dumpings related to the SLP instance. */
198 : :
199 : : dump_user_location_t
200 : 3331258 : _slp_instance::location () const
201 : : {
202 : 3331258 : if (!root_stmts.is_empty ())
203 : 256984 : return root_stmts[0]->stmt;
204 : : else
205 : 3074274 : return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
206 : : }
207 : :
208 : :
209 : : /* Free the memory allocated for the SLP instance. */
210 : :
211 : : void
212 : 1591912 : vect_free_slp_instance (slp_instance instance)
213 : : {
214 : 1591912 : vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
215 : 1591912 : SLP_INSTANCE_LOADS (instance).release ();
216 : 1591912 : SLP_INSTANCE_ROOT_STMTS (instance).release ();
217 : 1591912 : SLP_INSTANCE_REMAIN_DEFS (instance).release ();
218 : 1591912 : instance->subgraph_entries.release ();
219 : 1591912 : instance->cost_vec.release ();
220 : 1591912 : free (instance);
221 : 1591912 : }
222 : :
223 : :
224 : : /* Create an SLP node for SCALAR_STMTS. */
225 : :
226 : : slp_tree
227 : 113944 : vect_create_new_slp_node (unsigned nops, tree_code code)
228 : : {
229 : 113944 : slp_tree node = new _slp_tree;
230 : 113944 : SLP_TREE_SCALAR_STMTS (node) = vNULL;
231 : 113944 : SLP_TREE_CHILDREN (node).create (nops);
232 : 113944 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
233 : 113944 : SLP_TREE_CODE (node) = code;
234 : 113944 : return node;
235 : : }
236 : : /* Create an SLP node for SCALAR_STMTS. */
237 : :
238 : : static slp_tree
239 : 3527897 : vect_create_new_slp_node (slp_tree node,
240 : : vec<stmt_vec_info> scalar_stmts, unsigned nops)
241 : : {
242 : 3527897 : SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
243 : 3527897 : SLP_TREE_CHILDREN (node).create (nops);
244 : 3527897 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
245 : 3527897 : SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
246 : 3527897 : SLP_TREE_LANES (node) = scalar_stmts.length ();
247 : 3527897 : return node;
248 : : }
249 : :
250 : : /* Create an SLP node for SCALAR_STMTS. */
251 : :
252 : : static slp_tree
253 : 6338 : vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
254 : : {
255 : 6338 : return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
256 : : }
257 : :
258 : : /* Create an SLP node for OPS. */
259 : :
260 : : static slp_tree
261 : 1826312 : vect_create_new_slp_node (slp_tree node, vec<tree> ops)
262 : : {
263 : 1826312 : SLP_TREE_SCALAR_OPS (node) = ops;
264 : 1826312 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
265 : 0 : SLP_TREE_LANES (node) = ops.length ();
266 : 1826312 : return node;
267 : : }
268 : :
269 : : /* Create an SLP node for OPS. */
270 : :
271 : : static slp_tree
272 : 1826312 : vect_create_new_slp_node (vec<tree> ops)
273 : : {
274 : 1826312 : return vect_create_new_slp_node (new _slp_tree, ops);
275 : : }
276 : :
277 : :
278 : : /* This structure is used in creation of an SLP tree. Each instance
279 : : corresponds to the same operand in a group of scalar stmts in an SLP
280 : : node. */
281 : : typedef struct _slp_oprnd_info
282 : : {
283 : : /* Def-stmts for the operands. */
284 : : vec<stmt_vec_info> def_stmts;
285 : : /* Operands. */
286 : : vec<tree> ops;
287 : : /* Information about the first statement, its vector def-type, type, the
288 : : operand itself in case it's constant, and an indication if it's a pattern
289 : : stmt and gather/scatter info. */
290 : : tree first_op_type;
291 : : enum vect_def_type first_dt;
292 : : bool any_pattern;
293 : : bool first_gs_p;
294 : : gather_scatter_info first_gs_info;
295 : : } *slp_oprnd_info;
296 : :
297 : :
298 : : /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
299 : : operand. */
300 : : static vec<slp_oprnd_info>
301 : 3375614 : vect_create_oprnd_info (int nops, int group_size)
302 : : {
303 : 3375614 : int i;
304 : 3375614 : slp_oprnd_info oprnd_info;
305 : 3375614 : vec<slp_oprnd_info> oprnds_info;
306 : :
307 : 3375614 : oprnds_info.create (nops);
308 : 12175973 : for (i = 0; i < nops; i++)
309 : : {
310 : 5424745 : oprnd_info = XNEW (struct _slp_oprnd_info);
311 : 5424745 : oprnd_info->def_stmts.create (group_size);
312 : 5424745 : oprnd_info->ops.create (group_size);
313 : 5424745 : oprnd_info->first_dt = vect_uninitialized_def;
314 : 5424745 : oprnd_info->first_op_type = NULL_TREE;
315 : 5424745 : oprnd_info->any_pattern = false;
316 : 5424745 : oprnd_info->first_gs_p = false;
317 : 5424745 : oprnds_info.quick_push (oprnd_info);
318 : : }
319 : :
320 : 3375614 : return oprnds_info;
321 : : }
322 : :
323 : :
324 : : /* Free operands info. */
325 : :
326 : : static void
327 : 3375614 : vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
328 : : {
329 : 3375614 : int i;
330 : 3375614 : slp_oprnd_info oprnd_info;
331 : :
332 : 8800359 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
333 : : {
334 : 5424745 : oprnd_info->def_stmts.release ();
335 : 5424745 : oprnd_info->ops.release ();
336 : 5424745 : XDELETE (oprnd_info);
337 : : }
338 : :
339 : 3375614 : oprnds_info.release ();
340 : 3375614 : }
341 : :
342 : : /* Return the execution frequency of NODE (so that a higher value indicates
343 : : a "more important" node when optimizing for speed). */
344 : :
345 : : static sreal
346 : 3592395 : vect_slp_node_weight (slp_tree node)
347 : : {
348 : 3592395 : stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
349 : 3592395 : basic_block bb = gimple_bb (stmt_info->stmt);
350 : 3592395 : return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
351 : : }
352 : :
353 : : /* Return true if STMTS contains a pattern statement. */
354 : :
355 : : static bool
356 : 23915 : vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
357 : : {
358 : 23915 : stmt_vec_info stmt_info;
359 : 23915 : unsigned int i;
360 : 78322 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
361 : 56454 : if (stmt_info && is_pattern_stmt_p (stmt_info))
362 : : return true;
363 : : return false;
364 : : }
365 : :
366 : : /* Return true when all lanes in the external or constant NODE have
367 : : the same value. */
368 : :
369 : : static bool
370 : 496164 : vect_slp_tree_uniform_p (slp_tree node)
371 : : {
372 : 496164 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
373 : : || SLP_TREE_DEF_TYPE (node) == vect_external_def);
374 : :
375 : : /* Pre-exsting vectors. */
376 : 857296 : if (SLP_TREE_SCALAR_OPS (node).is_empty ())
377 : : return false;
378 : :
379 : : unsigned i;
380 : : tree op, first = NULL_TREE;
381 : 1152297 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
382 : 1017265 : if (!first)
383 : : first = op;
384 : 521101 : else if (!operand_equal_p (first, op, 0))
385 : : return false;
386 : :
387 : : return true;
388 : : }
389 : :
390 : : /* Find the place of the data-ref in STMT_INFO in the interleaving chain
391 : : that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
392 : : of the chain. */
393 : :
394 : : int
395 : 695392 : vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
396 : : stmt_vec_info first_stmt_info)
397 : : {
398 : 695392 : stmt_vec_info next_stmt_info = first_stmt_info;
399 : 695392 : int result = 0;
400 : :
401 : 695392 : if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
402 : : return -1;
403 : :
404 : 1723149 : do
405 : : {
406 : 1723149 : if (next_stmt_info == stmt_info)
407 : : return result;
408 : 1027757 : next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
409 : 1027757 : if (next_stmt_info)
410 : 1027757 : result += DR_GROUP_GAP (next_stmt_info);
411 : : }
412 : 1027757 : while (next_stmt_info);
413 : :
414 : : return -1;
415 : : }
416 : :
417 : : /* Check whether it is possible to load COUNT elements of type ELT_TYPE
418 : : using the method implemented by duplicate_and_interleave. Return true
419 : : if so, returning the number of intermediate vectors in *NVECTORS_OUT
420 : : (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
421 : : (if nonnull). */
422 : :
423 : : bool
424 : 0 : can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
425 : : tree elt_type, unsigned int *nvectors_out,
426 : : tree *vector_type_out,
427 : : tree *permutes)
428 : : {
429 : 0 : tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
430 : 0 : if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
431 : 0 : return false;
432 : :
433 : 0 : machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
434 : 0 : poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
435 : 0 : unsigned int nvectors = 1;
436 : 0 : for (;;)
437 : : {
438 : 0 : scalar_int_mode int_mode;
439 : 0 : poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
440 : 0 : if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
441 : : {
442 : : /* Get the natural vector type for this SLP group size. */
443 : 0 : tree int_type = build_nonstandard_integer_type
444 : 0 : (GET_MODE_BITSIZE (int_mode), 1);
445 : 0 : tree vector_type
446 : 0 : = get_vectype_for_scalar_type (vinfo, int_type, count);
447 : 0 : poly_int64 half_nelts;
448 : 0 : if (vector_type
449 : 0 : && VECTOR_MODE_P (TYPE_MODE (vector_type))
450 : 0 : && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
451 : : GET_MODE_SIZE (base_vector_mode))
452 : 0 : && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
453 : : 2, &half_nelts))
454 : : {
455 : : /* Try fusing consecutive sequences of COUNT / NVECTORS elements
456 : : together into elements of type INT_TYPE and using the result
457 : : to build NVECTORS vectors. */
458 : 0 : poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
459 : 0 : vec_perm_builder sel1 (nelts, 2, 3);
460 : 0 : vec_perm_builder sel2 (nelts, 2, 3);
461 : :
462 : 0 : for (unsigned int i = 0; i < 3; ++i)
463 : : {
464 : 0 : sel1.quick_push (i);
465 : 0 : sel1.quick_push (i + nelts);
466 : 0 : sel2.quick_push (half_nelts + i);
467 : 0 : sel2.quick_push (half_nelts + i + nelts);
468 : : }
469 : 0 : vec_perm_indices indices1 (sel1, 2, nelts);
470 : 0 : vec_perm_indices indices2 (sel2, 2, nelts);
471 : 0 : machine_mode vmode = TYPE_MODE (vector_type);
472 : 0 : if (can_vec_perm_const_p (vmode, vmode, indices1)
473 : 0 : && can_vec_perm_const_p (vmode, vmode, indices2))
474 : : {
475 : 0 : if (nvectors_out)
476 : 0 : *nvectors_out = nvectors;
477 : 0 : if (vector_type_out)
478 : 0 : *vector_type_out = vector_type;
479 : 0 : if (permutes)
480 : : {
481 : 0 : permutes[0] = vect_gen_perm_mask_checked (vector_type,
482 : : indices1);
483 : 0 : permutes[1] = vect_gen_perm_mask_checked (vector_type,
484 : : indices2);
485 : : }
486 : 0 : return true;
487 : : }
488 : 0 : }
489 : : }
490 : 0 : if (!multiple_p (elt_bytes, 2, &elt_bytes))
491 : : return false;
492 : 0 : nvectors *= 2;
493 : : /* We need to be able to fuse COUNT / NVECTORS elements together. */
494 : 0 : if (!multiple_p (count, nvectors))
495 : : return false;
496 : : }
497 : : }
498 : :
499 : : /* Return true if DTA and DTB match. */
500 : :
501 : : static bool
502 : 16749611 : vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
503 : : {
504 : 16749611 : return (dta == dtb
505 : 362310 : || ((dta == vect_external_def || dta == vect_constant_def)
506 : 227103 : && (dtb == vect_external_def || dtb == vect_constant_def)));
507 : : }
508 : :
509 : : static const int cond_expr_maps[3][5] = {
510 : : { 4, -1, -2, 1, 2 },
511 : : { 4, -2, -1, 1, 2 },
512 : : { 4, -1, -2, 2, 1 }
513 : : };
514 : : static const int no_arg_map[] = { 0 };
515 : : static const int arg0_map[] = { 1, 0 };
516 : : static const int arg1_map[] = { 1, 1 };
517 : : static const int arg2_arg3_map[] = { 2, 2, 3 };
518 : : static const int arg1_arg3_map[] = { 2, 1, 3 };
519 : : static const int arg1_arg4_arg5_map[] = { 3, 1, 4, 5 };
520 : : static const int arg1_arg3_arg4_map[] = { 3, 1, 3, 4 };
521 : : static const int arg3_arg2_map[] = { 2, 3, 2 };
522 : : static const int op1_op0_map[] = { 2, 1, 0 };
523 : : static const int off_map[] = { 1, -3 };
524 : : static const int off_op0_map[] = { 2, -3, 0 };
525 : : static const int off_arg2_arg3_map[] = { 3, -3, 2, 3 };
526 : : static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
527 : : static const int mask_call_maps[6][7] = {
528 : : { 1, 1, },
529 : : { 2, 1, 2, },
530 : : { 3, 1, 2, 3, },
531 : : { 4, 1, 2, 3, 4, },
532 : : { 5, 1, 2, 3, 4, 5, },
533 : : { 6, 1, 2, 3, 4, 5, 6 },
534 : : };
535 : :
536 : : /* For most SLP statements, there is a one-to-one mapping between
537 : : gimple arguments and child nodes. If that is not true for STMT,
538 : : return an array that contains:
539 : :
540 : : - the number of child nodes, followed by
541 : : - for each child node, the index of the argument associated with that node.
542 : : The special index -1 is the first operand of an embedded comparison and
543 : : the special index -2 is the second operand of an embedded comparison.
544 : : The special indes -3 is the offset of a gather as analyzed by
545 : : vect_check_gather_scatter.
546 : :
547 : : SWAP is as for vect_get_and_check_slp_defs. */
548 : :
549 : : static const int *
550 : 19380554 : vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
551 : : unsigned char swap = 0)
552 : : {
553 : 19380554 : if (auto assign = dyn_cast<const gassign *> (stmt))
554 : : {
555 : 17850307 : if (gimple_assign_rhs_code (assign) == COND_EXPR
556 : 17850307 : && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
557 : 0 : gcc_unreachable ();
558 : 17850307 : if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
559 : 17850307 : && swap)
560 : : return op1_op0_map;
561 : 17368179 : if (gather_scatter_p)
562 : 30796 : return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
563 : 30796 : ? off_op0_map : off_map);
564 : : }
565 : 18867630 : gcc_assert (!swap);
566 : 18867630 : if (auto call = dyn_cast<const gcall *> (stmt))
567 : : {
568 : 141442 : if (gimple_call_internal_p (call))
569 : 72111 : switch (gimple_call_internal_fn (call))
570 : : {
571 : 12040 : case IFN_MASK_LOAD:
572 : 19600 : return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
573 : :
574 : 0 : case IFN_GATHER_LOAD:
575 : 0 : return arg1_map;
576 : :
577 : 0 : case IFN_MASK_GATHER_LOAD:
578 : 0 : case IFN_MASK_LEN_GATHER_LOAD:
579 : 0 : return arg1_arg4_arg5_map;
580 : :
581 : 0 : case IFN_SCATTER_STORE:
582 : 0 : return arg1_arg3_map;
583 : :
584 : 0 : case IFN_MASK_SCATTER_STORE:
585 : 0 : case IFN_MASK_LEN_SCATTER_STORE:
586 : 0 : return arg1_arg3_arg4_map;
587 : :
588 : 7042 : case IFN_MASK_STORE:
589 : 12564 : return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
590 : :
591 : 884 : case IFN_MASK_CALL:
592 : 884 : {
593 : 884 : unsigned nargs = gimple_call_num_args (call);
594 : 884 : if (nargs >= 2 && nargs <= 7)
595 : 884 : return mask_call_maps[nargs-2];
596 : : else
597 : : return nullptr;
598 : : }
599 : :
600 : 140 : case IFN_CLZ:
601 : 140 : case IFN_CTZ:
602 : 140 : return arg0_map;
603 : :
604 : 7280 : case IFN_GOMP_SIMD_LANE:
605 : 7280 : return no_arg_map;
606 : :
607 : : default:
608 : : break;
609 : : }
610 : : }
611 : : return nullptr;
612 : : }
613 : :
614 : : /* Return the SLP node child index for operand OP of STMT. */
615 : :
616 : : int
617 : 1321747 : vect_slp_child_index_for_operand (const gimple *stmt, int op,
618 : : bool gather_scatter_p)
619 : : {
620 : 1321747 : const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
621 : 1321747 : if (!opmap)
622 : : return op;
623 : 20819 : for (int i = 1; i < 1 + opmap[0]; ++i)
624 : 20819 : if (opmap[i] == op)
625 : 11238 : return i - 1;
626 : 0 : gcc_unreachable ();
627 : : }
628 : :
629 : : /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
630 : : they are of a valid type and that they match the defs of the first stmt of
631 : : the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
632 : : by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
633 : : indicates swap is required for cond_expr stmts. Specifically, SWAP
634 : : is 1 if STMT is cond and operands of comparison need to be swapped;
635 : : SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
636 : :
637 : : If there was a fatal error return -1; if the error could be corrected by
638 : : swapping operands of father node of this one, return 1; if everything is
639 : : ok return 0. */
640 : : static int
641 : 12654993 : vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
642 : : bool *skip_args,
643 : : vec<stmt_vec_info> stmts, unsigned stmt_num,
644 : : vec<slp_oprnd_info> *oprnds_info)
645 : : {
646 : 12654993 : stmt_vec_info stmt_info = stmts[stmt_num];
647 : 12654993 : tree oprnd;
648 : 12654993 : unsigned int i, number_of_oprnds;
649 : 12654993 : enum vect_def_type dt = vect_uninitialized_def;
650 : 12654993 : slp_oprnd_info oprnd_info;
651 : 12654993 : gather_scatter_info gs_info;
652 : 12654993 : unsigned int gs_op = -1u;
653 : 12654993 : unsigned int commutative_op = -1U;
654 : 12654993 : bool first = stmt_num == 0;
655 : :
656 : 12654993 : if (!stmt_info)
657 : : {
658 : 0 : for (auto oi : *oprnds_info)
659 : : {
660 : 0 : oi->def_stmts.quick_push (NULL);
661 : 0 : oi->ops.quick_push (NULL_TREE);
662 : : }
663 : : return 0;
664 : : }
665 : :
666 : 12654993 : if (!is_a<gcall *> (stmt_info->stmt)
667 : : && !is_a<gassign *> (stmt_info->stmt)
668 : : && !is_a<gphi *> (stmt_info->stmt))
669 : : return -1;
670 : :
671 : 12654993 : number_of_oprnds = gimple_num_args (stmt_info->stmt);
672 : 12654993 : const int *map
673 : 25309986 : = vect_get_operand_map (stmt_info->stmt,
674 : 12654993 : STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
675 : 12654993 : if (map)
676 : 505629 : number_of_oprnds = *map++;
677 : 12654993 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
678 : : {
679 : 42066 : if (gimple_call_internal_p (stmt))
680 : : {
681 : 23313 : internal_fn ifn = gimple_call_internal_fn (stmt);
682 : 23313 : commutative_op = first_commutative_argument (ifn);
683 : : }
684 : : }
685 : 12612927 : else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
686 : : {
687 : 14568745 : if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
688 : 12654993 : commutative_op = 0;
689 : : }
690 : :
691 : 12654993 : bool swapped = (swap != 0);
692 : 12654993 : bool backedge = false;
693 : 12654993 : enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
694 : 34976065 : for (i = 0; i < number_of_oprnds; i++)
695 : : {
696 : 22323702 : oprnd_info = (*oprnds_info)[i];
697 : 22323702 : int opno = map ? map[i] : int (i);
698 : 22323702 : if (opno == -3)
699 : : {
700 : 15902 : gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
701 : 15902 : if (!is_a <loop_vec_info> (vinfo)
702 : 15902 : || !vect_check_gather_scatter (stmt_info,
703 : : as_a <loop_vec_info> (vinfo),
704 : : first ? &oprnd_info->first_gs_info
705 : : : &gs_info))
706 : 2630 : return -1;
707 : :
708 : 15902 : if (first)
709 : : {
710 : 15712 : oprnd_info->first_gs_p = true;
711 : 15712 : oprnd = oprnd_info->first_gs_info.offset;
712 : : }
713 : : else
714 : : {
715 : 190 : gs_op = i;
716 : 190 : oprnd = gs_info.offset;
717 : : }
718 : : }
719 : 22307800 : else if (opno < 0)
720 : 0 : oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
721 : : else
722 : : {
723 : 22307800 : oprnd = gimple_arg (stmt_info->stmt, opno);
724 : 22307800 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
725 : : {
726 : 1450285 : edge e = gimple_phi_arg_edge (stmt, opno);
727 : 2900570 : backedge = (is_a <bb_vec_info> (vinfo)
728 : 2271299 : ? e->flags & EDGE_DFS_BACK
729 : 821014 : : dominated_by_p (CDI_DOMINATORS, e->src,
730 : 821014 : gimple_bb (stmt_info->stmt)));
731 : : }
732 : : }
733 : 22323702 : if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
734 : 2471 : oprnd = TREE_OPERAND (oprnd, 0);
735 : :
736 : 22323702 : stmt_vec_info def_stmt_info;
737 : 22323702 : if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
738 : : {
739 : 1186 : if (dump_enabled_p ())
740 : 4 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
741 : : "Build SLP failed: can't analyze def for %T\n",
742 : : oprnd);
743 : :
744 : 1186 : return -1;
745 : : }
746 : :
747 : 22322516 : if (skip_args[i])
748 : : {
749 : 744166 : oprnd_info->def_stmts.quick_push (NULL);
750 : 744166 : oprnd_info->ops.quick_push (NULL_TREE);
751 : 744166 : oprnd_info->first_dt = vect_uninitialized_def;
752 : 744166 : continue;
753 : : }
754 : :
755 : 21578350 : oprnd_info->def_stmts.quick_push (def_stmt_info);
756 : 21578350 : oprnd_info->ops.quick_push (oprnd);
757 : :
758 : 21578350 : if (def_stmt_info
759 : 21578350 : && is_pattern_stmt_p (def_stmt_info))
760 : : {
761 : 337149 : if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
762 : : != def_stmt_info)
763 : 241523 : oprnd_info->any_pattern = true;
764 : : else
765 : : /* If we promote this to external use the original stmt def. */
766 : 95626 : oprnd_info->ops.last ()
767 : 191252 : = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
768 : : }
769 : :
770 : : /* If there's a extern def on a backedge make sure we can
771 : : code-generate at the region start.
772 : : ??? This is another case that could be fixed by adjusting
773 : : how we split the function but at the moment we'd have conflicting
774 : : goals there. */
775 : 21578350 : if (backedge
776 : 119658 : && dts[i] == vect_external_def
777 : 1468 : && is_a <bb_vec_info> (vinfo)
778 : 1468 : && TREE_CODE (oprnd) == SSA_NAME
779 : 1444 : && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
780 : 21579794 : && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
781 : 1444 : gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
782 : : {
783 : 1444 : if (dump_enabled_p ())
784 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
785 : : "Build SLP failed: extern def %T only defined "
786 : : "on backedge\n", oprnd);
787 : 1444 : return -1;
788 : : }
789 : :
790 : 21576906 : if (first)
791 : : {
792 : 4687103 : tree type = TREE_TYPE (oprnd);
793 : 4687103 : dt = dts[i];
794 : :
795 : : /* For the swapping logic below force vect_reduction_def
796 : : for the reduction op in a SLP reduction group. */
797 : 4687103 : if (!STMT_VINFO_DATA_REF (stmt_info)
798 : 3559776 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
799 : 1248 : && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
800 : 4687715 : && def_stmt_info)
801 : 612 : dts[i] = dt = vect_reduction_def;
802 : :
803 : : /* Check the types of the definition. */
804 : 4687103 : switch (dt)
805 : : {
806 : 4687103 : case vect_external_def:
807 : 4687103 : case vect_constant_def:
808 : 4687103 : case vect_internal_def:
809 : 4687103 : case vect_reduction_def:
810 : 4687103 : case vect_double_reduction_def:
811 : 4687103 : case vect_induction_def:
812 : 4687103 : case vect_nested_cycle:
813 : 4687103 : case vect_first_order_recurrence:
814 : 4687103 : break;
815 : :
816 : 0 : default:
817 : : /* FORNOW: Not supported. */
818 : 0 : if (dump_enabled_p ())
819 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
820 : : "Build SLP failed: illegal type of def %T\n",
821 : : oprnd);
822 : 0 : return -1;
823 : : }
824 : :
825 : 4687103 : oprnd_info->first_dt = dt;
826 : 4687103 : oprnd_info->first_op_type = type;
827 : : }
828 : : }
829 : 12652363 : if (first)
830 : : return 0;
831 : :
832 : : /* Now match the operand definition types to that of the first stmt. */
833 : 25851662 : for (i = 0; i < number_of_oprnds;)
834 : : {
835 : 16868123 : if (skip_args[i])
836 : : {
837 : 10984 : ++i;
838 : 10984 : continue;
839 : : }
840 : :
841 : 16857139 : oprnd_info = (*oprnds_info)[i];
842 : 16857139 : dt = dts[i];
843 : 16857139 : stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
844 : 16857139 : oprnd = oprnd_info->ops[stmt_num];
845 : 16857139 : tree type = TREE_TYPE (oprnd);
846 : :
847 : 16857139 : if (!types_compatible_p (oprnd_info->first_op_type, type))
848 : : {
849 : 110380 : if (dump_enabled_p ())
850 : 170 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
851 : : "Build SLP failed: different operand types\n");
852 : 110380 : return 1;
853 : : }
854 : :
855 : 16746759 : if ((gs_op == i) != oprnd_info->first_gs_p)
856 : : {
857 : 0 : if (dump_enabled_p ())
858 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
859 : : "Build SLP failed: mixed gather and non-gather\n");
860 : 0 : return 1;
861 : : }
862 : 16746759 : else if (gs_op == i)
863 : : {
864 : 182 : if (!operand_equal_p (oprnd_info->first_gs_info.base,
865 : 182 : gs_info.base))
866 : : {
867 : 20 : if (dump_enabled_p ())
868 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
869 : : "Build SLP failed: different gather base\n");
870 : 20 : return 1;
871 : : }
872 : 162 : if (oprnd_info->first_gs_info.scale != gs_info.scale)
873 : : {
874 : 8 : if (dump_enabled_p ())
875 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
876 : : "Build SLP failed: different gather scale\n");
877 : 8 : return 1;
878 : : }
879 : : }
880 : :
881 : : /* Not first stmt of the group, check that the def-stmt/s match
882 : : the def-stmt/s of the first stmt. Allow different definition
883 : : types for reduction chains: the first stmt must be a
884 : : vect_reduction_def (a phi node), and the rest
885 : : end in the reduction chain. */
886 : 16746731 : if ((!vect_def_types_match (oprnd_info->first_dt, dt)
887 : 305862 : && !(oprnd_info->first_dt == vect_reduction_def
888 : 5115 : && !STMT_VINFO_DATA_REF (stmt_info)
889 : 5115 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
890 : 1697 : && def_stmt_info
891 : 1691 : && !STMT_VINFO_DATA_REF (def_stmt_info)
892 : 1685 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
893 : : == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
894 : 16442339 : || (!STMT_VINFO_DATA_REF (stmt_info)
895 : 15126308 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
896 : 4178 : && ((!def_stmt_info
897 : 4086 : || STMT_VINFO_DATA_REF (def_stmt_info)
898 : 7317 : || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
899 : : != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
900 : 4178 : != (oprnd_info->first_dt != vect_reduction_def))))
901 : : {
902 : : /* Try swapping operands if we got a mismatch. For BB
903 : : vectorization only in case it will clearly improve things. */
904 : 306404 : if (i == commutative_op && !swapped
905 : 304707 : && (!is_a <bb_vec_info> (vinfo)
906 : 2169 : || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
907 : 2169 : dts[i+1])
908 : 466 : && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
909 : : || vect_def_types_match
910 : 245 : ((*oprnds_info)[i+1]->first_dt, dts[i])))))
911 : : {
912 : 1697 : if (dump_enabled_p ())
913 : 336 : dump_printf_loc (MSG_NOTE, vect_location,
914 : : "trying swapped operands\n");
915 : 1697 : std::swap (dts[i], dts[i+1]);
916 : 1697 : std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
917 : 1697 : (*oprnds_info)[i+1]->def_stmts[stmt_num]);
918 : 1697 : std::swap ((*oprnds_info)[i]->ops[stmt_num],
919 : 1697 : (*oprnds_info)[i+1]->ops[stmt_num]);
920 : : /* After swapping some operands we lost track whether an
921 : : operand has any pattern defs so be conservative here. */
922 : 1697 : if ((*oprnds_info)[i]->any_pattern
923 : 1697 : || (*oprnds_info)[i+1]->any_pattern)
924 : 8 : (*oprnds_info)[i]->any_pattern
925 : 4 : = (*oprnds_info)[i+1]->any_pattern = true;
926 : 1697 : swapped = true;
927 : 1697 : continue;
928 : : }
929 : :
930 : 303010 : if (is_a <bb_vec_info> (vinfo)
931 : 289518 : && !oprnd_info->any_pattern
932 : 592330 : && number_of_oprnds > 1)
933 : : {
934 : : /* Now for commutative ops we should see whether we can
935 : : make the other operand matching. */
936 : 116228 : if (dump_enabled_p ())
937 : 149 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
938 : : "treating operand as external\n");
939 : 116228 : oprnd_info->first_dt = dt = vect_external_def;
940 : : }
941 : : else
942 : : {
943 : 186782 : if (dump_enabled_p ())
944 : 503 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
945 : : "Build SLP failed: different types\n");
946 : 186782 : return 1;
947 : : }
948 : : }
949 : :
950 : : /* Make sure to demote the overall operand to external. */
951 : 16558252 : if (dt == vect_external_def)
952 : 379584 : oprnd_info->first_dt = vect_external_def;
953 : : /* For a SLP reduction chain we want to duplicate the reduction to
954 : : each of the chain members. That gets us a sane SLP graph (still
955 : : the stmts are not 100% correct wrt the initial values). */
956 : 16178668 : else if ((dt == vect_internal_def
957 : 16178668 : || dt == vect_reduction_def)
958 : 15302355 : && oprnd_info->first_dt == vect_reduction_def
959 : 14370 : && !STMT_VINFO_DATA_REF (stmt_info)
960 : 14370 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
961 : 1881 : && !STMT_VINFO_DATA_REF (def_stmt_info)
962 : 16180549 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
963 : : == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
964 : : {
965 : 1881 : oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
966 : 1881 : oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
967 : : }
968 : :
969 : 16558252 : ++i;
970 : : }
971 : :
972 : : /* Swap operands. */
973 : 8983539 : if (swapped)
974 : : {
975 : 123094 : if (dump_enabled_p ())
976 : 1009 : dump_printf_loc (MSG_NOTE, vect_location,
977 : : "swapped operands to match def types in %G",
978 : : stmt_info->stmt);
979 : : }
980 : :
981 : : return 0;
982 : : }
983 : :
984 : : /* Return true if call statements CALL1 and CALL2 are similar enough
985 : : to be combined into the same SLP group. */
986 : :
987 : : bool
988 : 25926 : compatible_calls_p (gcall *call1, gcall *call2)
989 : : {
990 : 25926 : unsigned int nargs = gimple_call_num_args (call1);
991 : 25926 : if (nargs != gimple_call_num_args (call2))
992 : : return false;
993 : :
994 : 23074 : if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
995 : : return false;
996 : :
997 : 23074 : if (gimple_call_internal_p (call1))
998 : : {
999 : 6371 : if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
1000 : 6371 : TREE_TYPE (gimple_call_lhs (call2))))
1001 : : return false;
1002 : 12941 : for (unsigned int i = 0; i < nargs; ++i)
1003 : 6570 : if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
1004 : 6570 : TREE_TYPE (gimple_call_arg (call2, i))))
1005 : : return false;
1006 : : }
1007 : : else
1008 : : {
1009 : 16703 : if (!operand_equal_p (gimple_call_fn (call1),
1010 : 16703 : gimple_call_fn (call2), 0))
1011 : : return false;
1012 : :
1013 : 33084 : if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
1014 : : return false;
1015 : : }
1016 : :
1017 : : /* Check that any unvectorized arguments are equal. */
1018 : 17399 : if (const int *map = vect_get_operand_map (call1))
1019 : : {
1020 : 15 : unsigned int nkept = *map++;
1021 : 15 : unsigned int mapi = 0;
1022 : 57 : for (unsigned int i = 0; i < nargs; ++i)
1023 : 42 : if (mapi < nkept && map[mapi] == int (i))
1024 : 27 : mapi += 1;
1025 : 15 : else if (!operand_equal_p (gimple_call_arg (call1, i),
1026 : 15 : gimple_call_arg (call2, i)))
1027 : : return false;
1028 : : }
1029 : :
1030 : : return true;
1031 : : }
1032 : :
1033 : : /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1034 : : caller's attempt to find the vector type in STMT_INFO with the narrowest
1035 : : element type. Return true if VECTYPE is nonnull and if it is valid
1036 : : for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1037 : : number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1038 : : vect_build_slp_tree. */
1039 : :
1040 : : static bool
1041 : 5614441 : vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1042 : : unsigned int group_size,
1043 : : tree vectype, poly_uint64 *max_nunits)
1044 : : {
1045 : 5614441 : if (!vectype)
1046 : : {
1047 : 0 : if (dump_enabled_p ())
1048 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1049 : : "Build SLP failed: unsupported data-type in %G\n",
1050 : : stmt_info->stmt);
1051 : : /* Fatal mismatch. */
1052 : 0 : return false;
1053 : : }
1054 : :
1055 : : /* If populating the vector type requires unrolling then fail
1056 : : before adjusting *max_nunits for basic-block vectorization. */
1057 : 5614441 : if (is_a <bb_vec_info> (vinfo)
1058 : 5614441 : && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1059 : : {
1060 : 132170 : if (dump_enabled_p ())
1061 : 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1062 : : "Build SLP failed: unrolling required "
1063 : : "in basic block SLP\n");
1064 : : /* Fatal mismatch. */
1065 : 132170 : return false;
1066 : : }
1067 : :
1068 : : /* In case of multiple types we need to detect the smallest type. */
1069 : 5482271 : vect_update_max_nunits (max_nunits, vectype);
1070 : 5482271 : return true;
1071 : : }
1072 : :
1073 : : /* Verify if the scalar stmts STMTS are isomorphic, require data
1074 : : permutation or are of unsupported types of operation. Return
1075 : : true if they are, otherwise return false and indicate in *MATCHES
1076 : : which stmts are not isomorphic to the first one. If MATCHES[0]
1077 : : is false then this indicates the comparison could not be
1078 : : carried out or the stmts will never be vectorized by SLP.
1079 : :
1080 : : Note COND_EXPR is possibly isomorphic to another one after swapping its
1081 : : operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1082 : : the first stmt by swapping the two operands of comparison; set SWAP[i]
1083 : : to 2 if stmt I is isormorphic to the first stmt by inverting the code
1084 : : of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1085 : : to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1086 : :
1087 : : static bool
1088 : 5386415 : vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1089 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1090 : : poly_uint64 *max_nunits, bool *matches,
1091 : : bool *two_operators, tree *node_vectype)
1092 : : {
1093 : 5386415 : unsigned int i;
1094 : 5386415 : stmt_vec_info first_stmt_info = stmts[0];
1095 : 5386415 : code_helper first_stmt_code = ERROR_MARK;
1096 : 5386415 : code_helper alt_stmt_code = ERROR_MARK;
1097 : 5386415 : code_helper first_cond_code = ERROR_MARK;
1098 : 5386415 : bool need_same_oprnds = false;
1099 : 5386415 : tree first_lhs = NULL_TREE;
1100 : 5386415 : tree first_op1 = NULL_TREE;
1101 : 5386415 : stmt_vec_info first_load = NULL, prev_first_load = NULL;
1102 : 5386415 : bool first_stmt_ldst_p = false;
1103 : 5386415 : bool first_stmt_phi_p = false;
1104 : 5386415 : int first_reduc_idx = -1;
1105 : 5386415 : bool maybe_soft_fail = false;
1106 : 5386415 : tree soft_fail_nunits_vectype = NULL_TREE;
1107 : :
1108 : 5386415 : tree vectype, nunits_vectype;
1109 : 5386415 : if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
1110 : 5386415 : &nunits_vectype, group_size))
1111 : : {
1112 : : /* Fatal mismatch. */
1113 : 177067 : matches[0] = false;
1114 : 177067 : return false;
1115 : : }
1116 : : /* Record nunits required but continue analysis, producing matches[]
1117 : : as if nunits was not an issue. This allows splitting of groups
1118 : : to happen. */
1119 : 5209348 : if (nunits_vectype
1120 : 5209348 : && !vect_record_max_nunits (vinfo, first_stmt_info, group_size,
1121 : : nunits_vectype, max_nunits))
1122 : : {
1123 : 132170 : gcc_assert (is_a <bb_vec_info> (vinfo));
1124 : 132170 : maybe_soft_fail = true;
1125 : 132170 : soft_fail_nunits_vectype = nunits_vectype;
1126 : : }
1127 : :
1128 : 5209348 : gcc_assert (vectype);
1129 : 5209348 : *node_vectype = vectype;
1130 : :
1131 : : /* For every stmt in NODE find its def stmt/s. */
1132 : 5209348 : stmt_vec_info stmt_info;
1133 : 22072254 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1134 : : {
1135 : 17077314 : bool ldst_p = false;
1136 : 17077314 : bool phi_p = false;
1137 : 17077314 : code_helper rhs_code = ERROR_MARK;
1138 : :
1139 : 17077314 : swap[i] = 0;
1140 : 17077314 : matches[i] = false;
1141 : 17077314 : if (!stmt_info)
1142 : : {
1143 : 119675 : matches[i] = true;
1144 : 16982581 : continue;
1145 : : }
1146 : :
1147 : 16957639 : gimple *stmt = stmt_info->stmt;
1148 : 16957639 : if (dump_enabled_p ())
1149 : 227022 : dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1150 : :
1151 : : /* Fail to vectorize statements marked as unvectorizable, throw
1152 : : or are volatile. */
1153 : 16957639 : if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1154 : 16738577 : || stmt_can_throw_internal (cfun, stmt)
1155 : 32710574 : || gimple_has_volatile_ops (stmt))
1156 : : {
1157 : 226098 : if (dump_enabled_p ())
1158 : 231 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159 : : "Build SLP failed: unvectorizable statement %G",
1160 : : stmt);
1161 : : /* ??? For BB vectorization we want to commutate operands in a way
1162 : : to shuffle all unvectorizable defs into one operand and have
1163 : : the other still vectorized. The following doesn't reliably
1164 : : work for this though but it's the easiest we can do here. */
1165 : 226098 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1166 : 64183 : continue;
1167 : : /* Fatal mismatch. */
1168 : 161915 : matches[0] = false;
1169 : 161915 : return false;
1170 : : }
1171 : :
1172 : 16731541 : gcall *call_stmt = dyn_cast <gcall *> (stmt);
1173 : 16731541 : tree lhs = gimple_get_lhs (stmt);
1174 : 16731541 : if (lhs == NULL_TREE
1175 : 16731541 : && (!call_stmt
1176 : 1956 : || !gimple_call_internal_p (stmt)
1177 : 1956 : || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1178 : : {
1179 : 60 : if (dump_enabled_p ())
1180 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1181 : : "Build SLP failed: not GIMPLE_ASSIGN nor "
1182 : : "GIMPLE_CALL %G", stmt);
1183 : 60 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1184 : 60 : continue;
1185 : : /* Fatal mismatch. */
1186 : 0 : matches[0] = false;
1187 : 0 : return false;
1188 : : }
1189 : :
1190 : 16731481 : if (call_stmt)
1191 : : {
1192 : 102902 : combined_fn cfn = gimple_call_combined_fn (call_stmt);
1193 : 102902 : if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1194 : 51894 : rhs_code = cfn;
1195 : : else
1196 : : rhs_code = CALL_EXPR;
1197 : :
1198 : 102902 : if (cfn == CFN_MASK_LOAD
1199 : 102902 : || cfn == CFN_GATHER_LOAD
1200 : : || cfn == CFN_MASK_GATHER_LOAD
1201 : : || cfn == CFN_MASK_LEN_GATHER_LOAD
1202 : : || cfn == CFN_SCATTER_STORE
1203 : : || cfn == CFN_MASK_SCATTER_STORE
1204 : : || cfn == CFN_MASK_LEN_SCATTER_STORE)
1205 : : ldst_p = true;
1206 : : else if (cfn == CFN_MASK_STORE)
1207 : : {
1208 : : ldst_p = true;
1209 : : rhs_code = CFN_MASK_STORE;
1210 : : }
1211 : : else if (cfn == CFN_GOMP_SIMD_LANE)
1212 : : ;
1213 : 93571 : else if ((cfn != CFN_LAST
1214 : : && cfn != CFN_MASK_CALL
1215 : 42563 : && internal_fn_p (cfn)
1216 : 30148 : && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1217 : 93506 : || gimple_call_tail_p (call_stmt)
1218 : 93506 : || gimple_call_noreturn_p (call_stmt)
1219 : 187077 : || gimple_call_chain (call_stmt))
1220 : : {
1221 : 665 : if (dump_enabled_p ())
1222 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223 : : "Build SLP failed: unsupported call type %G",
1224 : : (gimple *) call_stmt);
1225 : 665 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1226 : 87 : continue;
1227 : : /* Fatal mismatch. */
1228 : 578 : matches[0] = false;
1229 : 578 : return false;
1230 : : }
1231 : : }
1232 : 16628579 : else if (gimple_code (stmt) == GIMPLE_PHI)
1233 : : {
1234 : : rhs_code = ERROR_MARK;
1235 : : phi_p = true;
1236 : : }
1237 : : else
1238 : : {
1239 : 15642937 : rhs_code = gimple_assign_rhs_code (stmt);
1240 : 15642937 : ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1241 : : }
1242 : :
1243 : : /* Check the operation. */
1244 : 16730816 : if (i == 0)
1245 : : {
1246 : 5046855 : first_lhs = lhs;
1247 : 5046855 : first_stmt_code = rhs_code;
1248 : 5046855 : first_stmt_ldst_p = ldst_p;
1249 : 5046855 : first_stmt_phi_p = phi_p;
1250 : 5046855 : first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
1251 : :
1252 : : /* Shift arguments should be equal in all the packed stmts for a
1253 : : vector shift with scalar shift operand. */
1254 : 5046855 : if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1255 : 4911305 : || rhs_code == LROTATE_EXPR
1256 : 9958114 : || rhs_code == RROTATE_EXPR)
1257 : : {
1258 : : /* First see if we have a vector/vector shift. */
1259 : 135780 : if (!directly_supported_p (rhs_code, vectype, optab_vector))
1260 : : {
1261 : : /* No vector/vector shift, try for a vector/scalar shift. */
1262 : 122876 : if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1263 : : {
1264 : 31365 : if (dump_enabled_p ())
1265 : 463 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1266 : : "Build SLP failed: "
1267 : : "op not supported by target.\n");
1268 : 31365 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1269 : : continue;
1270 : : /* Fatal mismatch. */
1271 : 31365 : matches[0] = false;
1272 : 31365 : return false;
1273 : : }
1274 : 91511 : need_same_oprnds = true;
1275 : 91511 : first_op1 = gimple_assign_rhs2 (stmt);
1276 : : }
1277 : : }
1278 : 4911075 : else if (rhs_code == WIDEN_LSHIFT_EXPR)
1279 : : {
1280 : 0 : need_same_oprnds = true;
1281 : 0 : first_op1 = gimple_assign_rhs2 (stmt);
1282 : : }
1283 : 4911075 : else if (!ldst_p
1284 : 4911075 : && rhs_code == BIT_FIELD_REF)
1285 : : {
1286 : 5791 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1287 : 5791 : if (!is_a <bb_vec_info> (vinfo)
1288 : 5709 : || TREE_CODE (vec) != SSA_NAME
1289 : : /* When the element types are not compatible we pun the
1290 : : source to the target vectype which requires equal size. */
1291 : 11488 : || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1292 : 4826 : || !types_compatible_p (TREE_TYPE (vectype),
1293 : 4826 : TREE_TYPE (TREE_TYPE (vec))))
1294 : 1064 : && !operand_equal_p (TYPE_SIZE (vectype),
1295 : 1064 : TYPE_SIZE (TREE_TYPE (vec)))))
1296 : : {
1297 : 806 : if (dump_enabled_p ())
1298 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1299 : : "Build SLP failed: "
1300 : : "BIT_FIELD_REF not supported\n");
1301 : : /* Fatal mismatch. */
1302 : 806 : matches[0] = false;
1303 : 806 : return false;
1304 : : }
1305 : : }
1306 : 4905284 : else if (rhs_code == CFN_DIV_POW2)
1307 : : {
1308 : 0 : need_same_oprnds = true;
1309 : 0 : first_op1 = gimple_call_arg (call_stmt, 1);
1310 : : }
1311 : 4905284 : else if (rhs_code == CFN_GOMP_SIMD_LANE)
1312 : : {
1313 : 3640 : need_same_oprnds = true;
1314 : 3640 : first_op1 = gimple_call_arg (call_stmt, 1);
1315 : : }
1316 : : }
1317 : : else
1318 : : {
1319 : 11684972 : if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1320 : : /* For SLP reduction groups the index isn't necessarily
1321 : : uniform but only that of the first stmt matters. */
1322 : 11683961 : && !(first_reduc_idx != -1
1323 : 1712 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1324 : 1712 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
1325 : : {
1326 : 1011 : if (dump_enabled_p ())
1327 : : {
1328 : 56 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1329 : : "Build SLP failed: different reduc_idx "
1330 : : "%d instead of %d in %G",
1331 : : STMT_VINFO_REDUC_IDX (stmt_info),
1332 : : first_reduc_idx, stmt);
1333 : : }
1334 : : /* Mismatch. */
1335 : 1011 : continue;
1336 : : }
1337 : 11682950 : if (!ldst_p
1338 : 9260118 : && first_stmt_code != rhs_code
1339 : 13164813 : && alt_stmt_code == ERROR_MARK)
1340 : : alt_stmt_code = rhs_code;
1341 : 13150489 : if ((!ldst_p
1342 : 9260118 : && first_stmt_code != rhs_code
1343 : 1481863 : && (first_stmt_code != IMAGPART_EXPR
1344 : 112 : || rhs_code != REALPART_EXPR)
1345 : 1481851 : && (first_stmt_code != REALPART_EXPR
1346 : 214 : || rhs_code != IMAGPART_EXPR)
1347 : : /* Handle mismatches in plus/minus by computing both
1348 : : and merging the results. */
1349 : 1481848 : && !((first_stmt_code == PLUS_EXPR
1350 : 1377779 : || first_stmt_code == MINUS_EXPR)
1351 : 126640 : && (alt_stmt_code == PLUS_EXPR
1352 : 116751 : || alt_stmt_code == MINUS_EXPR)
1353 : 19310 : && rhs_code == alt_stmt_code)
1354 : 1462716 : && !(first_stmt_code.is_tree_code ()
1355 : 1354889 : && rhs_code.is_tree_code ()
1356 : 1244210 : && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1357 : : == tcc_comparison)
1358 : 132119 : && (swap_tree_comparison (tree_code (first_stmt_code))
1359 : 132119 : == tree_code (rhs_code))))
1360 : : || (ldst_p
1361 : 4845664 : && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1362 : 2422832 : != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1363 : : || (ldst_p
1364 : 2376069 : && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1365 : 2376069 : != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1366 : 10215526 : || first_stmt_ldst_p != ldst_p
1367 : 21898476 : || first_stmt_phi_p != phi_p)
1368 : : {
1369 : 1467539 : if (dump_enabled_p ())
1370 : : {
1371 : 3258 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1372 : : "Build SLP failed: different operation "
1373 : : "in stmt %G", stmt);
1374 : 3258 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1375 : : "original stmt %G", first_stmt_info->stmt);
1376 : : }
1377 : : /* Mismatch. */
1378 : 1467539 : continue;
1379 : : }
1380 : :
1381 : 10217263 : if (!ldst_p
1382 : 7839444 : && first_stmt_code == BIT_FIELD_REF
1383 : 10220697 : && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1384 : 5286 : != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1385 : : {
1386 : 1852 : if (dump_enabled_p ())
1387 : 36 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1388 : : "Build SLP failed: different BIT_FIELD_REF "
1389 : : "arguments in %G", stmt);
1390 : : /* Mismatch. */
1391 : 1852 : continue;
1392 : : }
1393 : :
1394 : 10213559 : if (call_stmt
1395 : 26037 : && first_stmt_code != CFN_MASK_LOAD
1396 : 10239535 : && first_stmt_code != CFN_MASK_STORE)
1397 : : {
1398 : 25926 : if (!is_a <gcall *> (stmts[0]->stmt)
1399 : 25926 : || !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1400 : : call_stmt))
1401 : : {
1402 : 8527 : if (dump_enabled_p ())
1403 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1404 : : "Build SLP failed: different calls in %G",
1405 : : stmt);
1406 : : /* Mismatch. */
1407 : 8527 : continue;
1408 : : }
1409 : : }
1410 : :
1411 : 10009757 : if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1412 : 10873989 : && (gimple_bb (first_stmt_info->stmt)
1413 : 864232 : != gimple_bb (stmt_info->stmt)))
1414 : : {
1415 : 28534 : if (dump_enabled_p ())
1416 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417 : : "Build SLP failed: different BB for PHI "
1418 : : "or possibly trapping operation in %G", stmt);
1419 : : /* Mismatch. */
1420 : 28534 : continue;
1421 : : }
1422 : :
1423 : 10176498 : if (need_same_oprnds)
1424 : : {
1425 : 50688 : tree other_op1 = gimple_arg (stmt, 1);
1426 : 50688 : if (!operand_equal_p (first_op1, other_op1, 0))
1427 : : {
1428 : 6789 : if (dump_enabled_p ())
1429 : 121 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1430 : : "Build SLP failed: different shift "
1431 : : "arguments in %G", stmt);
1432 : : /* Mismatch. */
1433 : 6789 : continue;
1434 : : }
1435 : : }
1436 : :
1437 : 10170300 : if (first_lhs
1438 : 10169709 : && lhs
1439 : 10169709 : && !types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (first_lhs)))
1440 : : {
1441 : 591 : if (dump_enabled_p ())
1442 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1443 : : "Build SLP failed: different vector type "
1444 : : "in %G", stmt);
1445 : : /* Mismatch. */
1446 : 591 : continue;
1447 : : }
1448 : : }
1449 : :
1450 : : /* Grouped store or load. */
1451 : 15183802 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1452 : : {
1453 : 3856620 : gcc_assert (ldst_p);
1454 : 3856620 : if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1455 : : {
1456 : : /* Store. */
1457 : 3057645 : gcc_assert (rhs_code == CFN_MASK_STORE
1458 : : || REFERENCE_CLASS_P (lhs)
1459 : : || DECL_P (lhs));
1460 : : }
1461 : : else
1462 : : {
1463 : : /* Load. */
1464 : 798975 : first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1465 : 798975 : if (prev_first_load)
1466 : : {
1467 : : /* Check that there are no loads from different interleaving
1468 : : chains in the same node. */
1469 : 335973 : if (prev_first_load != first_load)
1470 : : {
1471 : 25528 : if (dump_enabled_p ())
1472 : 1886 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1473 : : vect_location,
1474 : : "Build SLP failed: different "
1475 : : "interleaving chains in one node %G",
1476 : : stmt);
1477 : : /* Mismatch. */
1478 : 25528 : continue;
1479 : : }
1480 : : }
1481 : : else
1482 : : prev_first_load = first_load;
1483 : : }
1484 : : }
1485 : : /* Non-grouped store or load. */
1486 : 11327182 : else if (ldst_p)
1487 : : {
1488 : 568478 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1489 : 376258 : && rhs_code != CFN_GATHER_LOAD
1490 : : && rhs_code != CFN_MASK_GATHER_LOAD
1491 : : && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1492 : : && rhs_code != CFN_SCATTER_STORE
1493 : : && rhs_code != CFN_MASK_SCATTER_STORE
1494 : : && rhs_code != CFN_MASK_LEN_SCATTER_STORE
1495 : 376258 : && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1496 : : /* Not grouped loads are handled as externals for BB
1497 : : vectorization. For loop vectorization we can handle
1498 : : splats the same we handle single element interleaving. */
1499 : 933855 : && (is_a <bb_vec_info> (vinfo)
1500 : 365377 : || stmt_info != first_stmt_info))
1501 : : {
1502 : : /* Not grouped load. */
1503 : 15661 : if (dump_enabled_p ())
1504 : 138 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1505 : : "Build SLP failed: not grouped load %G", stmt);
1506 : :
1507 : 15661 : if (i != 0)
1508 : 15661 : continue;
1509 : : /* Fatal mismatch. */
1510 : 0 : matches[0] = false;
1511 : 0 : return false;
1512 : : }
1513 : : }
1514 : : /* Not memory operation. */
1515 : : else
1516 : : {
1517 : 10758704 : if (!phi_p
1518 : 9913347 : && rhs_code.is_tree_code ()
1519 : 9869676 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1520 : 1445634 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1521 : 885602 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1522 : 825983 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1523 : 67592 : && rhs_code != VIEW_CONVERT_EXPR
1524 : : && rhs_code != CALL_EXPR
1525 : : && rhs_code != BIT_FIELD_REF
1526 : 10758704 : && rhs_code != SSA_NAME)
1527 : : {
1528 : 19744 : if (dump_enabled_p ())
1529 : 9 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530 : : "Build SLP failed: operation unsupported %G",
1531 : : stmt);
1532 : 19744 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1533 : 0 : continue;
1534 : : /* Fatal mismatch. */
1535 : 19744 : matches[0] = false;
1536 : 19744 : return false;
1537 : : }
1538 : :
1539 : 10738960 : if (rhs_code == COND_EXPR)
1540 : : {
1541 : 57303 : tree cond_expr = gimple_assign_rhs1 (stmt);
1542 : 57303 : enum tree_code cond_code = TREE_CODE (cond_expr);
1543 : 57303 : enum tree_code swap_code = ERROR_MARK;
1544 : 57303 : enum tree_code invert_code = ERROR_MARK;
1545 : :
1546 : 57303 : if (i == 0)
1547 : 48407 : first_cond_code = TREE_CODE (cond_expr);
1548 : 8896 : else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1549 : : {
1550 : 0 : bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1551 : 0 : swap_code = swap_tree_comparison (cond_code);
1552 : 0 : invert_code = invert_tree_comparison (cond_code, honor_nans);
1553 : : }
1554 : :
1555 : 57303 : if (first_cond_code == cond_code)
1556 : : ;
1557 : : /* Isomorphic can be achieved by swapping. */
1558 : 0 : else if (first_cond_code == swap_code)
1559 : 0 : swap[i] = 1;
1560 : : /* Isomorphic can be achieved by inverting. */
1561 : 0 : else if (first_cond_code == invert_code)
1562 : 0 : swap[i] = 2;
1563 : : else
1564 : : {
1565 : 0 : if (dump_enabled_p ())
1566 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1567 : : "Build SLP failed: different"
1568 : : " operation %G", stmt);
1569 : : /* Mismatch. */
1570 : 0 : continue;
1571 : : }
1572 : : }
1573 : :
1574 : 10738960 : if (rhs_code.is_tree_code ()
1575 : 9849932 : && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1576 : 11497351 : && (swap_tree_comparison ((tree_code)first_stmt_code)
1577 : 758391 : == (tree_code)rhs_code))
1578 : 531123 : swap[i] = 1;
1579 : : }
1580 : :
1581 : 15122869 : matches[i] = true;
1582 : : }
1583 : :
1584 : 20215147 : for (i = 0; i < group_size; ++i)
1585 : 15978069 : if (!matches[i])
1586 : : return false;
1587 : :
1588 : : /* If we allowed a two-operation SLP node verify the target can cope
1589 : : with the permute we are going to use. */
1590 : 4237078 : if (alt_stmt_code != ERROR_MARK
1591 : 4237078 : && (!alt_stmt_code.is_tree_code ()
1592 : 53932 : || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1593 : 53932 : && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1594 : : {
1595 : 12115 : *two_operators = true;
1596 : : }
1597 : :
1598 : 4237078 : if (maybe_soft_fail)
1599 : : {
1600 : 131665 : unsigned HOST_WIDE_INT const_nunits;
1601 : 131665 : if (!TYPE_VECTOR_SUBPARTS
1602 : 131665 : (soft_fail_nunits_vectype).is_constant (&const_nunits)
1603 : 131665 : || const_nunits > group_size)
1604 : 0 : matches[0] = false;
1605 : : else
1606 : : {
1607 : : /* With constant vector elements simulate a mismatch at the
1608 : : point we need to split. */
1609 : 131665 : unsigned tail = group_size & (const_nunits - 1);
1610 : 131665 : memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1611 : : }
1612 : 131665 : return false;
1613 : : }
1614 : :
1615 : : return true;
1616 : : }
1617 : :
1618 : : /* Traits for the hash_set to record failed SLP builds for a stmt set.
1619 : : Note we never remove apart from at destruction time so we do not
1620 : : need a special value for deleted that differs from empty. */
1621 : : struct bst_traits
1622 : : {
1623 : : typedef vec <stmt_vec_info> value_type;
1624 : : typedef vec <stmt_vec_info> compare_type;
1625 : : static inline hashval_t hash (value_type);
1626 : : static inline bool equal (value_type existing, value_type candidate);
1627 : 475493816 : static inline bool is_empty (value_type x) { return !x.exists (); }
1628 : 110601728 : static inline bool is_deleted (value_type x) { return !x.exists (); }
1629 : : static const bool empty_zero_p = true;
1630 : 0 : static inline void mark_empty (value_type &x) { x.release (); }
1631 : : static inline void mark_deleted (value_type &x) { x.release (); }
1632 : 8960074 : static inline void remove (value_type &x) { x.release (); }
1633 : : };
1634 : : inline hashval_t
1635 : 95655231 : bst_traits::hash (value_type x)
1636 : : {
1637 : 95655231 : inchash::hash h;
1638 : 416795102 : for (unsigned i = 0; i < x.length (); ++i)
1639 : 321139871 : h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
1640 : 95655231 : return h.end ();
1641 : : }
1642 : : inline bool
1643 : 85515106 : bst_traits::equal (value_type existing, value_type candidate)
1644 : : {
1645 : 256545318 : if (existing.length () != candidate.length ())
1646 : : return false;
1647 : 87732257 : for (unsigned i = 0; i < existing.length (); ++i)
1648 : 83035254 : if (existing[i] != candidate[i])
1649 : : return false;
1650 : : return true;
1651 : : }
1652 : :
1653 : : typedef hash_map <vec <stmt_vec_info>, slp_tree,
1654 : : simple_hashmap_traits <bst_traits, slp_tree> >
1655 : : scalar_stmts_to_slp_tree_map_t;
1656 : :
1657 : : /* Release BST_MAP. */
1658 : :
1659 : : static void
1660 : 1602415 : release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
1661 : : {
1662 : : /* The map keeps a reference on SLP nodes built, release that. */
1663 : 10562489 : for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1664 : 19522563 : it != bst_map->end (); ++it)
1665 : 8960074 : if ((*it).second)
1666 : 8960074 : vect_free_slp_tree ((*it).second);
1667 : 1602415 : delete bst_map;
1668 : 1602415 : }
1669 : :
1670 : : /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1671 : : but then vec::insert does memmove and that's not compatible with
1672 : : std::pair. */
1673 : : struct chain_op_t
1674 : : {
1675 : 3338643 : chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1676 : 3338643 : : code (code_), dt (dt_), op (op_) {}
1677 : : tree_code code;
1678 : : vect_def_type dt;
1679 : : tree op;
1680 : : };
1681 : :
1682 : : /* Comparator for sorting associatable chains. */
1683 : :
1684 : : static int
1685 : 7987895 : dt_sort_cmp (const void *op1_, const void *op2_, void *)
1686 : : {
1687 : 7987895 : auto *op1 = (const chain_op_t *) op1_;
1688 : 7987895 : auto *op2 = (const chain_op_t *) op2_;
1689 : 7987895 : if (op1->dt != op2->dt)
1690 : 965550 : return (int)op1->dt - (int)op2->dt;
1691 : 7022345 : return (int)op1->code - (int)op2->code;
1692 : : }
1693 : :
1694 : : /* Linearize the associatable expression chain at START with the
1695 : : associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1696 : : filling CHAIN with the result and using WORKLIST as intermediate storage.
1697 : : CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1698 : : or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1699 : : stmts, starting with START. */
1700 : :
1701 : : static void
1702 : 1489733 : vect_slp_linearize_chain (vec_info *vinfo,
1703 : : vec<std::pair<tree_code, gimple *> > &worklist,
1704 : : vec<chain_op_t> &chain,
1705 : : enum tree_code code, gimple *start,
1706 : : gimple *&code_stmt, gimple *&alt_code_stmt,
1707 : : vec<gimple *> *chain_stmts)
1708 : : {
1709 : : /* For each lane linearize the addition/subtraction (or other
1710 : : uniform associatable operation) expression tree. */
1711 : 1489733 : worklist.safe_push (std::make_pair (code, start));
1712 : 3338643 : while (!worklist.is_empty ())
1713 : : {
1714 : 1848910 : auto entry = worklist.pop ();
1715 : 1848910 : gassign *stmt = as_a <gassign *> (entry.second);
1716 : 1848910 : enum tree_code in_code = entry.first;
1717 : 3697820 : enum tree_code this_code = gimple_assign_rhs_code (stmt);
1718 : : /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1719 : 1848910 : if (!code_stmt
1720 : 1848910 : && gimple_assign_rhs_code (stmt) == code)
1721 : 1262710 : code_stmt = stmt;
1722 : 586200 : else if (!alt_code_stmt
1723 : 586200 : && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1724 : 298753 : alt_code_stmt = stmt;
1725 : 1848910 : if (chain_stmts)
1726 : 1842733 : chain_stmts->safe_push (stmt);
1727 : 5546730 : for (unsigned opnum = 1; opnum <= 2; ++opnum)
1728 : : {
1729 : 3697820 : tree op = gimple_op (stmt, opnum);
1730 : 3697820 : vect_def_type dt;
1731 : 3697820 : stmt_vec_info def_stmt_info;
1732 : 3697820 : bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1733 : 3697820 : gcc_assert (res);
1734 : 3697820 : if (dt == vect_internal_def
1735 : 3697820 : && is_pattern_stmt_p (def_stmt_info))
1736 : 998 : op = gimple_get_lhs (def_stmt_info->stmt);
1737 : 3697820 : gimple *use_stmt;
1738 : 3697820 : use_operand_p use_p;
1739 : 3697820 : if (dt == vect_internal_def
1740 : 3460235 : && single_imm_use (op, &use_p, &use_stmt)
1741 : 2178500 : && is_gimple_assign (def_stmt_info->stmt)
1742 : 5702156 : && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1743 : 1670081 : || (code == PLUS_EXPR
1744 : 794962 : && (gimple_assign_rhs_code (def_stmt_info->stmt)
1745 : : == MINUS_EXPR))))
1746 : : {
1747 : 359177 : tree_code op_def_code = this_code;
1748 : 359177 : if (op_def_code == MINUS_EXPR && opnum == 1)
1749 : 52636 : op_def_code = PLUS_EXPR;
1750 : 359177 : if (in_code == MINUS_EXPR)
1751 : 193 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1752 : 359177 : worklist.safe_push (std::make_pair (op_def_code,
1753 : 359177 : def_stmt_info->stmt));
1754 : : }
1755 : : else
1756 : : {
1757 : 3338643 : tree_code op_def_code = this_code;
1758 : 3338643 : if (op_def_code == MINUS_EXPR && opnum == 1)
1759 : 250091 : op_def_code = PLUS_EXPR;
1760 : 3338643 : if (in_code == MINUS_EXPR)
1761 : 4277 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1762 : 3338643 : chain.safe_push (chain_op_t (op_def_code, dt, op));
1763 : : }
1764 : : }
1765 : : }
1766 : 1489733 : }
1767 : :
1768 : : static slp_tree
1769 : : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1770 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1771 : : poly_uint64 *max_nunits,
1772 : : bool *matches, unsigned *limit, unsigned *tree_size,
1773 : : scalar_stmts_to_slp_tree_map_t *bst_map);
1774 : :
1775 : : static slp_tree
1776 : 5838093 : vect_build_slp_tree (vec_info *vinfo,
1777 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1778 : : poly_uint64 *max_nunits,
1779 : : bool *matches, unsigned *limit, unsigned *tree_size,
1780 : : scalar_stmts_to_slp_tree_map_t *bst_map)
1781 : : {
1782 : 5838093 : if (slp_tree *leader = bst_map->get (stmts))
1783 : : {
1784 : 450176 : if (dump_enabled_p ())
1785 : 19091 : dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1786 : 19091 : !(*leader)->failed ? "" : "failed ",
1787 : : (void *) *leader);
1788 : 450176 : if (!(*leader)->failed)
1789 : : {
1790 : 403748 : SLP_TREE_REF_COUNT (*leader)++;
1791 : 403748 : vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1792 : 403748 : stmts.release ();
1793 : 403748 : return *leader;
1794 : : }
1795 : 46428 : memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1796 : 46428 : return NULL;
1797 : : }
1798 : :
1799 : : /* Single-lane SLP doesn't have the chance of run-away, do not account
1800 : : it to the limit. */
1801 : 5387917 : if (stmts.length () > 1)
1802 : : {
1803 : 3025709 : if (*limit == 0)
1804 : : {
1805 : 1401 : if (dump_enabled_p ())
1806 : 48 : dump_printf_loc (MSG_NOTE, vect_location,
1807 : : "SLP discovery limit exceeded\n");
1808 : 1401 : memset (matches, 0, sizeof (bool) * group_size);
1809 : 1401 : return NULL;
1810 : : }
1811 : 3024308 : --*limit;
1812 : : }
1813 : :
1814 : : /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1815 : : so we can pick up backedge destinations during discovery. */
1816 : 5386516 : slp_tree res = new _slp_tree;
1817 : 5386516 : SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1818 : 5386516 : SLP_TREE_SCALAR_STMTS (res) = stmts;
1819 : 5386516 : bst_map->put (stmts.copy (), res);
1820 : :
1821 : 5386516 : if (dump_enabled_p ())
1822 : 155966 : dump_printf_loc (MSG_NOTE, vect_location,
1823 : : "starting SLP discovery for node %p\n", (void *) res);
1824 : :
1825 : 5386516 : poly_uint64 this_max_nunits = 1;
1826 : 5386516 : slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1827 : : &this_max_nunits,
1828 : : matches, limit, tree_size, bst_map);
1829 : 5386516 : if (!res_)
1830 : : {
1831 : 1867238 : if (dump_enabled_p ())
1832 : 8539 : dump_printf_loc (MSG_NOTE, vect_location,
1833 : : "SLP discovery for node %p failed\n", (void *) res);
1834 : : /* Mark the node invalid so we can detect those when still in use
1835 : : as backedge destinations. */
1836 : 1867238 : SLP_TREE_SCALAR_STMTS (res) = vNULL;
1837 : 1867238 : SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1838 : 1867238 : res->failed = XNEWVEC (bool, group_size);
1839 : 1867238 : if (flag_checking)
1840 : : {
1841 : : unsigned i;
1842 : 3563484 : for (i = 0; i < group_size; ++i)
1843 : 3563484 : if (!matches[i])
1844 : : break;
1845 : 1867238 : gcc_assert (i < group_size);
1846 : : }
1847 : 1867238 : memcpy (res->failed, matches, sizeof (bool) * group_size);
1848 : : }
1849 : : else
1850 : : {
1851 : 3519278 : if (dump_enabled_p ())
1852 : 147427 : dump_printf_loc (MSG_NOTE, vect_location,
1853 : : "SLP discovery for node %p succeeded\n",
1854 : : (void *) res);
1855 : 3519278 : gcc_assert (res_ == res);
1856 : 3519278 : res->max_nunits = this_max_nunits;
1857 : 3519278 : vect_update_max_nunits (max_nunits, this_max_nunits);
1858 : : /* Keep a reference for the bst_map use. */
1859 : 3519278 : SLP_TREE_REF_COUNT (res)++;
1860 : : }
1861 : : return res_;
1862 : : }
1863 : :
1864 : : /* Helper for building an associated SLP node chain. */
1865 : :
1866 : : static void
1867 : 165 : vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1868 : : slp_tree op0, slp_tree op1,
1869 : : stmt_vec_info oper1, stmt_vec_info oper2,
1870 : : vec<std::pair<unsigned, unsigned> > lperm)
1871 : : {
1872 : 165 : unsigned group_size = SLP_TREE_LANES (op1);
1873 : :
1874 : 165 : slp_tree child1 = new _slp_tree;
1875 : 165 : SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1876 : 165 : SLP_TREE_VECTYPE (child1) = vectype;
1877 : 165 : SLP_TREE_LANES (child1) = group_size;
1878 : 165 : SLP_TREE_CHILDREN (child1).create (2);
1879 : 165 : SLP_TREE_CHILDREN (child1).quick_push (op0);
1880 : 165 : SLP_TREE_CHILDREN (child1).quick_push (op1);
1881 : 165 : SLP_TREE_REPRESENTATIVE (child1) = oper1;
1882 : :
1883 : 165 : slp_tree child2 = new _slp_tree;
1884 : 165 : SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1885 : 165 : SLP_TREE_VECTYPE (child2) = vectype;
1886 : 165 : SLP_TREE_LANES (child2) = group_size;
1887 : 165 : SLP_TREE_CHILDREN (child2).create (2);
1888 : 165 : SLP_TREE_CHILDREN (child2).quick_push (op0);
1889 : 165 : SLP_TREE_REF_COUNT (op0)++;
1890 : 165 : SLP_TREE_CHILDREN (child2).quick_push (op1);
1891 : 165 : SLP_TREE_REF_COUNT (op1)++;
1892 : 165 : SLP_TREE_REPRESENTATIVE (child2) = oper2;
1893 : :
1894 : 165 : SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1895 : 165 : SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1896 : 165 : SLP_TREE_VECTYPE (perm) = vectype;
1897 : 165 : SLP_TREE_LANES (perm) = group_size;
1898 : : /* ??? We should set this NULL but that's not expected. */
1899 : 165 : SLP_TREE_REPRESENTATIVE (perm) = oper1;
1900 : 165 : SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1901 : 165 : SLP_TREE_CHILDREN (perm).quick_push (child1);
1902 : 165 : SLP_TREE_CHILDREN (perm).quick_push (child2);
1903 : 165 : }
1904 : :
1905 : : /* Recursively build an SLP tree starting from NODE.
1906 : : Fail (and return a value not equal to zero) if def-stmts are not
1907 : : isomorphic, require data permutation or are of unsupported types of
1908 : : operation. Otherwise, return 0.
1909 : : The value returned is the depth in the SLP tree where a mismatch
1910 : : was found. */
1911 : :
1912 : : static slp_tree
1913 : 5386516 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1914 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1915 : : poly_uint64 *max_nunits,
1916 : : bool *matches, unsigned *limit, unsigned *tree_size,
1917 : : scalar_stmts_to_slp_tree_map_t *bst_map)
1918 : : {
1919 : 5386516 : unsigned nops, i, this_tree_size = 0;
1920 : 5386516 : poly_uint64 this_max_nunits = *max_nunits;
1921 : :
1922 : 5386516 : matches[0] = false;
1923 : :
1924 : 5386516 : stmt_vec_info stmt_info = stmts[0];
1925 : 5386516 : if (!is_a<gcall *> (stmt_info->stmt)
1926 : : && !is_a<gassign *> (stmt_info->stmt)
1927 : : && !is_a<gphi *> (stmt_info->stmt))
1928 : : return NULL;
1929 : :
1930 : 5386415 : nops = gimple_num_args (stmt_info->stmt);
1931 : 5386415 : if (const int *map = vect_get_operand_map (stmt_info->stmt,
1932 : 5386415 : STMT_VINFO_GATHER_SCATTER_P
1933 : : (stmt_info)))
1934 : 23428 : nops = map[0];
1935 : :
1936 : : /* If the SLP node is a PHI (induction or reduction), terminate
1937 : : the recursion. */
1938 : 5386415 : bool *skip_args = XALLOCAVEC (bool, nops);
1939 : 5386415 : memset (skip_args, 0, sizeof (bool) * nops);
1940 : 5386415 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1941 : 2474463 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1942 : : {
1943 : 405093 : tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1944 : 405093 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1945 : : group_size);
1946 : 405093 : if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1947 : : max_nunits))
1948 : : return NULL;
1949 : :
1950 : 405093 : vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1951 : 405093 : if (def_type == vect_induction_def)
1952 : : {
1953 : : /* Induction PHIs are not cycles but walk the initial
1954 : : value. Only for inner loops through, for outer loops
1955 : : we need to pick up the value from the actual PHIs
1956 : : to more easily support peeling and epilogue vectorization. */
1957 : 332723 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1958 : 332723 : if (!nested_in_vect_loop_p (loop, stmt_info))
1959 : 331936 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1960 : : else
1961 : : loop = loop->inner;
1962 : 332723 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
1963 : : }
1964 : 72370 : else if (def_type == vect_reduction_def
1965 : : || def_type == vect_double_reduction_def
1966 : : || def_type == vect_nested_cycle
1967 : 72370 : || def_type == vect_first_order_recurrence)
1968 : : {
1969 : : /* Else def types have to match. */
1970 : : stmt_vec_info other_info;
1971 : : bool all_same = true;
1972 : 144529 : FOR_EACH_VEC_ELT (stmts, i, other_info)
1973 : : {
1974 : 73390 : if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1975 : 1514308 : return NULL;
1976 : 73390 : if (other_info != stmt_info)
1977 : 712 : all_same = false;
1978 : : }
1979 : 71139 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1980 : : /* Reduction initial values are not explicitely represented. */
1981 : 71139 : if (def_type != vect_first_order_recurrence
1982 : 71139 : && gimple_bb (stmt_info->stmt) == loop->header)
1983 : 68155 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1984 : : /* Reduction chain backedge defs are filled manually.
1985 : : ??? Need a better way to identify a SLP reduction chain PHI.
1986 : : Or a better overall way to SLP match those. */
1987 : 71139 : if (stmts.length () > 1
1988 : 71139 : && all_same && def_type == vect_reduction_def)
1989 : 368 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
1990 : : }
1991 : 1231 : else if (def_type != vect_internal_def)
1992 : : return NULL;
1993 : : }
1994 : :
1995 : :
1996 : 5386415 : bool two_operators = false;
1997 : 5386415 : unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1998 : 5386415 : tree vectype = NULL_TREE;
1999 : 5386415 : if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
2000 : : &this_max_nunits, matches, &two_operators,
2001 : : &vectype))
2002 : : return NULL;
2003 : :
2004 : : /* If the SLP node is a load, terminate the recursion unless masked. */
2005 : 4105413 : if (STMT_VINFO_DATA_REF (stmt_info)
2006 : 1844039 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2007 : : {
2008 : 741234 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2009 : : gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
2010 : : else
2011 : : {
2012 : 730543 : *max_nunits = this_max_nunits;
2013 : 730543 : (*tree_size)++;
2014 : 730543 : node = vect_create_new_slp_node (node, stmts, 0);
2015 : 730543 : SLP_TREE_VECTYPE (node) = vectype;
2016 : : /* And compute the load permutation. Whether it is actually
2017 : : a permutation depends on the unrolling factor which is
2018 : : decided later. */
2019 : 730543 : vec<unsigned> load_permutation;
2020 : 730543 : int j;
2021 : 730543 : stmt_vec_info load_info;
2022 : 730543 : load_permutation.create (group_size);
2023 : 730543 : stmt_vec_info first_stmt_info
2024 : 730543 : = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2025 : 730543 : ? DR_GROUP_FIRST_ELEMENT (stmt_info) : stmt_info;
2026 : 730543 : bool any_permute = false;
2027 : 1886064 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2028 : : {
2029 : 1155521 : int load_place;
2030 : 1155521 : if (! load_info)
2031 : : {
2032 : 119675 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2033 : : load_place = j;
2034 : : else
2035 : : load_place = 0;
2036 : : }
2037 : 1035846 : else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2038 : 691707 : load_place = vect_get_place_in_interleaving_chain
2039 : 691707 : (load_info, first_stmt_info);
2040 : : else
2041 : : load_place = 0;
2042 : 811382 : gcc_assert (load_place != -1);
2043 : 1155521 : any_permute |= load_place != j;
2044 : 1155521 : load_permutation.quick_push (load_place);
2045 : : }
2046 : :
2047 : 730543 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
2048 : : {
2049 : 2309 : gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
2050 : 2309 : bool has_gaps = false;
2051 : 2309 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052 : 217 : for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
2053 : 326 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2054 : 109 : if (DR_GROUP_GAP (si) != 1)
2055 : 24 : has_gaps = true;
2056 : : /* We cannot handle permuted masked loads directly, see
2057 : : PR114375. We cannot handle strided masked loads or masked
2058 : : loads with gaps unless the mask is uniform. */
2059 : 2309 : if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
2060 : 217 : && (DR_GROUP_GAP (first_stmt_info) != 0
2061 : 91 : || (has_gaps
2062 : 24 : && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))))
2063 : 4468 : || STMT_VINFO_STRIDED_P (stmt_info))
2064 : : {
2065 : 176 : load_permutation.release ();
2066 : 176 : matches[0] = false;
2067 : 728448 : return NULL;
2068 : : }
2069 : :
2070 : : /* For permuted masked loads do an unpermuted masked load of
2071 : : the whole group followed by a SLP permute node. */
2072 : 2133 : if (any_permute
2073 : 2133 : || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2074 : 43 : && DR_GROUP_SIZE (first_stmt_info) != group_size))
2075 : : {
2076 : : /* Discover the whole unpermuted load. */
2077 : 38 : vec<stmt_vec_info> stmts2;
2078 : 38 : unsigned dr_group_size = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2079 : 70 : ? DR_GROUP_SIZE (first_stmt_info) : 1;
2080 : 38 : stmts2.create (dr_group_size);
2081 : 38 : stmts2.quick_grow_cleared (dr_group_size);
2082 : 38 : unsigned i = 0;
2083 : 38 : for (stmt_vec_info si = first_stmt_info;
2084 : 120 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2085 : : {
2086 : 82 : if (si != first_stmt_info)
2087 : 44 : for (unsigned k = 1; k < DR_GROUP_GAP (si); ++k)
2088 : 0 : stmts2[i++] = NULL;
2089 : 82 : stmts2[i++] = si;
2090 : : }
2091 : 38 : bool *matches2 = XALLOCAVEC (bool, dr_group_size);
2092 : 38 : slp_tree unperm_load
2093 : 38 : = vect_build_slp_tree (vinfo, stmts2, dr_group_size,
2094 : : &this_max_nunits, matches2, limit,
2095 : 38 : &this_tree_size, bst_map);
2096 : : /* When we are able to do the full masked load emit that
2097 : : followed by 'node' being the desired final permutation. */
2098 : 38 : if (unperm_load)
2099 : : {
2100 : 28 : gcc_assert
2101 : : (!SLP_TREE_LOAD_PERMUTATION (unperm_load).exists ());
2102 : 28 : lane_permutation_t lperm;
2103 : 28 : lperm.create (group_size);
2104 : 68 : for (unsigned j = 0; j < load_permutation.length (); ++j)
2105 : 40 : lperm.quick_push
2106 : 40 : (std::make_pair (0, load_permutation[j]));
2107 : 28 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2108 : 28 : SLP_TREE_CHILDREN (node).safe_push (unperm_load);
2109 : 28 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2110 : 28 : load_permutation.release ();
2111 : 28 : return node;
2112 : : }
2113 : 10 : stmts2.release ();
2114 : 10 : load_permutation.release ();
2115 : 10 : matches[0] = false;
2116 : 10 : return NULL;
2117 : : }
2118 : 2095 : load_permutation.release ();
2119 : : }
2120 : : else
2121 : : {
2122 : 728234 : if (!any_permute
2123 : 636630 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2124 : 1028126 : && group_size == DR_GROUP_SIZE (first_stmt_info))
2125 : 167262 : load_permutation.release ();
2126 : 728234 : SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
2127 : 728234 : return node;
2128 : : }
2129 : : }
2130 : : }
2131 : 3364179 : else if (gimple_assign_single_p (stmt_info->stmt)
2132 : 2210776 : && !gimple_vuse (stmt_info->stmt)
2133 : 3371968 : && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
2134 : : {
2135 : : /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
2136 : : the same SSA name vector of a compatible type to vectype. */
2137 : 2411 : vec<std::pair<unsigned, unsigned> > lperm = vNULL;
2138 : 2411 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
2139 : 2411 : stmt_vec_info estmt_info;
2140 : 7917 : FOR_EACH_VEC_ELT (stmts, i, estmt_info)
2141 : : {
2142 : 5509 : gassign *estmt = as_a <gassign *> (estmt_info->stmt);
2143 : 5509 : tree bfref = gimple_assign_rhs1 (estmt);
2144 : 5509 : HOST_WIDE_INT lane;
2145 : 5509 : if (!known_eq (bit_field_size (bfref),
2146 : : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2147 : 11015 : || !constant_multiple_p (bit_field_offset (bfref),
2148 : 5506 : bit_field_size (bfref), &lane))
2149 : : {
2150 : 3 : lperm.release ();
2151 : 3 : matches[0] = false;
2152 : 3 : return NULL;
2153 : : }
2154 : 5506 : lperm.safe_push (std::make_pair (0, (unsigned)lane));
2155 : : }
2156 : 2408 : slp_tree vnode = vect_create_new_slp_node (vNULL);
2157 : 2408 : if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2158 : : /* ??? We record vectype here but we hide eventually necessary
2159 : : punning and instead rely on code generation to materialize
2160 : : VIEW_CONVERT_EXPRs as necessary. We instead should make
2161 : : this explicit somehow. */
2162 : 649 : SLP_TREE_VECTYPE (vnode) = vectype;
2163 : : else
2164 : : {
2165 : : /* For different size but compatible elements we can still
2166 : : use VEC_PERM_EXPR without punning. */
2167 : 1759 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2168 : : && types_compatible_p (TREE_TYPE (vectype),
2169 : : TREE_TYPE (TREE_TYPE (vec))));
2170 : 1759 : SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2171 : : }
2172 : 2408 : auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2173 : 2408 : unsigned HOST_WIDE_INT const_nunits;
2174 : 2408 : if (nunits.is_constant (&const_nunits))
2175 : 2408 : SLP_TREE_LANES (vnode) = const_nunits;
2176 : 2408 : SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2177 : : /* We are always building a permutation node even if it is an identity
2178 : : permute to shield the rest of the vectorizer from the odd node
2179 : : representing an actual vector without any scalar ops.
2180 : : ??? We could hide it completely with making the permute node
2181 : : external? */
2182 : 2408 : node = vect_create_new_slp_node (node, stmts, 1);
2183 : 2408 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2184 : 2408 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2185 : 2408 : SLP_TREE_VECTYPE (node) = vectype;
2186 : 2408 : SLP_TREE_CHILDREN (node).quick_push (vnode);
2187 : 2408 : return node;
2188 : : }
2189 : : /* When discovery reaches an associatable operation see whether we can
2190 : : improve that to match up lanes in a way superior to the operand
2191 : : swapping code which at most looks at two defs.
2192 : : ??? For BB vectorization we cannot do the brute-force search
2193 : : for matching as we can succeed by means of builds from scalars
2194 : : and have no good way to "cost" one build against another. */
2195 : 3361768 : else if (is_a <loop_vec_info> (vinfo)
2196 : : /* Do not bother for single-lane SLP. */
2197 : 1875253 : && group_size > 1
2198 : : /* ??? We don't handle !vect_internal_def defs below. */
2199 : 45199 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2200 : : /* ??? Do not associate a reduction, this will wreck REDUC_IDX
2201 : : mapping as long as that exists on the stmt_info level. */
2202 : 40349 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1
2203 : 39948 : && is_gimple_assign (stmt_info->stmt)
2204 : 39744 : && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2205 : 32119 : || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2206 : 3371087 : && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2207 : 7959 : || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2208 : 5844 : && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2209 : : {
2210 : : /* See if we have a chain of (mixed) adds or subtracts or other
2211 : : associatable ops. */
2212 : 4527 : enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2213 : 4527 : if (code == MINUS_EXPR)
2214 : 703 : code = PLUS_EXPR;
2215 : 4527 : stmt_vec_info other_op_stmt_info = NULL;
2216 : 4527 : stmt_vec_info op_stmt_info = NULL;
2217 : 4527 : unsigned chain_len = 0;
2218 : 4527 : auto_vec<chain_op_t> chain;
2219 : 4527 : auto_vec<std::pair<tree_code, gimple *> > worklist;
2220 : 4527 : auto_vec<vec<chain_op_t> > chains (group_size);
2221 : 4527 : auto_vec<slp_tree, 4> children;
2222 : 4527 : bool hard_fail = true;
2223 : 5370 : for (unsigned lane = 0; lane < group_size; ++lane)
2224 : : {
2225 : 5064 : if (!stmts[lane])
2226 : : {
2227 : : /* ??? Below we require lane zero is present. */
2228 : 0 : if (lane == 0)
2229 : : {
2230 : : hard_fail = false;
2231 : 4221 : break;
2232 : : }
2233 : 0 : chains.quick_push (vNULL);
2234 : 0 : continue;
2235 : : }
2236 : : /* For each lane linearize the addition/subtraction (or other
2237 : : uniform associatable operation) expression tree. */
2238 : 5064 : gimple *op_stmt = NULL, *other_op_stmt = NULL;
2239 : 5064 : vect_slp_linearize_chain (vinfo, worklist, chain, code,
2240 : 5064 : stmts[lane]->stmt, op_stmt, other_op_stmt,
2241 : : NULL);
2242 : 5064 : if (!op_stmt_info && op_stmt)
2243 : 3977 : op_stmt_info = vinfo->lookup_stmt (op_stmt);
2244 : 5064 : if (!other_op_stmt_info && other_op_stmt)
2245 : 727 : other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2246 : 5064 : if (chain.length () == 2)
2247 : : {
2248 : : /* In a chain of just two elements resort to the regular
2249 : : operand swapping scheme. Likewise if we run into a
2250 : : length mismatch process regularly as well as we did not
2251 : : process the other lanes we cannot report a good hint what
2252 : : lanes to try swapping in the parent. */
2253 : : hard_fail = false;
2254 : : break;
2255 : : }
2256 : 843 : else if (chain_len == 0)
2257 : 343 : chain_len = chain.length ();
2258 : 1000 : else if (chain.length () != chain_len)
2259 : : {
2260 : : /* ??? Here we could slip in magic to compensate with
2261 : : neutral operands. */
2262 : 0 : matches[lane] = false;
2263 : 0 : if (lane != group_size - 1)
2264 : 0 : matches[0] = false;
2265 : : break;
2266 : : }
2267 : 843 : chains.quick_push (chain.copy ());
2268 : 843 : chain.truncate (0);
2269 : : }
2270 : 9054 : if (chains.length () == group_size)
2271 : : {
2272 : : /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2273 : 306 : if (!op_stmt_info)
2274 : : {
2275 : 15 : hard_fail = false;
2276 : 15 : goto out;
2277 : : }
2278 : : /* Now we have a set of chains with the same length. */
2279 : : /* 1. pre-sort according to def_type and operation. */
2280 : 1057 : for (unsigned lane = 0; lane < group_size; ++lane)
2281 : 1532 : chains[lane].stablesort (dt_sort_cmp, vinfo);
2282 : 291 : if (dump_enabled_p ())
2283 : : {
2284 : 123 : dump_printf_loc (MSG_NOTE, vect_location,
2285 : : "pre-sorted chains of %s\n",
2286 : : get_tree_code_name (code));
2287 : 522 : for (unsigned lane = 0; lane < group_size; ++lane)
2288 : : {
2289 : 399 : if (!stmts[lane])
2290 : 0 : dump_printf (MSG_NOTE, "--");
2291 : : else
2292 : 1822 : for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2293 : 2846 : dump_printf (MSG_NOTE, "%s %T ",
2294 : 1423 : get_tree_code_name (chains[lane][opnum].code),
2295 : 1423 : chains[lane][opnum].op);
2296 : 399 : dump_printf (MSG_NOTE, "\n");
2297 : : }
2298 : : }
2299 : : /* 2. try to build children nodes, associating as necessary. */
2300 : : /* 2a. prepare and perform early checks to avoid eating into
2301 : : discovery limit unnecessarily. */
2302 : 291 : vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
2303 : 1226 : for (unsigned n = 0; n < chain_len; ++n)
2304 : : {
2305 : 935 : vect_def_type dt = chains[0][n].dt;
2306 : 935 : unsigned lane;
2307 : 3497 : for (lane = 0; lane < group_size; ++lane)
2308 : 5124 : if (stmts[lane] && chains[lane][n].dt != dt)
2309 : : {
2310 : 0 : if (dt == vect_constant_def
2311 : 0 : && chains[lane][n].dt == vect_external_def)
2312 : : dt = vect_external_def;
2313 : 0 : else if (dt == vect_external_def
2314 : 0 : && chains[lane][n].dt == vect_constant_def)
2315 : : ;
2316 : : else
2317 : : break;
2318 : : }
2319 : 935 : if (lane != group_size)
2320 : : {
2321 : 0 : if (dump_enabled_p ())
2322 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2323 : : "giving up on chain due to mismatched "
2324 : : "def types\n");
2325 : 0 : matches[lane] = false;
2326 : 0 : if (lane != group_size - 1)
2327 : 0 : matches[0] = false;
2328 : 0 : goto out;
2329 : : }
2330 : 935 : dts[n] = dt;
2331 : 935 : if (dt == vect_constant_def
2332 : 935 : || dt == vect_external_def)
2333 : : {
2334 : : /* Check whether we can build the invariant. If we can't
2335 : : we never will be able to. */
2336 : 78 : tree type = TREE_TYPE (chains[0][n].op);
2337 : 935 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2338 : : && (TREE_CODE (type) == BOOLEAN_TYPE
2339 : : || !can_duplicate_and_interleave_p (vinfo, group_size,
2340 : : type)))
2341 : : {
2342 : : matches[0] = false;
2343 : : goto out;
2344 : : }
2345 : : }
2346 : 857 : else if (dt != vect_internal_def)
2347 : : {
2348 : : /* Not sure, we might need sth special.
2349 : : gcc.dg/vect/pr96854.c,
2350 : : gfortran.dg/vect/fast-math-pr37021.f90
2351 : : and gfortran.dg/vect/pr61171.f trigger. */
2352 : : /* Soft-fail for now. */
2353 : 0 : hard_fail = false;
2354 : 0 : goto out;
2355 : : }
2356 : : }
2357 : : /* 2b. do the actual build. */
2358 : 1144 : for (unsigned n = 0; n < chain_len; ++n)
2359 : : {
2360 : 880 : vect_def_type dt = dts[n];
2361 : 880 : unsigned lane;
2362 : 880 : if (dt == vect_constant_def
2363 : 880 : || dt == vect_external_def)
2364 : : {
2365 : 78 : vec<tree> ops;
2366 : 78 : ops.create (group_size);
2367 : 387 : for (lane = 0; lane < group_size; ++lane)
2368 : 231 : if (stmts[lane])
2369 : 231 : ops.quick_push (chains[lane][n].op);
2370 : : else
2371 : 0 : ops.quick_push (NULL_TREE);
2372 : 78 : slp_tree child = vect_create_new_slp_node (ops);
2373 : 78 : SLP_TREE_DEF_TYPE (child) = dt;
2374 : 78 : children.safe_push (child);
2375 : : }
2376 : : else
2377 : : {
2378 : 802 : vec<stmt_vec_info> op_stmts;
2379 : 802 : op_stmts.create (group_size);
2380 : 802 : slp_tree child = NULL;
2381 : : /* Brute-force our way. We have to consider a lane
2382 : : failing after fixing an earlier fail up in the
2383 : : SLP discovery recursion. So track the current
2384 : : permute per lane. */
2385 : 802 : unsigned *perms = XALLOCAVEC (unsigned, group_size);
2386 : 802 : memset (perms, 0, sizeof (unsigned) * group_size);
2387 : 931 : do
2388 : : {
2389 : 931 : op_stmts.truncate (0);
2390 : 4437 : for (lane = 0; lane < group_size; ++lane)
2391 : 2575 : if (stmts[lane])
2392 : 2575 : op_stmts.quick_push
2393 : 2575 : (vinfo->lookup_def (chains[lane][n].op));
2394 : : else
2395 : 0 : op_stmts.quick_push (NULL);
2396 : 931 : child = vect_build_slp_tree (vinfo, op_stmts,
2397 : : group_size, &this_max_nunits,
2398 : : matches, limit,
2399 : : &this_tree_size, bst_map);
2400 : : /* ??? We're likely getting too many fatal mismatches
2401 : : here so maybe we want to ignore them (but then we
2402 : : have no idea which lanes fatally mismatched). */
2403 : 931 : if (child || !matches[0])
2404 : : break;
2405 : : /* Swap another lane we have not yet matched up into
2406 : : lanes that did not match. If we run out of
2407 : : permute possibilities for a lane terminate the
2408 : : search. */
2409 : 405 : bool term = false;
2410 : 405 : for (lane = 1; lane < group_size; ++lane)
2411 : 276 : if (!matches[lane])
2412 : : {
2413 : 220 : if (n + perms[lane] + 1 == chain_len)
2414 : : {
2415 : : term = true;
2416 : : break;
2417 : : }
2418 : 193 : if (dump_enabled_p ())
2419 : 119 : dump_printf_loc (MSG_NOTE, vect_location,
2420 : : "swapping operand %d and %d "
2421 : : "of lane %d\n",
2422 : : n, n + perms[lane] + 1, lane);
2423 : 386 : std::swap (chains[lane][n],
2424 : 193 : chains[lane][n + perms[lane] + 1]);
2425 : 193 : perms[lane]++;
2426 : : }
2427 : 156 : if (term)
2428 : : break;
2429 : : }
2430 : : while (1);
2431 : 802 : if (!child)
2432 : : {
2433 : 27 : if (dump_enabled_p ())
2434 : 20 : dump_printf_loc (MSG_NOTE, vect_location,
2435 : : "failed to match up op %d\n", n);
2436 : 27 : op_stmts.release ();
2437 : 27 : if (lane != group_size - 1)
2438 : 12 : matches[0] = false;
2439 : : else
2440 : 15 : matches[lane] = false;
2441 : 27 : goto out;
2442 : : }
2443 : 775 : if (dump_enabled_p ())
2444 : : {
2445 : 312 : dump_printf_loc (MSG_NOTE, vect_location,
2446 : : "matched up op %d to\n", n);
2447 : 312 : vect_print_slp_tree (MSG_NOTE, vect_location, child);
2448 : : }
2449 : 775 : children.safe_push (child);
2450 : : }
2451 : : }
2452 : : /* 3. build SLP nodes to combine the chain. */
2453 : 936 : for (unsigned lane = 0; lane < group_size; ++lane)
2454 : 1360 : if (stmts[lane] && chains[lane][0].code != code)
2455 : : {
2456 : : /* See if there's any alternate all-PLUS entry. */
2457 : : unsigned n;
2458 : 8 : for (n = 1; n < chain_len; ++n)
2459 : : {
2460 : 36 : for (lane = 0; lane < group_size; ++lane)
2461 : 56 : if (stmts[lane] && chains[lane][n].code != code)
2462 : : break;
2463 : 8 : if (lane == group_size)
2464 : : break;
2465 : : }
2466 : 8 : if (n != chain_len)
2467 : : {
2468 : : /* Swap that in at first position. */
2469 : 8 : std::swap (children[0], children[n]);
2470 : 36 : for (lane = 0; lane < group_size; ++lane)
2471 : 28 : if (stmts[lane])
2472 : 28 : std::swap (chains[lane][0], chains[lane][n]);
2473 : : }
2474 : : else
2475 : : {
2476 : : /* ??? When this triggers and we end up with two
2477 : : vect_constant/external_def up-front things break (ICE)
2478 : : spectacularly finding an insertion place for the
2479 : : all-constant op. We should have a fully
2480 : : vect_internal_def operand though(?) so we can swap
2481 : : that into first place and then prepend the all-zero
2482 : : constant. */
2483 : 0 : if (dump_enabled_p ())
2484 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2485 : : "inserting constant zero to compensate "
2486 : : "for (partially) negated first "
2487 : : "operand\n");
2488 : 0 : chain_len++;
2489 : 0 : for (lane = 0; lane < group_size; ++lane)
2490 : 0 : if (stmts[lane])
2491 : 0 : chains[lane].safe_insert
2492 : 0 : (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2493 : 0 : vec<tree> zero_ops;
2494 : 0 : zero_ops.create (group_size);
2495 : 0 : zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2496 : 0 : for (lane = 1; lane < group_size; ++lane)
2497 : 0 : if (stmts[lane])
2498 : 0 : zero_ops.quick_push (zero_ops[0]);
2499 : : else
2500 : 0 : zero_ops.quick_push (NULL_TREE);
2501 : 0 : slp_tree zero = vect_create_new_slp_node (zero_ops);
2502 : 0 : SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2503 : 0 : children.safe_insert (0, zero);
2504 : : }
2505 : : break;
2506 : : }
2507 : 852 : for (unsigned i = 1; i < children.length (); ++i)
2508 : : {
2509 : 588 : slp_tree op0 = children[i - 1];
2510 : 588 : slp_tree op1 = children[i];
2511 : 588 : bool this_two_op = false;
2512 : 2011 : for (unsigned lane = 0; lane < group_size; ++lane)
2513 : 3176 : if (stmts[lane] && chains[lane][i].code != chains[0][i].code)
2514 : : {
2515 : : this_two_op = true;
2516 : : break;
2517 : : }
2518 : 588 : slp_tree child;
2519 : 588 : if (i == children.length () - 1)
2520 : 264 : child = vect_create_new_slp_node (node, stmts, 2);
2521 : : else
2522 : 324 : child = vect_create_new_slp_node (2, ERROR_MARK);
2523 : 588 : if (this_two_op)
2524 : : {
2525 : 165 : vec<std::pair<unsigned, unsigned> > lperm;
2526 : 165 : lperm.create (group_size);
2527 : 603 : for (unsigned lane = 0; lane < group_size; ++lane)
2528 : 438 : lperm.quick_push (std::make_pair
2529 : 438 : (chains[lane][i].code != chains[0][i].code, lane));
2530 : 330 : vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2531 : 165 : (chains[0][i].code == code
2532 : : ? op_stmt_info
2533 : : : other_op_stmt_info),
2534 : 165 : (chains[0][i].code == code
2535 : : ? other_op_stmt_info
2536 : : : op_stmt_info),
2537 : : lperm);
2538 : : }
2539 : : else
2540 : : {
2541 : 423 : SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2542 : 423 : SLP_TREE_VECTYPE (child) = vectype;
2543 : 423 : SLP_TREE_LANES (child) = group_size;
2544 : 423 : SLP_TREE_CHILDREN (child).quick_push (op0);
2545 : 423 : SLP_TREE_CHILDREN (child).quick_push (op1);
2546 : 423 : SLP_TREE_REPRESENTATIVE (child)
2547 : 846 : = (chains[0][i].code == code
2548 : 423 : ? op_stmt_info : other_op_stmt_info);
2549 : : }
2550 : 588 : children[i] = child;
2551 : : }
2552 : 264 : *tree_size += this_tree_size + 1;
2553 : 264 : *max_nunits = this_max_nunits;
2554 : 1255 : while (!chains.is_empty ())
2555 : 700 : chains.pop ().release ();
2556 : : return node;
2557 : : }
2558 : 4221 : out:
2559 : 4263 : if (dump_enabled_p ())
2560 : 3126 : dump_printf_loc (MSG_NOTE, vect_location,
2561 : : "failed to line up SLP graph by re-associating "
2562 : : "operations in lanes%s\n",
2563 : : !hard_fail ? " trying regular discovery" : "");
2564 : 4264 : while (!children.is_empty ())
2565 : 1 : vect_free_slp_tree (children.pop ());
2566 : 4406 : while (!chains.is_empty ())
2567 : 143 : chains.pop ().release ();
2568 : : /* Hard-fail, otherwise we might run into quadratic processing of the
2569 : : chains starting one stmt into the chain again. */
2570 : 4263 : if (hard_fail)
2571 : : return NULL;
2572 : : /* Fall thru to normal processing. */
2573 : 4527 : }
2574 : :
2575 : : /* Get at the operands, verifying they are compatible. */
2576 : 3374263 : vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2577 : 3374263 : slp_oprnd_info oprnd_info;
2578 : 16026626 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2579 : : {
2580 : 12654993 : int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2581 : : stmts, i, &oprnds_info);
2582 : 12654993 : if (res != 0)
2583 : 597010 : matches[(res == -1) ? 0 : i] = false;
2584 : 12654993 : if (!matches[0])
2585 : : break;
2586 : : }
2587 : 15689143 : for (i = 0; i < group_size; ++i)
2588 : 12548085 : if (!matches[i])
2589 : : {
2590 : 233205 : vect_free_oprnd_info (oprnds_info);
2591 : 233205 : return NULL;
2592 : : }
2593 : 9423174 : swap = NULL;
2594 : :
2595 : 9423174 : bool has_two_operators_perm = false;
2596 : 18846348 : auto_vec<unsigned> two_op_perm_indices[2];
2597 : 3141058 : vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
2598 : :
2599 : 3152215 : if (two_operators && oprnds_info.length () == 2 && group_size > 2)
2600 : : {
2601 : 1351 : unsigned idx = 0;
2602 : 1351 : hash_map<gimple *, unsigned> seen;
2603 : 1351 : vec<slp_oprnd_info> new_oprnds_info
2604 : 1351 : = vect_create_oprnd_info (1, group_size);
2605 : 1351 : bool success = true;
2606 : :
2607 : 1351 : enum tree_code code = ERROR_MARK;
2608 : 1351 : if (oprnds_info[0]->def_stmts[0]
2609 : 1351 : && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
2610 : 1250 : code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
2611 : :
2612 : 4047 : for (unsigned j = 0; j < group_size; ++j)
2613 : : {
2614 : 9891 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2615 : : {
2616 : 7195 : stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
2617 : 6959 : if (!stmt_info || !stmt_info->stmt
2618 : 6959 : || !is_a<gassign *> (stmt_info->stmt)
2619 : 6956 : || gimple_assign_rhs_code (stmt_info->stmt) != code
2620 : 13359 : || skip_args[i])
2621 : : {
2622 : 1031 : success = false;
2623 : 1031 : break;
2624 : : }
2625 : :
2626 : 6164 : bool exists;
2627 : 6164 : unsigned &stmt_idx
2628 : 6164 : = seen.get_or_insert (stmt_info->stmt, &exists);
2629 : :
2630 : 6164 : if (!exists)
2631 : : {
2632 : 5204 : new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
2633 : 5204 : new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
2634 : 5204 : stmt_idx = idx;
2635 : 5204 : idx++;
2636 : : }
2637 : :
2638 : 6164 : two_op_perm_indices[i].safe_push (stmt_idx);
2639 : : }
2640 : :
2641 : 3727 : if (!success)
2642 : : break;
2643 : : }
2644 : :
2645 : 1351 : if (success && idx == group_size)
2646 : : {
2647 : 43 : if (dump_enabled_p ())
2648 : : {
2649 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2650 : : "Replace two_operators operands:\n");
2651 : :
2652 : 0 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2653 : : {
2654 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2655 : : "Operand %u:\n", i);
2656 : 0 : for (unsigned j = 0; j < group_size; j++)
2657 : 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2658 : 0 : j, oprnd_info->def_stmts[j]->stmt);
2659 : : }
2660 : :
2661 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2662 : : "With a single operand:\n");
2663 : 0 : for (unsigned j = 0; j < group_size; j++)
2664 : 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2665 : 0 : j, new_oprnds_info[0]->def_stmts[j]->stmt);
2666 : : }
2667 : :
2668 : 43 : two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
2669 : 43 : two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
2670 : :
2671 : 43 : new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
2672 : 43 : new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
2673 : 43 : new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
2674 : 43 : new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
2675 : 43 : new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
2676 : :
2677 : 43 : vect_free_oprnd_info (oprnds_info);
2678 : 43 : oprnds_info = new_oprnds_info;
2679 : 43 : nops = 1;
2680 : 43 : has_two_operators_perm = true;
2681 : : }
2682 : : else
2683 : 1308 : vect_free_oprnd_info (new_oprnds_info);
2684 : 1351 : }
2685 : :
2686 : 6282116 : auto_vec<slp_tree, 4> children;
2687 : :
2688 : 3141058 : stmt_info = stmts[0];
2689 : :
2690 : : /* Create SLP_TREE nodes for the definition node/s. */
2691 : 7922402 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2692 : : {
2693 : 5032406 : slp_tree child = nullptr;
2694 : 5032406 : unsigned int j;
2695 : :
2696 : : /* We're skipping certain operands from processing, for example
2697 : : outer loop reduction initial defs. */
2698 : 5032406 : if (skip_args[i])
2699 : : {
2700 : 733182 : children.safe_push (NULL);
2701 : 4781344 : continue;
2702 : : }
2703 : :
2704 : 4299224 : if (oprnd_info->first_dt == vect_uninitialized_def)
2705 : : {
2706 : : /* COND_EXPR have one too many eventually if the condition
2707 : : is a SSA name. */
2708 : 0 : gcc_assert (i == 3 && nops == 4);
2709 : 0 : continue;
2710 : : }
2711 : :
2712 : 4299224 : if (is_a <bb_vec_info> (vinfo)
2713 : 1656590 : && oprnd_info->first_dt == vect_internal_def
2714 : 5198102 : && !oprnd_info->any_pattern)
2715 : : {
2716 : : /* For BB vectorization, if all defs are the same do not
2717 : : bother to continue the build along the single-lane
2718 : : graph but use a splat of the scalar value. */
2719 : 855359 : stmt_vec_info first_def = oprnd_info->def_stmts[0];
2720 : 909812 : for (j = 1; j < group_size; ++j)
2721 : 864195 : if (oprnd_info->def_stmts[j] != first_def)
2722 : : break;
2723 : 855359 : if (j == group_size
2724 : : /* But avoid doing this for loads where we may be
2725 : : able to CSE things, unless the stmt is not
2726 : : vectorizable. */
2727 : 855359 : && (!STMT_VINFO_VECTORIZABLE (first_def)
2728 : 56825 : || !gimple_vuse (first_def->stmt)))
2729 : : {
2730 : 36871 : if (dump_enabled_p ())
2731 : 94 : dump_printf_loc (MSG_NOTE, vect_location,
2732 : : "Using a splat of the uniform operand %G",
2733 : : first_def->stmt);
2734 : 36871 : oprnd_info->first_dt = vect_external_def;
2735 : : }
2736 : : }
2737 : :
2738 : 4299224 : if (oprnd_info->first_dt == vect_external_def
2739 : 4299224 : || oprnd_info->first_dt == vect_constant_def)
2740 : : {
2741 : 1562066 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2742 : : {
2743 : : tree op0;
2744 : : tree uniform_val = op0 = oprnd_info->ops[0];
2745 : : for (j = 1; j < oprnd_info->ops.length (); ++j)
2746 : : if (oprnd_info->ops[j]
2747 : : && !operand_equal_p (uniform_val, oprnd_info->ops[j]))
2748 : : {
2749 : : uniform_val = NULL_TREE;
2750 : : break;
2751 : : }
2752 : : if (!uniform_val
2753 : : && !can_duplicate_and_interleave_p (vinfo,
2754 : : oprnd_info->ops.length (),
2755 : : TREE_TYPE (op0)))
2756 : : {
2757 : : matches[j] = false;
2758 : : if (dump_enabled_p ())
2759 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760 : : "Build SLP failed: invalid type of def "
2761 : : "for variable-length SLP %T\n", op0);
2762 : : goto fail;
2763 : : }
2764 : : }
2765 : 1562066 : slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2766 : 1562066 : SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2767 : 1562066 : oprnd_info->ops = vNULL;
2768 : 1562066 : children.safe_push (invnode);
2769 : 1562066 : continue;
2770 : 1562066 : }
2771 : :
2772 : : /* When we have a masked load with uniform mask discover this
2773 : : as a single-lane mask with a splat permute. This way we can
2774 : : recognize this as a masked load-lane by stripping the splat. */
2775 : 2737158 : if (is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
2776 : 33366 : && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
2777 : : IFN_MASK_LOAD)
2778 : 4806 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2779 : 2737190 : && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info)))
2780 : : {
2781 : 0 : vec<stmt_vec_info> def_stmts2;
2782 : 0 : def_stmts2.create (1);
2783 : 0 : def_stmts2.quick_push (oprnd_info->def_stmts[0]);
2784 : 0 : child = vect_build_slp_tree (vinfo, def_stmts2, 1,
2785 : : &this_max_nunits,
2786 : : matches, limit,
2787 : : &this_tree_size, bst_map);
2788 : 0 : if (child)
2789 : : {
2790 : 0 : slp_tree pnode = vect_create_new_slp_node (1, VEC_PERM_EXPR);
2791 : 0 : SLP_TREE_VECTYPE (pnode) = SLP_TREE_VECTYPE (child);
2792 : 0 : SLP_TREE_LANES (pnode) = group_size;
2793 : 0 : SLP_TREE_SCALAR_STMTS (pnode).create (group_size);
2794 : 0 : SLP_TREE_LANE_PERMUTATION (pnode).create (group_size);
2795 : 0 : for (unsigned k = 0; k < group_size; ++k)
2796 : : {
2797 : 0 : SLP_TREE_SCALAR_STMTS (pnode)
2798 : 0 : .quick_push (oprnd_info->def_stmts[0]);
2799 : 0 : SLP_TREE_LANE_PERMUTATION (pnode)
2800 : 0 : .quick_push (std::make_pair (0u, 0u));
2801 : : }
2802 : 0 : SLP_TREE_CHILDREN (pnode).quick_push (child);
2803 : 0 : pnode->max_nunits = child->max_nunits;
2804 : 0 : children.safe_push (pnode);
2805 : 0 : oprnd_info->def_stmts = vNULL;
2806 : 0 : continue;
2807 : 0 : }
2808 : : else
2809 : 0 : def_stmts2.release ();
2810 : : }
2811 : :
2812 : 2737158 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2813 : : group_size, &this_max_nunits,
2814 : : matches, limit,
2815 : : &this_tree_size, bst_map)) != NULL)
2816 : : {
2817 : 2218329 : oprnd_info->def_stmts = vNULL;
2818 : 2218329 : children.safe_push (child);
2819 : 2218329 : continue;
2820 : : }
2821 : :
2822 : : /* If the SLP build for operand zero failed and operand zero
2823 : : and one can be commutated try that for the scalar stmts
2824 : : that failed the match. */
2825 : 518829 : if (i == 0
2826 : : /* A first scalar stmt mismatch signals a fatal mismatch. */
2827 : 381835 : && matches[0]
2828 : : /* ??? For COND_EXPRs we can swap the comparison operands
2829 : : as well as the arms under some constraints. */
2830 : 228030 : && nops == 2
2831 : 127098 : && oprnds_info[1]->first_dt == vect_internal_def
2832 : 73254 : && is_gimple_assign (stmt_info->stmt)
2833 : : /* Swapping operands for reductions breaks assumptions later on. */
2834 : 573498 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
2835 : : {
2836 : : /* See whether we can swap the matching or the non-matching
2837 : : stmt operands. */
2838 : : bool swap_not_matching = true;
2839 : 66692 : do
2840 : : {
2841 : 7053232 : for (j = 0; j < group_size; ++j)
2842 : : {
2843 : 7009307 : if (matches[j] != !swap_not_matching)
2844 : 74109 : continue;
2845 : 6935198 : stmt_vec_info stmt_info = stmts[j];
2846 : : /* Verify if we can swap operands of this stmt. */
2847 : 6935198 : gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2848 : 6935198 : if (!stmt
2849 : 6935198 : || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2850 : : {
2851 : 22767 : if (!swap_not_matching)
2852 : 10600 : goto fail;
2853 : : swap_not_matching = false;
2854 : : break;
2855 : : }
2856 : : }
2857 : : }
2858 : 56092 : while (j != group_size);
2859 : :
2860 : : /* Swap mismatched definition stmts. */
2861 : 43925 : if (dump_enabled_p ())
2862 : 350 : dump_printf_loc (MSG_NOTE, vect_location,
2863 : : "Re-trying with swapped operands of stmts ");
2864 : 7017521 : for (j = 0; j < group_size; ++j)
2865 : 6973596 : if (matches[j] == !swap_not_matching)
2866 : : {
2867 : 13824598 : std::swap (oprnds_info[0]->def_stmts[j],
2868 : 6912299 : oprnds_info[1]->def_stmts[j]);
2869 : 13824598 : std::swap (oprnds_info[0]->ops[j],
2870 : 6912299 : oprnds_info[1]->ops[j]);
2871 : 6912299 : if (dump_enabled_p ())
2872 : 973 : dump_printf (MSG_NOTE, "%d ", j);
2873 : : }
2874 : 43925 : if (dump_enabled_p ())
2875 : 350 : dump_printf (MSG_NOTE, "\n");
2876 : : /* After swapping some operands we lost track whether an
2877 : : operand has any pattern defs so be conservative here. */
2878 : 84972 : if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2879 : 3045 : oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2880 : : /* And try again with scratch 'matches' ... */
2881 : 43925 : bool *tem = XALLOCAVEC (bool, group_size);
2882 : 43925 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2883 : : group_size, &this_max_nunits,
2884 : : tem, limit,
2885 : : &this_tree_size, bst_map)) != NULL)
2886 : : {
2887 : 7527 : oprnd_info->def_stmts = vNULL;
2888 : 7527 : children.safe_push (child);
2889 : 7527 : continue;
2890 : : }
2891 : : }
2892 : 511302 : fail:
2893 : :
2894 : : /* If the SLP build failed and we analyze a basic-block
2895 : : simply treat nodes we fail to build as externally defined
2896 : : (and thus build vectors from the scalar defs).
2897 : : The cost model will reject outright expensive cases.
2898 : : ??? This doesn't treat cases where permutation ultimatively
2899 : : fails (or we don't try permutation below). Ideally we'd
2900 : : even compute a permutation that will end up with the maximum
2901 : : SLP tree size... */
2902 : 511302 : if (is_a <bb_vec_info> (vinfo)
2903 : : /* ??? Rejecting patterns this way doesn't work. We'd have to
2904 : : do extra work to cancel the pattern so the uses see the
2905 : : scalar version. */
2906 : 472287 : && !is_pattern_stmt_p (stmt_info)
2907 : 956818 : && !oprnd_info->any_pattern)
2908 : : {
2909 : : /* But if there's a leading vector sized set of matching stmts
2910 : : fail here so we can split the group. This matches the condition
2911 : : vect_analyze_slp_instance uses. */
2912 : : /* ??? We might want to split here and combine the results to support
2913 : : multiple vector sizes better. */
2914 : 727669 : for (j = 0; j < group_size; ++j)
2915 : 727669 : if (!matches[j])
2916 : : break;
2917 : 563681 : if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
2918 : 445184 : && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
2919 : : {
2920 : 260240 : if (dump_enabled_p ())
2921 : 445 : dump_printf_loc (MSG_NOTE, vect_location,
2922 : : "Building vector operands from scalars\n");
2923 : 260240 : this_tree_size++;
2924 : 260240 : child = vect_create_new_slp_node (oprnd_info->ops);
2925 : 260240 : children.safe_push (child);
2926 : 260240 : oprnd_info->ops = vNULL;
2927 : 260240 : continue;
2928 : : }
2929 : : }
2930 : :
2931 : 251062 : gcc_assert (child == NULL);
2932 : 293036 : FOR_EACH_VEC_ELT (children, j, child)
2933 : 41974 : if (child)
2934 : 41974 : vect_free_slp_tree (child);
2935 : 251062 : vect_free_oprnd_info (oprnds_info);
2936 : 251062 : return NULL;
2937 : : }
2938 : :
2939 : 2889996 : vect_free_oprnd_info (oprnds_info);
2940 : :
2941 : : /* If we have all children of a child built up from uniform scalars
2942 : : or does more than one possibly expensive vector construction then
2943 : : just throw that away, causing it built up from scalars.
2944 : : The exception is the SLP node for the vector store. */
2945 : 2889996 : if (is_a <bb_vec_info> (vinfo)
2946 : 1045873 : && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2947 : : /* ??? Rejecting patterns this way doesn't work. We'd have to
2948 : : do extra work to cancel the pattern so the uses see the
2949 : : scalar version. */
2950 : 3277301 : && !is_pattern_stmt_p (stmt_info))
2951 : : {
2952 : : slp_tree child;
2953 : : unsigned j;
2954 : : bool all_uniform_p = true;
2955 : : unsigned n_vector_builds = 0;
2956 : 1073043 : FOR_EACH_VEC_ELT (children, j, child)
2957 : : {
2958 : 709485 : if (!child)
2959 : : ;
2960 : 709485 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2961 : : all_uniform_p = false;
2962 : 493665 : else if (!vect_slp_tree_uniform_p (child))
2963 : : {
2964 : 359612 : all_uniform_p = false;
2965 : 359612 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2966 : 326344 : n_vector_builds++;
2967 : : }
2968 : : }
2969 : 363558 : if (all_uniform_p
2970 : 363558 : || n_vector_builds > 1
2971 : 631614 : || (n_vector_builds == children.length ()
2972 : 22288 : && is_a <gphi *> (stmt_info->stmt)))
2973 : : {
2974 : : /* Roll back. */
2975 : 101652 : matches[0] = false;
2976 : 313899 : FOR_EACH_VEC_ELT (children, j, child)
2977 : 212247 : if (child)
2978 : 212247 : vect_free_slp_tree (child);
2979 : :
2980 : 101652 : if (dump_enabled_p ())
2981 : 114 : dump_printf_loc (MSG_NOTE, vect_location,
2982 : : "Building parent vector operands from "
2983 : : "scalars instead\n");
2984 : 101652 : return NULL;
2985 : : }
2986 : : }
2987 : :
2988 : 2788344 : *tree_size += this_tree_size + 1;
2989 : 2788344 : *max_nunits = this_max_nunits;
2990 : :
2991 : 2788344 : if (two_operators)
2992 : : {
2993 : : /* ??? We'd likely want to either cache in bst_map sth like
2994 : : { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2995 : : the true { a+b, a+b, a+b, a+b } ... but there we don't have
2996 : : explicit stmts to put in so the keying on 'stmts' doesn't
2997 : : work (but we have the same issue with nodes that use 'ops'). */
2998 : :
2999 : 3911 : if (has_two_operators_perm)
3000 : : {
3001 : 23 : slp_tree child = children[0];
3002 : 23 : children.truncate (0);
3003 : 69 : for (i = 0; i < 2; i++)
3004 : : {
3005 : 46 : slp_tree pnode
3006 : 46 : = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
3007 : 46 : SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
3008 : 46 : SLP_TREE_VECTYPE (pnode) = vectype;
3009 : 46 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3010 : 46 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3011 : 46 : lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
3012 : 46 : children.safe_push (pnode);
3013 : :
3014 : 486 : for (unsigned j = 0; j < stmts.length (); j++)
3015 : 440 : perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
3016 : : }
3017 : :
3018 : 23 : SLP_TREE_REF_COUNT (child) += 4;
3019 : : }
3020 : :
3021 : 3911 : slp_tree one = new _slp_tree;
3022 : 3911 : slp_tree two = new _slp_tree;
3023 : 3911 : SLP_TREE_DEF_TYPE (one) = vect_internal_def;
3024 : 3911 : SLP_TREE_DEF_TYPE (two) = vect_internal_def;
3025 : 3911 : SLP_TREE_VECTYPE (one) = vectype;
3026 : 3911 : SLP_TREE_VECTYPE (two) = vectype;
3027 : 3911 : SLP_TREE_CHILDREN (one).safe_splice (children);
3028 : 3911 : SLP_TREE_CHILDREN (two).safe_splice (children);
3029 : 3911 : slp_tree child;
3030 : 15644 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
3031 : 7822 : SLP_TREE_REF_COUNT (child)++;
3032 : :
3033 : : /* Here we record the original defs since this
3034 : : node represents the final lane configuration. */
3035 : 3911 : node = vect_create_new_slp_node (node, stmts, 2);
3036 : 3911 : SLP_TREE_VECTYPE (node) = vectype;
3037 : 3911 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3038 : 3911 : SLP_TREE_CHILDREN (node).quick_push (one);
3039 : 3911 : SLP_TREE_CHILDREN (node).quick_push (two);
3040 : 3911 : gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
3041 : 3911 : enum tree_code code0 = gimple_assign_rhs_code (stmt);
3042 : 3911 : enum tree_code ocode = ERROR_MARK;
3043 : 3911 : stmt_vec_info ostmt_info;
3044 : 3911 : unsigned j = 0;
3045 : 14717 : FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
3046 : : {
3047 : 10806 : gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
3048 : 10806 : if (gimple_assign_rhs_code (ostmt) != code0)
3049 : : {
3050 : 5400 : SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
3051 : 5400 : ocode = gimple_assign_rhs_code (ostmt);
3052 : 5400 : j = i;
3053 : : }
3054 : : else
3055 : 5406 : SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
3056 : : }
3057 : :
3058 : 3911 : SLP_TREE_CODE (one) = code0;
3059 : 3911 : SLP_TREE_CODE (two) = ocode;
3060 : 3911 : SLP_TREE_LANES (one) = stmts.length ();
3061 : 3911 : SLP_TREE_LANES (two) = stmts.length ();
3062 : 3911 : SLP_TREE_REPRESENTATIVE (one) = stmts[0];
3063 : 3911 : SLP_TREE_REPRESENTATIVE (two) = stmts[j];
3064 : :
3065 : 3911 : return node;
3066 : : }
3067 : :
3068 : 2784433 : node = vect_create_new_slp_node (node, stmts, nops);
3069 : 2784433 : SLP_TREE_VECTYPE (node) = vectype;
3070 : 2784433 : SLP_TREE_CHILDREN (node).splice (children);
3071 : 2784433 : return node;
3072 : 9423174 : }
3073 : :
3074 : : /* Dump a single SLP tree NODE. */
3075 : :
3076 : : static void
3077 : 470996 : vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
3078 : : slp_tree node)
3079 : : {
3080 : 470996 : unsigned i, j;
3081 : 470996 : slp_tree child;
3082 : 470996 : stmt_vec_info stmt_info;
3083 : 470996 : tree op;
3084 : :
3085 : 470996 : dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
3086 : 470996 : dump_user_location_t user_loc = loc.get_user_location ();
3087 : 470996 : dump_printf_loc (metadata, user_loc,
3088 : : "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
3089 : : ", refcnt=%u)",
3090 : 470996 : SLP_TREE_DEF_TYPE (node) == vect_external_def
3091 : : ? " (external)"
3092 : : : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
3093 : 455159 : ? " (constant)"
3094 : : : ""), (void *) node,
3095 : 470996 : estimated_poly_value (node->max_nunits),
3096 : : SLP_TREE_REF_COUNT (node));
3097 : 470996 : if (SLP_TREE_VECTYPE (node))
3098 : 394575 : dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
3099 : 470996 : dump_printf (metadata, "\n");
3100 : 470996 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3101 : : {
3102 : 380763 : if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3103 : 14846 : dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
3104 : : else
3105 : 365917 : dump_printf_loc (metadata, user_loc, "op template: %G",
3106 : 365917 : SLP_TREE_REPRESENTATIVE (node)->stmt);
3107 : : }
3108 : 470996 : if (SLP_TREE_SCALAR_STMTS (node).exists ())
3109 : 896686 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3110 : 524260 : if (stmt_info)
3111 : 516740 : dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
3112 : 516740 : STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
3113 : : i, stmt_info->stmt);
3114 : : else
3115 : 7520 : dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
3116 : : else
3117 : : {
3118 : 98570 : dump_printf_loc (metadata, user_loc, "\t{ ");
3119 : 311973 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
3120 : 114833 : dump_printf (metadata, "%T%s ", op,
3121 : 114833 : i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
3122 : 98570 : dump_printf (metadata, "}\n");
3123 : : }
3124 : 470996 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3125 : : {
3126 : 65360 : dump_printf_loc (metadata, user_loc, "\tload permutation {");
3127 : 212185 : FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
3128 : 81465 : dump_printf (dump_kind, " %u", j);
3129 : 65360 : dump_printf (dump_kind, " }\n");
3130 : : }
3131 : 470996 : if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3132 : : {
3133 : 14854 : dump_printf_loc (metadata, user_loc, "\tlane permutation {");
3134 : 70821 : for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
3135 : 41113 : dump_printf (dump_kind, " %u[%u]",
3136 : 41113 : SLP_TREE_LANE_PERMUTATION (node)[i].first,
3137 : 41113 : SLP_TREE_LANE_PERMUTATION (node)[i].second);
3138 : 14854 : dump_printf (dump_kind, " }%s\n",
3139 : 14854 : node->ldst_lanes ? " (load-lanes)" : "");
3140 : : }
3141 : 470996 : if (SLP_TREE_CHILDREN (node).is_empty ())
3142 : 177051 : return;
3143 : 293945 : dump_printf_loc (metadata, user_loc, "\tchildren");
3144 : 1072526 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3145 : 484636 : dump_printf (dump_kind, " %p", (void *)child);
3146 : 293945 : dump_printf (dump_kind, "%s\n",
3147 : 293945 : node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
3148 : : ? " (store-lanes)" : "");
3149 : : }
3150 : :
3151 : : DEBUG_FUNCTION void
3152 : 0 : debug (slp_tree node)
3153 : : {
3154 : 0 : debug_dump_context ctx;
3155 : 0 : vect_print_slp_tree (MSG_NOTE,
3156 : 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3157 : : node);
3158 : 0 : }
3159 : :
3160 : : /* Recursive helper for the dot producer below. */
3161 : :
3162 : : static void
3163 : 0 : dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
3164 : : {
3165 : 0 : if (visited.add (node))
3166 : : return;
3167 : :
3168 : 0 : fprintf (f, "\"%p\" [label=\"", (void *)node);
3169 : 0 : vect_print_slp_tree (MSG_NOTE,
3170 : 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3171 : : node);
3172 : 0 : fprintf (f, "\"];\n");
3173 : :
3174 : :
3175 : 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3176 : 0 : fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
3177 : :
3178 : 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3179 : 0 : if (child)
3180 : 0 : dot_slp_tree (f, child, visited);
3181 : : }
3182 : :
3183 : : DEBUG_FUNCTION void
3184 : 0 : dot_slp_tree (const char *fname, slp_tree node)
3185 : : {
3186 : 0 : FILE *f = fopen (fname, "w");
3187 : 0 : fprintf (f, "digraph {\n");
3188 : 0 : fflush (f);
3189 : 0 : {
3190 : 0 : debug_dump_context ctx (f);
3191 : 0 : hash_set<slp_tree> visited;
3192 : 0 : dot_slp_tree (f, node, visited);
3193 : 0 : }
3194 : 0 : fflush (f);
3195 : 0 : fprintf (f, "}\n");
3196 : 0 : fclose (f);
3197 : 0 : }
3198 : :
3199 : : DEBUG_FUNCTION void
3200 : 0 : dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
3201 : : {
3202 : 0 : FILE *f = fopen (fname, "w");
3203 : 0 : fprintf (f, "digraph {\n");
3204 : 0 : fflush (f);
3205 : 0 : {
3206 : 0 : debug_dump_context ctx (f);
3207 : 0 : hash_set<slp_tree> visited;
3208 : 0 : for (auto inst : slp_instances)
3209 : 0 : dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
3210 : 0 : }
3211 : 0 : fflush (f);
3212 : 0 : fprintf (f, "}\n");
3213 : 0 : fclose (f);
3214 : 0 : }
3215 : :
3216 : : /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
3217 : :
3218 : : static void
3219 : 511981 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3220 : : slp_tree node, hash_set<slp_tree> &visited)
3221 : : {
3222 : 511981 : unsigned i;
3223 : 511981 : slp_tree child;
3224 : :
3225 : 511981 : if (visited.add (node))
3226 : 511981 : return;
3227 : :
3228 : 470684 : vect_print_slp_tree (dump_kind, loc, node);
3229 : :
3230 : 1425645 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3231 : 484277 : if (child)
3232 : 429476 : vect_print_slp_graph (dump_kind, loc, child, visited);
3233 : : }
3234 : :
3235 : : static void
3236 : 49420 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3237 : : slp_tree entry)
3238 : : {
3239 : 49420 : hash_set<slp_tree> visited;
3240 : 49420 : vect_print_slp_graph (dump_kind, loc, entry, visited);
3241 : 49420 : }
3242 : :
3243 : : DEBUG_FUNCTION void
3244 : 0 : debug (slp_instance instance)
3245 : : {
3246 : 0 : debug_dump_context ctx;
3247 : 0 : vect_print_slp_graph (MSG_NOTE,
3248 : 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3249 : : SLP_INSTANCE_TREE (instance));
3250 : 0 : }
3251 : :
3252 : : /* Mark the tree rooted at NODE with PURE_SLP. */
3253 : :
3254 : : static void
3255 : 6001957 : vect_mark_slp_stmts (vec_info *vinfo, slp_tree node,
3256 : : hash_set<slp_tree> &visited)
3257 : : {
3258 : 6001957 : int i;
3259 : 6001957 : stmt_vec_info stmt_info;
3260 : 6001957 : slp_tree child;
3261 : :
3262 : 6001957 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3263 : : return;
3264 : :
3265 : 4280396 : if (visited.add (node))
3266 : : return;
3267 : :
3268 : 10191709 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3269 : 6165786 : if (stmt_info)
3270 : : {
3271 : 5967001 : STMT_SLP_TYPE (stmt_info) = pure_slp;
3272 : : /* ??? For .MASK_LOAD and .MASK_STORE detected as load/store-lanes
3273 : : when there is the mask_conversion pattern applied we have lost the
3274 : : alternate lanes of the uniform mask which nevertheless
3275 : : have separate pattern defs. To not confuse hybrid
3276 : : analysis we mark those as covered as well here. */
3277 : 5967001 : if (node->ldst_lanes)
3278 : 6165786 : if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
3279 : 0 : if (gimple_call_internal_p (call, IFN_MASK_LOAD)
3280 : 0 : || gimple_call_internal_p (call, IFN_MASK_STORE))
3281 : : {
3282 : 0 : tree mask = gimple_call_arg (call,
3283 : : internal_fn_mask_index
3284 : 0 : (gimple_call_internal_fn (call)));
3285 : 0 : if (TREE_CODE (mask) == SSA_NAME)
3286 : 0 : if (stmt_vec_info mask_info = vinfo->lookup_def (mask))
3287 : : {
3288 : 0 : mask_info = vect_stmt_to_vectorize (mask_info);
3289 : 0 : STMT_SLP_TYPE (mask_info) = pure_slp;
3290 : : }
3291 : : }
3292 : : }
3293 : :
3294 : 9430343 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3295 : 5404420 : if (child)
3296 : 4416211 : vect_mark_slp_stmts (vinfo, child, visited);
3297 : : }
3298 : :
3299 : : static void
3300 : 1585746 : vect_mark_slp_stmts (vec_info *vinfo, slp_tree node)
3301 : : {
3302 : 1585746 : hash_set<slp_tree> visited;
3303 : 1585746 : vect_mark_slp_stmts (vinfo, node, visited);
3304 : 1585746 : }
3305 : :
3306 : : /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
3307 : :
3308 : : static void
3309 : 2225868 : vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
3310 : : {
3311 : 2225868 : int i;
3312 : 2225868 : stmt_vec_info stmt_info;
3313 : 2225868 : slp_tree child;
3314 : :
3315 : 2225868 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3316 : : return;
3317 : :
3318 : 1321825 : if (visited.add (node))
3319 : : return;
3320 : :
3321 : 4145127 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3322 : 2911140 : if (stmt_info)
3323 : : {
3324 : 2911140 : gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
3325 : : || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
3326 : 2911140 : STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
3327 : : }
3328 : :
3329 : 2695087 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3330 : 1461100 : if (child)
3331 : 1461100 : vect_mark_slp_stmts_relevant (child, visited);
3332 : : }
3333 : :
3334 : : static void
3335 : 764768 : vect_mark_slp_stmts_relevant (slp_tree node)
3336 : : {
3337 : 764768 : hash_set<slp_tree> visited;
3338 : 764768 : vect_mark_slp_stmts_relevant (node, visited);
3339 : 764768 : }
3340 : :
3341 : :
3342 : : /* Gather loads in the SLP graph NODE and populate the INST loads array. */
3343 : :
3344 : : static void
3345 : 11176206 : vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
3346 : : hash_set<slp_tree> &visited)
3347 : : {
3348 : 11176206 : if (!node || visited.add (node))
3349 : 2311782 : return;
3350 : :
3351 : 8864424 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3352 : : return;
3353 : :
3354 : 6400319 : if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3355 : : {
3356 : 6142482 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3357 : 6142482 : if (STMT_VINFO_DATA_REF (stmt_info)
3358 : 2310836 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3359 : 1242837 : loads.safe_push (node);
3360 : : }
3361 : :
3362 : : unsigned i;
3363 : : slp_tree child;
3364 : 15163635 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3365 : 8763316 : vect_gather_slp_loads (loads, child, visited);
3366 : : }
3367 : :
3368 : :
3369 : : /* Find the last store in SLP INSTANCE. */
3370 : :
3371 : : stmt_vec_info
3372 : 2711931 : vect_find_last_scalar_stmt_in_slp (slp_tree node)
3373 : : {
3374 : 2711931 : stmt_vec_info last = NULL;
3375 : 2711931 : stmt_vec_info stmt_vinfo;
3376 : :
3377 : 9782662 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3378 : 7070731 : if (stmt_vinfo)
3379 : : {
3380 : 7070731 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3381 : 7070731 : last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
3382 : : }
3383 : :
3384 : 2711931 : return last;
3385 : : }
3386 : :
3387 : : /* Find the first stmt in NODE. */
3388 : :
3389 : : stmt_vec_info
3390 : 542470 : vect_find_first_scalar_stmt_in_slp (slp_tree node)
3391 : : {
3392 : 542470 : stmt_vec_info first = NULL;
3393 : 542470 : stmt_vec_info stmt_vinfo;
3394 : :
3395 : 1812321 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3396 : 1269851 : if (stmt_vinfo)
3397 : : {
3398 : 1267316 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3399 : 1267316 : if (!first
3400 : 1267316 : || get_later_stmt (stmt_vinfo, first) == first)
3401 : : first = stmt_vinfo;
3402 : : }
3403 : :
3404 : 542470 : return first;
3405 : : }
3406 : :
3407 : : /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3408 : : two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3409 : : (also containing the first GROUP1_SIZE stmts, since stores are
3410 : : consecutive), the second containing the remainder.
3411 : : Return the first stmt in the second group. */
3412 : :
3413 : : static stmt_vec_info
3414 : 147693 : vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3415 : : {
3416 : 147693 : gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3417 : 147693 : gcc_assert (group1_size > 0);
3418 : 147693 : int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3419 : 147693 : gcc_assert (group2_size > 0);
3420 : 147693 : DR_GROUP_SIZE (first_vinfo) = group1_size;
3421 : :
3422 : 147693 : stmt_vec_info stmt_info = first_vinfo;
3423 : 495169 : for (unsigned i = group1_size; i > 1; i--)
3424 : : {
3425 : 347476 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3426 : 347476 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3427 : : }
3428 : : /* STMT is now the last element of the first group. */
3429 : 147693 : stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3430 : 147693 : DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3431 : :
3432 : 147693 : DR_GROUP_SIZE (group2) = group2_size;
3433 : 414195 : for (stmt_info = group2; stmt_info;
3434 : 266502 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3435 : : {
3436 : 266502 : DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3437 : 266502 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3438 : : }
3439 : :
3440 : : /* For the second group, the DR_GROUP_GAP is that before the original group,
3441 : : plus skipping over the first vector. */
3442 : 147693 : DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3443 : :
3444 : : /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3445 : 147693 : DR_GROUP_GAP (first_vinfo) += group2_size;
3446 : :
3447 : 147693 : if (dump_enabled_p ())
3448 : 59 : dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3449 : : group1_size, group2_size);
3450 : :
3451 : 147693 : return group2;
3452 : : }
3453 : :
3454 : : /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3455 : : statements and a vector of NUNITS elements. */
3456 : :
3457 : : static poly_uint64
3458 : 4098704 : calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3459 : : {
3460 : 4098704 : return exact_div (common_multiple (nunits, group_size), group_size);
3461 : : }
3462 : :
3463 : : /* Helper that checks to see if a node is a load node. */
3464 : :
3465 : : static inline bool
3466 : 88 : vect_is_slp_load_node (slp_tree root)
3467 : : {
3468 : 88 : return (SLP_TREE_CODE (root) != VEC_PERM_EXPR
3469 : 76 : && SLP_TREE_DEF_TYPE (root) == vect_internal_def
3470 : 64 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3471 : 128 : && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
3472 : : }
3473 : :
3474 : :
3475 : : /* Helper function of optimize_load_redistribution that performs the operation
3476 : : recursively. */
3477 : :
3478 : : static slp_tree
3479 : 8600 : optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3480 : : vec_info *vinfo, unsigned int group_size,
3481 : : hash_map<slp_tree, slp_tree> *load_map,
3482 : : slp_tree root)
3483 : : {
3484 : 8600 : if (slp_tree *leader = load_map->get (root))
3485 : 644 : return *leader;
3486 : :
3487 : 7956 : slp_tree node;
3488 : 7956 : unsigned i;
3489 : :
3490 : : /* For now, we don't know anything about externals so do not do anything. */
3491 : 7956 : if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3492 : : return NULL;
3493 : 5516 : else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3494 : : {
3495 : : /* First convert this node into a load node and add it to the leaves
3496 : : list and flatten the permute from a lane to a load one. If it's
3497 : : unneeded it will be elided later. */
3498 : 68 : vec<stmt_vec_info> stmts;
3499 : 68 : stmts.create (SLP_TREE_LANES (root));
3500 : 68 : lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3501 : 108 : for (unsigned j = 0; j < lane_perm.length (); j++)
3502 : : {
3503 : 88 : std::pair<unsigned, unsigned> perm = lane_perm[j];
3504 : 88 : node = SLP_TREE_CHILDREN (root)[perm.first];
3505 : :
3506 : 88 : if (!vect_is_slp_load_node (node)
3507 : 88 : || SLP_TREE_CHILDREN (node).exists ())
3508 : : {
3509 : 48 : stmts.release ();
3510 : 48 : goto next;
3511 : : }
3512 : :
3513 : 40 : stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3514 : : }
3515 : :
3516 : 20 : if (dump_enabled_p ())
3517 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
3518 : : "converting stmts on permute node %p\n",
3519 : : (void *) root);
3520 : :
3521 : 20 : bool *matches = XALLOCAVEC (bool, group_size);
3522 : 20 : poly_uint64 max_nunits = 1;
3523 : 20 : unsigned tree_size = 0, limit = 1;
3524 : 20 : node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3525 : : matches, &limit, &tree_size, bst_map);
3526 : 20 : if (!node)
3527 : 0 : stmts.release ();
3528 : :
3529 : 20 : load_map->put (root, node);
3530 : 20 : return node;
3531 : : }
3532 : :
3533 : 5448 : next:
3534 : 5496 : load_map->put (root, NULL);
3535 : :
3536 : 12728 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3537 : : {
3538 : 7232 : slp_tree value
3539 : 7232 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3540 : : node);
3541 : 7232 : if (value)
3542 : : {
3543 : 20 : SLP_TREE_REF_COUNT (value)++;
3544 : 20 : SLP_TREE_CHILDREN (root)[i] = value;
3545 : : /* ??? We know the original leafs of the replaced nodes will
3546 : : be referenced by bst_map, only the permutes created by
3547 : : pattern matching are not. */
3548 : 20 : if (SLP_TREE_REF_COUNT (node) == 1)
3549 : 20 : load_map->remove (node);
3550 : 20 : vect_free_slp_tree (node);
3551 : : }
3552 : : }
3553 : :
3554 : : return NULL;
3555 : : }
3556 : :
3557 : : /* Temporary workaround for loads not being CSEd during SLP build. This
3558 : : function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3559 : : VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3560 : : same DR such that the final operation is equal to a permuted load. Such
3561 : : NODES are then directly converted into LOADS themselves. The nodes are
3562 : : CSEd using BST_MAP. */
3563 : :
3564 : : static void
3565 : 1251 : optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3566 : : vec_info *vinfo, unsigned int group_size,
3567 : : hash_map<slp_tree, slp_tree> *load_map,
3568 : : slp_tree root)
3569 : : {
3570 : 1251 : slp_tree node;
3571 : 1251 : unsigned i;
3572 : :
3573 : 2619 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3574 : : {
3575 : 1368 : slp_tree value
3576 : 1368 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3577 : : node);
3578 : 1368 : if (value)
3579 : : {
3580 : 0 : SLP_TREE_REF_COUNT (value)++;
3581 : 0 : SLP_TREE_CHILDREN (root)[i] = value;
3582 : : /* ??? We know the original leafs of the replaced nodes will
3583 : : be referenced by bst_map, only the permutes created by
3584 : : pattern matching are not. */
3585 : 0 : if (SLP_TREE_REF_COUNT (node) == 1)
3586 : 0 : load_map->remove (node);
3587 : 0 : vect_free_slp_tree (node);
3588 : : }
3589 : : }
3590 : 1251 : }
3591 : :
3592 : : /* Helper function of vect_match_slp_patterns.
3593 : :
3594 : : Attempts to match patterns against the slp tree rooted in REF_NODE using
3595 : : VINFO. Patterns are matched in post-order traversal.
3596 : :
3597 : : If matching is successful the value in REF_NODE is updated and returned, if
3598 : : not then it is returned unchanged. */
3599 : :
3600 : : static bool
3601 : 6112393 : vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3602 : : slp_tree_to_load_perm_map_t *perm_cache,
3603 : : slp_compat_nodes_map_t *compat_cache,
3604 : : hash_set<slp_tree> *visited)
3605 : : {
3606 : 6112393 : unsigned i;
3607 : 6112393 : slp_tree node = *ref_node;
3608 : 6112393 : bool found_p = false;
3609 : 6112393 : if (!node || visited->add (node))
3610 : 1094228 : return false;
3611 : :
3612 : : slp_tree child;
3613 : 9538646 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3614 : 4520481 : found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3615 : : vinfo, perm_cache, compat_cache,
3616 : : visited);
3617 : :
3618 : 15054495 : for (unsigned x = 0; x < num__slp_patterns; x++)
3619 : : {
3620 : 10036330 : vect_pattern *pattern
3621 : 10036330 : = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3622 : 10036330 : if (pattern)
3623 : : {
3624 : 448 : pattern->build (vinfo);
3625 : 448 : delete pattern;
3626 : 448 : found_p = true;
3627 : : }
3628 : : }
3629 : :
3630 : : return found_p;
3631 : : }
3632 : :
3633 : : /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3634 : : vec_info VINFO.
3635 : :
3636 : : The modified tree is returned. Patterns are tried in order and multiple
3637 : : patterns may match. */
3638 : :
3639 : : static bool
3640 : 1591912 : vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3641 : : hash_set<slp_tree> *visited,
3642 : : slp_tree_to_load_perm_map_t *perm_cache,
3643 : : slp_compat_nodes_map_t *compat_cache)
3644 : : {
3645 : 1591912 : DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3646 : 1591912 : slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3647 : :
3648 : 1591912 : if (dump_enabled_p ())
3649 : 34167 : dump_printf_loc (MSG_NOTE, vect_location,
3650 : : "Analyzing SLP tree %p for patterns\n",
3651 : 34167 : (void *) SLP_INSTANCE_TREE (instance));
3652 : :
3653 : 1591912 : return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3654 : 1591912 : visited);
3655 : : }
3656 : :
3657 : : /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3658 : : vectorizing with VECTYPE that might be NULL. MASKED_P indicates whether
3659 : : the stores are masked.
3660 : : Return true if we could use IFN_STORE_LANES instead and if that appears
3661 : : to be the better approach. */
3662 : :
3663 : : static bool
3664 : 4332 : vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3665 : : tree vectype, bool masked_p,
3666 : : unsigned int group_size,
3667 : : unsigned int new_group_size)
3668 : : {
3669 : 4332 : if (!vectype)
3670 : : {
3671 : 4332 : tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3672 : 4332 : vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3673 : : }
3674 : 4332 : if (!vectype)
3675 : : return false;
3676 : : /* Allow the split if one of the two new groups would operate on full
3677 : : vectors *within* rather than across one scalar loop iteration.
3678 : : This is purely a heuristic, but it should work well for group
3679 : : sizes of 3 and 4, where the possible splits are:
3680 : :
3681 : : 3->2+1: OK if the vector has exactly two elements
3682 : : 4->2+2: Likewise
3683 : : 4->3+1: Less clear-cut. */
3684 : 4332 : if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3685 : 2728 : || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3686 : 1652 : return false;
3687 : 2680 : return vect_store_lanes_supported (vectype, group_size, masked_p) != IFN_LAST;
3688 : : }
3689 : :
3690 : : /* Analyze an SLP instance starting from a group of grouped stores. Call
3691 : : vect_build_slp_tree to build a tree of packed stmts if possible.
3692 : : Return FALSE if it's impossible to SLP any stmt in the loop. */
3693 : :
3694 : : static bool
3695 : : vect_analyze_slp_instance (vec_info *vinfo,
3696 : : scalar_stmts_to_slp_tree_map_t *bst_map,
3697 : : stmt_vec_info stmt_info, slp_instance_kind kind,
3698 : : unsigned max_tree_size, unsigned *limit,
3699 : : bool force_single_lane);
3700 : :
3701 : : /* Build an interleaving scheme for the store sources RHS_NODES from
3702 : : SCALAR_STMTS. */
3703 : :
3704 : : static slp_tree
3705 : 6244 : vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
3706 : : vec<stmt_vec_info> &scalar_stmts,
3707 : : poly_uint64 max_nunits)
3708 : : {
3709 : 6244 : unsigned int group_size = scalar_stmts.length ();
3710 : 12488 : slp_tree node = vect_create_new_slp_node (scalar_stmts,
3711 : 6244 : SLP_TREE_CHILDREN
3712 : : (rhs_nodes[0]).length ());
3713 : 6244 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
3714 : 6244 : node->max_nunits = max_nunits;
3715 : 6244 : for (unsigned l = 0;
3716 : 12523 : l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
3717 : : {
3718 : : /* And a permute merging all RHS SLP trees. */
3719 : 6279 : slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
3720 : 6279 : VEC_PERM_EXPR);
3721 : 6279 : SLP_TREE_CHILDREN (node).quick_push (perm);
3722 : 6279 : SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
3723 : 6279 : SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
3724 : 6279 : perm->max_nunits = max_nunits;
3725 : 6279 : SLP_TREE_LANES (perm) = group_size;
3726 : : /* ??? We should set this NULL but that's not expected. */
3727 : 6279 : SLP_TREE_REPRESENTATIVE (perm)
3728 : 6279 : = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
3729 : 24637 : for (unsigned j = 0; j < rhs_nodes.length (); ++j)
3730 : : {
3731 : 18358 : SLP_TREE_CHILDREN (perm)
3732 : 18358 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
3733 : 18358 : SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
3734 : 18358 : for (unsigned k = 0;
3735 : 39043 : k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
3736 : : {
3737 : : /* ??? We should populate SLP_TREE_SCALAR_STMTS
3738 : : or SLP_TREE_SCALAR_OPS but then we might have
3739 : : a mix of both in our children. */
3740 : 20685 : SLP_TREE_LANE_PERMUTATION (perm)
3741 : 20685 : .quick_push (std::make_pair (j, k));
3742 : : }
3743 : : }
3744 : :
3745 : : /* Now we have a single permute node but we cannot code-generate
3746 : : the case with more than two inputs.
3747 : : Perform pairwise reduction, reducing the two inputs
3748 : : with the least number of lanes to one and then repeat until
3749 : : we end up with two inputs. That scheme makes sure we end
3750 : : up with permutes satisfying the restriction of requiring at
3751 : : most two vector inputs to produce a single vector output
3752 : : when the number of lanes is even. */
3753 : 12079 : while (SLP_TREE_CHILDREN (perm).length () > 2)
3754 : : {
3755 : : /* When we have three equal sized groups left the pairwise
3756 : : reduction does not result in a scheme that avoids using
3757 : : three vectors. Instead merge the first two groups
3758 : : to the final size with do-not-care elements (chosen
3759 : : from the first group) and then merge with the third.
3760 : : { A0, B0, x, A1, B1, x, ... }
3761 : : -> { A0, B0, C0, A1, B1, C1, ... }
3762 : : This handles group size of three (and at least
3763 : : power-of-two multiples of that). */
3764 : 5800 : if (SLP_TREE_CHILDREN (perm).length () == 3
3765 : 2640 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3766 : 2640 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
3767 : 5800 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3768 : 1798 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
3769 : : {
3770 : 1593 : int ai = 0;
3771 : 1593 : int bi = 1;
3772 : 1593 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3773 : 1593 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3774 : 1593 : unsigned n = SLP_TREE_LANES (perm);
3775 : :
3776 : 1593 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3777 : 1593 : SLP_TREE_LANES (permab) = n;
3778 : 1593 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
3779 : 1593 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3780 : 1593 : permab->max_nunits = max_nunits;
3781 : : /* ??? Should be NULL but that's not expected. */
3782 : 1593 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3783 : 1593 : SLP_TREE_CHILDREN (permab).quick_push (a);
3784 : 3197 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3785 : 1604 : SLP_TREE_LANE_PERMUTATION (permab)
3786 : 1604 : .quick_push (std::make_pair (0, k));
3787 : 1593 : SLP_TREE_CHILDREN (permab).quick_push (b);
3788 : 3197 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3789 : 1604 : SLP_TREE_LANE_PERMUTATION (permab)
3790 : 1604 : .quick_push (std::make_pair (1, k));
3791 : : /* Push the do-not-care lanes. */
3792 : 3197 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3793 : 1604 : SLP_TREE_LANE_PERMUTATION (permab)
3794 : 1604 : .quick_push (std::make_pair (0, k));
3795 : :
3796 : : /* Put the merged node into 'perm', in place of a. */
3797 : 1593 : SLP_TREE_CHILDREN (perm)[ai] = permab;
3798 : : /* Adjust the references to b in the permutation
3799 : : of perm and to the later children which we'll
3800 : : remove. */
3801 : 6405 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3802 : : {
3803 : 4812 : std::pair<unsigned, unsigned> &p
3804 : 4812 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
3805 : 4812 : if (p.first == (unsigned) bi)
3806 : : {
3807 : 1604 : p.first = ai;
3808 : 1604 : p.second += SLP_TREE_LANES (a);
3809 : : }
3810 : 3208 : else if (p.first > (unsigned) bi)
3811 : 1604 : p.first--;
3812 : : }
3813 : 1593 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3814 : 1593 : break;
3815 : : }
3816 : :
3817 : : /* Pick the two nodes with the least number of lanes,
3818 : : prefer the earliest candidate and maintain ai < bi. */
3819 : : int ai = -1;
3820 : : int bi = -1;
3821 : 35696 : for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
3822 : : {
3823 : 31489 : if (ai == -1)
3824 : 4207 : ai = ci;
3825 : 27282 : else if (bi == -1)
3826 : 4207 : bi = ci;
3827 : 23075 : else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3828 : 23075 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
3829 : 23075 : || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3830 : 18772 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
3831 : : {
3832 : 9472 : if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
3833 : 4736 : <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
3834 : 2198 : bi = ci;
3835 : : else
3836 : : {
3837 : 2538 : ai = bi;
3838 : 2538 : bi = ci;
3839 : : }
3840 : : }
3841 : : }
3842 : :
3843 : : /* Produce a merge of nodes ai and bi. */
3844 : 4207 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3845 : 4207 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3846 : 4207 : unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
3847 : 4207 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3848 : 4207 : SLP_TREE_LANES (permab) = n;
3849 : 4207 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
3850 : 4207 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3851 : 4207 : permab->max_nunits = max_nunits;
3852 : : /* ??? Should be NULL but that's not expected. */
3853 : 4207 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3854 : 4207 : SLP_TREE_CHILDREN (permab).quick_push (a);
3855 : 10919 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3856 : 6712 : SLP_TREE_LANE_PERMUTATION (permab)
3857 : 6712 : .quick_push (std::make_pair (0, k));
3858 : 4207 : SLP_TREE_CHILDREN (permab).quick_push (b);
3859 : 10379 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3860 : 6172 : SLP_TREE_LANE_PERMUTATION (permab)
3861 : 6172 : .quick_push (std::make_pair (1, k));
3862 : :
3863 : : /* Put the merged node into 'perm', in place of a. */
3864 : 4207 : SLP_TREE_CHILDREN (perm)[ai] = permab;
3865 : : /* Adjust the references to b in the permutation
3866 : : of perm and to the later children which we'll
3867 : : remove. */
3868 : 56496 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3869 : : {
3870 : 52289 : std::pair<unsigned, unsigned> &p
3871 : 52289 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
3872 : 52289 : if (p.first == (unsigned) bi)
3873 : : {
3874 : 6172 : p.first = ai;
3875 : 6172 : p.second += SLP_TREE_LANES (a);
3876 : : }
3877 : 46117 : else if (p.first > (unsigned) bi)
3878 : 18918 : p.first--;
3879 : : }
3880 : 4207 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3881 : : }
3882 : : }
3883 : :
3884 : 6244 : return node;
3885 : : }
3886 : :
3887 : : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3888 : : of KIND. Return true if successful. */
3889 : :
3890 : : static bool
3891 : 2968063 : vect_build_slp_instance (vec_info *vinfo,
3892 : : slp_instance_kind kind,
3893 : : vec<stmt_vec_info> &scalar_stmts,
3894 : : vec<stmt_vec_info> &root_stmt_infos,
3895 : : vec<tree> &remain,
3896 : : unsigned max_tree_size, unsigned *limit,
3897 : : scalar_stmts_to_slp_tree_map_t *bst_map,
3898 : : /* ??? We need stmt_info for group splitting. */
3899 : : stmt_vec_info stmt_info_,
3900 : : bool force_single_lane)
3901 : : {
3902 : : /* If there's no budget left bail out early. */
3903 : 2968063 : if (*limit == 0)
3904 : : return false;
3905 : :
3906 : 2942809 : if (kind == slp_inst_kind_ctor)
3907 : : {
3908 : 7717 : if (dump_enabled_p ())
3909 : 66 : dump_printf_loc (MSG_NOTE, vect_location,
3910 : : "Analyzing vectorizable constructor: %G\n",
3911 : 33 : root_stmt_infos[0]->stmt);
3912 : : }
3913 : 2935092 : else if (kind == slp_inst_kind_gcond)
3914 : : {
3915 : 200414 : if (dump_enabled_p ())
3916 : 4884 : dump_printf_loc (MSG_NOTE, vect_location,
3917 : : "Analyzing vectorizable control flow: %G",
3918 : 2442 : root_stmt_infos[0]->stmt);
3919 : : }
3920 : :
3921 : 2942809 : if (dump_enabled_p ())
3922 : : {
3923 : 37688 : dump_printf_loc (MSG_NOTE, vect_location,
3924 : : "Starting SLP discovery for\n");
3925 : 95223 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3926 : 115070 : dump_printf_loc (MSG_NOTE, vect_location,
3927 : 57535 : " %G", scalar_stmts[i]->stmt);
3928 : : }
3929 : :
3930 : : /* Build the tree for the SLP instance. */
3931 : 2942809 : unsigned int group_size = scalar_stmts.length ();
3932 : 2942809 : bool *matches = XALLOCAVEC (bool, group_size);
3933 : 2942809 : poly_uint64 max_nunits = 1;
3934 : 2942809 : unsigned tree_size = 0;
3935 : 2942809 : unsigned i;
3936 : :
3937 : 2942809 : slp_tree node = NULL;
3938 : 2942809 : if (group_size > 1 && force_single_lane)
3939 : : {
3940 : 1757 : matches[0] = true;
3941 : 1757 : matches[1] = false;
3942 : : }
3943 : : else
3944 : 2941052 : node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3945 : : &max_nunits, matches, limit,
3946 : : &tree_size, bst_map);
3947 : 2942809 : if (node != NULL)
3948 : : {
3949 : : /* Calculate the unrolling factor based on the smallest type. */
3950 : 1585668 : poly_uint64 unrolling_factor
3951 : 1585668 : = calculate_unrolling_factor (max_nunits, group_size);
3952 : :
3953 : 1585668 : if (maybe_ne (unrolling_factor, 1U)
3954 : 1585668 : && is_a <bb_vec_info> (vinfo))
3955 : : {
3956 : 0 : unsigned HOST_WIDE_INT const_max_nunits;
3957 : 0 : if (!max_nunits.is_constant (&const_max_nunits)
3958 : 0 : || const_max_nunits > group_size)
3959 : : {
3960 : 0 : if (dump_enabled_p ())
3961 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3962 : : "Build SLP failed: store group "
3963 : : "size not a multiple of the vector size "
3964 : : "in basic block SLP\n");
3965 : 0 : vect_free_slp_tree (node);
3966 : 0 : return false;
3967 : : }
3968 : : /* Fatal mismatch. */
3969 : 0 : if (dump_enabled_p ())
3970 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
3971 : : "SLP discovery succeeded but node needs "
3972 : : "splitting\n");
3973 : 0 : memset (matches, true, group_size);
3974 : 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
3975 : 0 : vect_free_slp_tree (node);
3976 : : }
3977 : : else
3978 : : {
3979 : : /* Create a new SLP instance. */
3980 : 1585668 : slp_instance new_instance = XNEW (class _slp_instance);
3981 : 1585668 : SLP_INSTANCE_TREE (new_instance) = node;
3982 : 1585668 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
3983 : 1585668 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3984 : 1585668 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3985 : 1585668 : SLP_INSTANCE_KIND (new_instance) = kind;
3986 : 1585668 : new_instance->reduc_phis = NULL;
3987 : 1585668 : new_instance->cost_vec = vNULL;
3988 : 1585668 : new_instance->subgraph_entries = vNULL;
3989 : :
3990 : 1585668 : if (dump_enabled_p ())
3991 : 33239 : dump_printf_loc (MSG_NOTE, vect_location,
3992 : : "SLP size %u vs. limit %u.\n",
3993 : : tree_size, max_tree_size);
3994 : :
3995 : : /* Fixup SLP reduction chains. */
3996 : 1585668 : if (kind == slp_inst_kind_reduc_chain)
3997 : : {
3998 : : /* If this is a reduction chain with a conversion in front
3999 : : amend the SLP tree with a node for that. */
4000 : 345 : gimple *scalar_def
4001 : 345 : = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
4002 : 345 : if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
4003 : : {
4004 : : /* Get at the conversion stmt - we know it's the single use
4005 : : of the last stmt of the reduction chain. */
4006 : 48 : use_operand_p use_p;
4007 : 48 : bool r = single_imm_use (gimple_assign_lhs (scalar_def),
4008 : : &use_p, &scalar_def);
4009 : 48 : gcc_assert (r);
4010 : 48 : stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
4011 : 48 : next_info = vect_stmt_to_vectorize (next_info);
4012 : 48 : scalar_stmts = vNULL;
4013 : 48 : scalar_stmts.create (group_size);
4014 : 150 : for (unsigned i = 0; i < group_size; ++i)
4015 : 102 : scalar_stmts.quick_push (next_info);
4016 : 48 : slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
4017 : 48 : SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
4018 : 48 : SLP_TREE_CHILDREN (conv).quick_push (node);
4019 : 48 : SLP_INSTANCE_TREE (new_instance) = conv;
4020 : : /* We also have to fake this conversion stmt as SLP reduction
4021 : : group so we don't have to mess with too much code
4022 : : elsewhere. */
4023 : 48 : REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
4024 : 48 : REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
4025 : : }
4026 : : /* Fill the backedge child of the PHI SLP node. The
4027 : : general matching code cannot find it because the
4028 : : scalar code does not reflect how we vectorize the
4029 : : reduction. */
4030 : 345 : use_operand_p use_p;
4031 : 345 : imm_use_iterator imm_iter;
4032 : 345 : class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
4033 : 1081 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
4034 : : gimple_get_lhs (scalar_def))
4035 : : /* There are exactly two non-debug uses, the reduction
4036 : : PHI and the loop-closed PHI node. */
4037 : 736 : if (!is_gimple_debug (USE_STMT (use_p))
4038 : 736 : && gimple_bb (USE_STMT (use_p)) == loop->header)
4039 : : {
4040 : 345 : auto_vec<stmt_vec_info, 64> phis (group_size);
4041 : 345 : stmt_vec_info phi_info
4042 : 345 : = vinfo->lookup_stmt (USE_STMT (use_p));
4043 : 2134 : for (unsigned i = 0; i < group_size; ++i)
4044 : 1789 : phis.quick_push (phi_info);
4045 : 345 : slp_tree *phi_node = bst_map->get (phis);
4046 : 345 : unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
4047 : 690 : SLP_TREE_CHILDREN (*phi_node)[dest_idx]
4048 : 345 : = SLP_INSTANCE_TREE (new_instance);
4049 : 345 : SLP_INSTANCE_TREE (new_instance)->refcnt++;
4050 : 345 : }
4051 : : }
4052 : :
4053 : 1585668 : vinfo->slp_instances.safe_push (new_instance);
4054 : :
4055 : : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4056 : : the number of scalar stmts in the root in a few places.
4057 : : Verify that assumption holds. */
4058 : 3171336 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4059 : : .length () == group_size);
4060 : :
4061 : 1585668 : if (dump_enabled_p ())
4062 : : {
4063 : 33239 : dump_printf_loc (MSG_NOTE, vect_location,
4064 : : "Final SLP tree for instance %p:\n",
4065 : : (void *) new_instance);
4066 : 33239 : vect_print_slp_graph (MSG_NOTE, vect_location,
4067 : : SLP_INSTANCE_TREE (new_instance));
4068 : : }
4069 : :
4070 : 1585668 : return true;
4071 : : }
4072 : : }
4073 : : /* Failed to SLP. */
4074 : :
4075 : 1357141 : stmt_vec_info stmt_info = stmt_info_;
4076 : : /* Try to break the group up into pieces. */
4077 : 1357141 : if (*limit > 0 && kind == slp_inst_kind_store)
4078 : : {
4079 : : /* ??? We could delay all the actual splitting of store-groups
4080 : : until after SLP discovery of the original group completed.
4081 : : Then we can recurse to vect_build_slp_instance directly. */
4082 : 1039465 : for (i = 0; i < group_size; i++)
4083 : 1039465 : if (!matches[i])
4084 : : break;
4085 : :
4086 : : /* For basic block SLP, try to break the group up into multiples of
4087 : : a vector size. */
4088 : 356897 : if (is_a <bb_vec_info> (vinfo)
4089 : 356897 : && (i > 1 && i < group_size))
4090 : : {
4091 : : /* Free the allocated memory. */
4092 : 145740 : scalar_stmts.release ();
4093 : :
4094 : 145740 : tree scalar_type
4095 : 145740 : = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
4096 : 291480 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
4097 : 145740 : 1 << floor_log2 (i));
4098 : 145740 : unsigned HOST_WIDE_INT const_nunits;
4099 : 145740 : if (vectype
4100 : 145740 : && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
4101 : : {
4102 : : /* Split into two groups at the first vector boundary. */
4103 : 145740 : gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
4104 : 145740 : unsigned group1_size = i & ~(const_nunits - 1);
4105 : :
4106 : 145740 : if (dump_enabled_p ())
4107 : 57 : dump_printf_loc (MSG_NOTE, vect_location,
4108 : : "Splitting SLP group at stmt %u\n", i);
4109 : 145740 : stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
4110 : : group1_size);
4111 : 145740 : bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
4112 : : kind, max_tree_size,
4113 : : limit, false);
4114 : : /* Split the rest at the failure point and possibly
4115 : : re-analyze the remaining matching part if it has
4116 : : at least two lanes. */
4117 : 145740 : if (group1_size < i
4118 : 4822 : && (i + 1 < group_size
4119 : 2891 : || i - group1_size > 1))
4120 : : {
4121 : 1953 : stmt_vec_info rest2 = rest;
4122 : 1953 : rest = vect_split_slp_store_group (rest, i - group1_size);
4123 : 1953 : if (i - group1_size > 1)
4124 : 47 : res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
4125 : : kind, max_tree_size,
4126 : : limit, false);
4127 : : }
4128 : : /* Re-analyze the non-matching tail if it has at least
4129 : : two lanes. */
4130 : 145740 : if (i + 1 < group_size)
4131 : 20918 : res |= vect_analyze_slp_instance (vinfo, bst_map,
4132 : : rest, kind, max_tree_size,
4133 : : limit, false);
4134 : 145740 : return res;
4135 : : }
4136 : : }
4137 : :
4138 : : /* For loop vectorization split the RHS into arbitrary pieces of
4139 : : size >= 1. */
4140 : 211157 : else if (is_a <loop_vec_info> (vinfo)
4141 : 211157 : && (group_size != 1 && i < group_size))
4142 : : {
4143 : 6478 : gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4144 : 35 : bool masked_p = call
4145 : 35 : && gimple_call_internal_p (call)
4146 : 35 : && internal_fn_mask_index (gimple_call_internal_fn (call)) != -1;
4147 : : /* There are targets that cannot do even/odd interleaving schemes
4148 : : so they absolutely need to use load/store-lanes. For now
4149 : : force single-lane SLP for them - they would be happy with
4150 : : uniform power-of-two lanes (but depending on element size),
4151 : : but even if we can use 'i' as indicator we would need to
4152 : : backtrack when later lanes fail to discover with the same
4153 : : granularity. We cannot turn any of strided or scatter store
4154 : : into store-lanes. */
4155 : : /* ??? If this is not in sync with what get_load_store_type
4156 : : later decides the SLP representation is not good for other
4157 : : store vectorization methods. */
4158 : 6478 : bool want_store_lanes
4159 : 6478 : = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
4160 : 6478 : && ! STMT_VINFO_STRIDED_P (stmt_info)
4161 : 4380 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
4162 : 4369 : && compare_step_with_zero (vinfo, stmt_info) > 0
4163 : 10810 : && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
4164 : 12956 : masked_p, group_size, i));
4165 : 6478 : if (want_store_lanes || force_single_lane)
4166 : : i = 1;
4167 : :
4168 : : /* A fatal discovery fail doesn't always mean single-lane SLP
4169 : : isn't a possibility, so try. */
4170 : 4721 : if (i == 0)
4171 : : i = 1;
4172 : :
4173 : 6478 : if (dump_enabled_p ())
4174 : 966 : dump_printf_loc (MSG_NOTE, vect_location,
4175 : : "Splitting SLP group at stmt %u\n", i);
4176 : :
4177 : : /* Analyze the stored values and pinch them together with
4178 : : a permute node so we can preserve the whole store group. */
4179 : 6478 : auto_vec<slp_tree> rhs_nodes;
4180 : 6478 : poly_uint64 max_nunits = 1;
4181 : :
4182 : 6478 : unsigned int rhs_common_nlanes = 0;
4183 : 6478 : unsigned int start = 0, end = i;
4184 : 28804 : while (start < group_size)
4185 : : {
4186 : 22560 : gcc_assert (end - start >= 1);
4187 : 22560 : vec<stmt_vec_info> substmts;
4188 : 22560 : substmts.create (end - start);
4189 : 69047 : for (unsigned j = start; j < end; ++j)
4190 : 46487 : substmts.quick_push (scalar_stmts[j]);
4191 : 22560 : max_nunits = 1;
4192 : 22560 : node = vect_build_slp_tree (vinfo, substmts, end - start,
4193 : : &max_nunits,
4194 : : matches, limit, &tree_size, bst_map);
4195 : 22560 : if (node)
4196 : : {
4197 : 18270 : rhs_nodes.safe_push (node);
4198 : 18270 : vect_update_max_nunits (&max_nunits, node->max_nunits);
4199 : 18270 : if (start == 0)
4200 : 6244 : rhs_common_nlanes = SLP_TREE_LANES (node);
4201 : 12026 : else if (rhs_common_nlanes != SLP_TREE_LANES (node))
4202 : 1381 : rhs_common_nlanes = 0;
4203 : 18270 : start = end;
4204 : 18270 : if (want_store_lanes || force_single_lane)
4205 : 5193 : end = start + 1;
4206 : : else
4207 : : end = group_size;
4208 : : }
4209 : : else
4210 : : {
4211 : 4290 : substmts.release ();
4212 : 4290 : if (end - start == 1)
4213 : : {
4214 : : /* Single-lane discovery failed. Free ressources. */
4215 : 234 : for (auto node : rhs_nodes)
4216 : 0 : vect_free_slp_tree (node);
4217 : 234 : scalar_stmts.release ();
4218 : 234 : if (dump_enabled_p ())
4219 : 38 : dump_printf_loc (MSG_NOTE, vect_location,
4220 : : "SLP discovery failed\n");
4221 : 234 : return false;
4222 : : }
4223 : :
4224 : : /* ??? It really happens that we soft-fail SLP
4225 : : build at a mismatch but the matching part hard-fails
4226 : : later. As we know we arrived here with a group
4227 : : larger than one try a group of size one! */
4228 : 4056 : if (!matches[0])
4229 : 66 : end = start + 1;
4230 : : else
4231 : 9160 : for (unsigned j = start; j < end; j++)
4232 : 9160 : if (!matches[j - start])
4233 : : {
4234 : : end = j;
4235 : : break;
4236 : : }
4237 : : }
4238 : : }
4239 : :
4240 : : /* Now re-assess whether we want store lanes in case the
4241 : : discovery ended up producing all single-lane RHSs. */
4242 : 6244 : if (! want_store_lanes
4243 : 6244 : && rhs_common_nlanes == 1
4244 : 5237 : && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
4245 : 5237 : && ! STMT_VINFO_STRIDED_P (stmt_info)
4246 : 3532 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
4247 : 3521 : && compare_step_with_zero (vinfo, stmt_info) > 0
4248 : 9742 : && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes[0]),
4249 : : group_size, masked_p)
4250 : : != IFN_LAST))
4251 : : want_store_lanes = true;
4252 : :
4253 : : /* Now we assume we can build the root SLP node from all stores. */
4254 : 6244 : if (want_store_lanes)
4255 : : {
4256 : : /* For store-lanes feed the store node with all RHS nodes
4257 : : in order. */
4258 : 0 : node = vect_create_new_slp_node (scalar_stmts,
4259 : 0 : SLP_TREE_CHILDREN
4260 : : (rhs_nodes[0]).length ());
4261 : 0 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
4262 : 0 : node->max_nunits = max_nunits;
4263 : 0 : node->ldst_lanes = true;
4264 : 0 : SLP_TREE_CHILDREN (node)
4265 : 0 : .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
4266 : 0 : + rhs_nodes.length () - 1);
4267 : : /* First store value and possibly mask. */
4268 : 0 : SLP_TREE_CHILDREN (node)
4269 : 0 : .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
4270 : : /* Rest of the store values. All mask nodes are the same,
4271 : : this should be guaranteed by dataref group discovery. */
4272 : 0 : for (unsigned j = 1; j < rhs_nodes.length (); ++j)
4273 : 0 : SLP_TREE_CHILDREN (node)
4274 : 0 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
4275 : 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
4276 : 0 : child->refcnt++;
4277 : : }
4278 : : else
4279 : 6244 : node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
4280 : : max_nunits);
4281 : :
4282 : 24514 : while (!rhs_nodes.is_empty ())
4283 : 18270 : vect_free_slp_tree (rhs_nodes.pop ());
4284 : :
4285 : : /* Create a new SLP instance. */
4286 : 6244 : slp_instance new_instance = XNEW (class _slp_instance);
4287 : 6244 : SLP_INSTANCE_TREE (new_instance) = node;
4288 : 6244 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4289 : 6244 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4290 : 6244 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4291 : 6244 : SLP_INSTANCE_KIND (new_instance) = kind;
4292 : 6244 : new_instance->reduc_phis = NULL;
4293 : 6244 : new_instance->cost_vec = vNULL;
4294 : 6244 : new_instance->subgraph_entries = vNULL;
4295 : :
4296 : 6244 : if (dump_enabled_p ())
4297 : 928 : dump_printf_loc (MSG_NOTE, vect_location,
4298 : : "SLP size %u vs. limit %u.\n",
4299 : : tree_size, max_tree_size);
4300 : :
4301 : 6244 : vinfo->slp_instances.safe_push (new_instance);
4302 : :
4303 : : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4304 : : the number of scalar stmts in the root in a few places.
4305 : : Verify that assumption holds. */
4306 : 12488 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4307 : : .length () == group_size);
4308 : :
4309 : 6244 : if (dump_enabled_p ())
4310 : : {
4311 : 928 : dump_printf_loc (MSG_NOTE, vect_location,
4312 : : "Final SLP tree for instance %p:\n",
4313 : : (void *) new_instance);
4314 : 928 : vect_print_slp_graph (MSG_NOTE, vect_location,
4315 : : SLP_INSTANCE_TREE (new_instance));
4316 : : }
4317 : 6244 : return true;
4318 : 6478 : }
4319 : : else
4320 : : /* Free the allocated memory. */
4321 : 204679 : scalar_stmts.release ();
4322 : :
4323 : : /* Even though the first vector did not all match, we might be able to SLP
4324 : : (some) of the remainder. FORNOW ignore this possibility. */
4325 : : }
4326 : : else
4327 : : /* Free the allocated memory. */
4328 : 1000244 : scalar_stmts.release ();
4329 : :
4330 : : /* Failed to SLP. */
4331 : 1204923 : if (dump_enabled_p ())
4332 : 3426 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4333 : : return false;
4334 : : }
4335 : :
4336 : :
4337 : : /* Analyze an SLP instance starting from a group of grouped stores. Call
4338 : : vect_build_slp_tree to build a tree of packed stmts if possible.
4339 : : Return FALSE if it's impossible to SLP any stmt in the loop. */
4340 : :
4341 : : static bool
4342 : 1022349 : vect_analyze_slp_instance (vec_info *vinfo,
4343 : : scalar_stmts_to_slp_tree_map_t *bst_map,
4344 : : stmt_vec_info stmt_info,
4345 : : slp_instance_kind kind,
4346 : : unsigned max_tree_size, unsigned *limit,
4347 : : bool force_single_lane)
4348 : : {
4349 : 1022349 : vec<stmt_vec_info> scalar_stmts;
4350 : :
4351 : 1022349 : if (is_a <bb_vec_info> (vinfo))
4352 : 1004669 : vect_location = stmt_info->stmt;
4353 : :
4354 : 1022349 : stmt_vec_info next_info = stmt_info;
4355 : 1022349 : if (kind == slp_inst_kind_store)
4356 : : {
4357 : : /* Collect the stores and store them in scalar_stmts. */
4358 : 1021737 : scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
4359 : 5061947 : while (next_info)
4360 : : {
4361 : 3018473 : scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4362 : 3018473 : next_info = DR_GROUP_NEXT_ELEMENT (next_info);
4363 : : }
4364 : : }
4365 : 612 : else if (kind == slp_inst_kind_reduc_chain)
4366 : : {
4367 : : /* Collect the reduction stmts and store them in scalar_stmts. */
4368 : 612 : scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
4369 : 3792 : while (next_info)
4370 : : {
4371 : 2568 : scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4372 : 2568 : next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
4373 : : }
4374 : : /* Mark the first element of the reduction chain as reduction to properly
4375 : : transform the node. In the reduction analysis phase only the last
4376 : : element of the chain is marked as reduction. */
4377 : 612 : STMT_VINFO_DEF_TYPE (stmt_info)
4378 : 612 : = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
4379 : 612 : STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
4380 : 670 : = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
4381 : : }
4382 : : else
4383 : 0 : gcc_unreachable ();
4384 : :
4385 : 1022349 : vec<stmt_vec_info> roots = vNULL;
4386 : 1022349 : vec<tree> remain = vNULL;
4387 : : /* Build the tree for the SLP instance. */
4388 : 1022961 : bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
4389 : : roots, remain,
4390 : : max_tree_size, limit, bst_map,
4391 : : kind == slp_inst_kind_store
4392 : : ? stmt_info : NULL, force_single_lane);
4393 : :
4394 : : /* ??? If this is slp_inst_kind_store and the above succeeded here's
4395 : : where we should do store group splitting. */
4396 : :
4397 : 1022349 : return res;
4398 : : }
4399 : :
4400 : : /* qsort comparator ordering SLP load nodes. */
4401 : :
4402 : : static int
4403 : 2052640 : vllp_cmp (const void *a_, const void *b_)
4404 : : {
4405 : 2052640 : const slp_tree a = *(const slp_tree *)a_;
4406 : 2052640 : const slp_tree b = *(const slp_tree *)b_;
4407 : 2052640 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
4408 : 2052640 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
4409 : 2052640 : if (STMT_VINFO_GROUPED_ACCESS (a0)
4410 : 1271480 : && STMT_VINFO_GROUPED_ACCESS (b0)
4411 : 3276506 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
4412 : : {
4413 : : /* Same group, order after lanes used. */
4414 : 283321 : if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
4415 : : return 1;
4416 : 280500 : else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
4417 : : return -1;
4418 : : else
4419 : : {
4420 : : /* Try to order loads using the same lanes together, breaking
4421 : : the tie with the lane number that first differs. */
4422 : 277247 : if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4423 : 277247 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4424 : : return 0;
4425 : 277247 : else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
4426 : 277247 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4427 : : return 1;
4428 : 276610 : else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4429 : 276610 : && SLP_TREE_LOAD_PERMUTATION (b).exists ())
4430 : : return -1;
4431 : : else
4432 : : {
4433 : 276013 : for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
4434 : 276013 : if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4435 : 276013 : != SLP_TREE_LOAD_PERMUTATION (b)[i])
4436 : : {
4437 : : /* In-order lane first, that's what the above case for
4438 : : no permutation does. */
4439 : 275724 : if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
4440 : : return -1;
4441 : 164359 : else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
4442 : : return 1;
4443 : 79719 : else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4444 : 79719 : < SLP_TREE_LOAD_PERMUTATION (b)[i])
4445 : : return -1;
4446 : : else
4447 : : return 1;
4448 : : }
4449 : : return 0;
4450 : : }
4451 : : }
4452 : : }
4453 : : else /* Different groups or non-groups. */
4454 : : {
4455 : : /* Order groups as their first element to keep them together. */
4456 : 1769319 : if (STMT_VINFO_GROUPED_ACCESS (a0))
4457 : 1769319 : a0 = DR_GROUP_FIRST_ELEMENT (a0);
4458 : 1769319 : if (STMT_VINFO_GROUPED_ACCESS (b0))
4459 : 1769319 : b0 = DR_GROUP_FIRST_ELEMENT (b0);
4460 : 1769319 : if (a0 == b0)
4461 : : return 0;
4462 : : /* Tie using UID. */
4463 : 1769295 : else if (gimple_uid (STMT_VINFO_STMT (a0))
4464 : 1769295 : < gimple_uid (STMT_VINFO_STMT (b0)))
4465 : : return -1;
4466 : : else
4467 : : {
4468 : 728505 : gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
4469 : : != gimple_uid (STMT_VINFO_STMT (b0)));
4470 : : return 1;
4471 : : }
4472 : : }
4473 : : }
4474 : :
4475 : : /* Process the set of LOADS that are all from the same dataref group. */
4476 : :
4477 : : static void
4478 : 124090 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
4479 : : scalar_stmts_to_slp_tree_map_t *bst_map,
4480 : : const array_slice<slp_tree> &loads,
4481 : : bool force_single_lane)
4482 : : {
4483 : : /* We at this point want to lower without a fixed VF or vector
4484 : : size in mind which means we cannot actually compute whether we
4485 : : need three or more vectors for a load permutation yet. So always
4486 : : lower. */
4487 : 124090 : stmt_vec_info first
4488 : 124090 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
4489 : 124090 : unsigned group_lanes = DR_GROUP_SIZE (first);
4490 : :
4491 : : /* Verify if all load permutations can be implemented with a suitably
4492 : : large element load-lanes operation. */
4493 : 124090 : unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
4494 : 124090 : if (STMT_VINFO_STRIDED_P (first)
4495 : 121941 : || compare_step_with_zero (loop_vinfo, first) <= 0
4496 : 119554 : || exact_log2 (ld_lanes_lanes) == -1
4497 : : /* ??? For now only support the single-lane case as there is
4498 : : missing support on the store-lane side and code generation
4499 : : isn't up to the task yet. */
4500 : 117699 : || ld_lanes_lanes != 1
4501 : 236535 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
4502 : : group_lanes / ld_lanes_lanes,
4503 : : false) == IFN_LAST)
4504 : : ld_lanes_lanes = 0;
4505 : : else
4506 : : /* Verify the loads access the same number of lanes aligned to
4507 : : ld_lanes_lanes. */
4508 : 0 : for (slp_tree load : loads)
4509 : : {
4510 : 0 : if (SLP_TREE_LANES (load) != ld_lanes_lanes)
4511 : : {
4512 : : ld_lanes_lanes = 0;
4513 : : break;
4514 : : }
4515 : 0 : unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
4516 : 0 : if (first % ld_lanes_lanes != 0)
4517 : : {
4518 : : ld_lanes_lanes = 0;
4519 : : break;
4520 : : }
4521 : 0 : for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i)
4522 : : if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i)
4523 : : {
4524 : : ld_lanes_lanes = 0;
4525 : : break;
4526 : : }
4527 : : }
4528 : :
4529 : : /* Only a power-of-two number of lanes matches interleaving with N levels.
4530 : : ??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
4531 : : at each step. */
4532 : 202459 : if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
4533 : : return;
4534 : :
4535 : 232813 : for (slp_tree load : loads)
4536 : : {
4537 : : /* Leave masked or gather loads alone for now. */
4538 : 140806 : if (!SLP_TREE_CHILDREN (load).is_empty ())
4539 : 48397 : continue;
4540 : :
4541 : : /* We want to pattern-match special cases here and keep those
4542 : : alone. Candidates are splats and load-lane. */
4543 : :
4544 : : /* We need to lower only loads of less than half of the groups
4545 : : lanes, including duplicate lanes. Note this leaves nodes
4546 : : with a non-1:1 load permutation around instead of canonicalizing
4547 : : those into a load and a permute node. Removing this early
4548 : : check would do such canonicalization. */
4549 : 140784 : if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
4550 : 40703 : && ld_lanes_lanes == 0)
4551 : 40703 : continue;
4552 : :
4553 : : /* Build the permute to get the original load permutation order. */
4554 : 100081 : bool contiguous = true;
4555 : 100081 : lane_permutation_t final_perm;
4556 : 100081 : final_perm.create (SLP_TREE_LANES (load));
4557 : 200683 : for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
4558 : : {
4559 : 100602 : final_perm.quick_push
4560 : 100602 : (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
4561 : 100602 : if (i != 0
4562 : 100602 : && (SLP_TREE_LOAD_PERMUTATION (load)[i]
4563 : 521 : != SLP_TREE_LOAD_PERMUTATION (load)[i-1] + 1))
4564 : : contiguous = false;
4565 : : }
4566 : :
4567 : : /* When the load permutation accesses a contiguous unpermuted,
4568 : : power-of-two aligned and sized chunk leave the load alone.
4569 : : We can likely (re-)load it more efficiently rather than
4570 : : extracting it from the larger load.
4571 : : ??? Long-term some of the lowering should move to where
4572 : : the vector types involved are fixed. */
4573 : 107753 : if (!force_single_lane
4574 : 100081 : && ld_lanes_lanes == 0
4575 : 54334 : && contiguous
4576 : 54129 : && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
4577 : 15955 : && pow2p_hwi (SLP_TREE_LANES (load))
4578 : 15955 : && pow2p_hwi (group_lanes)
4579 : 7672 : && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
4580 : 107753 : && group_lanes % SLP_TREE_LANES (load) == 0)
4581 : : {
4582 : 7672 : final_perm.release ();
4583 : 7672 : continue;
4584 : : }
4585 : :
4586 : : /* First build (and possibly re-use) a load node for the
4587 : : unpermuted group. Gaps in the middle and on the end are
4588 : : represented with NULL stmts. */
4589 : 92409 : vec<stmt_vec_info> stmts;
4590 : 92409 : stmts.create (group_lanes);
4591 : 289307 : for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
4592 : : {
4593 : 196898 : if (s != first)
4594 : 108102 : for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
4595 : 3613 : stmts.quick_push (NULL);
4596 : 196898 : stmts.quick_push (s);
4597 : : }
4598 : 240071 : for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
4599 : 147662 : stmts.quick_push (NULL);
4600 : 92409 : poly_uint64 max_nunits = 1;
4601 : 92409 : bool *matches = XALLOCAVEC (bool, group_lanes);
4602 : 92409 : unsigned limit = 1;
4603 : 92409 : unsigned tree_size = 0;
4604 : 92409 : slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
4605 : : group_lanes,
4606 : : &max_nunits, matches, &limit,
4607 : 92409 : &tree_size, bst_map);
4608 : 92409 : gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0).exists ());
4609 : :
4610 : 92409 : if (ld_lanes_lanes != 0)
4611 : : {
4612 : : /* ??? If this is not in sync with what get_load_store_type
4613 : : later decides the SLP representation is not good for other
4614 : : store vectorization methods. */
4615 : 0 : l0->ldst_lanes = true;
4616 : 0 : load->ldst_lanes = true;
4617 : : }
4618 : :
4619 : 294757 : while (1)
4620 : : {
4621 : 193583 : unsigned group_lanes = SLP_TREE_LANES (l0);
4622 : 193583 : if (ld_lanes_lanes != 0
4623 : 193583 : || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
4624 : : break;
4625 : :
4626 : : /* Try to lower by reducing the group to half its size using an
4627 : : interleaving scheme. For this try to compute whether all
4628 : : elements needed for this load are in even or odd elements of
4629 : : an even/odd decomposition with N consecutive elements.
4630 : : Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
4631 : : with N == 2. */
4632 : : /* ??? Only an even number of lanes can be handed this way, but the
4633 : : fallback below could work for any number. We have to make sure
4634 : : to round up in that case. */
4635 : 101174 : gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
4636 : 19111 : unsigned even = 0, odd = 0;
4637 : 19111 : if ((group_lanes & 1) == 0)
4638 : : {
4639 : 19111 : even = (1 << ceil_log2 (group_lanes)) - 1;
4640 : 19111 : odd = even;
4641 : 76886 : for (auto l : final_perm)
4642 : : {
4643 : 19553 : even &= ~l.second;
4644 : 19553 : odd &= l.second;
4645 : : }
4646 : : }
4647 : :
4648 : : /* Now build an even or odd extraction from the unpermuted load. */
4649 : 101174 : lane_permutation_t perm;
4650 : 101174 : perm.create ((group_lanes + 1) / 2);
4651 : 101174 : unsigned even_level = even ? 1 << ctz_hwi (even) : 0;
4652 : 101174 : unsigned odd_level = odd ? 1 << ctz_hwi (odd) : 0;
4653 : 101174 : if (even_level
4654 : 18419 : && group_lanes % (2 * even_level) == 0
4655 : : /* ??? When code generating permutes we do not try to pun
4656 : : to larger component modes so level != 1 isn't a natural
4657 : : even/odd extract. Prefer one if possible. */
4658 : 18419 : && (even_level == 1 || !odd_level || odd_level != 1))
4659 : : {
4660 : : /* { 0, 1, ... 4, 5 ..., } */
4661 : 87637 : for (unsigned i = 0; i < group_lanes / 2 / even_level; ++i)
4662 : 142671 : for (unsigned j = 0; j < even_level; ++j)
4663 : 71418 : perm.quick_push (std::make_pair (0, 2 * i * even_level + j));
4664 : : }
4665 : 82755 : else if (odd_level)
4666 : : {
4667 : : /* { ..., 2, 3, ... 6, 7 } */
4668 : 2727 : gcc_assert (group_lanes % (2 * odd_level) == 0);
4669 : 12155 : for (unsigned i = 0; i < group_lanes / 2 / odd_level; ++i)
4670 : 18856 : for (unsigned j = 0; j < odd_level; ++j)
4671 : 9428 : perm.quick_push
4672 : 9428 : (std::make_pair (0, (2 * i + 1) * odd_level + j));
4673 : : }
4674 : : else
4675 : : {
4676 : : /* As fallback extract all used lanes and fill to half the
4677 : : group size by repeating the last element.
4678 : : ??? This is quite a bad strathegy for re-use - we could
4679 : : brute force our way to find more optimal filling lanes to
4680 : : maximize re-use when looking at all loads from the group. */
4681 : 82063 : auto_bitmap l;
4682 : 328252 : for (auto p : final_perm)
4683 : 82063 : bitmap_set_bit (l, p.second);
4684 : 82063 : unsigned i = 0;
4685 : 82063 : bitmap_iterator bi;
4686 : 164126 : EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
4687 : 82063 : perm.quick_push (std::make_pair (0, i));
4688 : 328252 : while (perm.length () < (group_lanes + 1) / 2)
4689 : 82063 : perm.quick_push (perm.last ());
4690 : 82063 : }
4691 : :
4692 : : /* Update final_perm with the intermediate permute. */
4693 : 202790 : for (unsigned i = 0; i < final_perm.length (); ++i)
4694 : : {
4695 : 101616 : unsigned l = final_perm[i].second;
4696 : 101616 : unsigned j;
4697 : 108729 : for (j = 0; j < perm.length (); ++j)
4698 : 108729 : if (perm[j].second == l)
4699 : : {
4700 : 101616 : final_perm[i].second = j;
4701 : 101616 : break;
4702 : : }
4703 : 101616 : gcc_assert (j < perm.length ());
4704 : : }
4705 : :
4706 : : /* And create scalar stmts. */
4707 : 101174 : vec<stmt_vec_info> perm_stmts;
4708 : 101174 : perm_stmts.create (perm.length ());
4709 : 346146 : for (unsigned i = 0; i < perm.length (); ++i)
4710 : 244972 : perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
4711 : :
4712 : 101174 : slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
4713 : 101174 : SLP_TREE_CHILDREN (p).quick_push (l0);
4714 : 101174 : SLP_TREE_LANE_PERMUTATION (p) = perm;
4715 : 101174 : SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
4716 : 101174 : SLP_TREE_LANES (p) = perm.length ();
4717 : 101174 : SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
4718 : : /* ??? As we have scalar stmts for this intermediate permute we
4719 : : could CSE it via bst_map but we do not want to pick up
4720 : : another SLP node with a load permutation. We instead should
4721 : : have a "local" CSE map here. */
4722 : 101174 : SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
4723 : :
4724 : : /* We now have a node for (group_lanes + 1) / 2 lanes. */
4725 : 101174 : l0 = p;
4726 : 101174 : }
4727 : :
4728 : : /* And finally from the ordered reduction node create the
4729 : : permute to shuffle the lanes into the original load-permutation
4730 : : order. We replace the original load node with this. */
4731 : 92409 : SLP_TREE_CODE (load) = VEC_PERM_EXPR;
4732 : 92409 : SLP_TREE_LOAD_PERMUTATION (load).release ();
4733 : 92409 : SLP_TREE_LANE_PERMUTATION (load) = final_perm;
4734 : 92409 : SLP_TREE_CHILDREN (load).create (1);
4735 : 92409 : SLP_TREE_CHILDREN (load).quick_push (l0);
4736 : : }
4737 : : }
4738 : :
4739 : : /* Transform SLP loads in the SLP graph created by SLP discovery to
4740 : : group loads from the same group and lower load permutations that
4741 : : are unlikely to be supported into a series of permutes.
4742 : : In the degenerate case of having only single-lane SLP instances
4743 : : this should result in a series of permute nodes emulating an
4744 : : interleaving scheme. */
4745 : :
4746 : : static void
4747 : 342434 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
4748 : : scalar_stmts_to_slp_tree_map_t *bst_map,
4749 : : bool force_single_lane)
4750 : : {
4751 : : /* Gather and sort loads across all instances. */
4752 : 342434 : hash_set<slp_tree> visited;
4753 : 342434 : auto_vec<slp_tree> loads;
4754 : 1838266 : for (auto inst : loop_vinfo->slp_instances)
4755 : 820978 : vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
4756 : 342434 : if (loads.is_empty ())
4757 : 66608 : return;
4758 : 275826 : loads.qsort (vllp_cmp);
4759 : :
4760 : : /* Now process each dataref group separately. */
4761 : 275826 : unsigned firsti = 0;
4762 : 526251 : for (unsigned i = 1; i < loads.length (); ++i)
4763 : : {
4764 : 250425 : slp_tree first = loads[firsti];
4765 : 250425 : slp_tree next = loads[i];
4766 : 250425 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
4767 : 250425 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
4768 : 250425 : if (STMT_VINFO_GROUPED_ACCESS (a0)
4769 : 126941 : && STMT_VINFO_GROUPED_ACCESS (b0)
4770 : 367726 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
4771 : 53932 : continue;
4772 : : /* Now we have one or multiple SLP loads of the same group from
4773 : : firsti to i - 1. */
4774 : 196493 : if (STMT_VINFO_GROUPED_ACCESS (a0))
4775 : 73009 : vect_lower_load_permutations (loop_vinfo, bst_map,
4776 : 73009 : make_array_slice (&loads[firsti],
4777 : : i - firsti),
4778 : : force_single_lane);
4779 : : firsti = i;
4780 : : }
4781 : 551652 : if (firsti < loads.length ()
4782 : 551652 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
4783 : 51081 : vect_lower_load_permutations (loop_vinfo, bst_map,
4784 : 51081 : make_array_slice (&loads[firsti],
4785 : 51081 : loads.length () - firsti),
4786 : : force_single_lane);
4787 : 342434 : }
4788 : :
4789 : : /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
4790 : : trees of packed scalar stmts if SLP is possible. */
4791 : :
4792 : : opt_result
4793 : 944022 : vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
4794 : : bool force_single_lane)
4795 : : {
4796 : 944022 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4797 : 944022 : unsigned int i;
4798 : 944022 : stmt_vec_info first_element;
4799 : 944022 : slp_instance instance;
4800 : :
4801 : 944022 : DUMP_VECT_SCOPE ("vect_analyze_slp");
4802 : :
4803 : 944022 : unsigned limit = max_tree_size;
4804 : :
4805 : 944022 : scalar_stmts_to_slp_tree_map_t *bst_map
4806 : 944022 : = new scalar_stmts_to_slp_tree_map_t ();
4807 : :
4808 : : /* Find SLP sequences starting from groups of grouped stores. */
4809 : 2743076 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4810 : 855032 : vect_analyze_slp_instance (vinfo, bst_map, first_element,
4811 : : slp_inst_kind_store, max_tree_size, &limit,
4812 : : force_single_lane);
4813 : :
4814 : : /* For loops also start SLP discovery from non-grouped stores. */
4815 : 944022 : if (loop_vinfo)
4816 : : {
4817 : : data_reference_p dr;
4818 : 1154250 : FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
4819 : 811816 : if (DR_IS_WRITE (dr))
4820 : : {
4821 : 258676 : stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
4822 : : /* Grouped stores are already handled above. */
4823 : 258676 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
4824 : 66456 : continue;
4825 : 192220 : vec<stmt_vec_info> stmts;
4826 : 192220 : vec<stmt_vec_info> roots = vNULL;
4827 : 192220 : vec<tree> remain = vNULL;
4828 : 192220 : stmts.create (1);
4829 : 192220 : stmts.quick_push (stmt_info);
4830 : 192220 : vect_build_slp_instance (vinfo, slp_inst_kind_store,
4831 : : stmts, roots, remain, max_tree_size,
4832 : : &limit, bst_map, NULL, force_single_lane);
4833 : : }
4834 : : }
4835 : :
4836 : 944022 : if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4837 : : {
4838 : 1732535 : for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
4839 : : {
4840 : 1130947 : vect_location = bb_vinfo->roots[i].roots[0]->stmt;
4841 : : /* Apply patterns. */
4842 : 3543493 : for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
4843 : 4825092 : bb_vinfo->roots[i].stmts[j]
4844 : 2469784 : = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
4845 : 1130947 : if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
4846 : 1130947 : bb_vinfo->roots[i].stmts,
4847 : 1130947 : bb_vinfo->roots[i].roots,
4848 : 1130947 : bb_vinfo->roots[i].remain,
4849 : : max_tree_size, &limit, bst_map, NULL,
4850 : : false))
4851 : : {
4852 : 112369 : bb_vinfo->roots[i].stmts = vNULL;
4853 : 112369 : bb_vinfo->roots[i].roots = vNULL;
4854 : 112369 : bb_vinfo->roots[i].remain = vNULL;
4855 : : }
4856 : : }
4857 : : }
4858 : :
4859 : 944022 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4860 : : {
4861 : : /* Find SLP sequences starting from reduction chains. */
4862 : 343050 : FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
4863 : 616 : if (! STMT_VINFO_RELEVANT_P (first_element)
4864 : 4 : && ! STMT_VINFO_LIVE_P (first_element))
4865 : : ;
4866 : 612 : else if (force_single_lane
4867 : 612 : || ! vect_analyze_slp_instance (vinfo, bst_map, first_element,
4868 : : slp_inst_kind_reduc_chain,
4869 : : max_tree_size, &limit,
4870 : : force_single_lane))
4871 : : {
4872 : : /* Dissolve reduction chain group. */
4873 : 267 : stmt_vec_info vinfo = first_element;
4874 : 267 : stmt_vec_info last = NULL;
4875 : 1046 : while (vinfo)
4876 : : {
4877 : 779 : stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
4878 : 779 : REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
4879 : 779 : REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
4880 : 779 : last = vinfo;
4881 : 779 : vinfo = next;
4882 : : }
4883 : 267 : STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
4884 : : /* It can be still vectorized as part of an SLP reduction. */
4885 : 267 : loop_vinfo->reductions.safe_push (last);
4886 : : }
4887 : :
4888 : : /* Find SLP sequences starting from groups of reductions. */
4889 : 342434 : if (loop_vinfo->reductions.length () > 0)
4890 : : {
4891 : : /* Collect reduction statements we can combine into
4892 : : a SLP reduction. */
4893 : 47568 : vec<stmt_vec_info> scalar_stmts;
4894 : 47568 : scalar_stmts.create (loop_vinfo->reductions.length ());
4895 : 211831 : for (auto next_info : loop_vinfo->reductions)
4896 : : {
4897 : 69127 : next_info = vect_stmt_to_vectorize (next_info);
4898 : 69127 : if ((STMT_VINFO_RELEVANT_P (next_info)
4899 : 0 : || STMT_VINFO_LIVE_P (next_info))
4900 : : /* ??? Make sure we didn't skip a conversion around a
4901 : : reduction path. In that case we'd have to reverse
4902 : : engineer that conversion stmt following the chain using
4903 : : reduc_idx and from the PHI using reduc_def. */
4904 : 69127 : && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
4905 : 69127 : || (STMT_VINFO_DEF_TYPE (next_info)
4906 : : == vect_double_reduction_def)))
4907 : : {
4908 : : /* Do not discover SLP reductions combining lane-reducing
4909 : : ops, that will fail later. */
4910 : 69049 : if (!force_single_lane
4911 : 69049 : && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
4912 : 50866 : scalar_stmts.quick_push (next_info);
4913 : : else
4914 : : {
4915 : : /* Do SLP discovery for single-lane reductions. */
4916 : 18183 : vec<stmt_vec_info> stmts;
4917 : 18183 : vec<stmt_vec_info> roots = vNULL;
4918 : 18183 : vec<tree> remain = vNULL;
4919 : 18183 : stmts.create (1);
4920 : 18183 : stmts.quick_push (next_info);
4921 : 18183 : vect_build_slp_instance (vinfo,
4922 : : slp_inst_kind_reduc_group,
4923 : : stmts, roots, remain,
4924 : : max_tree_size, &limit,
4925 : : bst_map, NULL,
4926 : : force_single_lane);
4927 : : }
4928 : : }
4929 : : }
4930 : : /* Save for re-processing on failure. */
4931 : 47568 : vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
4932 : 47568 : vec<stmt_vec_info> roots = vNULL;
4933 : 47568 : vec<tree> remain = vNULL;
4934 : 47568 : if (scalar_stmts.length () <= 1
4935 : 47568 : || !vect_build_slp_instance (loop_vinfo,
4936 : : slp_inst_kind_reduc_group,
4937 : : scalar_stmts, roots, remain,
4938 : : max_tree_size, &limit, bst_map,
4939 : : NULL, force_single_lane))
4940 : : {
4941 : 47259 : if (scalar_stmts.length () <= 1)
4942 : 47259 : scalar_stmts.release ();
4943 : : /* Do SLP discovery for single-lane reductions. */
4944 : 162733 : for (auto stmt_info : saved_stmts)
4945 : : {
4946 : 49966 : vec<stmt_vec_info> stmts;
4947 : 49966 : vec<stmt_vec_info> roots = vNULL;
4948 : 49966 : vec<tree> remain = vNULL;
4949 : 49966 : stmts.create (1);
4950 : 49966 : stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
4951 : 49966 : vect_build_slp_instance (vinfo,
4952 : : slp_inst_kind_reduc_group,
4953 : : stmts, roots, remain,
4954 : : max_tree_size, &limit,
4955 : : bst_map, NULL, force_single_lane);
4956 : : }
4957 : : }
4958 : 47568 : saved_stmts.release ();
4959 : : }
4960 : :
4961 : : /* Make sure to vectorize only-live stmts, usually inductions. */
4962 : 1570150 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
4963 : 1020541 : for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
4964 : 477693 : gsi_next (&gsi))
4965 : : {
4966 : 477693 : gphi *lc_phi = *gsi;
4967 : 477693 : tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
4968 : 477693 : stmt_vec_info stmt_info;
4969 : 477693 : if (TREE_CODE (def) == SSA_NAME
4970 : 404599 : && !virtual_operand_p (def)
4971 : 197287 : && (stmt_info = loop_vinfo->lookup_def (def))
4972 : 182455 : && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
4973 : 182455 : && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
4974 : 142100 : && STMT_VINFO_LIVE_P (stmt_info)
4975 : 619793 : && (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
4976 : 112736 : || (STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
4977 : 38358 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1)))
4978 : : {
4979 : 67640 : vec<stmt_vec_info> stmts;
4980 : 67640 : vec<stmt_vec_info> roots = vNULL;
4981 : 67640 : vec<tree> remain = vNULL;
4982 : 67640 : stmts.create (1);
4983 : 67640 : stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
4984 : 67640 : vect_build_slp_instance (vinfo,
4985 : : slp_inst_kind_reduc_group,
4986 : : stmts, roots, remain,
4987 : : max_tree_size, &limit,
4988 : : bst_map, NULL, force_single_lane);
4989 : : }
4990 : 342434 : }
4991 : :
4992 : : /* Find SLP sequences starting from gconds. */
4993 : 843470 : for (auto cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
4994 : : {
4995 : 200414 : auto cond_info = loop_vinfo->lookup_stmt (cond);
4996 : :
4997 : 200414 : cond_info = vect_stmt_to_vectorize (cond_info);
4998 : 200414 : vec<stmt_vec_info> roots = vNULL;
4999 : 200414 : roots.safe_push (cond_info);
5000 : 200414 : gimple *stmt = STMT_VINFO_STMT (cond_info);
5001 : 200414 : tree args0 = gimple_cond_lhs (stmt);
5002 : 200414 : tree args1 = gimple_cond_rhs (stmt);
5003 : :
5004 : : /* These should be enforced by cond lowering. */
5005 : 200414 : gcc_assert (gimple_cond_code (stmt) == NE_EXPR);
5006 : 200414 : gcc_assert (zerop (args1));
5007 : :
5008 : : /* An argument without a loop def will be codegened from vectorizing the
5009 : : root gcond itself. As such we don't need to try to build an SLP tree
5010 : : from them. It's highly likely that the resulting SLP tree here if both
5011 : : arguments have a def will be incompatible, but we rely on it being split
5012 : : later on. */
5013 : 200414 : auto varg = loop_vinfo->lookup_def (args0);
5014 : 200414 : vec<stmt_vec_info> stmts;
5015 : 200414 : vec<tree> remain = vNULL;
5016 : 200414 : stmts.create (1);
5017 : 200414 : stmts.quick_push (vect_stmt_to_vectorize (varg));
5018 : :
5019 : 200414 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
5020 : : stmts, roots, remain,
5021 : : max_tree_size, &limit,
5022 : : bst_map, NULL, force_single_lane))
5023 : 1976 : roots.release ();
5024 : : }
5025 : :
5026 : : /* Find and create slp instances for inductions that have been forced
5027 : : live due to early break. */
5028 : 342434 : edge latch_e = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
5029 : 920840 : for (auto stmt_info : LOOP_VINFO_EARLY_BREAKS_LIVE_IVS (loop_vinfo))
5030 : : {
5031 : 277784 : vec<stmt_vec_info> stmts;
5032 : 277784 : vec<stmt_vec_info> roots = vNULL;
5033 : 277784 : vec<tree> remain = vNULL;
5034 : 277784 : gphi *phi = as_a<gphi *> (STMT_VINFO_STMT (stmt_info));
5035 : 277784 : stmts.create (1);
5036 : 277784 : tree def = gimple_phi_arg_def_from_edge (phi, latch_e);
5037 : 277784 : stmt_vec_info lc_info = loop_vinfo->lookup_def (def);
5038 : 277814 : stmts.quick_push (vect_stmt_to_vectorize (lc_info));
5039 : 277784 : vect_build_slp_instance (vinfo, slp_inst_kind_reduc_group,
5040 : : stmts, roots, remain,
5041 : : max_tree_size, &limit,
5042 : : bst_map, NULL, force_single_lane);
5043 : : /* When the latch def is from a different cycle this can only
5044 : : be a induction. Build a simple instance for this.
5045 : : ??? We should be able to start discovery from the PHI
5046 : : for all inductions, but then there will be stray
5047 : : non-SLP stmts we choke on as needing non-SLP handling. */
5048 : 277784 : auto_vec<stmt_vec_info, 1> tem;
5049 : 277784 : tem.quick_push (stmt_info);
5050 : 277784 : if (!bst_map->get (tem))
5051 : : {
5052 : 4886 : gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
5053 : : == vect_induction_def);
5054 : 4886 : stmts.create (1);
5055 : 4886 : stmts.quick_push (stmt_info);
5056 : 4886 : vect_build_slp_instance (vinfo, slp_inst_kind_reduc_group,
5057 : : stmts, roots, remain,
5058 : : max_tree_size, &limit,
5059 : : bst_map, NULL, force_single_lane);
5060 : : }
5061 : 277784 : }
5062 : : }
5063 : :
5064 : 944022 : hash_set<slp_tree> visited_patterns;
5065 : 944022 : slp_tree_to_load_perm_map_t perm_cache;
5066 : 944022 : slp_compat_nodes_map_t compat_cache;
5067 : :
5068 : : /* See if any patterns can be found in the SLP tree. */
5069 : 944022 : bool pattern_found = false;
5070 : 3479956 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5071 : 1591912 : pattern_found |= vect_match_slp_patterns (instance, vinfo,
5072 : : &visited_patterns, &perm_cache,
5073 : : &compat_cache);
5074 : :
5075 : : /* If any were found optimize permutations of loads. */
5076 : 944022 : if (pattern_found)
5077 : : {
5078 : 224 : hash_map<slp_tree, slp_tree> load_map;
5079 : 1699 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5080 : : {
5081 : 1251 : slp_tree root = SLP_INSTANCE_TREE (instance);
5082 : 1251 : optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
5083 : : &load_map, root);
5084 : : }
5085 : 224 : }
5086 : :
5087 : : /* Check whether we should force some SLP instances to use load/store-lanes
5088 : : and do so by forcing SLP re-discovery with single lanes. We used
5089 : : to cancel SLP when this applied to all instances in a loop but now
5090 : : we decide this per SLP instance. It's important to do this only
5091 : : after SLP pattern recognition. */
5092 : 944022 : if (is_a <loop_vec_info> (vinfo))
5093 : 1163412 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5094 : 820978 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
5095 : 204717 : && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
5096 : : {
5097 : 204717 : slp_tree slp_root = SLP_INSTANCE_TREE (instance);
5098 : 204717 : unsigned int group_size = SLP_TREE_LANES (slp_root);
5099 : 204717 : tree vectype = SLP_TREE_VECTYPE (slp_root);
5100 : :
5101 : 204717 : stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
5102 : 204717 : gimple *rep = STMT_VINFO_STMT (rep_info);
5103 : 204717 : bool masked = (is_gimple_call (rep)
5104 : 1679 : && gimple_call_internal_p (rep)
5105 : 206396 : && internal_fn_mask_index
5106 : 1679 : (gimple_call_internal_fn (rep)) != -1);
5107 : 204717 : if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
5108 : 16766 : || slp_root->ldst_lanes
5109 : 221483 : || (vect_store_lanes_supported (vectype, group_size, masked)
5110 : : == IFN_LAST))
5111 : 204717 : continue;
5112 : :
5113 : 0 : auto_vec<slp_tree> loads;
5114 : 0 : hash_set<slp_tree> visited;
5115 : 0 : vect_gather_slp_loads (loads, slp_root, visited);
5116 : :
5117 : : /* Check whether any load in the SLP instance is possibly
5118 : : permuted. */
5119 : 0 : bool loads_permuted = false;
5120 : 0 : slp_tree load_node;
5121 : 0 : unsigned j;
5122 : 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
5123 : : {
5124 : 0 : if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
5125 : 0 : continue;
5126 : : unsigned k;
5127 : : stmt_vec_info load_info;
5128 : 0 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
5129 : 0 : if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
5130 : : {
5131 : : loads_permuted = true;
5132 : : break;
5133 : : }
5134 : : }
5135 : :
5136 : : /* If the loads and stores can use load/store-lanes force re-discovery
5137 : : with single lanes. */
5138 : 0 : if (loads_permuted)
5139 : : {
5140 : 0 : bool can_use_lanes = true;
5141 : : bool prefer_load_lanes = false;
5142 : 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
5143 : 0 : if (STMT_VINFO_GROUPED_ACCESS
5144 : : (SLP_TREE_REPRESENTATIVE (load_node)))
5145 : : {
5146 : 0 : stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
5147 : : (SLP_TREE_REPRESENTATIVE (load_node));
5148 : 0 : rep = STMT_VINFO_STMT (stmt_vinfo);
5149 : 0 : masked = (is_gimple_call (rep)
5150 : 0 : && gimple_call_internal_p (rep)
5151 : 0 : && internal_fn_mask_index
5152 : 0 : (gimple_call_internal_fn (rep)));
5153 : : /* Use SLP for strided accesses (or if we can't
5154 : : load-lanes). */
5155 : 0 : if (STMT_VINFO_STRIDED_P (stmt_vinfo)
5156 : 0 : || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
5157 : 0 : || vect_load_lanes_supported
5158 : 0 : (STMT_VINFO_VECTYPE (stmt_vinfo),
5159 : 0 : DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
5160 : : /* ??? During SLP re-discovery with a single lane
5161 : : a masked grouped load will appear permuted and
5162 : : discovery will fail. We have to rework this
5163 : : on the discovery side - for now avoid ICEing. */
5164 : 0 : || masked)
5165 : : {
5166 : : can_use_lanes = false;
5167 : : break;
5168 : : }
5169 : : /* Make sure that the target would prefer store-lanes
5170 : : for at least one of the loads.
5171 : :
5172 : : ??? Perhaps we should instead require this for
5173 : : all loads? */
5174 : 0 : prefer_load_lanes
5175 : : = (prefer_load_lanes
5176 : 0 : || SLP_TREE_LANES (load_node) == group_size
5177 : 0 : || (vect_slp_prefer_store_lanes_p
5178 : 0 : (vinfo, stmt_vinfo,
5179 : : STMT_VINFO_VECTYPE (stmt_vinfo), masked,
5180 : : group_size, SLP_TREE_LANES (load_node))));
5181 : : }
5182 : :
5183 : 0 : if (can_use_lanes && prefer_load_lanes)
5184 : : {
5185 : 0 : if (dump_enabled_p ())
5186 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
5187 : : "SLP instance %p can use load/store-lanes,"
5188 : : " re-discovering with single-lanes\n",
5189 : : (void *) instance);
5190 : :
5191 : 0 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
5192 : :
5193 : 0 : vect_free_slp_instance (instance);
5194 : 0 : limit = max_tree_size;
5195 : 0 : bool res = vect_analyze_slp_instance (vinfo, bst_map,
5196 : : stmt_info,
5197 : : slp_inst_kind_store,
5198 : : max_tree_size, &limit,
5199 : : true);
5200 : 0 : gcc_assert (res);
5201 : 0 : auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
5202 : 0 : LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
5203 : : }
5204 : : }
5205 : 0 : }
5206 : :
5207 : : /* When we end up with load permutations that we cannot possibly handle,
5208 : : like those requiring three vector inputs, lower them using interleaving
5209 : : like schemes. */
5210 : 944022 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5211 : : {
5212 : 342434 : vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
5213 : 342434 : if (dump_enabled_p ())
5214 : : {
5215 : 20390 : dump_printf_loc (MSG_NOTE, vect_location,
5216 : : "SLP graph after lowering permutations:\n");
5217 : 20390 : hash_set<slp_tree> visited;
5218 : 93786 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5219 : 32926 : vect_print_slp_graph (MSG_NOTE, vect_location,
5220 : : SLP_INSTANCE_TREE (instance), visited);
5221 : 20390 : }
5222 : : }
5223 : :
5224 : 944022 : release_scalar_stmts_to_slp_tree_map (bst_map);
5225 : :
5226 : 944022 : if (pattern_found && dump_enabled_p ())
5227 : : {
5228 : 22 : dump_printf_loc (MSG_NOTE, vect_location,
5229 : : "Pattern matched SLP tree\n");
5230 : 22 : hash_set<slp_tree> visited;
5231 : 142 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5232 : 76 : vect_print_slp_graph (MSG_NOTE, vect_location,
5233 : : SLP_INSTANCE_TREE (instance), visited);
5234 : 22 : }
5235 : :
5236 : 944022 : return opt_result::success ();
5237 : 944022 : }
5238 : :
5239 : : /* Estimates the cost of inserting layout changes into the SLP graph.
5240 : : It can also say that the insertion is impossible. */
5241 : :
5242 : : struct slpg_layout_cost
5243 : : {
5244 : 10887222 : slpg_layout_cost () = default;
5245 : : slpg_layout_cost (sreal, bool);
5246 : :
5247 : 329610 : static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
5248 : 3600100 : bool is_possible () const { return depth != sreal::max (); }
5249 : :
5250 : : bool operator== (const slpg_layout_cost &) const;
5251 : : bool operator!= (const slpg_layout_cost &) const;
5252 : :
5253 : : bool is_better_than (const slpg_layout_cost &, bool) const;
5254 : :
5255 : : void add_parallel_cost (const slpg_layout_cost &);
5256 : : void add_serial_cost (const slpg_layout_cost &);
5257 : : void split (unsigned int);
5258 : :
5259 : : /* The longest sequence of layout changes needed during any traversal
5260 : : of the partition dag, weighted by execution frequency.
5261 : :
5262 : : This is the most important metric when optimizing for speed, since
5263 : : it helps to ensure that we keep the number of operations on
5264 : : critical paths to a minimum. */
5265 : : sreal depth = 0;
5266 : :
5267 : : /* An estimate of the total number of operations needed. It is weighted by
5268 : : execution frequency when optimizing for speed but not when optimizing for
5269 : : size. In order to avoid double-counting, a node with a fanout of N will
5270 : : distribute 1/N of its total cost to each successor.
5271 : :
5272 : : This is the most important metric when optimizing for size, since
5273 : : it helps to keep the total number of operations to a minimum, */
5274 : : sreal total = 0;
5275 : : };
5276 : :
5277 : : /* Construct costs for a node with weight WEIGHT. A higher weight
5278 : : indicates more frequent execution. IS_FOR_SIZE is true if we are
5279 : : optimizing for size rather than speed. */
5280 : :
5281 : 817428 : slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
5282 : 818194 : : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
5283 : : {
5284 : 817428 : }
5285 : :
5286 : : bool
5287 : 0 : slpg_layout_cost::operator== (const slpg_layout_cost &other) const
5288 : : {
5289 : 0 : return depth == other.depth && total == other.total;
5290 : : }
5291 : :
5292 : : bool
5293 : 0 : slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
5294 : : {
5295 : 0 : return !operator== (other);
5296 : : }
5297 : :
5298 : : /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
5299 : : true if we are optimizing for size rather than speed. */
5300 : :
5301 : : bool
5302 : 219307 : slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
5303 : : bool is_for_size) const
5304 : : {
5305 : 219307 : if (is_for_size)
5306 : : {
5307 : 337 : if (total != other.total)
5308 : 136 : return total < other.total;
5309 : 201 : return depth < other.depth;
5310 : : }
5311 : : else
5312 : : {
5313 : 218970 : if (depth != other.depth)
5314 : 100807 : return depth < other.depth;
5315 : 118163 : return total < other.total;
5316 : : }
5317 : : }
5318 : :
5319 : : /* Increase the costs to account for something with cost INPUT_COST
5320 : : happening in parallel with the current costs. */
5321 : :
5322 : : void
5323 : 235173 : slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
5324 : : {
5325 : 235173 : depth = std::max (depth, input_cost.depth);
5326 : 235173 : total += input_cost.total;
5327 : 235173 : }
5328 : :
5329 : : /* Increase the costs to account for something with cost INPUT_COST
5330 : : happening in series with the current costs. */
5331 : :
5332 : : void
5333 : 1017607 : slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
5334 : : {
5335 : 1017607 : depth += other.depth;
5336 : 1017607 : total += other.total;
5337 : 1017607 : }
5338 : :
5339 : : /* Split the total cost among TIMES successors or predecessors. */
5340 : :
5341 : : void
5342 : 780509 : slpg_layout_cost::split (unsigned int times)
5343 : : {
5344 : 780509 : if (times > 1)
5345 : 254188 : total /= times;
5346 : 780509 : }
5347 : :
5348 : : /* Information about one node in the SLP graph, for use during
5349 : : vect_optimize_slp_pass. */
5350 : :
5351 : : struct slpg_vertex
5352 : : {
5353 : 10342164 : slpg_vertex (slp_tree node_) : node (node_) {}
5354 : :
5355 : : /* The node itself. */
5356 : : slp_tree node;
5357 : :
5358 : : /* Which partition the node belongs to, or -1 if none. Nodes outside of
5359 : : partitions are flexible; they can have whichever layout consumers
5360 : : want them to have. */
5361 : : int partition = -1;
5362 : :
5363 : : /* The number of nodes that directly use the result of this one
5364 : : (i.e. the number of nodes that count this one as a child). */
5365 : : unsigned int out_degree = 0;
5366 : :
5367 : : /* The execution frequency of the node. */
5368 : : sreal weight = 0;
5369 : :
5370 : : /* The total execution frequency of all nodes that directly use the
5371 : : result of this one. */
5372 : : sreal out_weight = 0;
5373 : : };
5374 : :
5375 : : /* Information about one partition of the SLP graph, for use during
5376 : : vect_optimize_slp_pass. */
5377 : :
5378 : 3508932 : struct slpg_partition_info
5379 : : {
5380 : : /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
5381 : : of m_partitioned_nodes. */
5382 : : unsigned int node_begin = 0;
5383 : : unsigned int node_end = 0;
5384 : :
5385 : : /* Which layout we've chosen to use for this partition, or -1 if
5386 : : we haven't picked one yet. */
5387 : : int layout = -1;
5388 : :
5389 : : /* The number of predecessors and successors in the partition dag.
5390 : : The predecessors always have lower partition numbers and the
5391 : : successors always have higher partition numbers.
5392 : :
5393 : : Note that the directions of these edges are not necessarily the
5394 : : same as in the data flow graph. For example, if an SCC has separate
5395 : : partitions for an inner loop and an outer loop, the inner loop's
5396 : : partition will have at least two incoming edges from the outer loop's
5397 : : partition: one for a live-in value and one for a live-out value.
5398 : : In data flow terms, one of these edges would also be from the outer loop
5399 : : to the inner loop, but the other would be in the opposite direction. */
5400 : : unsigned int in_degree = 0;
5401 : : unsigned int out_degree = 0;
5402 : : };
5403 : :
5404 : : /* Information about the costs of using a particular layout for a
5405 : : particular partition. It can also say that the combination is
5406 : : impossible. */
5407 : :
5408 : : struct slpg_partition_layout_costs
5409 : : {
5410 : 1059495 : bool is_possible () const { return internal_cost.is_possible (); }
5411 : 38220 : void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
5412 : :
5413 : : /* The costs inherited from predecessor partitions. */
5414 : : slpg_layout_cost in_cost;
5415 : :
5416 : : /* The inherent cost of the layout within the node itself. For example,
5417 : : this is nonzero for a load if choosing a particular layout would require
5418 : : the load to permute the loaded elements. It is nonzero for a
5419 : : VEC_PERM_EXPR if the permutation cannot be eliminated or converted
5420 : : to full-vector moves. */
5421 : : slpg_layout_cost internal_cost;
5422 : :
5423 : : /* The costs inherited from successor partitions. */
5424 : : slpg_layout_cost out_cost;
5425 : : };
5426 : :
5427 : : /* This class tries to optimize the layout of vectors in order to avoid
5428 : : unnecessary shuffling. At the moment, the set of possible layouts are
5429 : : restricted to bijective permutations.
5430 : :
5431 : : The goal of the pass depends on whether we're optimizing for size or
5432 : : for speed. When optimizing for size, the goal is to reduce the overall
5433 : : number of layout changes (including layout changes implied by things
5434 : : like load permutations). When optimizing for speed, the goal is to
5435 : : reduce the maximum latency attributable to layout changes on any
5436 : : non-cyclical path through the data flow graph.
5437 : :
5438 : : For example, when optimizing a loop nest for speed, we will prefer
5439 : : to make layout changes outside of a loop rather than inside of a loop,
5440 : : and will prefer to make layout changes in parallel rather than serially,
5441 : : even if that increases the overall number of layout changes.
5442 : :
5443 : : The high-level procedure is:
5444 : :
5445 : : (1) Build a graph in which edges go from uses (parents) to definitions
5446 : : (children).
5447 : :
5448 : : (2) Divide the graph into a dag of strongly-connected components (SCCs).
5449 : :
5450 : : (3) When optimizing for speed, partition the nodes in each SCC based
5451 : : on their containing cfg loop. When optimizing for size, treat
5452 : : each SCC as a single partition.
5453 : :
5454 : : This gives us a dag of partitions. The goal is now to assign a
5455 : : layout to each partition.
5456 : :
5457 : : (4) Construct a set of vector layouts that are worth considering.
5458 : : Record which nodes must keep their current layout.
5459 : :
5460 : : (5) Perform a forward walk over the partition dag (from loads to stores)
5461 : : accumulating the "forward" cost of using each layout. When visiting
5462 : : each partition, assign a tentative choice of layout to the partition
5463 : : and use that choice when calculating the cost of using a different
5464 : : layout in successor partitions.
5465 : :
5466 : : (6) Perform a backward walk over the partition dag (from stores to loads),
5467 : : accumulating the "backward" cost of using each layout. When visiting
5468 : : each partition, make a final choice of layout for that partition based
5469 : : on the accumulated forward costs (from (5)) and backward costs
5470 : : (from (6)).
5471 : :
5472 : : (7) Apply the chosen layouts to the SLP graph.
5473 : :
5474 : : For example, consider the SLP statements:
5475 : :
5476 : : S1: a_1 = load
5477 : : loop:
5478 : : S2: a_2 = PHI<a_1, a_3>
5479 : : S3: b_1 = load
5480 : : S4: a_3 = a_2 + b_1
5481 : : exit:
5482 : : S5: a_4 = PHI<a_3>
5483 : : S6: store a_4
5484 : :
5485 : : S2 and S4 form an SCC and are part of the same loop. Every other
5486 : : statement is in a singleton SCC. In this example there is a one-to-one
5487 : : mapping between SCCs and partitions and the partition dag looks like this;
5488 : :
5489 : : S1 S3
5490 : : \ /
5491 : : S2+S4
5492 : : |
5493 : : S5
5494 : : |
5495 : : S6
5496 : :
5497 : : S2, S3 and S4 will have a higher execution frequency than the other
5498 : : statements, so when optimizing for speed, the goal is to avoid any
5499 : : layout changes:
5500 : :
5501 : : - within S3
5502 : : - within S2+S4
5503 : : - on the S3->S2+S4 edge
5504 : :
5505 : : For example, if S3 was originally a reversing load, the goal of the
5506 : : pass is to make it an unreversed load and change the layout on the
5507 : : S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
5508 : : on S1->S2+S4 and S5->S6 would also be acceptable.)
5509 : :
5510 : : The difference between SCCs and partitions becomes important if we
5511 : : add an outer loop:
5512 : :
5513 : : S1: a_1 = ...
5514 : : loop1:
5515 : : S2: a_2 = PHI<a_1, a_6>
5516 : : S3: b_1 = load
5517 : : S4: a_3 = a_2 + b_1
5518 : : loop2:
5519 : : S5: a_4 = PHI<a_3, a_5>
5520 : : S6: c_1 = load
5521 : : S7: a_5 = a_4 + c_1
5522 : : exit2:
5523 : : S8: a_6 = PHI<a_5>
5524 : : S9: store a_6
5525 : : exit1:
5526 : :
5527 : : Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
5528 : : for speed, we usually do not want restrictions in the outer loop to "infect"
5529 : : the decision for the inner loop. For example, if an outer-loop node
5530 : : in the SCC contains a statement with a fixed layout, that should not
5531 : : prevent the inner loop from using a different layout. Conversely,
5532 : : the inner loop should not dictate a layout to the outer loop: if the
5533 : : outer loop does a lot of computation, then it may not be efficient to
5534 : : do all of that computation in the inner loop's preferred layout.
5535 : :
5536 : : So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
5537 : : and S5+S7 (inner). We also try to arrange partitions so that:
5538 : :
5539 : : - the partition for an outer loop comes before the partition for
5540 : : an inner loop
5541 : :
5542 : : - if a sibling loop A dominates a sibling loop B, A's partition
5543 : : comes before B's
5544 : :
5545 : : This gives the following partition dag for the example above:
5546 : :
5547 : : S1 S3
5548 : : \ /
5549 : : S2+S4+S8 S6
5550 : : | \\ /
5551 : : | S5+S7
5552 : : |
5553 : : S9
5554 : :
5555 : : There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
5556 : : one for a reversal of the edge S7->S8.
5557 : :
5558 : : The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
5559 : : for S2+S4+S8 therefore has to balance the cost of using the outer loop's
5560 : : preferred layout against the cost of changing the layout on entry to the
5561 : : inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
5562 : :
5563 : : Although this works well when optimizing for speed, it has the downside
5564 : : when optimizing for size that the choice of layout for S5+S7 is completely
5565 : : independent of S9, which lessens the chance of reducing the overall number
5566 : : of permutations. We therefore do not partition SCCs when optimizing
5567 : : for size.
5568 : :
5569 : : To give a concrete example of the difference between optimizing
5570 : : for size and speed, consider:
5571 : :
5572 : : a[0] = (b[1] << c[3]) - d[1];
5573 : : a[1] = (b[0] << c[2]) - d[0];
5574 : : a[2] = (b[3] << c[1]) - d[3];
5575 : : a[3] = (b[2] << c[0]) - d[2];
5576 : :
5577 : : There are three different layouts here: one for a, one for b and d,
5578 : : and one for c. When optimizing for speed it is better to permute each
5579 : : of b, c and d into the order required by a, since those permutations
5580 : : happen in parallel. But when optimizing for size, it is better to:
5581 : :
5582 : : - permute c into the same order as b
5583 : : - do the arithmetic
5584 : : - permute the result into the order required by a
5585 : :
5586 : : This gives 2 permutations rather than 3. */
5587 : :
5588 : : class vect_optimize_slp_pass
5589 : : {
5590 : : public:
5591 : 658393 : vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
5592 : : void run ();
5593 : :
5594 : : private:
5595 : : /* Graph building. */
5596 : : struct loop *containing_loop (slp_tree);
5597 : : bool is_cfg_latch_edge (graph_edge *);
5598 : : void build_vertices (hash_set<slp_tree> &, slp_tree);
5599 : : void build_vertices ();
5600 : : void build_graph ();
5601 : :
5602 : : /* Partitioning. */
5603 : : void create_partitions ();
5604 : : template<typename T> void for_each_partition_edge (unsigned int, T);
5605 : :
5606 : : /* Layout selection. */
5607 : : bool is_compatible_layout (slp_tree, unsigned int);
5608 : : int change_layout_cost (slp_tree, unsigned int, unsigned int);
5609 : : slpg_partition_layout_costs &partition_layout_costs (unsigned int,
5610 : : unsigned int);
5611 : : void change_vec_perm_layout (slp_tree, lane_permutation_t &,
5612 : : int, unsigned int);
5613 : : int internal_node_cost (slp_tree, int, unsigned int);
5614 : : void start_choosing_layouts ();
5615 : :
5616 : : /* Cost propagation. */
5617 : : slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
5618 : : unsigned int, unsigned int);
5619 : : slpg_layout_cost total_in_cost (unsigned int);
5620 : : slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
5621 : : slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
5622 : : void forward_pass ();
5623 : : void backward_pass ();
5624 : :
5625 : : /* Rematerialization. */
5626 : : slp_tree get_result_with_layout (slp_tree, unsigned int);
5627 : : void materialize ();
5628 : :
5629 : : /* Clean-up. */
5630 : : void remove_redundant_permutations ();
5631 : :
5632 : : /* Masked load lanes discovery. */
5633 : : void decide_masked_load_lanes ();
5634 : :
5635 : : void dump ();
5636 : :
5637 : : vec_info *m_vinfo;
5638 : :
5639 : : /* True if we should optimize the graph for size, false if we should
5640 : : optimize it for speed. (It wouldn't be easy to make this decision
5641 : : more locally.) */
5642 : : bool m_optimize_size;
5643 : :
5644 : : /* A graph of all SLP nodes, with edges leading from uses to definitions.
5645 : : In other words, a node's predecessors are its slp_tree parents and
5646 : : a node's successors are its slp_tree children. */
5647 : : graph *m_slpg = nullptr;
5648 : :
5649 : : /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
5650 : : auto_vec<slpg_vertex> m_vertices;
5651 : :
5652 : : /* The list of all leaves of M_SLPG. such as external definitions, constants,
5653 : : and loads. */
5654 : : auto_vec<int> m_leafs;
5655 : :
5656 : : /* This array has one entry for every vector layout that we're considering.
5657 : : Element 0 is null and indicates "no change". Other entries describe
5658 : : permutations that are inherent in the current graph and that we would
5659 : : like to reverse if possible.
5660 : :
5661 : : For example, a permutation { 1, 2, 3, 0 } means that something has
5662 : : effectively been permuted in that way, such as a load group
5663 : : { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
5664 : : We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
5665 : : in order to put things "back" in order. */
5666 : : auto_vec<vec<unsigned> > m_perms;
5667 : :
5668 : : /* A partitioning of the nodes for which a layout must be chosen.
5669 : : Each partition represents an <SCC, cfg loop> pair; that is,
5670 : : nodes in different SCCs belong to different partitions, and nodes
5671 : : within an SCC can be further partitioned according to a containing
5672 : : cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
5673 : :
5674 : : - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
5675 : : from leaves (such as loads) to roots (such as stores).
5676 : :
5677 : : - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
5678 : : auto_vec<slpg_partition_info> m_partitions;
5679 : :
5680 : : /* The list of all nodes for which a layout must be chosen. Nodes for
5681 : : partition P come before the nodes for partition P+1. Nodes within a
5682 : : partition are in reverse postorder. */
5683 : : auto_vec<unsigned int> m_partitioned_nodes;
5684 : :
5685 : : /* Index P * num-layouts + L contains the cost of using layout L
5686 : : for partition P. */
5687 : : auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
5688 : :
5689 : : /* Index N * num-layouts + L, if nonnull, is a node that provides the
5690 : : original output of node N adjusted to have layout L. */
5691 : : auto_vec<slp_tree> m_node_layouts;
5692 : : };
5693 : :
5694 : : /* Fill the vertices and leafs vector with all nodes in the SLP graph.
5695 : : Also record whether we should optimize anything for speed rather
5696 : : than size. */
5697 : :
5698 : : void
5699 : 11144708 : vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
5700 : : slp_tree node)
5701 : : {
5702 : 11144708 : unsigned i;
5703 : 11144708 : slp_tree child;
5704 : :
5705 : 11144708 : if (visited.add (node))
5706 : 11144708 : return;
5707 : :
5708 : 10342164 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
5709 : : {
5710 : 7878735 : basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
5711 : 7185131 : if (optimize_bb_for_speed_p (bb))
5712 : 7056539 : m_optimize_size = false;
5713 : : }
5714 : :
5715 : 10342164 : node->vertex = m_vertices.length ();
5716 : 10342164 : m_vertices.safe_push (slpg_vertex (node));
5717 : :
5718 : 10342164 : bool leaf = true;
5719 : 10342164 : bool force_leaf = false;
5720 : 19766792 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5721 : 9424628 : if (child)
5722 : : {
5723 : 7960884 : leaf = false;
5724 : 7960884 : build_vertices (visited, child);
5725 : : }
5726 : : else
5727 : : force_leaf = true;
5728 : : /* Since SLP discovery works along use-def edges all cycles have an
5729 : : entry - but there's the exception of cycles where we do not handle
5730 : : the entry explicitely (but with a NULL SLP node), like some reductions
5731 : : and inductions. Force those SLP PHIs to act as leafs to make them
5732 : : backwards reachable. */
5733 : 10342164 : if (leaf || force_leaf)
5734 : 5207202 : m_leafs.safe_push (node->vertex);
5735 : : }
5736 : :
5737 : : /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
5738 : :
5739 : : void
5740 : 1316786 : vect_optimize_slp_pass::build_vertices ()
5741 : : {
5742 : 1316786 : hash_set<slp_tree> visited;
5743 : 1316786 : unsigned i;
5744 : 1316786 : slp_instance instance;
5745 : 1316786 : m_vertices.truncate (0);
5746 : 1316786 : m_leafs.truncate (0);
5747 : 7134182 : FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
5748 : 3183824 : build_vertices (visited, SLP_INSTANCE_TREE (instance));
5749 : 1316786 : }
5750 : :
5751 : : /* Apply (reverse) bijectite PERM to VEC. */
5752 : :
5753 : : template <class T>
5754 : : static void
5755 : 142318 : vect_slp_permute (vec<unsigned> perm,
5756 : : vec<T> &vec, bool reverse)
5757 : : {
5758 : 142318 : auto_vec<T, 64> saved;
5759 : 142318 : saved.create (vec.length ());
5760 : 486798 : for (unsigned i = 0; i < vec.length (); ++i)
5761 : 344480 : saved.quick_push (vec[i]);
5762 : :
5763 : 142318 : if (reverse)
5764 : : {
5765 : 965261 : for (unsigned i = 0; i < vec.length (); ++i)
5766 : 343168 : vec[perm[i]] = saved[i];
5767 : 484890 : for (unsigned i = 0; i < vec.length (); ++i)
5768 : 584594 : gcc_assert (vec[perm[i]] == saved[i]);
5769 : : }
5770 : : else
5771 : : {
5772 : 3816 : for (unsigned i = 0; i < vec.length (); ++i)
5773 : 1312 : vec[i] = saved[perm[i]];
5774 : 143630 : for (unsigned i = 0; i < vec.length (); ++i)
5775 : 1968 : gcc_assert (vec[i] == saved[perm[i]]);
5776 : : }
5777 : 142318 : }
5778 : :
5779 : : /* Return the cfg loop that contains NODE. */
5780 : :
5781 : : struct loop *
5782 : 3894255 : vect_optimize_slp_pass::containing_loop (slp_tree node)
5783 : : {
5784 : 3894255 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
5785 : 3894255 : if (!rep)
5786 : 5485 : return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
5787 : 4244922 : return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
5788 : : }
5789 : :
5790 : : /* Return true if UD (an edge from a use to a definition) is associated
5791 : : with a loop latch edge in the cfg. */
5792 : :
5793 : : bool
5794 : 7960884 : vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
5795 : : {
5796 : 7960884 : slp_tree use = m_vertices[ud->src].node;
5797 : 7960884 : slp_tree def = m_vertices[ud->dest].node;
5798 : 7960884 : if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
5799 : 7960884 : || SLP_TREE_CODE (use) == VEC_PERM_EXPR)
5800 : 7506758 : || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
5801 : : return false;
5802 : :
5803 : 4363516 : stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
5804 : 4363516 : return (is_a<gphi *> (use_rep->stmt)
5805 : 294052 : && bb_loop_header_p (gimple_bb (use_rep->stmt))
5806 : 4526488 : && containing_loop (def) == containing_loop (use));
5807 : : }
5808 : :
5809 : : /* Build the graph. Mark edges that correspond to cfg loop latch edges with
5810 : : a nonnull data field. */
5811 : :
5812 : : void
5813 : 1316786 : vect_optimize_slp_pass::build_graph ()
5814 : : {
5815 : 1316786 : m_optimize_size = true;
5816 : 1316786 : build_vertices ();
5817 : :
5818 : 2633572 : m_slpg = new_graph (m_vertices.length ());
5819 : 14292522 : for (slpg_vertex &v : m_vertices)
5820 : 31637508 : for (slp_tree child : SLP_TREE_CHILDREN (v.node))
5821 : 9424628 : if (child)
5822 : : {
5823 : 7960884 : graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
5824 : 7960884 : if (is_cfg_latch_edge (ud))
5825 : 154830 : ud->data = this;
5826 : : }
5827 : 1316786 : }
5828 : :
5829 : : /* Return true if E corresponds to a loop latch edge in the cfg. */
5830 : :
5831 : : static bool
5832 : 4057684 : skip_cfg_latch_edges (graph_edge *e)
5833 : : {
5834 : 4057684 : return e->data;
5835 : : }
5836 : :
5837 : : /* Create the node partitions. */
5838 : :
5839 : : void
5840 : 658393 : vect_optimize_slp_pass::create_partitions ()
5841 : : {
5842 : : /* Calculate a postorder of the graph, ignoring edges that correspond
5843 : : to natural latch edges in the cfg. Reading the vector from the end
5844 : : to the beginning gives the reverse postorder. */
5845 : 658393 : auto_vec<int> initial_rpo;
5846 : 1316786 : graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
5847 : : false, NULL, skip_cfg_latch_edges);
5848 : 1975179 : gcc_assert (initial_rpo.length () == m_vertices.length ());
5849 : :
5850 : : /* Calculate the strongly connected components of the graph. */
5851 : 658393 : auto_vec<int> scc_grouping;
5852 : 658393 : unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
5853 : :
5854 : : /* Create a new index order in which all nodes from the same SCC are
5855 : : consecutive. Use scc_pos to record the index of the first node in
5856 : : each SCC. */
5857 : 658393 : auto_vec<unsigned int> scc_pos (num_sccs);
5858 : 658393 : int last_component = -1;
5859 : 658393 : unsigned int node_count = 0;
5860 : 7146090 : for (unsigned int node_i : scc_grouping)
5861 : : {
5862 : 5170911 : if (last_component != m_slpg->vertices[node_i].component)
5863 : : {
5864 : 5081317 : last_component = m_slpg->vertices[node_i].component;
5865 : 10162634 : gcc_assert (last_component == int (scc_pos.length ()));
5866 : 5081317 : scc_pos.quick_push (node_count);
5867 : : }
5868 : 5170911 : node_count += 1;
5869 : : }
5870 : 1316786 : gcc_assert (node_count == initial_rpo.length ()
5871 : : && last_component + 1 == int (num_sccs));
5872 : :
5873 : : /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
5874 : : inside each SCC following the RPO we calculated above. The fact that
5875 : : we ignored natural latch edges when calculating the RPO should ensure
5876 : : that, for natural loop nests:
5877 : :
5878 : : - the first node that we encounter in a cfg loop is the loop header phi
5879 : : - the loop header phis are in dominance order
5880 : :
5881 : : Arranging for this is an optimization (see below) rather than a
5882 : : correctness issue. Unnatural loops with a tangled mess of backedges
5883 : : will still work correctly, but might give poorer results.
5884 : :
5885 : : Also update scc_pos so that it gives 1 + the index of the last node
5886 : : in the SCC. */
5887 : 658393 : m_partitioned_nodes.safe_grow (node_count);
5888 : 6487697 : for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
5889 : : {
5890 : 5170911 : unsigned int node_i = initial_rpo[old_i];
5891 : 5170911 : unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
5892 : 5170911 : m_partitioned_nodes[new_i] = node_i;
5893 : : }
5894 : :
5895 : : /* When optimizing for speed, partition each SCC based on the containing
5896 : : cfg loop. The order we constructed above should ensure that, for natural
5897 : : cfg loops, we'll create sub-SCC partitions for outer loops before
5898 : : the corresponding sub-SCC partitions for inner loops. Similarly,
5899 : : when one sibling loop A dominates another sibling loop B, we should
5900 : : create a sub-SCC partition for A before a sub-SCC partition for B.
5901 : :
5902 : : As above, nothing depends for correctness on whether this achieves
5903 : : a natural nesting, but we should get better results when it does. */
5904 : 1316786 : m_partitions.reserve (m_vertices.length ());
5905 : 658393 : unsigned int next_partition_i = 0;
5906 : 658393 : hash_map<struct loop *, int> loop_partitions;
5907 : 658393 : unsigned int rpo_begin = 0;
5908 : 658393 : unsigned int num_partitioned_nodes = 0;
5909 : 7056496 : for (unsigned int rpo_end : scc_pos)
5910 : : {
5911 : 5081317 : loop_partitions.empty ();
5912 : : unsigned int partition_i = next_partition_i;
5913 : 10252228 : for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
5914 : : {
5915 : : /* Handle externals and constants optimistically throughout.
5916 : : But treat existing vectors as fixed since we do not handle
5917 : : permuting them. */
5918 : 5170911 : unsigned int node_i = m_partitioned_nodes[rpo_i];
5919 : 5170911 : auto &vertex = m_vertices[node_i];
5920 : 5170911 : if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
5921 : 462455 : && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
5922 : 5173316 : || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
5923 : 1573027 : vertex.partition = -1;
5924 : : else
5925 : : {
5926 : 3597884 : bool existed;
5927 : 3597884 : if (m_optimize_size)
5928 : 29573 : existed = next_partition_i > partition_i;
5929 : : else
5930 : : {
5931 : 3568311 : struct loop *loop = containing_loop (vertex.node);
5932 : 3568311 : auto &entry = loop_partitions.get_or_insert (loop, &existed);
5933 : 3568311 : if (!existed)
5934 : 3479455 : entry = next_partition_i;
5935 : 3568311 : partition_i = entry;
5936 : : }
5937 : 3597884 : if (!existed)
5938 : : {
5939 : 3508932 : m_partitions.quick_push (slpg_partition_info ());
5940 : 3508932 : next_partition_i += 1;
5941 : : }
5942 : 3597884 : vertex.partition = partition_i;
5943 : 3597884 : num_partitioned_nodes += 1;
5944 : 3597884 : m_partitions[partition_i].node_end += 1;
5945 : : }
5946 : : }
5947 : 5081317 : rpo_begin = rpo_end;
5948 : : }
5949 : :
5950 : : /* Assign ranges of consecutive node indices to each partition,
5951 : : in partition order. Start with node_end being the same as
5952 : : node_begin so that the next loop can use it as a counter. */
5953 : 658393 : unsigned int node_begin = 0;
5954 : 5484111 : for (auto &partition : m_partitions)
5955 : : {
5956 : 3508932 : partition.node_begin = node_begin;
5957 : 3508932 : node_begin += partition.node_end;
5958 : 3508932 : partition.node_end = partition.node_begin;
5959 : : }
5960 : 658393 : gcc_assert (node_begin == num_partitioned_nodes);
5961 : :
5962 : : /* Finally build the list of nodes in partition order. */
5963 : 658393 : m_partitioned_nodes.truncate (num_partitioned_nodes);
5964 : 5829304 : for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
5965 : : {
5966 : 5170911 : int partition_i = m_vertices[node_i].partition;
5967 : 5170911 : if (partition_i >= 0)
5968 : : {
5969 : 3597884 : unsigned int order_i = m_partitions[partition_i].node_end++;
5970 : 3597884 : m_partitioned_nodes[order_i] = node_i;
5971 : : }
5972 : : }
5973 : 658393 : }
5974 : :
5975 : : /* Look for edges from earlier partitions into node NODE_I and edges from
5976 : : node NODE_I into later partitions. Call:
5977 : :
5978 : : FN (ud, other_node_i)
5979 : :
5980 : : for each such use-to-def edge ud, where other_node_i is the node at the
5981 : : other end of the edge. */
5982 : :
5983 : : template<typename T>
5984 : : void
5985 : 3886385 : vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
5986 : : {
5987 : 3886385 : int partition_i = m_vertices[node_i].partition;
5988 : 3886385 : for (graph_edge *pred = m_slpg->vertices[node_i].pred;
5989 : 6537789 : pred; pred = pred->pred_next)
5990 : : {
5991 : 2651404 : int src_partition_i = m_vertices[pred->src].partition;
5992 : 2651404 : if (src_partition_i >= 0 && src_partition_i != partition_i)
5993 : 2473433 : fn (pred, pred->src);
5994 : : }
5995 : 3886385 : for (graph_edge *succ = m_slpg->vertices[node_i].succ;
5996 : 8224528 : succ; succ = succ->succ_next)
5997 : : {
5998 : 4338143 : int dest_partition_i = m_vertices[succ->dest].partition;
5999 : 4338143 : if (dest_partition_i >= 0 && dest_partition_i != partition_i)
6000 : 2479939 : fn (succ, succ->dest);
6001 : : }
6002 : 3886385 : }
6003 : :
6004 : : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
6005 : : that NODE would operate on. This test is independent of NODE's actual
6006 : : operation. */
6007 : :
6008 : : bool
6009 : 1068947 : vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
6010 : : unsigned int layout_i)
6011 : : {
6012 : 1068947 : if (layout_i == 0)
6013 : : return true;
6014 : :
6015 : 632682 : if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
6016 : 8438 : return false;
6017 : :
6018 : : return true;
6019 : : }
6020 : :
6021 : : /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
6022 : : to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
6023 : : layouts is incompatible with NODE or if the change is not possible for
6024 : : some other reason.
6025 : :
6026 : : The properties taken from NODE include the number of lanes and the
6027 : : vector type. The actual operation doesn't matter. */
6028 : :
6029 : : int
6030 : 458402 : vect_optimize_slp_pass::change_layout_cost (slp_tree node,
6031 : : unsigned int from_layout_i,
6032 : : unsigned int to_layout_i)
6033 : : {
6034 : 458402 : if (!is_compatible_layout (node, from_layout_i)
6035 : 458402 : || !is_compatible_layout (node, to_layout_i))
6036 : 662 : return -1;
6037 : :
6038 : 457740 : if (from_layout_i == to_layout_i)
6039 : : return 0;
6040 : :
6041 : 206906 : auto_vec<slp_tree, 1> children (1);
6042 : 206906 : children.quick_push (node);
6043 : 206906 : auto_lane_permutation_t perm (SLP_TREE_LANES (node));
6044 : 206906 : if (from_layout_i > 0)
6045 : 616920 : for (unsigned int i : m_perms[from_layout_i])
6046 : 282756 : perm.quick_push ({ 0, i });
6047 : : else
6048 : 321918 : for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
6049 : 226400 : perm.quick_push ({ 0, i });
6050 : 206906 : if (to_layout_i > 0)
6051 : 95949 : vect_slp_permute (m_perms[to_layout_i], perm, true);
6052 : 206906 : auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
6053 : : children, false);
6054 : 206906 : if (count >= 0)
6055 : 202501 : return MAX (count, 1);
6056 : :
6057 : : /* ??? In principle we could try changing via layout 0, giving two
6058 : : layout changes rather than 1. Doing that would require
6059 : : corresponding support in get_result_with_layout. */
6060 : : return -1;
6061 : 206906 : }
6062 : :
6063 : : /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
6064 : :
6065 : : inline slpg_partition_layout_costs &
6066 : 710666 : vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
6067 : : unsigned int layout_i)
6068 : : {
6069 : 1421332 : return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
6070 : : }
6071 : :
6072 : : /* Change PERM in one of two ways:
6073 : :
6074 : : - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
6075 : : chosen for child I of NODE.
6076 : :
6077 : : - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
6078 : :
6079 : : In both cases, arrange for the output to have layout OUT_LAYOUT_I */
6080 : :
6081 : : void
6082 : 22565 : vect_optimize_slp_pass::
6083 : : change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
6084 : : int in_layout_i, unsigned int out_layout_i)
6085 : : {
6086 : 133313 : for (auto &entry : perm)
6087 : : {
6088 : 65618 : int this_in_layout_i = in_layout_i;
6089 : 65618 : if (this_in_layout_i < 0)
6090 : : {
6091 : 49465 : slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
6092 : 49465 : unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
6093 : 49465 : if (in_partition_i == -1u)
6094 : 329 : continue;
6095 : 49136 : this_in_layout_i = m_partitions[in_partition_i].layout;
6096 : : }
6097 : 65289 : if (this_in_layout_i > 0)
6098 : 13335 : entry.second = m_perms[this_in_layout_i][entry.second];
6099 : : }
6100 : 22565 : if (out_layout_i > 0)
6101 : 4638 : vect_slp_permute (m_perms[out_layout_i], perm, true);
6102 : 22565 : }
6103 : :
6104 : : /* Check whether the target allows NODE to be rearranged so that the node's
6105 : : output has layout OUT_LAYOUT_I. Return the cost of the change if so,
6106 : : in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
6107 : :
6108 : : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
6109 : : NODE can adapt to the layout changes that have (perhaps provisionally)
6110 : : been chosen for NODE's children, so that no extra permutations are
6111 : : needed on either the input or the output of NODE.
6112 : :
6113 : : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
6114 : : that all inputs will be forced into layout IN_LAYOUT_I beforehand.
6115 : :
6116 : : IN_LAYOUT_I has no meaning for other types of node.
6117 : :
6118 : : Keeping the node as-is is always valid. If the target doesn't appear
6119 : : to support the node as-is, but might realistically support other layouts,
6120 : : then layout 0 instead has the cost of a worst-case permutation. On the
6121 : : one hand, this ensures that every node has at least one valid layout,
6122 : : avoiding what would otherwise be an awkward special case. On the other,
6123 : : it still encourages the pass to change an invalid pre-existing layout
6124 : : choice into a valid one. */
6125 : :
6126 : : int
6127 : 155893 : vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
6128 : : unsigned int out_layout_i)
6129 : : {
6130 : 155893 : const int fallback_cost = 1;
6131 : :
6132 : 155893 : if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6133 : : {
6134 : 19796 : auto_lane_permutation_t tmp_perm;
6135 : 19796 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
6136 : :
6137 : : /* Check that the child nodes support the chosen layout. Checking
6138 : : the first child is enough, since any second child would have the
6139 : : same shape. */
6140 : 19796 : auto first_child = SLP_TREE_CHILDREN (node)[0];
6141 : 19796 : if (in_layout_i > 0
6142 : 19796 : && !is_compatible_layout (first_child, in_layout_i))
6143 : : return -1;
6144 : :
6145 : 19137 : change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
6146 : 38274 : int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
6147 : : node, tmp_perm,
6148 : 19137 : SLP_TREE_CHILDREN (node),
6149 : : false);
6150 : 19137 : if (count < 0)
6151 : : {
6152 : 1600 : if (in_layout_i == 0 && out_layout_i == 0)
6153 : : {
6154 : : /* Use the fallback cost if the node could in principle support
6155 : : some nonzero layout for both the inputs and the outputs.
6156 : : Otherwise assume that the node will be rejected later
6157 : : and rebuilt from scalars. */
6158 : 365 : if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
6159 : : return fallback_cost;
6160 : 287 : return 0;
6161 : : }
6162 : : return -1;
6163 : : }
6164 : :
6165 : : /* We currently have no way of telling whether the new layout is cheaper
6166 : : or more expensive than the old one. But at least in principle,
6167 : : it should be worth making zero permutations (whole-vector shuffles)
6168 : : cheaper than real permutations, in case the pass is able to remove
6169 : : the latter. */
6170 : 17537 : return count == 0 ? 0 : 1;
6171 : 19796 : }
6172 : :
6173 : 136097 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
6174 : 136097 : if (rep
6175 : 135038 : && STMT_VINFO_DATA_REF (rep)
6176 : 53980 : && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
6177 : 173465 : && SLP_TREE_LOAD_PERMUTATION (node).exists ())
6178 : : {
6179 : 33275 : auto_load_permutation_t tmp_perm;
6180 : 33275 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
6181 : 33275 : if (out_layout_i > 0)
6182 : 15189 : vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
6183 : :
6184 : 33275 : poly_uint64 vf = 1;
6185 : 33275 : if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
6186 : 2201 : vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6187 : 33275 : unsigned int n_perms;
6188 : 33275 : if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
6189 : : nullptr, vf, true, false, &n_perms))
6190 : : {
6191 : 2064 : auto rep = SLP_TREE_REPRESENTATIVE (node);
6192 : 2064 : if (out_layout_i == 0)
6193 : : {
6194 : : /* Use the fallback cost if the load is an N-to-N permutation.
6195 : : Otherwise assume that the node will be rejected later
6196 : : and rebuilt from scalars. */
6197 : 1581 : if (STMT_VINFO_GROUPED_ACCESS (rep)
6198 : 3162 : && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
6199 : 1581 : == SLP_TREE_LANES (node)))
6200 : 1004 : return fallback_cost;
6201 : : return 0;
6202 : : }
6203 : : return -1;
6204 : : }
6205 : :
6206 : : /* See the comment above the corresponding VEC_PERM_EXPR handling. */
6207 : 31211 : return n_perms == 0 ? 0 : 1;
6208 : 33275 : }
6209 : :
6210 : : return 0;
6211 : : }
6212 : :
6213 : : /* Decide which element layouts we should consider using. Calculate the
6214 : : weights associated with inserting layout changes on partition edges.
6215 : : Also mark partitions that cannot change layout, by setting their
6216 : : layout to zero. */
6217 : :
6218 : : void
6219 : 658393 : vect_optimize_slp_pass::start_choosing_layouts ()
6220 : : {
6221 : : /* Used to assign unique permutation indices. */
6222 : 658393 : using perm_hash = unbounded_hashmap_traits<
6223 : : vec_free_hash_base<int_hash_base<unsigned>>,
6224 : : int_hash<int, -1, -2>
6225 : : >;
6226 : 658393 : hash_map<vec<unsigned>, int, perm_hash> layout_ids;
6227 : :
6228 : : /* Layout 0 is "no change". */
6229 : 658393 : m_perms.safe_push (vNULL);
6230 : :
6231 : : /* Create layouts from existing permutations. */
6232 : 658393 : auto_load_permutation_t tmp_perm;
6233 : 5573063 : for (unsigned int node_i : m_partitioned_nodes)
6234 : : {
6235 : : /* Leafs also double as entries to the reverse graph. Allow the
6236 : : layout of those to be changed. */
6237 : 3597884 : auto &vertex = m_vertices[node_i];
6238 : 3597884 : auto &partition = m_partitions[vertex.partition];
6239 : 3597884 : if (!m_slpg->vertices[node_i].succ)
6240 : 962050 : partition.layout = 0;
6241 : :
6242 : : /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
6243 : 3597884 : slp_tree node = vertex.node;
6244 : 3597884 : stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
6245 : 3597884 : slp_tree child;
6246 : 3597884 : unsigned HOST_WIDE_INT imin, imax = 0;
6247 : 3597884 : bool any_permute = false;
6248 : 3597884 : tmp_perm.truncate (0);
6249 : 3597884 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
6250 : : {
6251 : : /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
6252 : : unpermuted, record a layout that reverses this permutation.
6253 : :
6254 : : We would need more work to cope with loads that are internally
6255 : : permuted and also have inputs (such as masks for
6256 : : IFN_MASK_LOADs). */
6257 : 463262 : gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
6258 : 463262 : if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
6259 : : {
6260 : 335677 : partition.layout = -1;
6261 : 3583887 : continue;
6262 : : }
6263 : 127585 : dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
6264 : 127585 : imin = DR_GROUP_SIZE (dr_stmt) + 1;
6265 : 127585 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
6266 : : }
6267 : 6073228 : else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
6268 : 211453 : && SLP_TREE_CHILDREN (node).length () == 1
6269 : 196016 : && (child = SLP_TREE_CHILDREN (node)[0])
6270 : 3330638 : && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
6271 : 196016 : .is_constant (&imin)))
6272 : : {
6273 : : /* If the child has the same vector size as this node,
6274 : : reversing the permutation can make the permutation a no-op.
6275 : : In other cases it can change a true permutation into a
6276 : : full-vector extract. */
6277 : 196016 : tmp_perm.reserve (SLP_TREE_LANES (node));
6278 : 539233 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6279 : 343217 : tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
6280 : : }
6281 : : else
6282 : 2938606 : continue;
6283 : :
6284 : 887711 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6285 : : {
6286 : 564110 : unsigned idx = tmp_perm[j];
6287 : 564110 : imin = MIN (imin, idx);
6288 : 564110 : imax = MAX (imax, idx);
6289 : 564110 : if (idx - tmp_perm[0] != j)
6290 : 195570 : any_permute = true;
6291 : : }
6292 : : /* If the span doesn't match we'd disrupt VF computation, avoid
6293 : : that for now. */
6294 : 323601 : if (imax - imin + 1 != SLP_TREE_LANES (node))
6295 : 116373 : continue;
6296 : : /* If there's no permute no need to split one out. In this case
6297 : : we can consider turning a load into a permuted load, if that
6298 : : turns out to be cheaper than alternatives. */
6299 : 207228 : if (!any_permute)
6300 : : {
6301 : 193048 : partition.layout = -1;
6302 : 193048 : continue;
6303 : : }
6304 : :
6305 : : /* For now only handle true permutes, like
6306 : : vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
6307 : : when permuting constants and invariants keeping the permute
6308 : : bijective. */
6309 : 14180 : auto_sbitmap load_index (SLP_TREE_LANES (node));
6310 : 14180 : bitmap_clear (load_index);
6311 : 57110 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6312 : 42930 : bitmap_set_bit (load_index, tmp_perm[j] - imin);
6313 : : unsigned j;
6314 : 56193 : for (j = 0; j < SLP_TREE_LANES (node); ++j)
6315 : 42196 : if (!bitmap_bit_p (load_index, j))
6316 : : break;
6317 : 14180 : if (j != SLP_TREE_LANES (node))
6318 : 183 : continue;
6319 : :
6320 : 13997 : vec<unsigned> perm = vNULL;
6321 : 13997 : perm.safe_grow (SLP_TREE_LANES (node), true);
6322 : 55883 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6323 : 41886 : perm[j] = tmp_perm[j] - imin;
6324 : :
6325 : 27994 : if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
6326 : : {
6327 : : /* Continue to use existing layouts, but don't add any more. */
6328 : 0 : int *entry = layout_ids.get (perm);
6329 : 0 : partition.layout = entry ? *entry : 0;
6330 : 0 : perm.release ();
6331 : : }
6332 : : else
6333 : : {
6334 : 13997 : bool existed;
6335 : 13997 : int &layout_i = layout_ids.get_or_insert (perm, &existed);
6336 : 13997 : if (existed)
6337 : 3476 : perm.release ();
6338 : : else
6339 : : {
6340 : 10521 : layout_i = m_perms.length ();
6341 : 10521 : m_perms.safe_push (perm);
6342 : : }
6343 : 13997 : partition.layout = layout_i;
6344 : : }
6345 : 14180 : }
6346 : :
6347 : : /* Initially assume that every layout is possible and has zero cost
6348 : : in every partition. */
6349 : 658393 : m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
6350 : 1316786 : * m_perms.length ());
6351 : :
6352 : : /* We have to mark outgoing permutations facing non-associating-reduction
6353 : : graph entries that are not represented as to be materialized.
6354 : : slp_inst_kind_bb_reduc currently only covers associatable reductions. */
6355 : 3567091 : for (slp_instance instance : m_vinfo->slp_instances)
6356 : 1591912 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
6357 : : {
6358 : 1414 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
6359 : 1414 : m_partitions[m_vertices[node_i].partition].layout = 0;
6360 : : }
6361 : 1590498 : else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
6362 : : {
6363 : 345 : stmt_vec_info stmt_info
6364 : 345 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
6365 : 345 : stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
6366 : 345 : if (needs_fold_left_reduction_p (TREE_TYPE
6367 : : (gimple_get_lhs (stmt_info->stmt)),
6368 : : STMT_VINFO_REDUC_CODE (reduc_info)))
6369 : : {
6370 : 71 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
6371 : 71 : m_partitions[m_vertices[node_i].partition].layout = 0;
6372 : : }
6373 : : }
6374 : :
6375 : : /* Check which layouts each node and partition can handle. Calculate the
6376 : : weights associated with inserting layout changes on edges. */
6377 : 5573063 : for (unsigned int node_i : m_partitioned_nodes)
6378 : : {
6379 : 3597884 : auto &vertex = m_vertices[node_i];
6380 : 3597884 : auto &partition = m_partitions[vertex.partition];
6381 : 3597884 : slp_tree node = vertex.node;
6382 : :
6383 : 3597884 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
6384 : : {
6385 : 3592395 : vertex.weight = vect_slp_node_weight (node);
6386 : :
6387 : : /* We do not handle stores with a permutation, so all
6388 : : incoming permutations must have been materialized.
6389 : :
6390 : : We also don't handle masked grouped loads, which lack a
6391 : : permutation vector. In this case the memory locations
6392 : : form an implicit second input to the loads, on top of the
6393 : : explicit mask input, and the memory input's layout cannot
6394 : : be changed.
6395 : :
6396 : : On the other hand, we do support permuting gather loads and
6397 : : masked gather loads, where each scalar load is independent
6398 : : of the others. This can be useful if the address/index input
6399 : : benefits from permutation. */
6400 : 3592395 : if (STMT_VINFO_DATA_REF (rep)
6401 : 1696795 : && STMT_VINFO_GROUPED_ACCESS (rep)
6402 : 4752480 : && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
6403 : 1032500 : partition.layout = 0;
6404 : :
6405 : : /* We cannot change the layout of an operation that is
6406 : : not independent on lanes. Note this is an explicit
6407 : : negative list since that's much shorter than the respective
6408 : : positive one but it's critical to keep maintaining it. */
6409 : 3592395 : if (is_gimple_call (STMT_VINFO_STMT (rep)))
6410 : 22634 : switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
6411 : : {
6412 : 438 : case CFN_COMPLEX_ADD_ROT90:
6413 : 438 : case CFN_COMPLEX_ADD_ROT270:
6414 : 438 : case CFN_COMPLEX_MUL:
6415 : 438 : case CFN_COMPLEX_MUL_CONJ:
6416 : 438 : case CFN_VEC_ADDSUB:
6417 : 438 : case CFN_VEC_FMADDSUB:
6418 : 438 : case CFN_VEC_FMSUBADD:
6419 : 438 : partition.layout = 0;
6420 : : default:;
6421 : : }
6422 : : }
6423 : :
6424 : 8067480 : auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
6425 : : {
6426 : 4469596 : auto &other_vertex = m_vertices[other_node_i];
6427 : :
6428 : : /* Count the number of edges from earlier partitions and the number
6429 : : of edges to later partitions. */
6430 : 4469596 : if (other_vertex.partition < vertex.partition)
6431 : 2234798 : partition.in_degree += 1;
6432 : : else
6433 : 2234798 : partition.out_degree += 1;
6434 : :
6435 : : /* If the current node uses the result of OTHER_NODE_I, accumulate
6436 : : the effects of that. */
6437 : 4469596 : if (ud->src == int (node_i))
6438 : : {
6439 : 2234798 : other_vertex.out_weight += vertex.weight;
6440 : 2234798 : other_vertex.out_degree += 1;
6441 : : }
6442 : 8067480 : };
6443 : 3597884 : for_each_partition_edge (node_i, process_edge);
6444 : : }
6445 : 658393 : }
6446 : :
6447 : : /* Return the incoming costs for node NODE_I, assuming that each input keeps
6448 : : its current (provisional) choice of layout. The inputs do not necessarily
6449 : : have the same layout as each other. */
6450 : :
6451 : : slpg_layout_cost
6452 : 3019 : vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
6453 : : {
6454 : 3019 : auto &vertex = m_vertices[node_i];
6455 : 3019 : slpg_layout_cost cost;
6456 : 10808 : auto add_cost = [&](graph_edge *, unsigned int other_node_i)
6457 : : {
6458 : 7789 : auto &other_vertex = m_vertices[other_node_i];
6459 : 7789 : if (other_vertex.partition < vertex.partition)
6460 : : {
6461 : 4942 : auto &other_partition = m_partitions[other_vertex.partition];
6462 : 9884 : auto &other_costs = partition_layout_costs (other_vertex.partition,
6463 : 4942 : other_partition.layout);
6464 : 4942 : slpg_layout_cost this_cost = other_costs.in_cost;
6465 : 4942 : this_cost.add_serial_cost (other_costs.internal_cost);
6466 : 4942 : this_cost.split (other_partition.out_degree);
6467 : 4942 : cost.add_parallel_cost (this_cost);
6468 : : }
6469 : 10808 : };
6470 : 3019 : for_each_partition_edge (node_i, add_cost);
6471 : 3019 : return cost;
6472 : : }
6473 : :
6474 : : /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
6475 : : and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
6476 : : slpg_layout_cost::impossible () if the change isn't possible. */
6477 : :
6478 : : slpg_layout_cost
6479 : 458402 : vect_optimize_slp_pass::
6480 : : edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
6481 : : unsigned int layout2_i)
6482 : : {
6483 : 458402 : auto &def_vertex = m_vertices[ud->dest];
6484 : 458402 : auto &use_vertex = m_vertices[ud->src];
6485 : 458402 : auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
6486 : 458402 : auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
6487 : 458402 : auto factor = change_layout_cost (def_vertex.node, def_layout_i,
6488 : : use_layout_i);
6489 : 458402 : if (factor < 0)
6490 : 5067 : return slpg_layout_cost::impossible ();
6491 : :
6492 : : /* We have a choice of putting the layout change at the site of the
6493 : : definition or at the site of the use. Prefer the former when
6494 : : optimizing for size or when the execution frequency of the
6495 : : definition is no greater than the combined execution frequencies of
6496 : : the uses. When putting the layout change at the site of the definition,
6497 : : divvy up the cost among all consumers. */
6498 : 453335 : if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
6499 : : {
6500 : 436827 : slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
6501 : 436827 : cost.split (def_vertex.out_degree);
6502 : 436827 : return cost;
6503 : : }
6504 : 16508 : return { use_vertex.weight * factor, m_optimize_size };
6505 : : }
6506 : :
6507 : : /* UD represents a use-def link between FROM_NODE_I and a node in a later
6508 : : partition; FROM_NODE_I could be the definition node or the use node.
6509 : : The node at the other end of the link wants to use layout TO_LAYOUT_I.
6510 : : Return the cost of any necessary fix-ups on edge UD, or return
6511 : : slpg_layout_cost::impossible () if the change isn't possible.
6512 : :
6513 : : At this point, FROM_NODE_I's partition has chosen the cheapest
6514 : : layout based on the information available so far, but this choice
6515 : : is only provisional. */
6516 : :
6517 : : slpg_layout_cost
6518 : 119117 : vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
6519 : : unsigned int to_layout_i)
6520 : : {
6521 : 119117 : auto &from_vertex = m_vertices[from_node_i];
6522 : 119117 : unsigned int from_partition_i = from_vertex.partition;
6523 : 119117 : slpg_partition_info &from_partition = m_partitions[from_partition_i];
6524 : 119117 : gcc_assert (from_partition.layout >= 0);
6525 : :
6526 : : /* First calculate the cost on the assumption that FROM_PARTITION sticks
6527 : : with its current layout preference. */
6528 : 119117 : slpg_layout_cost cost = slpg_layout_cost::impossible ();
6529 : 119117 : auto edge_cost = edge_layout_cost (ud, from_node_i,
6530 : 119117 : from_partition.layout, to_layout_i);
6531 : 119117 : if (edge_cost.is_possible ())
6532 : : {
6533 : 232720 : auto &from_costs = partition_layout_costs (from_partition_i,
6534 : 116360 : from_partition.layout);
6535 : 116360 : cost = from_costs.in_cost;
6536 : 116360 : cost.add_serial_cost (from_costs.internal_cost);
6537 : 116360 : cost.split (from_partition.out_degree);
6538 : 116360 : cost.add_serial_cost (edge_cost);
6539 : : }
6540 : 2757 : else if (from_partition.layout == 0)
6541 : : /* We must allow the source partition to have layout 0 as a fallback,
6542 : : in case all other options turn out to be impossible. */
6543 : 2757 : return cost;
6544 : :
6545 : : /* Take the minimum of that cost and the cost that applies if
6546 : : FROM_PARTITION instead switches to TO_LAYOUT_I. */
6547 : 116360 : auto &direct_layout_costs = partition_layout_costs (from_partition_i,
6548 : : to_layout_i);
6549 : 116360 : if (direct_layout_costs.is_possible ())
6550 : : {
6551 : 108509 : slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
6552 : 108509 : direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
6553 : 108509 : direct_cost.split (from_partition.out_degree);
6554 : 108509 : if (!cost.is_possible ()
6555 : 108509 : || direct_cost.is_better_than (cost, m_optimize_size))
6556 : 33369 : cost = direct_cost;
6557 : : }
6558 : :
6559 : 116360 : return cost;
6560 : : }
6561 : :
6562 : : /* UD represents a use-def link between TO_NODE_I and a node in an earlier
6563 : : partition; TO_NODE_I could be the definition node or the use node.
6564 : : The node at the other end of the link wants to use layout FROM_LAYOUT_I;
6565 : : return the cost of any necessary fix-ups on edge UD, or
6566 : : slpg_layout_cost::impossible () if the choice cannot be made.
6567 : :
6568 : : At this point, TO_NODE_I's partition has a fixed choice of layout. */
6569 : :
6570 : : slpg_layout_cost
6571 : 113871 : vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
6572 : : unsigned int from_layout_i)
6573 : : {
6574 : 113871 : auto &to_vertex = m_vertices[to_node_i];
6575 : 113871 : unsigned int to_partition_i = to_vertex.partition;
6576 : 113871 : slpg_partition_info &to_partition = m_partitions[to_partition_i];
6577 : 113871 : gcc_assert (to_partition.layout >= 0);
6578 : :
6579 : : /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
6580 : : adjusted for this input having layout FROM_LAYOUT_I. Assume that
6581 : : any other inputs keep their current choice of layout. */
6582 : 113871 : auto &to_costs = partition_layout_costs (to_partition_i,
6583 : : to_partition.layout);
6584 : 113871 : if (ud->src == int (to_node_i)
6585 : 113695 : && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
6586 : : {
6587 : 8633 : auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
6588 : 8633 : auto old_layout = from_partition.layout;
6589 : 8633 : from_partition.layout = from_layout_i;
6590 : 17266 : int factor = internal_node_cost (to_vertex.node, -1,
6591 : 8633 : to_partition.layout);
6592 : 8633 : from_partition.layout = old_layout;
6593 : 8633 : if (factor >= 0)
6594 : : {
6595 : 7941 : slpg_layout_cost cost = to_costs.out_cost;
6596 : 15882 : cost.add_serial_cost ({ to_vertex.weight * factor,
6597 : 7941 : m_optimize_size });
6598 : 7941 : cost.split (to_partition.in_degree);
6599 : 7941 : return cost;
6600 : : }
6601 : : }
6602 : :
6603 : : /* Compute the cost if we insert any necessary layout change on edge UD. */
6604 : 105930 : auto edge_cost = edge_layout_cost (ud, to_node_i,
6605 : 105930 : to_partition.layout, from_layout_i);
6606 : 105930 : if (edge_cost.is_possible ())
6607 : : {
6608 : 105930 : slpg_layout_cost cost = to_costs.out_cost;
6609 : 105930 : cost.add_serial_cost (to_costs.internal_cost);
6610 : 105930 : cost.split (to_partition.in_degree);
6611 : 105930 : cost.add_serial_cost (edge_cost);
6612 : 105930 : return cost;
6613 : : }
6614 : :
6615 : 0 : return slpg_layout_cost::impossible ();
6616 : : }
6617 : :
6618 : : /* Make a forward pass through the partitions, accumulating input costs.
6619 : : Make a tentative (provisional) choice of layout for each partition,
6620 : : ensuring that this choice still allows later partitions to keep
6621 : : their original layout. */
6622 : :
6623 : : void
6624 : 9999 : vect_optimize_slp_pass::forward_pass ()
6625 : : {
6626 : 93602 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
6627 : : ++partition_i)
6628 : : {
6629 : 83603 : auto &partition = m_partitions[partition_i];
6630 : :
6631 : : /* If the partition consists of a single VEC_PERM_EXPR, precompute
6632 : : the incoming cost that would apply if every predecessor partition
6633 : : keeps its current layout. This is used within the loop below. */
6634 : 83603 : slpg_layout_cost in_cost;
6635 : 83603 : slp_tree single_node = nullptr;
6636 : 83603 : if (partition.node_end == partition.node_begin + 1)
6637 : : {
6638 : 83092 : unsigned int node_i = m_partitioned_nodes[partition.node_begin];
6639 : 83092 : single_node = m_vertices[node_i].node;
6640 : 83092 : if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
6641 : 3019 : in_cost = total_in_cost (node_i);
6642 : : }
6643 : :
6644 : : /* Go through the possible layouts. Decide which ones are valid
6645 : : for this partition and record which of the valid layouts has
6646 : : the lowest cost. */
6647 : 83603 : unsigned int min_layout_i = 0;
6648 : 83603 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
6649 : 258474 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6650 : : {
6651 : 174871 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6652 : 174871 : if (!layout_costs.is_possible ())
6653 : 38220 : continue;
6654 : :
6655 : : /* If the recorded layout is already 0 then the layout cannot
6656 : : change. */
6657 : 174871 : if (partition.layout == 0 && layout_i != 0)
6658 : : {
6659 : 27659 : layout_costs.mark_impossible ();
6660 : 27659 : continue;
6661 : : }
6662 : :
6663 : 147212 : bool is_possible = true;
6664 : 288085 : for (unsigned int order_i = partition.node_begin;
6665 : 288085 : order_i < partition.node_end; ++order_i)
6666 : : {
6667 : 149210 : unsigned int node_i = m_partitioned_nodes[order_i];
6668 : 149210 : auto &vertex = m_vertices[node_i];
6669 : :
6670 : : /* Reject the layout if it is individually incompatible
6671 : : with any node in the partition. */
6672 : 149210 : if (!is_compatible_layout (vertex.node, layout_i))
6673 : : {
6674 : 7117 : is_possible = false;
6675 : 8337 : break;
6676 : : }
6677 : :
6678 : 378305 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
6679 : : {
6680 : 236212 : auto &other_vertex = m_vertices[other_node_i];
6681 : 236212 : if (other_vertex.partition < vertex.partition)
6682 : : {
6683 : : /* Accumulate the incoming costs from earlier
6684 : : partitions, plus the cost of any layout changes
6685 : : on UD itself. */
6686 : 119117 : auto cost = forward_cost (ud, other_node_i, layout_i);
6687 : 119117 : if (!cost.is_possible ())
6688 : 2757 : is_possible = false;
6689 : : else
6690 : 116360 : layout_costs.in_cost.add_parallel_cost (cost);
6691 : : }
6692 : : else
6693 : : /* Reject the layout if it would make layout 0 impossible
6694 : : for later partitions. This amounts to testing that the
6695 : : target supports reversing the layout change on edges
6696 : : to later partitions.
6697 : :
6698 : : In principle, it might be possible to push a layout
6699 : : change all the way down a graph, so that it never
6700 : : needs to be reversed and so that the target doesn't
6701 : : need to support the reverse operation. But it would
6702 : : be awkward to bail out if we hit a partition that
6703 : : does not support the new layout, especially since
6704 : : we are not dealing with a lattice. */
6705 : 117095 : is_possible &= edge_layout_cost (ud, other_node_i, 0,
6706 : 117095 : layout_i).is_possible ();
6707 : 378305 : };
6708 : 142093 : for_each_partition_edge (node_i, add_cost);
6709 : :
6710 : : /* Accumulate the cost of using LAYOUT_I within NODE,
6711 : : both for the inputs and the outputs. */
6712 : 142093 : int factor = internal_node_cost (vertex.node, layout_i,
6713 : : layout_i);
6714 : 142093 : if (factor < 0)
6715 : : {
6716 : 1220 : is_possible = false;
6717 : 1220 : break;
6718 : : }
6719 : 140873 : else if (factor)
6720 : 21840 : layout_costs.internal_cost.add_serial_cost
6721 : 21840 : ({ vertex.weight * factor, m_optimize_size });
6722 : : }
6723 : 147212 : if (!is_possible)
6724 : : {
6725 : 10561 : layout_costs.mark_impossible ();
6726 : 10561 : continue;
6727 : : }
6728 : :
6729 : : /* Combine the incoming and partition-internal costs. */
6730 : 136651 : slpg_layout_cost combined_cost = layout_costs.in_cost;
6731 : 136651 : combined_cost.add_serial_cost (layout_costs.internal_cost);
6732 : :
6733 : : /* If this partition consists of a single VEC_PERM_EXPR, see
6734 : : if the VEC_PERM_EXPR can be changed to support output layout
6735 : : LAYOUT_I while keeping all the provisional choices of input
6736 : : layout. */
6737 : 136651 : if (single_node
6738 : 135664 : && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
6739 : : {
6740 : 5167 : int factor = internal_node_cost (single_node, -1, layout_i);
6741 : 5167 : if (factor >= 0)
6742 : : {
6743 : 4702 : auto weight = m_vertices[single_node->vertex].weight;
6744 : 4702 : slpg_layout_cost internal_cost
6745 : 4702 : = { weight * factor, m_optimize_size };
6746 : :
6747 : 4702 : slpg_layout_cost alt_cost = in_cost;
6748 : 4702 : alt_cost.add_serial_cost (internal_cost);
6749 : 4702 : if (alt_cost.is_better_than (combined_cost, m_optimize_size))
6750 : : {
6751 : 1383 : combined_cost = alt_cost;
6752 : 1383 : layout_costs.in_cost = in_cost;
6753 : 1383 : layout_costs.internal_cost = internal_cost;
6754 : : }
6755 : : }
6756 : : }
6757 : :
6758 : : /* Record the layout with the lowest cost. Prefer layout 0 in
6759 : : the event of a tie between it and another layout. */
6760 : 136651 : if (!min_layout_cost.is_possible ()
6761 : 53048 : || combined_cost.is_better_than (min_layout_cost,
6762 : 53048 : m_optimize_size))
6763 : : {
6764 : 98026 : min_layout_i = layout_i;
6765 : 98026 : min_layout_cost = combined_cost;
6766 : : }
6767 : : }
6768 : :
6769 : : /* This loop's handling of earlier partitions should ensure that
6770 : : choosing the original layout for the current partition is no
6771 : : less valid than it was in the original graph, even with the
6772 : : provisional layout choices for those earlier partitions. */
6773 : 83603 : gcc_assert (min_layout_cost.is_possible ());
6774 : 83603 : partition.layout = min_layout_i;
6775 : : }
6776 : 9999 : }
6777 : :
6778 : : /* Make a backward pass through the partitions, accumulating output costs.
6779 : : Make a final choice of layout for each partition. */
6780 : :
6781 : : void
6782 : 9999 : vect_optimize_slp_pass::backward_pass ()
6783 : : {
6784 : 103601 : for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
6785 : : {
6786 : 83603 : auto &partition = m_partitions[partition_i];
6787 : :
6788 : 83603 : unsigned int min_layout_i = 0;
6789 : 83603 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
6790 : 258474 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6791 : : {
6792 : 174871 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6793 : 174871 : if (!layout_costs.is_possible ())
6794 : 38220 : continue;
6795 : :
6796 : : /* Accumulate the costs from successor partitions. */
6797 : 136651 : bool is_possible = true;
6798 : 275271 : for (unsigned int order_i = partition.node_begin;
6799 : 275271 : order_i < partition.node_end; ++order_i)
6800 : : {
6801 : 138620 : unsigned int node_i = m_partitioned_nodes[order_i];
6802 : 138620 : auto &vertex = m_vertices[node_i];
6803 : 368751 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
6804 : : {
6805 : 230131 : auto &other_vertex = m_vertices[other_node_i];
6806 : 230131 : auto &other_partition = m_partitions[other_vertex.partition];
6807 : 230131 : if (other_vertex.partition > vertex.partition)
6808 : : {
6809 : : /* Accumulate the incoming costs from later
6810 : : partitions, plus the cost of any layout changes
6811 : : on UD itself. */
6812 : 113871 : auto cost = backward_cost (ud, other_node_i, layout_i);
6813 : 113871 : if (!cost.is_possible ())
6814 : 0 : is_possible = false;
6815 : : else
6816 : 113871 : layout_costs.out_cost.add_parallel_cost (cost);
6817 : : }
6818 : : else
6819 : : /* Make sure that earlier partitions can (if necessary
6820 : : or beneficial) keep the layout that they chose in
6821 : : the forward pass. This ensures that there is at
6822 : : least one valid choice of layout. */
6823 : 116260 : is_possible &= edge_layout_cost (ud, other_node_i,
6824 : 116260 : other_partition.layout,
6825 : 116260 : layout_i).is_possible ();
6826 : 368751 : };
6827 : 138620 : for_each_partition_edge (node_i, add_cost);
6828 : : }
6829 : 136651 : if (!is_possible)
6830 : : {
6831 : 0 : layout_costs.mark_impossible ();
6832 : 0 : continue;
6833 : : }
6834 : :
6835 : : /* Locally combine the costs from the forward and backward passes.
6836 : : (This combined cost is not passed on, since that would lead
6837 : : to double counting.) */
6838 : 136651 : slpg_layout_cost combined_cost = layout_costs.in_cost;
6839 : 136651 : combined_cost.add_serial_cost (layout_costs.internal_cost);
6840 : 136651 : combined_cost.add_serial_cost (layout_costs.out_cost);
6841 : :
6842 : : /* Record the layout with the lowest cost. Prefer layout 0 in
6843 : : the event of a tie between it and another layout. */
6844 : 136651 : if (!min_layout_cost.is_possible ()
6845 : 53048 : || combined_cost.is_better_than (min_layout_cost,
6846 : 53048 : m_optimize_size))
6847 : : {
6848 : 95602 : min_layout_i = layout_i;
6849 : 95602 : min_layout_cost = combined_cost;
6850 : : }
6851 : : }
6852 : :
6853 : 83603 : gcc_assert (min_layout_cost.is_possible ());
6854 : 83603 : partition.layout = min_layout_i;
6855 : : }
6856 : 9999 : }
6857 : :
6858 : : /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
6859 : : NODE already has the layout that was selected for its partition. */
6860 : :
6861 : : slp_tree
6862 : 93536 : vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
6863 : : unsigned int to_layout_i)
6864 : : {
6865 : 93536 : unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
6866 : 93536 : slp_tree result = m_node_layouts[result_i];
6867 : 93536 : if (result)
6868 : : return result;
6869 : :
6870 : 93097 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6871 : 93097 : || (SLP_TREE_DEF_TYPE (node) == vect_external_def
6872 : : /* We can't permute vector defs in place. */
6873 : 15855 : && SLP_TREE_VEC_DEFS (node).is_empty ()))
6874 : : {
6875 : : /* If the vector is uniform or unchanged, there's nothing to do. */
6876 : 31646 : if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
6877 : : result = node;
6878 : : else
6879 : : {
6880 : 1520 : auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
6881 : 1520 : result = vect_create_new_slp_node (scalar_ops);
6882 : 1520 : vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
6883 : : }
6884 : : }
6885 : : else
6886 : : {
6887 : 61451 : unsigned int partition_i = m_vertices[node->vertex].partition;
6888 : 61451 : unsigned int from_layout_i = m_partitions[partition_i].layout;
6889 : 61451 : if (from_layout_i == to_layout_i)
6890 : 61104 : return node;
6891 : :
6892 : : /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
6893 : : permutation instead of a serial one. Leave the new permutation
6894 : : in TMP_PERM on success. */
6895 : 347 : auto_lane_permutation_t tmp_perm;
6896 : 347 : unsigned int num_inputs = 1;
6897 : 347 : if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6898 : : {
6899 : 9 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
6900 : 9 : if (from_layout_i != 0)
6901 : 9 : vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
6902 : 9 : if (to_layout_i != 0)
6903 : 4 : vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
6904 : 9 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
6905 : : tmp_perm,
6906 : 9 : SLP_TREE_CHILDREN (node),
6907 : : false) >= 0)
6908 : 9 : num_inputs = SLP_TREE_CHILDREN (node).length ();
6909 : : else
6910 : 0 : tmp_perm.truncate (0);
6911 : : }
6912 : :
6913 : 347 : if (dump_enabled_p ())
6914 : : {
6915 : 58 : if (tmp_perm.length () > 0)
6916 : 6 : dump_printf_loc (MSG_NOTE, vect_location,
6917 : : "duplicating permutation node %p with"
6918 : : " layout %d\n",
6919 : : (void *) node, to_layout_i);
6920 : : else
6921 : 52 : dump_printf_loc (MSG_NOTE, vect_location,
6922 : : "inserting permutation node in place of %p\n",
6923 : : (void *) node);
6924 : : }
6925 : :
6926 : 347 : unsigned int num_lanes = SLP_TREE_LANES (node);
6927 : 347 : result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
6928 : 347 : if (SLP_TREE_SCALAR_STMTS (node).length ())
6929 : : {
6930 : 346 : auto &stmts = SLP_TREE_SCALAR_STMTS (result);
6931 : 346 : stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
6932 : 346 : if (from_layout_i != 0)
6933 : 298 : vect_slp_permute (m_perms[from_layout_i], stmts, false);
6934 : 346 : if (to_layout_i != 0)
6935 : 52 : vect_slp_permute (m_perms[to_layout_i], stmts, true);
6936 : : }
6937 : 347 : SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
6938 : 347 : SLP_TREE_LANES (result) = num_lanes;
6939 : 347 : SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
6940 : 347 : result->vertex = -1;
6941 : :
6942 : 347 : auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
6943 : 347 : if (tmp_perm.length ())
6944 : : {
6945 : 9 : lane_perm.safe_splice (tmp_perm);
6946 : 9 : SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
6947 : : }
6948 : : else
6949 : : {
6950 : 338 : lane_perm.create (num_lanes);
6951 : 1072 : for (unsigned j = 0; j < num_lanes; ++j)
6952 : 734 : lane_perm.quick_push ({ 0, j });
6953 : 338 : if (from_layout_i != 0)
6954 : 289 : vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
6955 : 338 : if (to_layout_i != 0)
6956 : 49 : vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
6957 : 338 : SLP_TREE_CHILDREN (result).safe_push (node);
6958 : : }
6959 : 1392 : for (slp_tree child : SLP_TREE_CHILDREN (result))
6960 : 351 : child->refcnt++;
6961 : 347 : }
6962 : 31993 : m_node_layouts[result_i] = result;
6963 : 31993 : return result;
6964 : : }
6965 : :
6966 : : /* Apply the chosen vector layouts to the SLP graph. */
6967 : :
6968 : : void
6969 : 9999 : vect_optimize_slp_pass::materialize ()
6970 : : {
6971 : : /* We no longer need the costs, so avoid having two O(N * P) arrays
6972 : : live at the same time. */
6973 : 9999 : m_partition_layout_costs.release ();
6974 : 29997 : m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
6975 : :
6976 : 19998 : auto_sbitmap fully_folded (m_vertices.length ());
6977 : 9999 : bitmap_clear (fully_folded);
6978 : 114616 : for (unsigned int node_i : m_partitioned_nodes)
6979 : : {
6980 : 84619 : auto &vertex = m_vertices[node_i];
6981 : 84619 : slp_tree node = vertex.node;
6982 : 84619 : int layout_i = m_partitions[vertex.partition].layout;
6983 : 84619 : gcc_assert (layout_i >= 0);
6984 : :
6985 : : /* Rearrange the scalar statements to match the chosen layout. */
6986 : 84619 : if (layout_i > 0)
6987 : 12207 : vect_slp_permute (m_perms[layout_i],
6988 : 12207 : SLP_TREE_SCALAR_STMTS (node), true);
6989 : :
6990 : : /* Update load and lane permutations. */
6991 : 84619 : if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6992 : : {
6993 : : /* First try to absorb the input vector layouts. If that fails,
6994 : : force the inputs to have layout LAYOUT_I too. We checked that
6995 : : that was possible before deciding to use nonzero output layouts.
6996 : : (Note that at this stage we don't really have any guarantee that
6997 : : the target supports the original VEC_PERM_EXPR.) */
6998 : 3063 : auto &perm = SLP_TREE_LANE_PERMUTATION (node);
6999 : 3063 : auto_lane_permutation_t tmp_perm;
7000 : 3063 : tmp_perm.safe_splice (perm);
7001 : 3063 : change_vec_perm_layout (node, tmp_perm, -1, layout_i);
7002 : 3063 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
7003 : : tmp_perm,
7004 : 3063 : SLP_TREE_CHILDREN (node),
7005 : : false) >= 0)
7006 : : {
7007 : 2698 : if (dump_enabled_p ()
7008 : 3530 : && !std::equal (tmp_perm.begin (), tmp_perm.end (),
7009 : : perm.begin ()))
7010 : 54 : dump_printf_loc (MSG_NOTE, vect_location,
7011 : : "absorbing input layouts into %p\n",
7012 : : (void *) node);
7013 : 8094 : std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
7014 : 2698 : bitmap_set_bit (fully_folded, node_i);
7015 : : }
7016 : : else
7017 : : {
7018 : : /* Not MSG_MISSED because it would make no sense to users. */
7019 : 365 : if (dump_enabled_p ())
7020 : 46 : dump_printf_loc (MSG_NOTE, vect_location,
7021 : : "failed to absorb input layouts into %p\n",
7022 : : (void *) node);
7023 : 365 : change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
7024 : : }
7025 : 3063 : }
7026 : : else
7027 : : {
7028 : 81556 : gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
7029 : 81556 : auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
7030 : 81556 : if (layout_i > 0)
7031 : : /* ??? When we handle non-bijective permutes the idea
7032 : : is that we can force the load-permutation to be
7033 : : { min, min + 1, min + 2, ... max }. But then the
7034 : : scalar defs might no longer match the lane content
7035 : : which means wrong-code with live lane vectorization.
7036 : : So we possibly have to have NULL entries for those. */
7037 : 12114 : vect_slp_permute (m_perms[layout_i], load_perm, true);
7038 : : }
7039 : : }
7040 : :
7041 : : /* Do this before any nodes disappear, since it involves a walk
7042 : : over the leaves. */
7043 : 9999 : remove_redundant_permutations ();
7044 : :
7045 : : /* Replace each child with a correctly laid-out version. */
7046 : 114616 : for (unsigned int node_i : m_partitioned_nodes)
7047 : : {
7048 : : /* Skip nodes that have already been handled above. */
7049 : 84619 : if (bitmap_bit_p (fully_folded, node_i))
7050 : 2698 : continue;
7051 : :
7052 : 81921 : auto &vertex = m_vertices[node_i];
7053 : 81921 : int in_layout_i = m_partitions[vertex.partition].layout;
7054 : 81921 : gcc_assert (in_layout_i >= 0);
7055 : :
7056 : : unsigned j;
7057 : : slp_tree child;
7058 : 237036 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
7059 : : {
7060 : 93712 : if (!child)
7061 : 176 : continue;
7062 : :
7063 : 93536 : slp_tree new_child = get_result_with_layout (child, in_layout_i);
7064 : 93536 : if (new_child != child)
7065 : : {
7066 : 2029 : vect_free_slp_tree (child);
7067 : 2029 : SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
7068 : 2029 : new_child->refcnt += 1;
7069 : : }
7070 : : }
7071 : : }
7072 : 9999 : }
7073 : :
7074 : : /* Elide load permutations that are not necessary. Such permutations might
7075 : : be pre-existing, rather than created by the layout optimizations. */
7076 : :
7077 : : void
7078 : 658393 : vect_optimize_slp_pass::remove_redundant_permutations ()
7079 : : {
7080 : 4578780 : for (unsigned int node_i : m_leafs)
7081 : : {
7082 : 2603601 : slp_tree node = m_vertices[node_i].node;
7083 : 2603601 : if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
7084 : 2140339 : continue;
7085 : :
7086 : : /* In basic block vectorization we allow any subchain of an interleaving
7087 : : chain.
7088 : : FORNOW: not in loop SLP because of realignment complications. */
7089 : 463262 : if (is_a <bb_vec_info> (m_vinfo))
7090 : : {
7091 : 146833 : bool subchain_p = true;
7092 : : stmt_vec_info next_load_info = NULL;
7093 : : stmt_vec_info load_info;
7094 : : unsigned j;
7095 : 146833 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
7096 : : {
7097 : 117245 : if (j != 0
7098 : 117245 : && (next_load_info != load_info
7099 : 54010 : || ! load_info
7100 : 54010 : || DR_GROUP_GAP (load_info) != 1))
7101 : : {
7102 : : subchain_p = false;
7103 : : break;
7104 : : }
7105 : 99128 : next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
7106 : : }
7107 : 47705 : if (subchain_p)
7108 : : {
7109 : 29588 : SLP_TREE_LOAD_PERMUTATION (node).release ();
7110 : 29588 : continue;
7111 : : }
7112 : : }
7113 : : else
7114 : : {
7115 : 415557 : loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
7116 : 415557 : stmt_vec_info load_info;
7117 : 415557 : bool this_load_permuted = false;
7118 : 415557 : unsigned j;
7119 : 1234924 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
7120 : 419389 : if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
7121 : : {
7122 : : this_load_permuted = true;
7123 : : break;
7124 : : }
7125 : : /* When this isn't a grouped access we know it's single element
7126 : : and contiguous. */
7127 : 415557 : if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
7128 : : {
7129 : 335677 : if (!this_load_permuted
7130 : 335677 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
7131 : 335133 : || SLP_TREE_LANES (node) == 1))
7132 : 335133 : SLP_TREE_LOAD_PERMUTATION (node).release ();
7133 : 335677 : continue;
7134 : : }
7135 : 79880 : stmt_vec_info first_stmt_info
7136 : 79880 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
7137 : 80198 : if (!this_load_permuted
7138 : : /* The load requires permutation when unrolling exposes
7139 : : a gap either because the group is larger than the SLP
7140 : : group-size or because there is a gap between the groups. */
7141 : 79880 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
7142 : 64617 : || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
7143 : 90 : && DR_GROUP_GAP (first_stmt_info) == 0)))
7144 : : {
7145 : 318 : SLP_TREE_LOAD_PERMUTATION (node).release ();
7146 : 318 : continue;
7147 : : }
7148 : : }
7149 : : }
7150 : 658393 : }
7151 : :
7152 : : /* Print the partition graph and layout information to the dump file. */
7153 : :
7154 : : void
7155 : 607 : vect_optimize_slp_pass::dump ()
7156 : : {
7157 : 607 : dump_printf_loc (MSG_NOTE, vect_location,
7158 : : "SLP optimize permutations:\n");
7159 : 1230 : for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
7160 : : {
7161 : 623 : dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
7162 : 623 : const char *sep = "";
7163 : 5329 : for (unsigned int idx : m_perms[layout_i])
7164 : : {
7165 : 3460 : dump_printf (MSG_NOTE, "%s%d", sep, idx);
7166 : 3460 : sep = ", ";
7167 : : }
7168 : 623 : dump_printf (MSG_NOTE, " }\n");
7169 : : }
7170 : 607 : dump_printf_loc (MSG_NOTE, vect_location,
7171 : : "SLP optimize partitions:\n");
7172 : 5176 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
7173 : : ++partition_i)
7174 : : {
7175 : 4569 : auto &partition = m_partitions[partition_i];
7176 : 4569 : dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
7177 : 4569 : dump_printf_loc (MSG_NOTE, vect_location,
7178 : : " partition %d (layout %d):\n",
7179 : : partition_i, partition.layout);
7180 : 4569 : dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
7181 : 9338 : for (unsigned int order_i = partition.node_begin;
7182 : 9338 : order_i < partition.node_end; ++order_i)
7183 : : {
7184 : 4769 : auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
7185 : 9538 : dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
7186 : 4769 : (void *) vertex.node);
7187 : 4769 : dump_printf_loc (MSG_NOTE, vect_location,
7188 : : " weight: %f\n",
7189 : : vertex.weight.to_double ());
7190 : 4769 : if (vertex.out_degree)
7191 : 3754 : dump_printf_loc (MSG_NOTE, vect_location,
7192 : : " out weight: %f (degree %d)\n",
7193 : : vertex.out_weight.to_double (),
7194 : : vertex.out_degree);
7195 : 4769 : if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
7196 : 462 : dump_printf_loc (MSG_NOTE, vect_location,
7197 : : " op: VEC_PERM_EXPR\n");
7198 : 4307 : else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
7199 : 4289 : dump_printf_loc (MSG_NOTE, vect_location,
7200 : : " op template: %G", rep->stmt);
7201 : : }
7202 : 4569 : dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
7203 : 9338 : for (unsigned int order_i = partition.node_begin;
7204 : 9338 : order_i < partition.node_end; ++order_i)
7205 : : {
7206 : 4769 : unsigned int node_i = m_partitioned_nodes[order_i];
7207 : 4769 : auto &vertex = m_vertices[node_i];
7208 : 14413 : auto print_edge = [&](graph_edge *, unsigned int other_node_i)
7209 : : {
7210 : 9644 : auto &other_vertex = m_vertices[other_node_i];
7211 : 9644 : if (other_vertex.partition < vertex.partition)
7212 : 4822 : dump_printf_loc (MSG_NOTE, vect_location,
7213 : : " - %p [%d] --> %p\n",
7214 : 4822 : (void *) other_vertex.node,
7215 : : other_vertex.partition,
7216 : 4822 : (void *) vertex.node);
7217 : : else
7218 : 4822 : dump_printf_loc (MSG_NOTE, vect_location,
7219 : : " - %p --> [%d] %p\n",
7220 : 4822 : (void *) vertex.node,
7221 : : other_vertex.partition,
7222 : 4822 : (void *) other_vertex.node);
7223 : 14413 : };
7224 : 4769 : for_each_partition_edge (node_i, print_edge);
7225 : : }
7226 : :
7227 : 13960 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7228 : : {
7229 : 9391 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7230 : 9391 : if (layout_costs.is_possible ())
7231 : : {
7232 : 7570 : dump_printf_loc (MSG_NOTE, vect_location,
7233 : : " layout %d:%s\n", layout_i,
7234 : 7570 : partition.layout == int (layout_i)
7235 : : ? " (*)" : "");
7236 : 7570 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7237 : 7570 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7238 : 7570 : combined_cost.add_serial_cost (layout_costs.out_cost);
7239 : : #define TEMPLATE "{depth: %f, total: %f}"
7240 : 7570 : dump_printf_loc (MSG_NOTE, vect_location,
7241 : : " " TEMPLATE "\n",
7242 : : layout_costs.in_cost.depth.to_double (),
7243 : : layout_costs.in_cost.total.to_double ());
7244 : 7570 : dump_printf_loc (MSG_NOTE, vect_location,
7245 : : " + " TEMPLATE "\n",
7246 : : layout_costs.internal_cost.depth.to_double (),
7247 : : layout_costs.internal_cost.total.to_double ());
7248 : 7570 : dump_printf_loc (MSG_NOTE, vect_location,
7249 : : " + " TEMPLATE "\n",
7250 : : layout_costs.out_cost.depth.to_double (),
7251 : : layout_costs.out_cost.total.to_double ());
7252 : 7570 : dump_printf_loc (MSG_NOTE, vect_location,
7253 : : " = " TEMPLATE "\n",
7254 : : combined_cost.depth.to_double (),
7255 : : combined_cost.total.to_double ());
7256 : : #undef TEMPLATE
7257 : : }
7258 : : else
7259 : 1821 : dump_printf_loc (MSG_NOTE, vect_location,
7260 : : " layout %d: rejected\n", layout_i);
7261 : : }
7262 : : }
7263 : 607 : }
7264 : :
7265 : : /* Masked load lanes discovery. */
7266 : :
7267 : : void
7268 : 658393 : vect_optimize_slp_pass::decide_masked_load_lanes ()
7269 : : {
7270 : 7146432 : for (auto v : m_vertices)
7271 : : {
7272 : 5171253 : slp_tree node = v.node;
7273 : 5171253 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
7274 : 3595803 : || SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7275 : 1787245 : continue;
7276 : 3384008 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7277 : 1500148 : if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
7278 : : /* The mask has to be uniform. */
7279 : 963968 : || STMT_VINFO_SLP_VECT_ONLY (stmt_info)
7280 : 963822 : || ! is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
7281 : 3384059 : || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
7282 : : IFN_MASK_LOAD))
7283 : 3384005 : continue;
7284 : 3 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
7285 : 6 : if (STMT_VINFO_STRIDED_P (stmt_info)
7286 : 3 : || compare_step_with_zero (m_vinfo, stmt_info) <= 0
7287 : 3 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (node),
7288 : 0 : DR_GROUP_SIZE (stmt_info),
7289 : : true) == IFN_LAST)
7290 : 3 : continue;
7291 : :
7292 : : /* Uniform masks need to be suitably represented. */
7293 : 0 : slp_tree mask = SLP_TREE_CHILDREN (node)[0];
7294 : 0 : if (SLP_TREE_CODE (mask) != VEC_PERM_EXPR
7295 : 0 : || SLP_TREE_CHILDREN (mask).length () != 1)
7296 : 0 : continue;
7297 : 0 : bool match = true;
7298 : 0 : for (auto perm : SLP_TREE_LANE_PERMUTATION (mask))
7299 : 0 : if (perm.first != 0 || perm.second != 0)
7300 : : {
7301 : : match = false;
7302 : : break;
7303 : : }
7304 : 0 : if (!match)
7305 : 0 : continue;
7306 : :
7307 : : /* Now see if the consumer side matches. */
7308 : 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
7309 : 0 : pred; pred = pred->pred_next)
7310 : : {
7311 : 0 : slp_tree pred_node = m_vertices[pred->src].node;
7312 : : /* All consumers should be a permute with a single outgoing lane. */
7313 : 0 : if (SLP_TREE_CODE (pred_node) != VEC_PERM_EXPR
7314 : 0 : || SLP_TREE_LANES (pred_node) != 1)
7315 : : {
7316 : : match = false;
7317 : : break;
7318 : : }
7319 : 0 : gcc_assert (SLP_TREE_CHILDREN (pred_node).length () == 1);
7320 : : }
7321 : 0 : if (!match)
7322 : 0 : continue;
7323 : : /* Now we can mark the nodes as to use load lanes. */
7324 : 0 : node->ldst_lanes = true;
7325 : 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
7326 : 0 : pred; pred = pred->pred_next)
7327 : 0 : m_vertices[pred->src].node->ldst_lanes = true;
7328 : : /* The catch is we have to massage the mask. We have arranged
7329 : : analyzed uniform masks to be represented by a splat VEC_PERM
7330 : : which we can now simply elide as we cannot easily re-do SLP
7331 : : discovery here. */
7332 : 0 : slp_tree new_mask = SLP_TREE_CHILDREN (mask)[0];
7333 : 0 : SLP_TREE_REF_COUNT (new_mask)++;
7334 : 0 : SLP_TREE_CHILDREN (node)[0] = new_mask;
7335 : 0 : vect_free_slp_tree (mask);
7336 : : }
7337 : 658393 : }
7338 : :
7339 : : /* Main entry point for the SLP graph optimization pass. */
7340 : :
7341 : : void
7342 : 658393 : vect_optimize_slp_pass::run ()
7343 : : {
7344 : 658393 : build_graph ();
7345 : 658393 : create_partitions ();
7346 : 658393 : start_choosing_layouts ();
7347 : 658393 : if (m_perms.length () > 1)
7348 : : {
7349 : 9999 : forward_pass ();
7350 : 9999 : backward_pass ();
7351 : 9999 : if (dump_enabled_p ())
7352 : 607 : dump ();
7353 : 9999 : materialize ();
7354 : 40518 : while (!m_perms.is_empty ())
7355 : 20520 : m_perms.pop ().release ();
7356 : : }
7357 : : else
7358 : 648394 : remove_redundant_permutations ();
7359 : 658393 : free_graph (m_slpg);
7360 : 658393 : build_graph ();
7361 : 658393 : decide_masked_load_lanes ();
7362 : 658393 : free_graph (m_slpg);
7363 : 658393 : }
7364 : :
7365 : : /* Apply CSE to NODE and its children using BST_MAP. */
7366 : :
7367 : : static void
7368 : 5569450 : vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
7369 : : {
7370 : 5569450 : bool put_p = false;
7371 : 5569450 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
7372 : : /* Besides some VEC_PERM_EXPR, two-operator nodes also
7373 : : lack scalar stmts and thus CSE doesn't work via bst_map. Ideally
7374 : : we'd have sth that works for all internal and external nodes. */
7375 : 5569450 : && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
7376 : : {
7377 : 3973584 : slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
7378 : 3973584 : if (leader)
7379 : : {
7380 : : /* We've visited this node already. */
7381 : 400026 : if (!*leader || *leader == node)
7382 : : return;
7383 : :
7384 : 2139 : if (dump_enabled_p ())
7385 : 937 : dump_printf_loc (MSG_NOTE, vect_location,
7386 : : "re-using SLP tree %p for %p\n",
7387 : : (void *)*leader, (void *)node);
7388 : 2139 : vect_free_slp_tree (node);
7389 : 2139 : (*leader)->refcnt += 1;
7390 : 2139 : node = *leader;
7391 : 2139 : return;
7392 : : }
7393 : :
7394 : : /* Avoid creating a cycle by populating the map only after recursion. */
7395 : 3573558 : bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
7396 : 3573558 : node->refcnt += 1;
7397 : 3573558 : put_p = true;
7398 : : /* And recurse. */
7399 : : }
7400 : :
7401 : 15808562 : for (slp_tree &child : SLP_TREE_CHILDREN (node))
7402 : 4709410 : if (child)
7403 : 3977538 : vect_cse_slp_nodes (bst_map, child);
7404 : :
7405 : : /* Now record the node for CSE in other siblings. */
7406 : 5169424 : if (put_p)
7407 : 3573558 : *bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
7408 : : }
7409 : :
7410 : : /* Optimize the SLP graph of VINFO. */
7411 : :
7412 : : void
7413 : 939015 : vect_optimize_slp (vec_info *vinfo)
7414 : : {
7415 : 939015 : if (vinfo->slp_instances.is_empty ())
7416 : : return;
7417 : 658393 : vect_optimize_slp_pass (vinfo).run ();
7418 : :
7419 : : /* Apply CSE again to nodes after permute optimization. */
7420 : 658393 : scalar_stmts_to_slp_tree_map_t *bst_map
7421 : 658393 : = new scalar_stmts_to_slp_tree_map_t ();
7422 : :
7423 : 3567091 : for (auto inst : vinfo->slp_instances)
7424 : 1591912 : vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
7425 : :
7426 : 658393 : release_scalar_stmts_to_slp_tree_map (bst_map);
7427 : : }
7428 : :
7429 : : /* Gather loads reachable from the individual SLP graph entries. */
7430 : :
7431 : : void
7432 : 939015 : vect_gather_slp_loads (vec_info *vinfo)
7433 : : {
7434 : 939015 : unsigned i;
7435 : 939015 : slp_instance instance;
7436 : 2530927 : FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
7437 : : {
7438 : 1591912 : hash_set<slp_tree> visited;
7439 : 1591912 : vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
7440 : : SLP_INSTANCE_TREE (instance), visited);
7441 : 1591912 : }
7442 : 939015 : }
7443 : :
7444 : : /* For NODE update VF based on the number of lanes and the vector types
7445 : : used. */
7446 : :
7447 : : static void
7448 : 4351217 : vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
7449 : : hash_set<slp_tree> &visited)
7450 : : {
7451 : 4351217 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7452 : 1841016 : return;
7453 : 2859602 : if (visited.add (node))
7454 : : return;
7455 : :
7456 : 10105774 : for (slp_tree child : SLP_TREE_CHILDREN (node))
7457 : 3530239 : vect_update_slp_vf_for_node (child, vf, visited);
7458 : :
7459 : : /* We do not visit SLP nodes for constants or externals - those neither
7460 : : have a vector type set yet (vectorizable_* does this) nor do they
7461 : : have max_nunits set. Instead we rely on internal nodes max_nunit
7462 : : to cover constant/external operands.
7463 : : Note that when we stop using fixed size vectors externs and constants
7464 : : shouldn't influence the (minimum) vectorization factor, instead
7465 : : vectorizable_* should honor the vectorization factor when trying to
7466 : : assign vector types to constants and externals and cause iteration
7467 : : to a higher vectorization factor when required. */
7468 : 2510201 : poly_uint64 node_vf
7469 : 2510201 : = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
7470 : 2510201 : vf = force_common_multiple (vf, node_vf);
7471 : :
7472 : : /* For permute nodes that are fed from externs or constants we have to
7473 : : consider their number of lanes as well. Likewise for store-lanes. */
7474 : 2510201 : if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
7475 : 2303957 : || node->ldst_lanes)
7476 : 837609 : for (slp_tree child : SLP_TREE_CHILDREN (node))
7477 : 218877 : if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7478 : : {
7479 : 2835 : poly_uint64 child_vf
7480 : 2835 : = calculate_unrolling_factor (node->max_nunits,
7481 : : SLP_TREE_LANES (child));
7482 : 2835 : vf = force_common_multiple (vf, child_vf);
7483 : : }
7484 : : }
7485 : :
7486 : : /* For each possible SLP instance decide whether to SLP it and calculate overall
7487 : : unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
7488 : : least one instance. */
7489 : :
7490 : : bool
7491 : 342434 : vect_make_slp_decision (loop_vec_info loop_vinfo)
7492 : : {
7493 : 342434 : unsigned int i;
7494 : 342434 : poly_uint64 unrolling_factor = 1;
7495 : 342434 : const vec<slp_instance> &slp_instances
7496 : : = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7497 : 342434 : slp_instance instance;
7498 : 342434 : int decided_to_slp = 0;
7499 : :
7500 : 342434 : DUMP_VECT_SCOPE ("vect_make_slp_decision");
7501 : :
7502 : 342434 : hash_set<slp_tree> visited;
7503 : 1505846 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
7504 : : {
7505 : : /* FORNOW: SLP if you can. */
7506 : : /* All unroll factors have the form:
7507 : :
7508 : : GET_MODE_SIZE (vinfo->vector_mode) * X
7509 : :
7510 : : for some rational X, so they must have a common multiple. */
7511 : 820978 : vect_update_slp_vf_for_node (SLP_INSTANCE_TREE (instance),
7512 : : unrolling_factor, visited);
7513 : :
7514 : : /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
7515 : : call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
7516 : : loop-based vectorization. Such stmts will be marked as HYBRID. */
7517 : 820978 : vect_mark_slp_stmts (loop_vinfo, SLP_INSTANCE_TREE (instance));
7518 : 820978 : decided_to_slp++;
7519 : : }
7520 : :
7521 : 342434 : LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
7522 : :
7523 : 342434 : if (decided_to_slp && dump_enabled_p ())
7524 : : {
7525 : 20080 : dump_printf_loc (MSG_NOTE, vect_location,
7526 : : "Decided to SLP %d instances. Unrolling factor ",
7527 : : decided_to_slp);
7528 : 20080 : dump_dec (MSG_NOTE, unrolling_factor);
7529 : 20080 : dump_printf (MSG_NOTE, "\n");
7530 : : }
7531 : :
7532 : 342434 : return (decided_to_slp > 0);
7533 : 342434 : }
7534 : :
7535 : : /* Private data for vect_detect_hybrid_slp. */
7536 : : struct vdhs_data
7537 : : {
7538 : : loop_vec_info loop_vinfo;
7539 : : vec<stmt_vec_info> *worklist;
7540 : : };
7541 : :
7542 : : /* Walker for walk_gimple_op. */
7543 : :
7544 : : static tree
7545 : 104801 : vect_detect_hybrid_slp (tree *tp, int *, void *data)
7546 : : {
7547 : 104801 : walk_stmt_info *wi = (walk_stmt_info *)data;
7548 : 104801 : vdhs_data *dat = (vdhs_data *)wi->info;
7549 : :
7550 : 104801 : if (wi->is_lhs)
7551 : : return NULL_TREE;
7552 : :
7553 : 69436 : stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
7554 : 69436 : if (!def_stmt_info)
7555 : : return NULL_TREE;
7556 : 31523 : def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
7557 : 31523 : if (PURE_SLP_STMT (def_stmt_info))
7558 : : {
7559 : 15936 : if (dump_enabled_p ())
7560 : 624 : dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
7561 : : def_stmt_info->stmt);
7562 : 15936 : STMT_SLP_TYPE (def_stmt_info) = hybrid;
7563 : 15936 : dat->worklist->safe_push (def_stmt_info);
7564 : : }
7565 : :
7566 : : return NULL_TREE;
7567 : : }
7568 : :
7569 : : /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
7570 : : if so, otherwise pushing it to WORKLIST. */
7571 : :
7572 : : static void
7573 : 223115 : maybe_push_to_hybrid_worklist (vec_info *vinfo,
7574 : : vec<stmt_vec_info> &worklist,
7575 : : stmt_vec_info stmt_info)
7576 : : {
7577 : 223115 : if (dump_enabled_p ())
7578 : 4526 : dump_printf_loc (MSG_NOTE, vect_location,
7579 : : "Processing hybrid candidate : %G", stmt_info->stmt);
7580 : 223115 : stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
7581 : 223115 : imm_use_iterator iter2;
7582 : 223115 : ssa_op_iter iter1;
7583 : 223115 : use_operand_p use_p;
7584 : 223115 : def_operand_p def_p;
7585 : 223115 : bool any_def = false;
7586 : 455074 : FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
7587 : : {
7588 : 19771 : any_def = true;
7589 : 30166 : FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
7590 : : {
7591 : 21322 : if (is_gimple_debug (USE_STMT (use_p)))
7592 : 1401 : continue;
7593 : 19921 : stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
7594 : : /* An out-of loop use means this is a loop_vect sink. */
7595 : 19921 : if (!use_info)
7596 : : {
7597 : 4220 : if (dump_enabled_p ())
7598 : 174 : dump_printf_loc (MSG_NOTE, vect_location,
7599 : : "Found loop_vect sink: %G", stmt_info->stmt);
7600 : 4220 : worklist.safe_push (stmt_info);
7601 : 18077 : return;
7602 : : }
7603 : 20034 : else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
7604 : : {
7605 : 6707 : if (dump_enabled_p ())
7606 : 974 : dump_printf_loc (MSG_NOTE, vect_location,
7607 : : "Found loop_vect use: %G", use_info->stmt);
7608 : 6707 : worklist.safe_push (stmt_info);
7609 : 6707 : return;
7610 : : }
7611 : : }
7612 : : }
7613 : : /* No def means this is a loop_vect sink. Gimple conditionals also don't have a
7614 : : def but shouldn't be considered sinks. */
7615 : 212188 : if (!any_def && STMT_VINFO_DEF_TYPE (stmt_info) != vect_condition_def)
7616 : : {
7617 : 2930 : if (dump_enabled_p ())
7618 : 192 : dump_printf_loc (MSG_NOTE, vect_location,
7619 : : "Found loop_vect sink: %G", stmt_info->stmt);
7620 : 2930 : worklist.safe_push (stmt_info);
7621 : 2930 : return;
7622 : : }
7623 : 209258 : if (dump_enabled_p ())
7624 : 3186 : dump_printf_loc (MSG_NOTE, vect_location,
7625 : : "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
7626 : 209258 : STMT_SLP_TYPE (stmt_info) = pure_slp;
7627 : : }
7628 : :
7629 : : /* Find stmts that must be both vectorized and SLPed. */
7630 : :
7631 : : void
7632 : 337427 : vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
7633 : : {
7634 : 337427 : DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
7635 : :
7636 : : /* All stmts participating in SLP are marked pure_slp, all other
7637 : : stmts are loop_vect.
7638 : : First collect all loop_vect stmts into a worklist.
7639 : : SLP patterns cause not all original scalar stmts to appear in
7640 : : SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
7641 : : Rectify this here and do a backward walk over the IL only considering
7642 : : stmts as loop_vect when they are used by a loop_vect stmt and otherwise
7643 : : mark them as pure_slp. */
7644 : 337427 : auto_vec<stmt_vec_info> worklist;
7645 : 1217957 : for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
7646 : : {
7647 : 880530 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
7648 : 1724229 : for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
7649 : 843699 : gsi_next (&gsi))
7650 : : {
7651 : 843699 : gphi *phi = gsi.phi ();
7652 : 843699 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
7653 : 843699 : if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
7654 : 4750 : maybe_push_to_hybrid_worklist (loop_vinfo,
7655 : : worklist, stmt_info);
7656 : : }
7657 : 880530 : for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
7658 : 10463564 : gsi_prev (&gsi))
7659 : : {
7660 : 4791517 : gimple *stmt = gsi_stmt (gsi);
7661 : 4791517 : if (is_gimple_debug (stmt))
7662 : 1405415 : continue;
7663 : 3386102 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
7664 : 3386102 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
7665 : : {
7666 : 448450 : for (gimple_stmt_iterator gsi2
7667 : 448450 : = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
7668 : 919828 : !gsi_end_p (gsi2); gsi_next (&gsi2))
7669 : : {
7670 : 471378 : stmt_vec_info patt_info
7671 : 471378 : = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
7672 : 471378 : if (!STMT_SLP_TYPE (patt_info)
7673 : 187106 : && STMT_VINFO_RELEVANT (patt_info))
7674 : 4113 : maybe_push_to_hybrid_worklist (loop_vinfo,
7675 : : worklist, patt_info);
7676 : : }
7677 : 448450 : stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7678 : : }
7679 : 3386102 : if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
7680 : 214252 : maybe_push_to_hybrid_worklist (loop_vinfo,
7681 : : worklist, stmt_info);
7682 : : }
7683 : : }
7684 : :
7685 : : /* Now we have a worklist of non-SLP stmts, follow use->def chains and
7686 : : mark any SLP vectorized stmt as hybrid.
7687 : : ??? We're visiting def stmts N times (once for each non-SLP and
7688 : : once for each hybrid-SLP use). */
7689 : 337427 : walk_stmt_info wi;
7690 : 337427 : vdhs_data dat;
7691 : 337427 : dat.worklist = &worklist;
7692 : 337427 : dat.loop_vinfo = loop_vinfo;
7693 : 337427 : memset (&wi, 0, sizeof (wi));
7694 : 337427 : wi.info = (void *)&dat;
7695 : 372156 : while (!worklist.is_empty ())
7696 : : {
7697 : 29793 : stmt_vec_info stmt_info = worklist.pop ();
7698 : : /* Since SSA operands are not set up for pattern stmts we need
7699 : : to use walk_gimple_op. */
7700 : 29793 : wi.is_lhs = 0;
7701 : 29793 : walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
7702 : : /* For gather/scatter make sure to walk the offset operand, that
7703 : : can be a scaling and conversion away. */
7704 : 29793 : gather_scatter_info gs_info;
7705 : 29793 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
7706 : 29793 : && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
7707 : : {
7708 : 970 : int dummy;
7709 : 970 : vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
7710 : : }
7711 : : }
7712 : 337427 : }
7713 : :
7714 : :
7715 : : /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
7716 : :
7717 : 2333511 : _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
7718 : : : vec_info (vec_info::bb, shared),
7719 : 2333511 : roots (vNULL)
7720 : : {
7721 : : /* The region we are operating on. bbs[0] is the entry, excluding
7722 : : its PHI nodes. In the future we might want to track an explicit
7723 : : entry edge to cover bbs[0] PHI nodes and have a region entry
7724 : : insert location. */
7725 : 2333511 : bbs = _bbs.address ();
7726 : 2333511 : nbbs = _bbs.length ();
7727 : :
7728 : 17406859 : for (unsigned i = 0; i < nbbs; ++i)
7729 : : {
7730 : 15073348 : if (i != 0)
7731 : 19818653 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
7732 : 7078816 : gsi_next (&si))
7733 : : {
7734 : 7078816 : gphi *phi = si.phi ();
7735 : 7078816 : gimple_set_uid (phi, 0);
7736 : 7078816 : add_stmt (phi);
7737 : : }
7738 : 30146696 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
7739 : 119640556 : !gsi_end_p (gsi); gsi_next (&gsi))
7740 : : {
7741 : 104567208 : gimple *stmt = gsi_stmt (gsi);
7742 : 104567208 : gimple_set_uid (stmt, 0);
7743 : 104567208 : if (is_gimple_debug (stmt))
7744 : 61728203 : continue;
7745 : 42839005 : add_stmt (stmt);
7746 : : }
7747 : : }
7748 : 2333511 : }
7749 : :
7750 : :
7751 : : /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
7752 : : stmts in the basic block. */
7753 : :
7754 : 2333511 : _bb_vec_info::~_bb_vec_info ()
7755 : : {
7756 : : /* Reset region marker. */
7757 : 17406859 : for (unsigned i = 0; i < nbbs; ++i)
7758 : : {
7759 : 15073348 : if (i != 0)
7760 : 19833834 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
7761 : 7093997 : gsi_next (&si))
7762 : : {
7763 : 7093997 : gphi *phi = si.phi ();
7764 : 7093997 : gimple_set_uid (phi, -1);
7765 : : }
7766 : 30146696 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
7767 : 119584377 : !gsi_end_p (gsi); gsi_next (&gsi))
7768 : : {
7769 : 104511029 : gimple *stmt = gsi_stmt (gsi);
7770 : 104511029 : gimple_set_uid (stmt, -1);
7771 : : }
7772 : : }
7773 : :
7774 : 3464458 : for (unsigned i = 0; i < roots.length (); ++i)
7775 : : {
7776 : 1130947 : roots[i].stmts.release ();
7777 : 1130947 : roots[i].roots.release ();
7778 : 1130947 : roots[i].remain.release ();
7779 : : }
7780 : 2333511 : roots.release ();
7781 : 2333511 : }
7782 : :
7783 : : /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
7784 : : given then that child nodes have already been processed, and that
7785 : : their def types currently match their SLP node's def type. */
7786 : :
7787 : : static bool
7788 : 2405713 : vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
7789 : : slp_instance node_instance,
7790 : : stmt_vector_for_cost *cost_vec)
7791 : : {
7792 : 2405713 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7793 : :
7794 : : /* Calculate the number of vector statements to be created for the scalar
7795 : : stmts in this node. It is the number of scalar elements in one scalar
7796 : : iteration (DR_GROUP_SIZE) multiplied by VF divided by the number of
7797 : : elements in a vector. For single-defuse-cycle, lane-reducing op, and
7798 : : PHI statement that starts reduction comprised of only lane-reducing ops,
7799 : : the number is more than effective vector statements actually required. */
7800 : 2405713 : SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vect_get_num_copies (vinfo, node);
7801 : :
7802 : : /* Handle purely internal nodes. */
7803 : 2405713 : if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7804 : : {
7805 : 100307 : if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
7806 : : return false;
7807 : :
7808 : : stmt_vec_info slp_stmt_info;
7809 : : unsigned int i;
7810 : 257552 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
7811 : : {
7812 : 158526 : if (slp_stmt_info
7813 : 152970 : && STMT_VINFO_LIVE_P (slp_stmt_info)
7814 : 158544 : && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
7815 : : node_instance, i,
7816 : : false, cost_vec))
7817 : : return false;
7818 : : }
7819 : : return true;
7820 : : }
7821 : :
7822 : 2305406 : bool dummy;
7823 : 2305406 : return vect_analyze_stmt (vinfo, stmt_info, &dummy,
7824 : : node, node_instance, cost_vec);
7825 : : }
7826 : :
7827 : : /* Verify if we can externalize a set of internal defs. */
7828 : :
7829 : : static bool
7830 : 400466 : vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
7831 : : {
7832 : 400466 : basic_block bb = NULL;
7833 : 1972657 : for (stmt_vec_info stmt : stmts)
7834 : 892288 : if (!stmt)
7835 : : return false;
7836 : : /* Constant generation uses get_later_stmt which can only handle
7837 : : defs from the same BB. */
7838 : 892288 : else if (!bb)
7839 : 400466 : bb = gimple_bb (stmt->stmt);
7840 : 491822 : else if (gimple_bb (stmt->stmt) != bb)
7841 : : return false;
7842 : : return true;
7843 : : }
7844 : :
7845 : : /* Try to build NODE from scalars, returning true on success.
7846 : : NODE_INSTANCE is the SLP instance that contains NODE. */
7847 : :
7848 : : static bool
7849 : 594355 : vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
7850 : : slp_instance node_instance)
7851 : : {
7852 : 594355 : stmt_vec_info stmt_info;
7853 : 594355 : unsigned int i;
7854 : :
7855 : 594355 : if (!is_a <bb_vec_info> (vinfo)
7856 : 76430 : || node == SLP_INSTANCE_TREE (node_instance)
7857 : 23970 : || !SLP_TREE_SCALAR_STMTS (node).exists ()
7858 : 23915 : || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
7859 : : /* Force the mask use to be built from scalars instead. */
7860 : 21868 : || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
7861 : 616084 : || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
7862 : : return false;
7863 : :
7864 : 19197 : if (dump_enabled_p ())
7865 : 80 : dump_printf_loc (MSG_NOTE, vect_location,
7866 : : "Building vector operands of %p from scalars instead\n",
7867 : : (void *) node);
7868 : :
7869 : : /* Don't remove and free the child nodes here, since they could be
7870 : : referenced by other structures. The analysis and scheduling phases
7871 : : (need to) ignore child nodes of anything that isn't vect_internal_def. */
7872 : 19197 : unsigned int group_size = SLP_TREE_LANES (node);
7873 : 19197 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
7874 : : /* Invariants get their vector type from the uses. */
7875 : 19197 : SLP_TREE_VECTYPE (node) = NULL_TREE;
7876 : 19197 : SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
7877 : 19197 : SLP_TREE_LOAD_PERMUTATION (node).release ();
7878 : 67936 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7879 : : {
7880 : 48739 : tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
7881 : 48739 : SLP_TREE_SCALAR_OPS (node)[i] = lhs;
7882 : : }
7883 : : return true;
7884 : : }
7885 : :
7886 : : /* Return true if all elements of the slice are the same. */
7887 : : bool
7888 : 477713 : vect_scalar_ops_slice::all_same_p () const
7889 : : {
7890 : 520520 : for (unsigned int i = 1; i < length; ++i)
7891 : 346475 : if (!operand_equal_p (op (0), op (i)))
7892 : : return false;
7893 : : return true;
7894 : : }
7895 : :
7896 : : hashval_t
7897 : 646016 : vect_scalar_ops_slice_hash::hash (const value_type &s)
7898 : : {
7899 : 646016 : hashval_t hash = 0;
7900 : 2023806 : for (unsigned i = 0; i < s.length; ++i)
7901 : 1377790 : hash = iterative_hash_expr (s.op (i), hash);
7902 : 646016 : return hash;
7903 : : }
7904 : :
7905 : : bool
7906 : 356611 : vect_scalar_ops_slice_hash::equal (const value_type &s1,
7907 : : const compare_type &s2)
7908 : : {
7909 : 356611 : if (s1.length != s2.length)
7910 : : return false;
7911 : 554416 : for (unsigned i = 0; i < s1.length; ++i)
7912 : 475606 : if (!operand_equal_p (s1.op (i), s2.op (i)))
7913 : : return false;
7914 : : return true;
7915 : : }
7916 : :
7917 : : /* Compute the prologue cost for invariant or constant operands represented
7918 : : by NODE. */
7919 : :
7920 : : static void
7921 : 977261 : vect_prologue_cost_for_slp (slp_tree node,
7922 : : stmt_vector_for_cost *cost_vec)
7923 : : {
7924 : : /* There's a special case of an existing vector, that costs nothing. */
7925 : 977261 : if (SLP_TREE_SCALAR_OPS (node).length () == 0
7926 : 977261 : && !SLP_TREE_VEC_DEFS (node).is_empty ())
7927 : 1835 : return;
7928 : : /* Without looking at the actual initializer a vector of
7929 : : constants can be implemented as load from the constant pool.
7930 : : When all elements are the same we can use a splat. */
7931 : 975426 : tree vectype = SLP_TREE_VECTYPE (node);
7932 : 975426 : unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
7933 : 975426 : unsigned HOST_WIDE_INT const_nunits;
7934 : 975426 : unsigned nelt_limit;
7935 : 975426 : auto ops = &SLP_TREE_SCALAR_OPS (node);
7936 : 975426 : auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7937 : 975426 : if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
7938 : 975426 : && ! multiple_p (const_nunits, group_size))
7939 : : {
7940 : 108664 : nelt_limit = const_nunits;
7941 : 108664 : hash_set<vect_scalar_ops_slice_hash> vector_ops;
7942 : 430541 : for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
7943 : 321877 : if (!vector_ops.add ({ ops, i * nelt_limit, nelt_limit }))
7944 : 243067 : starts.quick_push (i * nelt_limit);
7945 : 108664 : }
7946 : : else
7947 : : {
7948 : : /* If either the vector has variable length or the vectors
7949 : : are composed of repeated whole groups we only need to
7950 : : cost construction once. All vectors will be the same. */
7951 : 866762 : nelt_limit = group_size;
7952 : 866762 : starts.quick_push (0);
7953 : : }
7954 : : /* ??? We're just tracking whether vectors in a single node are the same.
7955 : : Ideally we'd do something more global. */
7956 : 975426 : bool passed = false;
7957 : 4036107 : for (unsigned int start : starts)
7958 : : {
7959 : 1109829 : vect_cost_for_stmt kind;
7960 : 1109829 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
7961 : : kind = vector_load;
7962 : 477713 : else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
7963 : : kind = scalar_to_vec;
7964 : : else
7965 : 303668 : kind = vec_construct;
7966 : : /* The target cost hook has no idea which part of the SLP node
7967 : : we are costing so avoid passing it down more than once. Pass
7968 : : it to the first vec_construct or scalar_to_vec part since for those
7969 : : the x86 backend tries to account for GPR to XMM register moves. */
7970 : 1109829 : record_stmt_cost (cost_vec, 1, kind,
7971 : 1109829 : (kind != vector_load && !passed) ? node : nullptr,
7972 : : vectype, 0, vect_prologue);
7973 : 1109829 : if (kind != vector_load)
7974 : 477713 : passed = true;
7975 : : }
7976 : 975426 : }
7977 : :
7978 : : /* Analyze statements contained in SLP tree NODE after recursively analyzing
7979 : : the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
7980 : :
7981 : : Return true if the operations are supported. */
7982 : :
7983 : : static bool
7984 : 4529327 : vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
7985 : : slp_instance node_instance,
7986 : : hash_set<slp_tree> &visited_set,
7987 : : vec<slp_tree> &visited_vec,
7988 : : stmt_vector_for_cost *cost_vec)
7989 : : {
7990 : 4529327 : int i, j;
7991 : 4529327 : slp_tree child;
7992 : :
7993 : : /* Assume we can code-generate all invariants. */
7994 : 4529327 : if (!node
7995 : 4225226 : || SLP_TREE_DEF_TYPE (node) == vect_constant_def
7996 : 3504426 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7997 : : return true;
7998 : :
7999 : 3026030 : if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
8000 : : {
8001 : 18 : if (dump_enabled_p ())
8002 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
8003 : : "Failed cyclic SLP reference in %p\n", (void *) node);
8004 : 18 : return false;
8005 : : }
8006 : 3026012 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
8007 : :
8008 : : /* If we already analyzed the exact same set of scalar stmts we're done.
8009 : : We share the generated vector stmts for those. */
8010 : 3026012 : if (visited_set.add (node))
8011 : : return true;
8012 : 2767598 : visited_vec.safe_push (node);
8013 : :
8014 : 2767598 : bool res = true;
8015 : 2767598 : unsigned visited_rec_start = visited_vec.length ();
8016 : 2767598 : unsigned cost_vec_rec_start = cost_vec->length ();
8017 : 2767598 : bool seen_non_constant_child = false;
8018 : 5742784 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8019 : : {
8020 : 3336920 : res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
8021 : : visited_set, visited_vec,
8022 : : cost_vec);
8023 : 3336920 : if (!res)
8024 : : break;
8025 : 2975186 : if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
8026 : 2975186 : seen_non_constant_child = true;
8027 : : }
8028 : : /* We're having difficulties scheduling nodes with just constant
8029 : : operands and no scalar stmts since we then cannot compute a stmt
8030 : : insertion place. */
8031 : 2767598 : if (res
8032 : 2767598 : && !seen_non_constant_child
8033 : 2767598 : && SLP_TREE_SCALAR_STMTS (node).is_empty ())
8034 : : {
8035 : 151 : if (dump_enabled_p ())
8036 : 6 : dump_printf_loc (MSG_NOTE, vect_location,
8037 : : "Cannot vectorize all-constant op node %p\n",
8038 : : (void *) node);
8039 : : res = false;
8040 : : }
8041 : :
8042 : 2767447 : if (res)
8043 : 2405713 : res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
8044 : : cost_vec);
8045 : : /* If analysis failed we have to pop all recursive visited nodes
8046 : : plus ourselves. */
8047 : 2767598 : if (!res)
8048 : : {
8049 : 2965160 : while (visited_vec.length () >= visited_rec_start)
8050 : 888225 : visited_set.remove (visited_vec.pop ());
8051 : 594355 : cost_vec->truncate (cost_vec_rec_start);
8052 : : }
8053 : :
8054 : : /* When the node can be vectorized cost invariant nodes it references.
8055 : : This is not done in DFS order to allow the refering node
8056 : : vectorizable_* calls to nail down the invariant nodes vector type
8057 : : and possibly unshare it if it needs a different vector type than
8058 : : other referrers. */
8059 : 2767598 : if (res)
8060 : 4795135 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
8061 : 2621892 : if (child
8062 : 2371423 : && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
8063 : 2371423 : || SLP_TREE_DEF_TYPE (child) == vect_external_def)
8064 : : /* Perform usual caching, note code-generation still
8065 : : code-gens these nodes multiple times but we expect
8066 : : to CSE them later. */
8067 : 3657472 : && !visited_set.add (child))
8068 : : {
8069 : 1011397 : visited_vec.safe_push (child);
8070 : : /* ??? After auditing more code paths make a "default"
8071 : : and push the vector type from NODE to all children
8072 : : if it is not already set. */
8073 : : /* Compute the number of vectors to be generated. */
8074 : 1011397 : tree vector_type = SLP_TREE_VECTYPE (child);
8075 : 1011397 : if (!vector_type)
8076 : : {
8077 : : /* Masked loads can have an undefined (default SSA definition)
8078 : : else operand. We do not need to cost it. */
8079 : 34136 : vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
8080 : 35307 : if ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
8081 : : == load_vec_info_type)
8082 : 35307 : && ((ops.length ()
8083 : 1171 : && TREE_CODE (ops[0]) == SSA_NAME
8084 : 0 : && SSA_NAME_IS_DEFAULT_DEF (ops[0])
8085 : 0 : && VAR_P (SSA_NAME_VAR (ops[0])))
8086 : 1171 : || SLP_TREE_DEF_TYPE (child) == vect_constant_def))
8087 : 1171 : continue;
8088 : :
8089 : : /* For shifts with a scalar argument we don't need
8090 : : to cost or code-generate anything.
8091 : : ??? Represent this more explicitely. */
8092 : 32965 : gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
8093 : : == shift_vec_info_type)
8094 : : && j == 1);
8095 : 32965 : continue;
8096 : 32965 : }
8097 : :
8098 : 977261 : SLP_TREE_NUMBER_OF_VEC_STMTS (child)
8099 : 977261 : = vect_get_num_copies (vinfo, child);
8100 : : /* And cost them. */
8101 : 977261 : vect_prologue_cost_for_slp (child, cost_vec);
8102 : : }
8103 : :
8104 : : /* If this node or any of its children can't be vectorized, try pruning
8105 : : the tree here rather than felling the whole thing. */
8106 : 594355 : if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
8107 : : {
8108 : : /* We'll need to revisit this for invariant costing and number
8109 : : of vectorized stmt setting. */
8110 : : res = true;
8111 : : }
8112 : :
8113 : : return res;
8114 : : }
8115 : :
8116 : : /* Given a definition DEF, analyze if it will have any live scalar use after
8117 : : performing SLP vectorization whose information is represented by BB_VINFO,
8118 : : and record result into hash map SCALAR_USE_MAP as cache for later fast
8119 : : check. If recursion DEPTH exceeds a limit, stop analysis and make a
8120 : : conservative assumption. Return 0 if no scalar use, 1 if there is, -1
8121 : : means recursion is limited. */
8122 : :
8123 : : static int
8124 : 546247 : vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
8125 : : hash_map<tree, int> &scalar_use_map,
8126 : : int depth = 0)
8127 : : {
8128 : 546247 : const int depth_limit = 2;
8129 : 546247 : imm_use_iterator use_iter;
8130 : 546247 : gimple *use_stmt;
8131 : :
8132 : 546247 : if (int *res = scalar_use_map.get (def))
8133 : 15430 : return *res;
8134 : :
8135 : 530817 : int scalar_use = 1;
8136 : :
8137 : 1224789 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
8138 : : {
8139 : 802748 : if (is_gimple_debug (use_stmt))
8140 : 199438 : continue;
8141 : :
8142 : 603310 : stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
8143 : :
8144 : 603310 : if (!use_stmt_info)
8145 : : break;
8146 : :
8147 : 604935 : if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
8148 : 492782 : continue;
8149 : :
8150 : : /* Do not step forward when encounter PHI statement, since it may
8151 : : involve cyclic reference and cause infinite recursive invocation. */
8152 : 104328 : if (gimple_code (use_stmt) == GIMPLE_PHI)
8153 : : break;
8154 : :
8155 : : /* When pattern recognition is involved, a statement whose definition is
8156 : : consumed in some pattern, may not be included in the final replacement
8157 : : pattern statements, so would be skipped when building SLP graph.
8158 : :
8159 : : * Original
8160 : : char a_c = *(char *) a;
8161 : : char b_c = *(char *) b;
8162 : : unsigned short a_s = (unsigned short) a_c;
8163 : : int a_i = (int) a_s;
8164 : : int b_i = (int) b_c;
8165 : : int r_i = a_i - b_i;
8166 : :
8167 : : * After pattern replacement
8168 : : a_s = (unsigned short) a_c;
8169 : : a_i = (int) a_s;
8170 : :
8171 : : patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
8172 : : patt_b_i = (int) patt_b_s; // b_i = (int) b_c
8173 : :
8174 : : patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
8175 : : patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
8176 : :
8177 : : The definitions of a_i(original statement) and b_i(pattern statement)
8178 : : are related to, but actually not part of widen_minus pattern.
8179 : : Vectorizing the pattern does not cause these definition statements to
8180 : : be marked as PURE_SLP. For this case, we need to recursively check
8181 : : whether their uses are all absorbed into vectorized code. But there
8182 : : is an exception that some use may participate in an vectorized
8183 : : operation via an external SLP node containing that use as an element.
8184 : : The parameter "scalar_use_map" tags such kind of SSA as having scalar
8185 : : use in advance. */
8186 : 83744 : tree lhs = gimple_get_lhs (use_stmt);
8187 : :
8188 : 83744 : if (!lhs || TREE_CODE (lhs) != SSA_NAME)
8189 : : break;
8190 : :
8191 : 50905 : if (depth_limit && depth >= depth_limit)
8192 : 9534 : return -1;
8193 : :
8194 : 41371 : if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
8195 : : depth + 1)))
8196 : : break;
8197 : 530817 : }
8198 : :
8199 : 521283 : if (end_imm_use_stmt_p (&use_iter))
8200 : 422041 : scalar_use = 0;
8201 : :
8202 : : /* If recursion is limited, do not cache result for non-root defs. */
8203 : 521283 : if (!depth || scalar_use >= 0)
8204 : : {
8205 : 511749 : bool added = scalar_use_map.put (def, scalar_use);
8206 : 511749 : gcc_assert (!added);
8207 : : }
8208 : :
8209 : 521283 : return scalar_use;
8210 : : }
8211 : :
8212 : : /* Mark lanes of NODE that are live outside of the basic-block vectorized
8213 : : region and that can be vectorized using vectorizable_live_operation
8214 : : with STMT_VINFO_LIVE_P. Not handled live operations will cause the
8215 : : scalar code computing it to be retained. */
8216 : :
8217 : : static void
8218 : 892638 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
8219 : : slp_instance instance,
8220 : : stmt_vector_for_cost *cost_vec,
8221 : : hash_map<tree, int> &scalar_use_map,
8222 : : hash_set<stmt_vec_info> &svisited,
8223 : : hash_set<slp_tree> &visited)
8224 : : {
8225 : 892638 : if (visited.add (node))
8226 : 27015 : return;
8227 : :
8228 : 865623 : unsigned i;
8229 : 865623 : stmt_vec_info stmt_info;
8230 : 865623 : stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
8231 : 3098507 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8232 : : {
8233 : 2232884 : if (!stmt_info || svisited.contains (stmt_info))
8234 : 24073 : continue;
8235 : 2217260 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
8236 : 2217260 : if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
8237 : 11257 : && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
8238 : : /* Only the pattern root stmt computes the original scalar value. */
8239 : 8449 : continue;
8240 : 2208811 : bool mark_visited = true;
8241 : 2208811 : gimple *orig_stmt = orig_stmt_info->stmt;
8242 : 2208811 : ssa_op_iter op_iter;
8243 : 2208811 : def_operand_p def_p;
8244 : 4922498 : FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
8245 : : {
8246 : 504876 : if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
8247 : : scalar_use_map))
8248 : : {
8249 : 84333 : STMT_VINFO_LIVE_P (stmt_info) = true;
8250 : 84333 : if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
8251 : : instance, i, false, cost_vec))
8252 : : /* ??? So we know we can vectorize the live stmt from one SLP
8253 : : node. If we cannot do so from all or none consistently
8254 : : we'd have to record which SLP node (and lane) we want to
8255 : : use for the live operation. So make sure we can
8256 : : code-generate from all nodes. */
8257 : : mark_visited = false;
8258 : : else
8259 : 0 : STMT_VINFO_LIVE_P (stmt_info) = false;
8260 : : }
8261 : :
8262 : : /* We have to verify whether we can insert the lane extract
8263 : : before all uses. The following is a conservative approximation.
8264 : : We cannot put this into vectorizable_live_operation because
8265 : : iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
8266 : : doesn't work.
8267 : : Note that while the fact that we emit code for loads at the
8268 : : first load should make this a non-problem leafs we construct
8269 : : from scalars are vectorized after the last scalar def.
8270 : : ??? If we'd actually compute the insert location during
8271 : : analysis we could use sth less conservative than the last
8272 : : scalar stmt in the node for the dominance check. */
8273 : : /* ??? What remains is "live" uses in vector CTORs in the same
8274 : : SLP graph which is where those uses can end up code-generated
8275 : : right after their definition instead of close to their original
8276 : : use. But that would restrict us to code-generate lane-extracts
8277 : : from the latest stmt in a node. So we compensate for this
8278 : : during code-generation, simply not replacing uses for those
8279 : : hopefully rare cases. */
8280 : 504876 : imm_use_iterator use_iter;
8281 : 504876 : gimple *use_stmt;
8282 : 504876 : stmt_vec_info use_stmt_info;
8283 : :
8284 : 504876 : if (STMT_VINFO_LIVE_P (stmt_info))
8285 : 466617 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
8286 : 382284 : if (!is_gimple_debug (use_stmt)
8287 : 290560 : && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
8288 : 279661 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
8289 : 549155 : && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
8290 : : {
8291 : 12345 : if (dump_enabled_p ())
8292 : 284 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8293 : : "Cannot determine insertion place for "
8294 : : "lane extract\n");
8295 : 12345 : STMT_VINFO_LIVE_P (stmt_info) = false;
8296 : 12345 : mark_visited = true;
8297 : 84333 : }
8298 : : }
8299 : 2208811 : if (mark_visited)
8300 : 2135747 : svisited.add (stmt_info);
8301 : : }
8302 : :
8303 : : slp_tree child;
8304 : 2455078 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8305 : 841537 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8306 : 227043 : vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
8307 : : scalar_use_map, svisited, visited);
8308 : : }
8309 : :
8310 : : /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
8311 : : are live outside of the basic-block vectorized region and that can be
8312 : : vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
8313 : :
8314 : : static void
8315 : 318775 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
8316 : : {
8317 : 318775 : if (bb_vinfo->slp_instances.is_empty ())
8318 : 36254 : return;
8319 : :
8320 : 282521 : hash_set<stmt_vec_info> svisited;
8321 : 282521 : hash_set<slp_tree> visited;
8322 : 282521 : hash_map<tree, int> scalar_use_map;
8323 : 282521 : auto_vec<slp_tree> worklist;
8324 : :
8325 : 1513158 : for (slp_instance instance : bb_vinfo->slp_instances)
8326 : : {
8327 : 665595 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
8328 : 42272 : for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
8329 : 13764 : if (TREE_CODE (op) == SSA_NAME)
8330 : 12419 : scalar_use_map.put (op, 1);
8331 : 665595 : if (!visited.add (SLP_INSTANCE_TREE (instance)))
8332 : 664165 : worklist.safe_push (SLP_INSTANCE_TREE (instance));
8333 : : }
8334 : :
8335 : 1479196 : do
8336 : : {
8337 : 1479196 : slp_tree node = worklist.pop ();
8338 : :
8339 : 1479196 : if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
8340 : : {
8341 : 1489419 : for (tree op : SLP_TREE_SCALAR_OPS (node))
8342 : 655811 : if (TREE_CODE (op) == SSA_NAME)
8343 : 421840 : scalar_use_map.put (op, 1);
8344 : : }
8345 : : else
8346 : : {
8347 : 3538207 : for (slp_tree child : SLP_TREE_CHILDREN (node))
8348 : 841513 : if (child && !visited.add (child))
8349 : 815031 : worklist.safe_push (child);
8350 : : }
8351 : : }
8352 : 2958392 : while (!worklist.is_empty ());
8353 : :
8354 : 282521 : visited.empty ();
8355 : :
8356 : 1513158 : for (slp_instance instance : bb_vinfo->slp_instances)
8357 : : {
8358 : 665595 : vect_location = instance->location ();
8359 : 665595 : vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
8360 : : instance, &instance->cost_vec,
8361 : : scalar_use_map, svisited, visited);
8362 : : }
8363 : 282521 : }
8364 : :
8365 : : /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
8366 : :
8367 : : static bool
8368 : 61290 : vectorizable_bb_reduc_epilogue (slp_instance instance,
8369 : : stmt_vector_for_cost *cost_vec)
8370 : : {
8371 : 61290 : gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
8372 : 61290 : enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
8373 : 61290 : if (reduc_code == MINUS_EXPR)
8374 : 0 : reduc_code = PLUS_EXPR;
8375 : 61290 : internal_fn reduc_fn;
8376 : 61290 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
8377 : 61290 : if (!vectype
8378 : 61197 : || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
8379 : 61197 : || reduc_fn == IFN_LAST
8380 : 61197 : || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
8381 : 87591 : || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
8382 : 26301 : TREE_TYPE (vectype)))
8383 : : {
8384 : 45277 : if (dump_enabled_p ())
8385 : 254 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8386 : : "not vectorized: basic block reduction epilogue "
8387 : : "operation unsupported.\n");
8388 : 45277 : return false;
8389 : : }
8390 : :
8391 : : /* There's no way to cost a horizontal vector reduction via REDUC_FN so
8392 : : cost log2 vector operations plus shuffles and one extraction. */
8393 : 16013 : unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
8394 : 16013 : record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
8395 : : vectype, 0, vect_body);
8396 : 16013 : record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
8397 : : vectype, 0, vect_body);
8398 : 16013 : record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
8399 : : vectype, 0, vect_body);
8400 : :
8401 : : /* Since we replace all stmts of a possibly longer scalar reduction
8402 : : chain account for the extra scalar stmts for that. */
8403 : 16013 : record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
8404 : 16013 : instance->root_stmts[0], 0, vect_body);
8405 : 16013 : return true;
8406 : : }
8407 : :
8408 : : /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
8409 : : and recurse to children. */
8410 : :
8411 : : static void
8412 : 124215 : vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
8413 : : hash_set<slp_tree> &visited)
8414 : : {
8415 : 124215 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
8416 : 124215 : || visited.add (node))
8417 : 55205 : return;
8418 : :
8419 : : stmt_vec_info stmt;
8420 : : unsigned i;
8421 : 223865 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
8422 : 154855 : if (stmt)
8423 : 158322 : roots.remove (vect_orig_stmt (stmt));
8424 : :
8425 : : slp_tree child;
8426 : 162721 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8427 : 93711 : if (child)
8428 : 87295 : vect_slp_prune_covered_roots (child, roots, visited);
8429 : : }
8430 : :
8431 : : /* Analyze statements in SLP instances of VINFO. Return true if the
8432 : : operations are supported. */
8433 : :
8434 : : bool
8435 : 649155 : vect_slp_analyze_operations (vec_info *vinfo)
8436 : : {
8437 : 649155 : slp_instance instance;
8438 : 649155 : int i;
8439 : :
8440 : 649155 : DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
8441 : :
8442 : 649155 : hash_set<slp_tree> visited;
8443 : 1630074 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
8444 : : {
8445 : 1192407 : auto_vec<slp_tree> visited_vec;
8446 : 1192407 : stmt_vector_for_cost cost_vec;
8447 : 1192407 : cost_vec.create (2);
8448 : 1192407 : if (is_a <bb_vec_info> (vinfo))
8449 : 764768 : vect_location = instance->location ();
8450 : 1192407 : if (!vect_slp_analyze_node_operations (vinfo,
8451 : : SLP_INSTANCE_TREE (instance),
8452 : : instance, visited, visited_vec,
8453 : : &cost_vec)
8454 : : /* CTOR instances require vectorized defs for the SLP tree root. */
8455 : 978965 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
8456 : 656 : && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
8457 : : != vect_internal_def
8458 : : /* Make sure we vectorized with the expected type. */
8459 : 652 : || !useless_type_conversion_p
8460 : 652 : (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
8461 : : (instance->root_stmts[0]->stmt))),
8462 : 652 : TREE_TYPE (SLP_TREE_VECTYPE
8463 : : (SLP_INSTANCE_TREE (instance))))))
8464 : : /* Check we can vectorize the reduction. */
8465 : 978958 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
8466 : 61290 : && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
8467 : : /* Check we can vectorize the gcond. */
8468 : 2126088 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
8469 : 52529 : && !vectorizable_early_exit (vinfo,
8470 : 52529 : SLP_INSTANCE_ROOT_STMTS (instance)[0],
8471 : : NULL, NULL,
8472 : : SLP_INSTANCE_TREE (instance),
8473 : : &cost_vec)))
8474 : : {
8475 : 309232 : cost_vec.release ();
8476 : 309232 : slp_tree node = SLP_INSTANCE_TREE (instance);
8477 : 309232 : stmt_vec_info stmt_info;
8478 : 309232 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8479 : 223920 : stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
8480 : : else
8481 : 85312 : stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8482 : 309232 : if (is_a <loop_vec_info> (vinfo))
8483 : : {
8484 : 211488 : if (dump_enabled_p ())
8485 : 7030 : dump_printf_loc (MSG_NOTE, vect_location,
8486 : : "unsupported SLP instance starting from: %G",
8487 : : stmt_info->stmt);
8488 : 211488 : return false;
8489 : : }
8490 : 97744 : if (dump_enabled_p ())
8491 : 321 : dump_printf_loc (MSG_NOTE, vect_location,
8492 : : "removing SLP instance operations starting from: %G",
8493 : : stmt_info->stmt);
8494 : 97744 : vect_free_slp_instance (instance);
8495 : 97744 : vinfo->slp_instances.ordered_remove (i);
8496 : 1466603 : while (!visited_vec.is_empty ())
8497 : 290318 : visited.remove (visited_vec.pop ());
8498 : : }
8499 : : else
8500 : : {
8501 : 883175 : i++;
8502 : 883175 : if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
8503 : : {
8504 : 216151 : add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
8505 : 216151 : cost_vec.release ();
8506 : : }
8507 : : else
8508 : : /* For BB vectorization remember the SLP graph entry
8509 : : cost for later. */
8510 : 667024 : instance->cost_vec = cost_vec;
8511 : : }
8512 : 1192407 : }
8513 : :
8514 : : /* Now look for SLP instances with a root that are covered by other
8515 : : instances and remove them. */
8516 : 437667 : hash_set<stmt_vec_info> roots;
8517 : 1710597 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
8518 : 853798 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8519 : 18535 : roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
8520 : 437667 : if (!roots.is_empty ())
8521 : : {
8522 : 10094 : visited.empty ();
8523 : 47014 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
8524 : 36920 : vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
8525 : : visited);
8526 : 47014 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
8527 : 36920 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
8528 : 18535 : && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
8529 : : {
8530 : 1429 : stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
8531 : 1429 : if (dump_enabled_p ())
8532 : 20 : dump_printf_loc (MSG_NOTE, vect_location,
8533 : : "removing SLP instance operations starting "
8534 : : "from: %G", root->stmt);
8535 : 1429 : vect_free_slp_instance (instance);
8536 : 1429 : vinfo->slp_instances.ordered_remove (i);
8537 : : }
8538 : : else
8539 : 35491 : ++i;
8540 : : }
8541 : :
8542 : : /* Compute vectorizable live stmts. */
8543 : 437667 : if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
8544 : 318775 : vect_bb_slp_mark_live_stmts (bb_vinfo);
8545 : :
8546 : 875334 : return !vinfo->slp_instances.is_empty ();
8547 : 1086822 : }
8548 : :
8549 : : /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
8550 : : closing the eventual chain. */
8551 : :
8552 : : static slp_instance
8553 : 708217 : get_ultimate_leader (slp_instance instance,
8554 : : hash_map<slp_instance, slp_instance> &instance_leader)
8555 : : {
8556 : 708217 : auto_vec<slp_instance *, 8> chain;
8557 : 708217 : slp_instance *tem;
8558 : 757278 : while (*(tem = instance_leader.get (instance)) != instance)
8559 : : {
8560 : 49061 : chain.safe_push (tem);
8561 : 49061 : instance = *tem;
8562 : : }
8563 : 757278 : while (!chain.is_empty ())
8564 : 49061 : *chain.pop () = instance;
8565 : 708217 : return instance;
8566 : 708217 : }
8567 : :
8568 : : namespace {
8569 : : /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
8570 : : KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
8571 : : for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
8572 : :
8573 : : INSTANCE_LEADER is as for get_ultimate_leader. */
8574 : :
8575 : : template<typename T>
8576 : : bool
8577 : 3193105 : vect_map_to_instance (slp_instance instance, T key,
8578 : : hash_map<T, slp_instance> &key_to_instance,
8579 : : hash_map<slp_instance, slp_instance> &instance_leader)
8580 : : {
8581 : : bool existed_p;
8582 : 3193105 : slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
8583 : 3193105 : if (!existed_p)
8584 : : ;
8585 : 117836 : else if (key_instance != instance)
8586 : : {
8587 : : /* If we're running into a previously marked key make us the
8588 : : leader of the current ultimate leader. This keeps the
8589 : : leader chain acyclic and works even when the current instance
8590 : : connects two previously independent graph parts. */
8591 : 42622 : slp_instance key_leader
8592 : 42622 : = get_ultimate_leader (key_instance, instance_leader);
8593 : 42622 : if (key_leader != instance)
8594 : 11745 : instance_leader.put (key_leader, instance);
8595 : : }
8596 : 3193105 : key_instance = instance;
8597 : 3193105 : return existed_p;
8598 : : }
8599 : : }
8600 : :
8601 : : /* Worker of vect_bb_partition_graph, recurse on NODE. */
8602 : :
8603 : : static void
8604 : 892638 : vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
8605 : : slp_instance instance, slp_tree node,
8606 : : hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
8607 : : hash_map<slp_tree, slp_instance> &node_to_instance,
8608 : : hash_map<slp_instance, slp_instance> &instance_leader)
8609 : : {
8610 : 892638 : stmt_vec_info stmt_info;
8611 : 892638 : unsigned i;
8612 : :
8613 : 3193105 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8614 : 2300467 : if (stmt_info)
8615 : 2300467 : vect_map_to_instance (instance, stmt_info, stmt_to_instance,
8616 : : instance_leader);
8617 : :
8618 : 892638 : if (vect_map_to_instance (instance, node, node_to_instance,
8619 : : instance_leader))
8620 : 892638 : return;
8621 : :
8622 : : slp_tree child;
8623 : 1707160 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8624 : 841537 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8625 : 227043 : vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
8626 : : node_to_instance, instance_leader);
8627 : : }
8628 : :
8629 : : /* Partition the SLP graph into pieces that can be costed independently. */
8630 : :
8631 : : static void
8632 : 282521 : vect_bb_partition_graph (bb_vec_info bb_vinfo)
8633 : : {
8634 : 282521 : DUMP_VECT_SCOPE ("vect_bb_partition_graph");
8635 : :
8636 : : /* First walk the SLP graph assigning each involved scalar stmt a
8637 : : corresponding SLP graph entry and upon visiting a previously
8638 : : marked stmt, make the stmts leader the current SLP graph entry. */
8639 : 282521 : hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
8640 : 282521 : hash_map<slp_tree, slp_instance> node_to_instance;
8641 : 282521 : hash_map<slp_instance, slp_instance> instance_leader;
8642 : 282521 : slp_instance instance;
8643 : 948116 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8644 : : {
8645 : 665595 : instance_leader.put (instance, instance);
8646 : 665595 : vect_bb_partition_graph_r (bb_vinfo,
8647 : : instance, SLP_INSTANCE_TREE (instance),
8648 : : stmt_to_instance, node_to_instance,
8649 : : instance_leader);
8650 : : }
8651 : :
8652 : : /* Then collect entries to each independent subgraph. */
8653 : 1230637 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8654 : : {
8655 : 665595 : slp_instance leader = get_ultimate_leader (instance, instance_leader);
8656 : 665595 : leader->subgraph_entries.safe_push (instance);
8657 : 665595 : if (dump_enabled_p ()
8658 : 665595 : && leader != instance)
8659 : 67 : dump_printf_loc (MSG_NOTE, vect_location,
8660 : : "instance %p is leader of %p\n",
8661 : : (void *) leader, (void *) instance);
8662 : : }
8663 : 282521 : }
8664 : :
8665 : : /* Compute the set of scalar stmts participating in internal and external
8666 : : nodes. */
8667 : :
8668 : : static void
8669 : 1493930 : vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
8670 : : hash_set<slp_tree> &visited,
8671 : : hash_set<stmt_vec_info> &vstmts,
8672 : : hash_set<stmt_vec_info> &estmts)
8673 : : {
8674 : 1493930 : int i;
8675 : 1493930 : stmt_vec_info stmt_info;
8676 : 1493930 : slp_tree child;
8677 : :
8678 : 1493930 : if (visited.add (node))
8679 : 26846 : return;
8680 : :
8681 : 1467084 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
8682 : : {
8683 : 3042057 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8684 : 2185100 : if (stmt_info)
8685 : 2185100 : vstmts.add (stmt_info);
8686 : :
8687 : 3040053 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8688 : 831936 : if (child)
8689 : 831936 : vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
8690 : : vstmts, estmts);
8691 : : }
8692 : : else
8693 : 3412886 : for (tree def : SLP_TREE_SCALAR_OPS (node))
8694 : : {
8695 : 1583491 : stmt_vec_info def_stmt = vinfo->lookup_def (def);
8696 : 1583491 : if (def_stmt)
8697 : 253452 : estmts.add (def_stmt);
8698 : : }
8699 : : }
8700 : :
8701 : :
8702 : : /* Compute the scalar cost of the SLP node NODE and its children
8703 : : and return it. Do not account defs that are marked in LIFE and
8704 : : update LIFE according to uses of NODE. */
8705 : :
8706 : : static void
8707 : 883220 : vect_bb_slp_scalar_cost (vec_info *vinfo,
8708 : : slp_tree node, vec<bool, va_heap> *life,
8709 : : stmt_vector_for_cost *cost_vec,
8710 : : hash_set<stmt_vec_info> &vectorized_scalar_stmts,
8711 : : hash_set<stmt_vec_info> &scalar_stmts_in_externs,
8712 : : hash_set<slp_tree> &visited)
8713 : : {
8714 : 883220 : unsigned i;
8715 : 883220 : stmt_vec_info stmt_info;
8716 : 883220 : slp_tree child;
8717 : :
8718 : 883220 : if (visited.add (node))
8719 : 26246 : return;
8720 : :
8721 : 3042108 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8722 : : {
8723 : 2185134 : ssa_op_iter op_iter;
8724 : 2185134 : def_operand_p def_p;
8725 : :
8726 : 2206895 : if (!stmt_info
8727 : 2185134 : || (*life)[i]
8728 : : /* Defs also used in external nodes are not in the
8729 : : vectorized_scalar_stmts set as they need to be preserved.
8730 : : Honor that. */
8731 : 4350132 : || scalar_stmts_in_externs.contains (stmt_info))
8732 : 80678 : continue;
8733 : :
8734 : 2163373 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
8735 : 2163373 : gimple *orig_stmt = orig_stmt_info->stmt;
8736 : :
8737 : : /* If there is a non-vectorized use of the defs then the scalar
8738 : : stmt is kept live in which case we do not account it or any
8739 : : required defs in the SLP children in the scalar cost. This
8740 : : way we make the vectorization more costly when compared to
8741 : : the scalar cost. */
8742 : 2163373 : if (!STMT_VINFO_LIVE_P (stmt_info))
8743 : : {
8744 : 2097026 : auto_vec<gimple *, 8> worklist;
8745 : 2097026 : hash_set<gimple *> *worklist_visited = NULL;
8746 : 2097026 : worklist.quick_push (orig_stmt);
8747 : 2101847 : do
8748 : : {
8749 : 2101847 : gimple *work_stmt = worklist.pop ();
8750 : 4609960 : FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
8751 : : {
8752 : 419312 : imm_use_iterator use_iter;
8753 : 419312 : gimple *use_stmt;
8754 : 1052499 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
8755 : : DEF_FROM_PTR (def_p))
8756 : 646233 : if (!is_gimple_debug (use_stmt))
8757 : : {
8758 : 462390 : stmt_vec_info use_stmt_info
8759 : 462390 : = vinfo->lookup_stmt (use_stmt);
8760 : 462390 : if (!use_stmt_info
8761 : 462390 : || !vectorized_scalar_stmts.contains (use_stmt_info))
8762 : : {
8763 : 17937 : if (use_stmt_info
8764 : 16174 : && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
8765 : : {
8766 : : /* For stmts participating in patterns we have
8767 : : to check its uses recursively. */
8768 : 4891 : if (!worklist_visited)
8769 : 3783 : worklist_visited = new hash_set<gimple *> ();
8770 : 4891 : if (!worklist_visited->add (use_stmt))
8771 : 4891 : worklist.safe_push (use_stmt);
8772 : 4891 : continue;
8773 : : }
8774 : 13046 : (*life)[i] = true;
8775 : 13046 : goto next_lane;
8776 : : }
8777 : 419312 : }
8778 : : }
8779 : : }
8780 : 4177602 : while (!worklist.is_empty ());
8781 : 2083980 : next_lane:
8782 : 2097026 : if (worklist_visited)
8783 : 3783 : delete worklist_visited;
8784 : 2097026 : if ((*life)[i])
8785 : 13046 : continue;
8786 : 2097026 : }
8787 : :
8788 : : /* Count scalar stmts only once. */
8789 : 2150327 : if (gimple_visited_p (orig_stmt))
8790 : 21933 : continue;
8791 : 2128394 : gimple_set_visited (orig_stmt, true);
8792 : :
8793 : 2128394 : vect_cost_for_stmt kind;
8794 : 2128394 : if (STMT_VINFO_DATA_REF (orig_stmt_info))
8795 : : {
8796 : 1950135 : data_reference_p dr = STMT_VINFO_DATA_REF (orig_stmt_info);
8797 : 1950135 : tree base = get_base_address (DR_REF (dr));
8798 : : /* When the scalar access is to a non-global not address-taken
8799 : : decl that is not BLKmode assume we can access it with a single
8800 : : non-load/store instruction. */
8801 : 1950135 : if (DECL_P (base)
8802 : 1565344 : && !is_global_var (base)
8803 : 1499028 : && !TREE_ADDRESSABLE (base)
8804 : 2643931 : && DECL_MODE (base) != BLKmode)
8805 : : kind = scalar_stmt;
8806 : 1692437 : else if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
8807 : : kind = scalar_load;
8808 : : else
8809 : 1475303 : kind = scalar_store;
8810 : : }
8811 : 178259 : else if (vect_nop_conversion_p (orig_stmt_info))
8812 : 14819 : continue;
8813 : : /* For single-argument PHIs assume coalescing which means zero cost
8814 : : for the scalar and the vector PHIs. This avoids artificially
8815 : : favoring the vector path (but may pessimize it in some cases). */
8816 : 163440 : else if (is_a <gphi *> (orig_stmt_info->stmt)
8817 : 163440 : && gimple_phi_num_args
8818 : 89303 : (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
8819 : 9119 : continue;
8820 : : else
8821 : : kind = scalar_stmt;
8822 : 2104456 : record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
8823 : : SLP_TREE_VECTYPE (node), 0, vect_body);
8824 : : }
8825 : :
8826 : 1713948 : auto_vec<bool, 20> subtree_life;
8827 : 2429976 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8828 : : {
8829 : 831960 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8830 : : {
8831 : : /* Do not directly pass LIFE to the recursive call, copy it to
8832 : : confine changes in the callee to the current child/subtree. */
8833 : 221226 : if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
8834 : : {
8835 : 3204 : subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
8836 : 11338 : for (unsigned j = 0;
8837 : 11338 : j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
8838 : : {
8839 : 8134 : auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
8840 : 8134 : if (perm.first == i)
8841 : 4318 : subtree_life[perm.second] = (*life)[j];
8842 : : }
8843 : : }
8844 : : else
8845 : : {
8846 : 218022 : gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
8847 : 218022 : subtree_life.safe_splice (*life);
8848 : : }
8849 : 221226 : vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
8850 : : vectorized_scalar_stmts,
8851 : : scalar_stmts_in_externs, visited);
8852 : 221226 : subtree_life.truncate (0);
8853 : : }
8854 : : }
8855 : : }
8856 : :
8857 : : /* Comparator for the loop-index sorted cost vectors. */
8858 : :
8859 : : static int
8860 : 17200562 : li_cost_vec_cmp (const void *a_, const void *b_)
8861 : : {
8862 : 17200562 : auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
8863 : 17200562 : auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
8864 : 17200562 : if (a->first < b->first)
8865 : : return -1;
8866 : 16574238 : else if (a->first == b->first)
8867 : 16022968 : return 0;
8868 : : return 1;
8869 : : }
8870 : :
8871 : : /* Check if vectorization of the basic block is profitable for the
8872 : : subgraph denoted by SLP_INSTANCES. */
8873 : :
8874 : : static bool
8875 : 650380 : vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
8876 : : vec<slp_instance> slp_instances,
8877 : : loop_p orig_loop)
8878 : : {
8879 : 650380 : slp_instance instance;
8880 : 650380 : int i;
8881 : 650380 : unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
8882 : 650380 : unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
8883 : :
8884 : 650380 : if (dump_enabled_p ())
8885 : : {
8886 : 80 : dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
8887 : 80 : hash_set<slp_tree> visited;
8888 : 323 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
8889 : 83 : vect_print_slp_graph (MSG_NOTE, vect_location,
8890 : : SLP_INSTANCE_TREE (instance), visited);
8891 : 80 : }
8892 : :
8893 : : /* Compute the set of scalar stmts we know will go away 'locally' when
8894 : : vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
8895 : : not accurate for nodes promoted extern late or for scalar stmts that
8896 : : are used both in extern defs and in vectorized defs. */
8897 : 650380 : hash_set<stmt_vec_info> vectorized_scalar_stmts;
8898 : 650380 : hash_set<stmt_vec_info> scalar_stmts_in_externs;
8899 : 650380 : hash_set<slp_tree> visited;
8900 : 1312374 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
8901 : : {
8902 : 661994 : vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
8903 : : SLP_INSTANCE_TREE (instance),
8904 : : visited,
8905 : : vectorized_scalar_stmts,
8906 : : scalar_stmts_in_externs);
8907 : 726875 : for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
8908 : 34877 : vectorized_scalar_stmts.add (rstmt);
8909 : : }
8910 : : /* Scalar stmts used as defs in external nodes need to be preseved, so
8911 : : remove them from vectorized_scalar_stmts. */
8912 : 876491 : for (stmt_vec_info stmt : scalar_stmts_in_externs)
8913 : 226111 : vectorized_scalar_stmts.remove (stmt);
8914 : :
8915 : : /* Calculate scalar cost and sum the cost for the vector stmts
8916 : : previously collected. */
8917 : 650380 : stmt_vector_for_cost scalar_costs = vNULL;
8918 : 650380 : stmt_vector_for_cost vector_costs = vNULL;
8919 : 650380 : visited.empty ();
8920 : 1312374 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
8921 : : {
8922 : 661994 : auto_vec<bool, 20> life;
8923 : 661994 : life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
8924 : : true);
8925 : 661994 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8926 : 30004 : record_stmt_cost (&scalar_costs,
8927 : 15002 : SLP_INSTANCE_ROOT_STMTS (instance).length (),
8928 : : scalar_stmt,
8929 : 15002 : SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
8930 : 661994 : vect_bb_slp_scalar_cost (bb_vinfo,
8931 : : SLP_INSTANCE_TREE (instance),
8932 : : &life, &scalar_costs, vectorized_scalar_stmts,
8933 : : scalar_stmts_in_externs, visited);
8934 : 661994 : vector_costs.safe_splice (instance->cost_vec);
8935 : 661994 : instance->cost_vec.release ();
8936 : 661994 : }
8937 : :
8938 : 650380 : if (dump_enabled_p ())
8939 : 80 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
8940 : :
8941 : : /* When costing non-loop vectorization we need to consider each covered
8942 : : loop independently and make sure vectorization is profitable. For
8943 : : now we assume a loop may be not entered or executed an arbitrary
8944 : : number of iterations (??? static information can provide more
8945 : : precise info here) which means we can simply cost each containing
8946 : : loops stmts separately. */
8947 : :
8948 : : /* First produce cost vectors sorted by loop index. */
8949 : 650380 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
8950 : 650380 : li_scalar_costs (scalar_costs.length ());
8951 : 650380 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
8952 : 650380 : li_vector_costs (vector_costs.length ());
8953 : 650380 : stmt_info_for_cost *cost;
8954 : 2769838 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
8955 : : {
8956 : 2119458 : unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
8957 : 2119458 : li_scalar_costs.quick_push (std::make_pair (l, cost));
8958 : : }
8959 : : /* Use a random used loop as fallback in case the first vector_costs
8960 : : entry does not have a stmt_info associated with it. */
8961 : 650380 : unsigned l = li_scalar_costs[0].first;
8962 : 2409982 : FOR_EACH_VEC_ELT (vector_costs, i, cost)
8963 : : {
8964 : : /* We inherit from the previous COST, invariants, externals and
8965 : : extracts immediately follow the cost for the related stmt. */
8966 : 1759602 : if (cost->stmt_info)
8967 : 1031487 : l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
8968 : 1759602 : li_vector_costs.quick_push (std::make_pair (l, cost));
8969 : : }
8970 : 650380 : li_scalar_costs.qsort (li_cost_vec_cmp);
8971 : 650380 : li_vector_costs.qsort (li_cost_vec_cmp);
8972 : :
8973 : : /* Now cost the portions individually. */
8974 : : unsigned vi = 0;
8975 : : unsigned si = 0;
8976 : 1127103 : bool profitable = true;
8977 : 1127103 : while (si < li_scalar_costs.length ()
8978 : 1781787 : && vi < li_vector_costs.length ())
8979 : : {
8980 : 654684 : unsigned sl = li_scalar_costs[si].first;
8981 : 654684 : unsigned vl = li_vector_costs[vi].first;
8982 : 654684 : if (sl != vl)
8983 : : {
8984 : 141 : if (dump_enabled_p ())
8985 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
8986 : : "Scalar %d and vector %d loop part do not "
8987 : : "match up, skipping scalar part\n", sl, vl);
8988 : : /* Skip the scalar part, assuming zero cost on the vector side. */
8989 : 433 : do
8990 : : {
8991 : 433 : si++;
8992 : : }
8993 : 433 : while (si < li_scalar_costs.length ()
8994 : 576 : && li_scalar_costs[si].first == sl);
8995 : 141 : continue;
8996 : : }
8997 : :
8998 : 654543 : class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
8999 : 2105400 : do
9000 : : {
9001 : 2105400 : add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
9002 : 2105400 : si++;
9003 : : }
9004 : 2105400 : while (si < li_scalar_costs.length ()
9005 : 4218252 : && li_scalar_costs[si].first == sl);
9006 : 654543 : scalar_target_cost_data->finish_cost (nullptr);
9007 : 654543 : scalar_cost = scalar_target_cost_data->body_cost ();
9008 : :
9009 : : /* Complete the target-specific vector cost calculation. */
9010 : 654543 : class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
9011 : 1739799 : do
9012 : : {
9013 : 1739799 : add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
9014 : 1739799 : vi++;
9015 : : }
9016 : 1739799 : while (vi < li_vector_costs.length ()
9017 : 3487603 : && li_vector_costs[vi].first == vl);
9018 : 654543 : vect_target_cost_data->finish_cost (scalar_target_cost_data);
9019 : 654543 : vec_prologue_cost = vect_target_cost_data->prologue_cost ();
9020 : 654543 : vec_inside_cost = vect_target_cost_data->body_cost ();
9021 : 654543 : vec_epilogue_cost = vect_target_cost_data->epilogue_cost ();
9022 : 654543 : delete scalar_target_cost_data;
9023 : 654543 : delete vect_target_cost_data;
9024 : :
9025 : 654543 : vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
9026 : :
9027 : 654543 : if (dump_enabled_p ())
9028 : : {
9029 : 80 : dump_printf_loc (MSG_NOTE, vect_location,
9030 : : "Cost model analysis for part in loop %d:\n", sl);
9031 : 80 : dump_printf (MSG_NOTE, " Vector cost: %d\n",
9032 : : vec_inside_cost + vec_outside_cost);
9033 : 80 : dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
9034 : : }
9035 : :
9036 : : /* Vectorization is profitable if its cost is more than the cost of scalar
9037 : : version. Note that we err on the vector side for equal cost because
9038 : : the cost estimate is otherwise quite pessimistic (constant uses are
9039 : : free on the scalar side but cost a load on the vector side for
9040 : : example). */
9041 : 654543 : if (vec_outside_cost + vec_inside_cost > scalar_cost)
9042 : : {
9043 : : profitable = false;
9044 : : break;
9045 : : }
9046 : : }
9047 : 1122799 : if (profitable && vi < li_vector_costs.length ())
9048 : : {
9049 : 588 : if (dump_enabled_p ())
9050 : 12 : dump_printf_loc (MSG_NOTE, vect_location,
9051 : : "Excess vector cost for part in loop %d:\n",
9052 : 6 : li_vector_costs[vi].first);
9053 : : profitable = false;
9054 : : }
9055 : :
9056 : : /* Unset visited flag. This is delayed when the subgraph is profitable
9057 : : and we process the loop for remaining unvectorized if-converted code. */
9058 : 650380 : if (!orig_loop || !profitable)
9059 : 2768381 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9060 : 2118128 : gimple_set_visited (cost->stmt_info->stmt, false);
9061 : :
9062 : 650380 : scalar_costs.release ();
9063 : 650380 : vector_costs.release ();
9064 : :
9065 : 650380 : return profitable;
9066 : 650380 : }
9067 : :
9068 : : /* qsort comparator for lane defs. */
9069 : :
9070 : : static int
9071 : 40 : vld_cmp (const void *a_, const void *b_)
9072 : : {
9073 : 40 : auto *a = (const std::pair<unsigned, tree> *)a_;
9074 : 40 : auto *b = (const std::pair<unsigned, tree> *)b_;
9075 : 40 : return a->first - b->first;
9076 : : }
9077 : :
9078 : : /* Return true if USE_STMT is a vector lane insert into VEC and set
9079 : : *THIS_LANE to the lane number that is set. */
9080 : :
9081 : : static bool
9082 : 240 : vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
9083 : : {
9084 : 240 : gassign *use_ass = dyn_cast <gassign *> (use_stmt);
9085 : 91 : if (!use_ass
9086 : 91 : || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
9087 : 22 : || (vec
9088 : 22 : ? gimple_assign_rhs1 (use_ass) != vec
9089 : 24 : : ((vec = gimple_assign_rhs1 (use_ass)), false))
9090 : 46 : || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
9091 : 46 : TREE_TYPE (gimple_assign_rhs2 (use_ass)))
9092 : 46 : || !constant_multiple_p
9093 : 46 : (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
9094 : 92 : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
9095 : : this_lane))
9096 : 194 : return false;
9097 : : return true;
9098 : : }
9099 : :
9100 : : /* Find any vectorizable constructors and add them to the grouped_store
9101 : : array. */
9102 : :
9103 : : static void
9104 : 2333511 : vect_slp_check_for_roots (bb_vec_info bb_vinfo)
9105 : : {
9106 : 17406859 : for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
9107 : 30146696 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
9108 : 119640556 : !gsi_end_p (gsi); gsi_next (&gsi))
9109 : : {
9110 : 104567208 : gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
9111 : : /* This can be used to start SLP discovery for early breaks for BB early breaks
9112 : : when we get that far. */
9113 : 104567208 : if (!assign)
9114 : 152490743 : continue;
9115 : :
9116 : 29285330 : tree rhs = gimple_assign_rhs1 (assign);
9117 : 29285330 : enum tree_code code = gimple_assign_rhs_code (assign);
9118 : 29285330 : use_operand_p use_p;
9119 : 29285330 : gimple *use_stmt;
9120 : 29285330 : if (code == CONSTRUCTOR)
9121 : : {
9122 : 1663281 : if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9123 : 50241 : || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
9124 : 77806 : CONSTRUCTOR_NELTS (rhs))
9125 : 35782 : || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
9126 : 1699062 : || uniform_vector_p (rhs))
9127 : 1655566 : continue;
9128 : :
9129 : : unsigned j;
9130 : : tree val;
9131 : 45798 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9132 : 38083 : if (TREE_CODE (val) != SSA_NAME
9133 : 38083 : || !bb_vinfo->lookup_def (val))
9134 : : break;
9135 : 19798 : if (j != CONSTRUCTOR_NELTS (rhs))
9136 : 2184 : continue;
9137 : :
9138 : 7715 : vec<stmt_vec_info> roots = vNULL;
9139 : 7715 : roots.safe_push (bb_vinfo->lookup_stmt (assign));
9140 : 7715 : vec<stmt_vec_info> stmts;
9141 : 7715 : stmts.create (CONSTRUCTOR_NELTS (rhs));
9142 : 50070 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9143 : 34640 : stmts.quick_push
9144 : 34640 : (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
9145 : 7715 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9146 : 7715 : stmts, roots));
9147 : : }
9148 : 27622049 : else if (code == BIT_INSERT_EXPR
9149 : 829 : && VECTOR_TYPE_P (TREE_TYPE (rhs))
9150 : 556 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
9151 : 556 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
9152 : 556 : && integer_zerop (gimple_assign_rhs3 (assign))
9153 : 312 : && useless_type_conversion_p
9154 : 312 : (TREE_TYPE (TREE_TYPE (rhs)),
9155 : 312 : TREE_TYPE (gimple_assign_rhs2 (assign)))
9156 : 27622613 : && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
9157 : : {
9158 : : /* We start to match on insert to lane zero but since the
9159 : : inserts need not be ordered we'd have to search both
9160 : : the def and the use chains. */
9161 : 211 : tree vectype = TREE_TYPE (rhs);
9162 : 211 : unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
9163 : 211 : auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
9164 : 211 : auto_sbitmap lanes (nlanes);
9165 : 211 : bitmap_clear (lanes);
9166 : 211 : bitmap_set_bit (lanes, 0);
9167 : 211 : tree def = gimple_assign_lhs (assign);
9168 : 211 : lane_defs.quick_push
9169 : 211 : (std::make_pair (0, gimple_assign_rhs2 (assign)));
9170 : 211 : unsigned lanes_found = 1;
9171 : : /* Start with the use chains, the last stmt will be the root. */
9172 : 211 : stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
9173 : 211 : vec<stmt_vec_info> roots = vNULL;
9174 : 211 : roots.safe_push (last);
9175 : 213 : do
9176 : : {
9177 : 213 : use_operand_p use_p;
9178 : 213 : gimple *use_stmt;
9179 : 213 : if (!single_imm_use (def, &use_p, &use_stmt))
9180 : : break;
9181 : 207 : unsigned this_lane;
9182 : 207 : if (!bb_vinfo->lookup_stmt (use_stmt)
9183 : 207 : || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
9184 : 229 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
9185 : : break;
9186 : 22 : if (bitmap_bit_p (lanes, this_lane))
9187 : : break;
9188 : 2 : lanes_found++;
9189 : 2 : bitmap_set_bit (lanes, this_lane);
9190 : 2 : gassign *use_ass = as_a <gassign *> (use_stmt);
9191 : 2 : lane_defs.quick_push (std::make_pair
9192 : 2 : (this_lane, gimple_assign_rhs2 (use_ass)));
9193 : 2 : last = bb_vinfo->lookup_stmt (use_ass);
9194 : 2 : roots.safe_push (last);
9195 : 2 : def = gimple_assign_lhs (use_ass);
9196 : : }
9197 : 2 : while (lanes_found < nlanes);
9198 : 211 : if (roots.length () > 1)
9199 : 2 : std::swap(roots[0], roots[roots.length () - 1]);
9200 : 211 : if (lanes_found < nlanes)
9201 : : {
9202 : : /* Now search the def chain. */
9203 : 211 : def = gimple_assign_rhs1 (assign);
9204 : 213 : do
9205 : : {
9206 : 213 : if (TREE_CODE (def) != SSA_NAME
9207 : 213 : || !has_single_use (def))
9208 : : break;
9209 : 56 : gimple *def_stmt = SSA_NAME_DEF_STMT (def);
9210 : 56 : unsigned this_lane;
9211 : 56 : if (!bb_vinfo->lookup_stmt (def_stmt)
9212 : 33 : || !vect_slp_is_lane_insert (def_stmt,
9213 : : NULL_TREE, &this_lane)
9214 : 80 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
9215 : : break;
9216 : 24 : if (bitmap_bit_p (lanes, this_lane))
9217 : : break;
9218 : 4 : lanes_found++;
9219 : 4 : bitmap_set_bit (lanes, this_lane);
9220 : 8 : lane_defs.quick_push (std::make_pair
9221 : 4 : (this_lane,
9222 : 4 : gimple_assign_rhs2 (def_stmt)));
9223 : 4 : roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
9224 : 4 : def = gimple_assign_rhs1 (def_stmt);
9225 : : }
9226 : 4 : while (lanes_found < nlanes);
9227 : : }
9228 : 211 : if (lanes_found == nlanes)
9229 : : {
9230 : : /* Sort lane_defs after the lane index and register the root. */
9231 : 2 : lane_defs.qsort (vld_cmp);
9232 : 2 : vec<stmt_vec_info> stmts;
9233 : 2 : stmts.create (nlanes);
9234 : 10 : for (unsigned i = 0; i < nlanes; ++i)
9235 : 8 : stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
9236 : 2 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9237 : 2 : stmts, roots));
9238 : : }
9239 : : else
9240 : 209 : roots.release ();
9241 : 211 : }
9242 : 27621838 : else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9243 : 26758562 : && (associative_tree_code (code) || code == MINUS_EXPR)
9244 : : /* ??? This pessimizes a two-element reduction. PR54400.
9245 : : ??? In-order reduction could be handled if we only
9246 : : traverse one operand chain in vect_slp_linearize_chain. */
9247 : 31211813 : && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
9248 : : /* Ops with constants at the tail can be stripped here. */
9249 : 5346065 : && TREE_CODE (rhs) == SSA_NAME
9250 : 5293499 : && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
9251 : : /* Should be the chain end. */
9252 : 29757406 : && (!single_imm_use (gimple_assign_lhs (assign),
9253 : : &use_p, &use_stmt)
9254 : 1652597 : || !is_gimple_assign (use_stmt)
9255 : 1101895 : || (gimple_assign_rhs_code (use_stmt) != code
9256 : 802673 : && ((code != PLUS_EXPR && code != MINUS_EXPR)
9257 : 437265 : || (gimple_assign_rhs_code (use_stmt)
9258 : 437265 : != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
9259 : : {
9260 : : /* We start the match at the end of a possible association
9261 : : chain. */
9262 : 1756090 : auto_vec<chain_op_t> chain;
9263 : 1756090 : auto_vec<std::pair<tree_code, gimple *> > worklist;
9264 : 1756090 : auto_vec<gimple *> chain_stmts;
9265 : 1756090 : gimple *code_stmt = NULL, *alt_code_stmt = NULL;
9266 : 1756090 : if (code == MINUS_EXPR)
9267 : 276973 : code = PLUS_EXPR;
9268 : 1756090 : internal_fn reduc_fn;
9269 : 2027511 : if (!reduction_fn_for_scalar_code (code, &reduc_fn)
9270 : 1756090 : || reduc_fn == IFN_LAST)
9271 : 271421 : continue;
9272 : 1484669 : vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
9273 : : /* ??? */
9274 : : code_stmt, alt_code_stmt, &chain_stmts);
9275 : 2969338 : if (chain.length () > 1)
9276 : : {
9277 : : /* Sort the chain according to def_type and operation. */
9278 : 1484669 : chain.sort (dt_sort_cmp, bb_vinfo);
9279 : : /* ??? Now we'd want to strip externals and constants
9280 : : but record those to be handled in the epilogue. */
9281 : : /* ??? For now do not allow mixing ops or externs/constants. */
9282 : 1484669 : bool invalid = false;
9283 : 1484669 : unsigned remain_cnt = 0;
9284 : 1484669 : unsigned last_idx = 0;
9285 : 4496303 : for (unsigned i = 0; i < chain.length (); ++i)
9286 : : {
9287 : 3309519 : if (chain[i].code != code)
9288 : : {
9289 : : invalid = true;
9290 : : break;
9291 : : }
9292 : 3011634 : if (chain[i].dt != vect_internal_def
9293 : : /* Avoid stmts where the def is not the LHS, like
9294 : : ASMs. */
9295 : 5809713 : || (gimple_get_lhs (bb_vinfo->lookup_def
9296 : 2798079 : (chain[i].op)->stmt)
9297 : 2798079 : != chain[i].op))
9298 : 216447 : remain_cnt++;
9299 : : else
9300 : : last_idx = i;
9301 : : }
9302 : : /* Make sure to have an even number of lanes as we later do
9303 : : all-or-nothing discovery, not trying to split further. */
9304 : 1484669 : if ((chain.length () - remain_cnt) & 1)
9305 : 174909 : remain_cnt++;
9306 : 1484669 : if (!invalid && chain.length () - remain_cnt > 1)
9307 : : {
9308 : 1123230 : vec<stmt_vec_info> stmts;
9309 : 1123230 : vec<tree> remain = vNULL;
9310 : 1123230 : stmts.create (chain.length ());
9311 : 1123230 : if (remain_cnt > 0)
9312 : 100970 : remain.create (remain_cnt);
9313 : 3612680 : for (unsigned i = 0; i < chain.length (); ++i)
9314 : : {
9315 : 2489450 : stmt_vec_info stmt_info;
9316 : 2489450 : if (chain[i].dt == vect_internal_def
9317 : 2456832 : && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
9318 : 2456832 : gimple_get_lhs (stmt_info->stmt) == chain[i].op)
9319 : 4946198 : && (i != last_idx
9320 : 1123230 : || (stmts.length () & 1)))
9321 : 2377898 : stmts.quick_push (stmt_info);
9322 : : else
9323 : 111552 : remain.quick_push (chain[i].op);
9324 : : }
9325 : 1123230 : vec<stmt_vec_info> roots;
9326 : 1123230 : roots.create (chain_stmts.length ());
9327 : 2489450 : for (unsigned i = 0; i < chain_stmts.length (); ++i)
9328 : 1366220 : roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
9329 : 1123230 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
9330 : 1123230 : stmts, roots, remain));
9331 : : }
9332 : : }
9333 : 1756090 : }
9334 : : }
9335 : 2333511 : }
9336 : :
9337 : : /* Walk the grouped store chains and replace entries with their
9338 : : pattern variant if any. */
9339 : :
9340 : : static void
9341 : 601588 : vect_fixup_store_groups_with_patterns (vec_info *vinfo)
9342 : : {
9343 : 601588 : stmt_vec_info first_element;
9344 : 601588 : unsigned i;
9345 : :
9346 : 1439552 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
9347 : : {
9348 : : /* We also have CTORs in this array. */
9349 : 837964 : if (!STMT_VINFO_GROUPED_ACCESS (first_element))
9350 : 0 : continue;
9351 : 837964 : if (STMT_VINFO_IN_PATTERN_P (first_element))
9352 : : {
9353 : 232 : stmt_vec_info orig = first_element;
9354 : 232 : first_element = STMT_VINFO_RELATED_STMT (first_element);
9355 : 232 : DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
9356 : 232 : DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
9357 : 232 : DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
9358 : 232 : DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
9359 : 232 : vinfo->grouped_stores[i] = first_element;
9360 : : }
9361 : 837964 : stmt_vec_info prev = first_element;
9362 : 2332775 : while (DR_GROUP_NEXT_ELEMENT (prev))
9363 : : {
9364 : 1494811 : stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
9365 : 1494811 : if (STMT_VINFO_IN_PATTERN_P (elt))
9366 : : {
9367 : 843 : stmt_vec_info orig = elt;
9368 : 843 : elt = STMT_VINFO_RELATED_STMT (elt);
9369 : 843 : DR_GROUP_NEXT_ELEMENT (prev) = elt;
9370 : 843 : DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
9371 : 843 : DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
9372 : : }
9373 : 1494811 : DR_GROUP_FIRST_ELEMENT (elt) = first_element;
9374 : 1494811 : prev = elt;
9375 : : }
9376 : : }
9377 : 601588 : }
9378 : :
9379 : : /* Check if the region described by BB_VINFO can be vectorized, returning
9380 : : true if so. When returning false, set FATAL to true if the same failure
9381 : : would prevent vectorization at other vector sizes, false if it is still
9382 : : worth trying other sizes. N_STMTS is the number of statements in the
9383 : : region. */
9384 : :
9385 : : static bool
9386 : 2333511 : vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
9387 : : vec<int> *dataref_groups)
9388 : : {
9389 : 2333511 : DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
9390 : :
9391 : 2333511 : slp_instance instance;
9392 : 2333511 : int i;
9393 : 2333511 : poly_uint64 min_vf = 2;
9394 : :
9395 : : /* The first group of checks is independent of the vector size. */
9396 : 2333511 : fatal = true;
9397 : :
9398 : : /* Analyze the data references. */
9399 : :
9400 : 2333511 : if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
9401 : : {
9402 : 0 : if (dump_enabled_p ())
9403 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9404 : : "not vectorized: unhandled data-ref in basic "
9405 : : "block.\n");
9406 : 0 : return false;
9407 : : }
9408 : :
9409 : 2333511 : if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
9410 : : {
9411 : 0 : if (dump_enabled_p ())
9412 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9413 : : "not vectorized: unhandled data access in "
9414 : : "basic block.\n");
9415 : 0 : return false;
9416 : : }
9417 : :
9418 : 2333511 : vect_slp_check_for_roots (bb_vinfo);
9419 : :
9420 : : /* If there are no grouped stores and no constructors in the region
9421 : : there is no need to continue with pattern recog as vect_analyze_slp
9422 : : will fail anyway. */
9423 : 2333511 : if (bb_vinfo->grouped_stores.is_empty ()
9424 : 1990392 : && bb_vinfo->roots.is_empty ())
9425 : : {
9426 : 1731923 : if (dump_enabled_p ())
9427 : 1044 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9428 : : "not vectorized: no grouped stores in "
9429 : : "basic block.\n");
9430 : 1731923 : return false;
9431 : : }
9432 : :
9433 : : /* While the rest of the analysis below depends on it in some way. */
9434 : 601588 : fatal = false;
9435 : :
9436 : 601588 : vect_pattern_recog (bb_vinfo);
9437 : :
9438 : : /* Update store groups from pattern processing. */
9439 : 601588 : vect_fixup_store_groups_with_patterns (bb_vinfo);
9440 : :
9441 : : /* Check the SLP opportunities in the basic block, analyze and build SLP
9442 : : trees. */
9443 : 601588 : if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
9444 : : {
9445 : 0 : if (dump_enabled_p ())
9446 : : {
9447 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9448 : : "Failed to SLP the basic block.\n");
9449 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9450 : : "not vectorized: failed to find SLP opportunities "
9451 : : "in basic block.\n");
9452 : : }
9453 : 0 : return false;
9454 : : }
9455 : :
9456 : : /* Optimize permutations. */
9457 : 601588 : vect_optimize_slp (bb_vinfo);
9458 : :
9459 : : /* Gather the loads reachable from the SLP graph entries. */
9460 : 601588 : vect_gather_slp_loads (bb_vinfo);
9461 : :
9462 : 601588 : vect_record_base_alignments (bb_vinfo);
9463 : :
9464 : : /* Analyze and verify the alignment of data references and the
9465 : : dependence in the SLP instances. */
9466 : 1372522 : for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
9467 : : {
9468 : 770934 : vect_location = instance->location ();
9469 : 770934 : if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
9470 : 770934 : || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
9471 : : {
9472 : 6166 : slp_tree node = SLP_INSTANCE_TREE (instance);
9473 : 6166 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
9474 : 6166 : if (dump_enabled_p ())
9475 : 6 : dump_printf_loc (MSG_NOTE, vect_location,
9476 : : "removing SLP instance operations starting from: %G",
9477 : : stmt_info->stmt);
9478 : 6166 : vect_free_slp_instance (instance);
9479 : 6166 : BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
9480 : 6166 : continue;
9481 : 6166 : }
9482 : :
9483 : : /* Mark all the statements that we want to vectorize as pure SLP and
9484 : : relevant. */
9485 : 764768 : vect_mark_slp_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance));
9486 : 764768 : vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
9487 : 764768 : unsigned j;
9488 : 764768 : stmt_vec_info root;
9489 : : /* Likewise consider instance root stmts as vectorized. */
9490 : 1673799 : FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
9491 : 144263 : STMT_SLP_TYPE (root) = pure_slp;
9492 : :
9493 : 764768 : i++;
9494 : : }
9495 : 2369765 : if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
9496 : : return false;
9497 : :
9498 : 318775 : if (!vect_slp_analyze_operations (bb_vinfo))
9499 : : {
9500 : 36254 : if (dump_enabled_p ())
9501 : 93 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9502 : : "not vectorized: bad operation in basic block.\n");
9503 : 36254 : return false;
9504 : : }
9505 : :
9506 : 282521 : vect_bb_partition_graph (bb_vinfo);
9507 : :
9508 : 282521 : return true;
9509 : : }
9510 : :
9511 : : /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
9512 : : basic blocks in BBS, returning true on success.
9513 : : The region has N_STMTS statements and has the datarefs given by DATAREFS. */
9514 : :
9515 : : static bool
9516 : 2037852 : vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
9517 : : vec<int> *dataref_groups, unsigned int n_stmts,
9518 : : loop_p orig_loop)
9519 : : {
9520 : 2037852 : bb_vec_info bb_vinfo;
9521 : 2037852 : auto_vector_modes vector_modes;
9522 : :
9523 : : /* Autodetect first vector size we try. */
9524 : 2037852 : machine_mode next_vector_mode = VOIDmode;
9525 : 2037852 : targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
9526 : 2037852 : unsigned int mode_i = 0;
9527 : :
9528 : 2037852 : vec_info_shared shared;
9529 : :
9530 : 2037852 : machine_mode autodetected_vector_mode = VOIDmode;
9531 : 2629170 : while (1)
9532 : : {
9533 : 2333511 : bool vectorized = false;
9534 : 2333511 : bool fatal = false;
9535 : 2333511 : bb_vinfo = new _bb_vec_info (bbs, &shared);
9536 : :
9537 : 2333511 : bool first_time_p = shared.datarefs.is_empty ();
9538 : 2333511 : BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
9539 : 2333511 : if (first_time_p)
9540 : 2060393 : bb_vinfo->shared->save_datarefs ();
9541 : : else
9542 : 273118 : bb_vinfo->shared->check_datarefs ();
9543 : 2333511 : bb_vinfo->vector_mode = next_vector_mode;
9544 : :
9545 : 2333511 : if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
9546 : : {
9547 : 282521 : if (dump_enabled_p ())
9548 : : {
9549 : 1426 : dump_printf_loc (MSG_NOTE, vect_location,
9550 : : "***** Analysis succeeded with vector mode"
9551 : 713 : " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
9552 : 713 : dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
9553 : : }
9554 : :
9555 : 282521 : bb_vinfo->shared->check_datarefs ();
9556 : :
9557 : 282521 : bool force_clear = false;
9558 : 282521 : auto_vec<slp_instance> profitable_subgraphs;
9559 : 1513158 : for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
9560 : : {
9561 : 665595 : if (instance->subgraph_entries.is_empty ())
9562 : 202039 : continue;
9563 : :
9564 : 653850 : dump_user_location_t saved_vect_location = vect_location;
9565 : 653850 : vect_location = instance->location ();
9566 : 653850 : if (!unlimited_cost_model (NULL)
9567 : 1304230 : && !vect_bb_vectorization_profitable_p
9568 : 650380 : (bb_vinfo, instance->subgraph_entries, orig_loop))
9569 : : {
9570 : 178549 : if (dump_enabled_p ())
9571 : 14 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9572 : : "not vectorized: vectorization is not "
9573 : : "profitable.\n");
9574 : 178549 : vect_location = saved_vect_location;
9575 : 178549 : continue;
9576 : : }
9577 : :
9578 : 475301 : vect_location = saved_vect_location;
9579 : 475301 : if (!dbg_cnt (vect_slp))
9580 : : {
9581 : 0 : force_clear = true;
9582 : 0 : continue;
9583 : : }
9584 : :
9585 : 475301 : profitable_subgraphs.safe_push (instance);
9586 : : }
9587 : :
9588 : : /* When we're vectorizing an if-converted loop body make sure
9589 : : we vectorized all if-converted code. */
9590 : 461218 : if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
9591 : : {
9592 : 125 : gcc_assert (bb_vinfo->nbbs == 1);
9593 : 250 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
9594 : 5244 : !gsi_end_p (gsi); gsi_next (&gsi))
9595 : : {
9596 : : /* The costing above left us with DCEable vectorized scalar
9597 : : stmts having the visited flag set on profitable
9598 : : subgraphs. Do the delayed clearing of the flag here. */
9599 : 5119 : if (gimple_visited_p (gsi_stmt (gsi)))
9600 : : {
9601 : 1304 : gimple_set_visited (gsi_stmt (gsi), false);
9602 : 1304 : continue;
9603 : : }
9604 : 3815 : if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
9605 : 868 : continue;
9606 : :
9607 : 7538 : if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
9608 : 3100 : if (gimple_assign_rhs_code (ass) == COND_EXPR)
9609 : : {
9610 : 142 : if (!profitable_subgraphs.is_empty ()
9611 : 48 : && dump_enabled_p ())
9612 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
9613 : : "not profitable because of "
9614 : : "unprofitable if-converted scalar "
9615 : : "code\n");
9616 : 94 : profitable_subgraphs.truncate (0);
9617 : : }
9618 : : }
9619 : : }
9620 : :
9621 : : /* Finally schedule the profitable subgraphs. */
9622 : 1115146 : for (slp_instance instance : profitable_subgraphs)
9623 : : {
9624 : 475231 : if (!vectorized && dump_enabled_p ())
9625 : 700 : dump_printf_loc (MSG_NOTE, vect_location,
9626 : : "Basic block will be vectorized "
9627 : : "using SLP\n");
9628 : 475231 : vectorized = true;
9629 : :
9630 : : /* Dump before scheduling as store vectorization will remove
9631 : : the original stores and mess with the instance tree
9632 : : so querying its location will eventually ICE. */
9633 : 475231 : if (flag_checking)
9634 : 1907428 : for (slp_instance sub : instance->subgraph_entries)
9635 : 481735 : gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
9636 : 475231 : unsigned HOST_WIDE_INT bytes;
9637 : 475231 : if (dump_enabled_p ())
9638 : 3319 : for (slp_instance sub : instance->subgraph_entries)
9639 : : {
9640 : 880 : tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
9641 : 1760 : if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
9642 : 880 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9643 : 880 : sub->location (),
9644 : : "basic block part vectorized using %wu "
9645 : : "byte vectors\n", bytes);
9646 : : else
9647 : : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9648 : : sub->location (),
9649 : : "basic block part vectorized using "
9650 : : "variable length vectors\n");
9651 : : }
9652 : :
9653 : 475231 : dump_user_location_t saved_vect_location = vect_location;
9654 : 475231 : vect_location = instance->location ();
9655 : :
9656 : 475231 : vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
9657 : :
9658 : 475231 : vect_location = saved_vect_location;
9659 : : }
9660 : :
9661 : :
9662 : : /* Generate the invariant statements. */
9663 : 282521 : if (!gimple_seq_empty_p (bb_vinfo->inv_pattern_def_seq))
9664 : : {
9665 : 45 : if (dump_enabled_p ())
9666 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
9667 : : "------>generating invariant statements\n");
9668 : :
9669 : 45 : bb_vinfo->insert_seq_on_entry (NULL,
9670 : : bb_vinfo->inv_pattern_def_seq);
9671 : : }
9672 : 282521 : }
9673 : : else
9674 : : {
9675 : 2050990 : if (dump_enabled_p ())
9676 : 1415 : dump_printf_loc (MSG_NOTE, vect_location,
9677 : : "***** Analysis failed with vector mode %s\n",
9678 : 1415 : GET_MODE_NAME (bb_vinfo->vector_mode));
9679 : : }
9680 : :
9681 : 2333511 : if (mode_i == 0)
9682 : 2037852 : autodetected_vector_mode = bb_vinfo->vector_mode;
9683 : :
9684 : 2333511 : if (!fatal)
9685 : 3277643 : while (mode_i < vector_modes.length ()
9686 : 1760890 : && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
9687 : : {
9688 : 342544 : if (dump_enabled_p ())
9689 : 1650 : dump_printf_loc (MSG_NOTE, vect_location,
9690 : : "***** The result for vector mode %s would"
9691 : : " be the same\n",
9692 : 825 : GET_MODE_NAME (vector_modes[mode_i]));
9693 : 342544 : mode_i += 1;
9694 : : }
9695 : :
9696 : 2333511 : delete bb_vinfo;
9697 : :
9698 : 2333511 : if (mode_i < vector_modes.length ()
9699 : 2167502 : && VECTOR_MODE_P (autodetected_vector_mode)
9700 : 2034212 : && (related_vector_mode (vector_modes[mode_i],
9701 : : GET_MODE_INNER (autodetected_vector_mode))
9702 : 1017106 : == autodetected_vector_mode)
9703 : 4501013 : && (related_vector_mode (autodetected_vector_mode,
9704 : 542425 : GET_MODE_INNER (vector_modes[mode_i]))
9705 : 1084850 : == vector_modes[mode_i]))
9706 : : {
9707 : 542425 : if (dump_enabled_p ())
9708 : 210 : dump_printf_loc (MSG_NOTE, vect_location,
9709 : : "***** Skipping vector mode %s, which would"
9710 : : " repeat the analysis for %s\n",
9711 : 210 : GET_MODE_NAME (vector_modes[mode_i]),
9712 : 210 : GET_MODE_NAME (autodetected_vector_mode));
9713 : 542425 : mode_i += 1;
9714 : : }
9715 : :
9716 : 2333511 : if (vectorized
9717 : 2154862 : || mode_i == vector_modes.length ()
9718 : 1988947 : || autodetected_vector_mode == VOIDmode
9719 : : /* If vect_slp_analyze_bb_1 signaled that analysis for all
9720 : : vector sizes will fail do not bother iterating. */
9721 : 3172062 : || fatal)
9722 : 4075704 : return vectorized;
9723 : :
9724 : : /* Try the next biggest vector size. */
9725 : 295659 : next_vector_mode = vector_modes[mode_i++];
9726 : 295659 : if (dump_enabled_p ())
9727 : 263 : dump_printf_loc (MSG_NOTE, vect_location,
9728 : : "***** Re-trying analysis with vector mode %s\n",
9729 : 263 : GET_MODE_NAME (next_vector_mode));
9730 : 295659 : }
9731 : 2037852 : }
9732 : :
9733 : :
9734 : : /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
9735 : : true if anything in the basic-block was vectorized. */
9736 : :
9737 : : static bool
9738 : 2037852 : vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
9739 : : {
9740 : 2037852 : vec<data_reference_p> datarefs = vNULL;
9741 : 2037852 : auto_vec<int> dataref_groups;
9742 : 2037852 : int insns = 0;
9743 : 2037852 : int current_group = 0;
9744 : :
9745 : 12526187 : for (unsigned i = 0; i < bbs.length (); i++)
9746 : : {
9747 : 10488335 : basic_block bb = bbs[i];
9748 : 81234253 : for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
9749 : 70745918 : gsi_next (&gsi))
9750 : : {
9751 : 70745918 : gimple *stmt = gsi_stmt (gsi);
9752 : 70745918 : if (is_gimple_debug (stmt))
9753 : 42224491 : continue;
9754 : :
9755 : 28521427 : insns++;
9756 : :
9757 : 28521427 : if (gimple_location (stmt) != UNKNOWN_LOCATION)
9758 : 25959090 : vect_location = stmt;
9759 : :
9760 : 28521427 : if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
9761 : 28521427 : &dataref_groups, current_group))
9762 : 4942476 : ++current_group;
9763 : : }
9764 : : /* New BBs always start a new DR group. */
9765 : 10488335 : ++current_group;
9766 : : }
9767 : :
9768 : 2037852 : return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
9769 : 2037852 : }
9770 : :
9771 : : /* Special entry for the BB vectorizer. Analyze and transform a single
9772 : : if-converted BB with ORIG_LOOPs body being the not if-converted
9773 : : representation. Returns true if anything in the basic-block was
9774 : : vectorized. */
9775 : :
9776 : : bool
9777 : 17479 : vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
9778 : : {
9779 : 17479 : auto_vec<basic_block> bbs;
9780 : 17479 : bbs.safe_push (bb);
9781 : 17479 : return vect_slp_bbs (bbs, orig_loop);
9782 : 17479 : }
9783 : :
9784 : : /* Main entry for the BB vectorizer. Analyze and transform BB, returns
9785 : : true if anything in the basic-block was vectorized. */
9786 : :
9787 : : bool
9788 : 881291 : vect_slp_function (function *fun)
9789 : : {
9790 : 881291 : bool r = false;
9791 : 881291 : int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
9792 : 881291 : auto_bitmap exit_bbs;
9793 : 881291 : bitmap_set_bit (exit_bbs, EXIT_BLOCK);
9794 : 881291 : edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
9795 : 881291 : unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
9796 : 881291 : true, rpo, NULL);
9797 : :
9798 : : /* For the moment split the function into pieces to avoid making
9799 : : the iteration on the vector mode moot. Split at points we know
9800 : : to not handle well which is CFG merges (SLP discovery doesn't
9801 : : handle non-loop-header PHIs) and loop exits. Since pattern
9802 : : recog requires reverse iteration to visit uses before defs
9803 : : simply chop RPO into pieces. */
9804 : 881291 : auto_vec<basic_block> bbs;
9805 : 11378260 : for (unsigned i = 0; i < n; i++)
9806 : : {
9807 : 10496969 : basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
9808 : 10496969 : bool split = false;
9809 : :
9810 : : /* Split when a BB is not dominated by the first block. */
9811 : 19757458 : if (!bbs.is_empty ()
9812 : 9260489 : && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
9813 : : {
9814 : 799141 : if (dump_enabled_p ())
9815 : 162 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9816 : : "splitting region at dominance boundary bb%d\n",
9817 : : bb->index);
9818 : : split = true;
9819 : : }
9820 : : /* Split when the loop determined by the first block
9821 : : is exited. This is because we eventually insert
9822 : : invariants at region begin. */
9823 : 18159176 : else if (!bbs.is_empty ()
9824 : 8461348 : && bbs[0]->loop_father != bb->loop_father
9825 : 2136320 : && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
9826 : : {
9827 : 4903 : if (dump_enabled_p ())
9828 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9829 : : "splitting region at loop %d exit at bb%d\n",
9830 : 3 : bbs[0]->loop_father->num, bb->index);
9831 : : split = true;
9832 : : }
9833 : 9692925 : else if (!bbs.is_empty ()
9834 : 8456445 : && bb->loop_father->header == bb
9835 : 448156 : && bb->loop_father->dont_vectorize)
9836 : : {
9837 : 5962 : if (dump_enabled_p ())
9838 : 72 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9839 : : "splitting region at dont-vectorize loop %d "
9840 : : "entry at bb%d\n",
9841 : : bb->loop_father->num, bb->index);
9842 : : split = true;
9843 : : }
9844 : :
9845 : 11306975 : if (split && !bbs.is_empty ())
9846 : : {
9847 : 810006 : r |= vect_slp_bbs (bbs, NULL);
9848 : 810006 : bbs.truncate (0);
9849 : : }
9850 : :
9851 : 10496969 : if (bbs.is_empty ())
9852 : : {
9853 : : /* We need to be able to insert at the head of the region which
9854 : : we cannot for region starting with a returns-twice call. */
9855 : 2046486 : if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
9856 : 384505 : if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
9857 : : {
9858 : 290 : if (dump_enabled_p ())
9859 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9860 : : "skipping bb%d as start of region as it "
9861 : : "starts with returns-twice call\n",
9862 : : bb->index);
9863 : 26113 : continue;
9864 : : }
9865 : : /* If the loop this BB belongs to is marked as not to be vectorized
9866 : : honor that also for BB vectorization. */
9867 : 2046196 : if (bb->loop_father->dont_vectorize)
9868 : 25823 : continue;
9869 : : }
9870 : :
9871 : 10470856 : bbs.safe_push (bb);
9872 : :
9873 : : /* When we have a stmt ending this block and defining a
9874 : : value we have to insert on edges when inserting after it for
9875 : : a vector containing its definition. Avoid this for now. */
9876 : 20941712 : if (gimple *last = *gsi_last_bb (bb))
9877 : 8312428 : if (gimple_get_lhs (last)
9878 : 8312428 : && is_ctrl_altering_stmt (last))
9879 : : {
9880 : 329083 : if (dump_enabled_p ())
9881 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9882 : : "splitting region at control altering "
9883 : : "definition %G", last);
9884 : 329083 : r |= vect_slp_bbs (bbs, NULL);
9885 : 329083 : bbs.truncate (0);
9886 : : }
9887 : : }
9888 : :
9889 : 881291 : if (!bbs.is_empty ())
9890 : 881284 : r |= vect_slp_bbs (bbs, NULL);
9891 : :
9892 : 881291 : free (rpo);
9893 : :
9894 : 881291 : return r;
9895 : 881291 : }
9896 : :
9897 : : /* Build a variable-length vector in which the elements in ELTS are repeated
9898 : : to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
9899 : : RESULTS and add any new instructions to SEQ.
9900 : :
9901 : : The approach we use is:
9902 : :
9903 : : (1) Find a vector mode VM with integer elements of mode IM.
9904 : :
9905 : : (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9906 : : ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
9907 : : from small vectors to IM.
9908 : :
9909 : : (3) Duplicate each ELTS'[I] into a vector of mode VM.
9910 : :
9911 : : (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
9912 : : correct byte contents.
9913 : :
9914 : : (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
9915 : :
9916 : : We try to find the largest IM for which this sequence works, in order
9917 : : to cut down on the number of interleaves. */
9918 : :
9919 : : void
9920 : 0 : duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
9921 : : const vec<tree> &elts, unsigned int nresults,
9922 : : vec<tree> &results)
9923 : : {
9924 : 0 : unsigned int nelts = elts.length ();
9925 : 0 : tree element_type = TREE_TYPE (vector_type);
9926 : :
9927 : : /* (1) Find a vector mode VM with integer elements of mode IM. */
9928 : 0 : unsigned int nvectors = 1;
9929 : 0 : tree new_vector_type;
9930 : 0 : tree permutes[2];
9931 : 0 : if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
9932 : : &nvectors, &new_vector_type,
9933 : : permutes))
9934 : 0 : gcc_unreachable ();
9935 : :
9936 : : /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
9937 : 0 : unsigned int partial_nelts = nelts / nvectors;
9938 : 0 : tree partial_vector_type = build_vector_type (element_type, partial_nelts);
9939 : :
9940 : 0 : tree_vector_builder partial_elts;
9941 : 0 : auto_vec<tree, 32> pieces (nvectors * 2);
9942 : 0 : pieces.quick_grow_cleared (nvectors * 2);
9943 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
9944 : : {
9945 : : /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9946 : : ELTS' has mode IM. */
9947 : 0 : partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
9948 : 0 : for (unsigned int j = 0; j < partial_nelts; ++j)
9949 : 0 : partial_elts.quick_push (elts[i * partial_nelts + j]);
9950 : 0 : tree t = gimple_build_vector (seq, &partial_elts);
9951 : 0 : t = gimple_build (seq, VIEW_CONVERT_EXPR,
9952 : 0 : TREE_TYPE (new_vector_type), t);
9953 : :
9954 : : /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
9955 : 0 : pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
9956 : : }
9957 : :
9958 : : /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
9959 : : correct byte contents.
9960 : :
9961 : : Conceptually, we need to repeat the following operation log2(nvectors)
9962 : : times, where hi_start = nvectors / 2:
9963 : :
9964 : : out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
9965 : : out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
9966 : :
9967 : : However, if each input repeats every N elements and the VF is
9968 : : a multiple of N * 2, the HI result is the same as the LO result.
9969 : : This will be true for the first N1 iterations of the outer loop,
9970 : : followed by N2 iterations for which both the LO and HI results
9971 : : are needed. I.e.:
9972 : :
9973 : : N1 + N2 = log2(nvectors)
9974 : :
9975 : : Each "N1 iteration" doubles the number of redundant vectors and the
9976 : : effect of the process as a whole is to have a sequence of nvectors/2**N1
9977 : : vectors that repeats 2**N1 times. Rather than generate these redundant
9978 : : vectors, we halve the number of vectors for each N1 iteration. */
9979 : : unsigned int in_start = 0;
9980 : : unsigned int out_start = nvectors;
9981 : : unsigned int new_nvectors = nvectors;
9982 : 0 : for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
9983 : : {
9984 : 0 : unsigned int hi_start = new_nvectors / 2;
9985 : 0 : unsigned int out_i = 0;
9986 : 0 : for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
9987 : : {
9988 : 0 : if ((in_i & 1) != 0
9989 : 0 : && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
9990 : : 2 * in_repeat))
9991 : 0 : continue;
9992 : :
9993 : 0 : tree output = make_ssa_name (new_vector_type);
9994 : 0 : tree input1 = pieces[in_start + (in_i / 2)];
9995 : 0 : tree input2 = pieces[in_start + (in_i / 2) + hi_start];
9996 : 0 : gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
9997 : : input1, input2,
9998 : : permutes[in_i & 1]);
9999 : 0 : gimple_seq_add_stmt (seq, stmt);
10000 : 0 : pieces[out_start + out_i] = output;
10001 : 0 : out_i += 1;
10002 : : }
10003 : 0 : std::swap (in_start, out_start);
10004 : 0 : new_nvectors = out_i;
10005 : : }
10006 : :
10007 : : /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
10008 : 0 : results.reserve (nresults);
10009 : 0 : for (unsigned int i = 0; i < nresults; ++i)
10010 : 0 : if (i < new_nvectors)
10011 : 0 : results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
10012 : 0 : pieces[in_start + i]));
10013 : : else
10014 : 0 : results.quick_push (results[i - new_nvectors]);
10015 : 0 : }
10016 : :
10017 : :
10018 : : /* For constant and loop invariant defs in OP_NODE this function creates
10019 : : vector defs that will be used in the vectorized stmts and stores them
10020 : : to SLP_TREE_VEC_DEFS of OP_NODE. */
10021 : :
10022 : : static void
10023 : 476474 : vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
10024 : : {
10025 : 476474 : unsigned HOST_WIDE_INT nunits;
10026 : 476474 : tree vec_cst;
10027 : 476474 : unsigned j, number_of_places_left_in_vector;
10028 : 476474 : tree vector_type;
10029 : 476474 : tree vop;
10030 : 476474 : int group_size = op_node->ops.length ();
10031 : 476474 : unsigned int vec_num, i;
10032 : 476474 : unsigned number_of_copies = 1;
10033 : 476474 : bool constant_p;
10034 : 476474 : gimple_seq ctor_seq = NULL;
10035 : 476474 : auto_vec<tree, 16> permute_results;
10036 : :
10037 : : /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
10038 : 476474 : vector_type = SLP_TREE_VECTYPE (op_node);
10039 : :
10040 : 476474 : unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
10041 : 476474 : SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
10042 : 476474 : auto_vec<tree> voprnds (number_of_vectors);
10043 : :
10044 : : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
10045 : : created vectors. It is greater than 1 if unrolling is performed.
10046 : :
10047 : : For example, we have two scalar operands, s1 and s2 (e.g., group of
10048 : : strided accesses of size two), while NUNITS is four (i.e., four scalars
10049 : : of this type can be packed in a vector). The output vector will contain
10050 : : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
10051 : : will be 2).
10052 : :
10053 : : If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
10054 : : containing the operands.
10055 : :
10056 : : For example, NUNITS is four as before, and the group size is 8
10057 : : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
10058 : : {s5, s6, s7, s8}. */
10059 : :
10060 : : /* When using duplicate_and_interleave, we just need one element for
10061 : : each scalar statement. */
10062 : 476474 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
10063 : : nunits = group_size;
10064 : :
10065 : 476474 : number_of_copies = nunits * number_of_vectors / group_size;
10066 : :
10067 : 476474 : number_of_places_left_in_vector = nunits;
10068 : 476474 : constant_p = true;
10069 : 476474 : tree uniform_elt = NULL_TREE;
10070 : 476474 : tree_vector_builder elts (vector_type, nunits, 1);
10071 : 476474 : elts.quick_grow (nunits);
10072 : 476474 : stmt_vec_info insert_after = NULL;
10073 : 1325397 : for (j = 0; j < number_of_copies; j++)
10074 : : {
10075 : 848923 : tree op;
10076 : 3333484 : for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
10077 : : {
10078 : : /* Create 'vect_ = {op0,op1,...,opn}'. */
10079 : 1635638 : tree orig_op = op;
10080 : 1635638 : if (number_of_places_left_in_vector == nunits)
10081 : : uniform_elt = op;
10082 : 1041609 : else if (uniform_elt && operand_equal_p (uniform_elt, op))
10083 : 626273 : op = elts[number_of_places_left_in_vector];
10084 : : else
10085 : : uniform_elt = NULL_TREE;
10086 : 1635638 : number_of_places_left_in_vector--;
10087 : 1635638 : if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
10088 : : {
10089 : 281531 : if (CONSTANT_CLASS_P (op))
10090 : : {
10091 : 98370 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10092 : : {
10093 : : /* Can't use VIEW_CONVERT_EXPR for booleans because
10094 : : of possibly different sizes of scalar value and
10095 : : vector element. */
10096 : 64 : if (integer_zerop (op))
10097 : 64 : op = build_int_cst (TREE_TYPE (vector_type), 0);
10098 : 0 : else if (integer_onep (op))
10099 : 0 : op = build_all_ones_cst (TREE_TYPE (vector_type));
10100 : : else
10101 : 0 : gcc_unreachable ();
10102 : : }
10103 : : else
10104 : 98306 : op = fold_unary (VIEW_CONVERT_EXPR,
10105 : : TREE_TYPE (vector_type), op);
10106 : 98370 : gcc_assert (op && CONSTANT_CLASS_P (op));
10107 : : }
10108 : : else
10109 : : {
10110 : 183161 : tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
10111 : 183161 : gimple *init_stmt;
10112 : 183161 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10113 : : {
10114 : 391 : tree true_val
10115 : 391 : = build_all_ones_cst (TREE_TYPE (vector_type));
10116 : 391 : tree false_val
10117 : 391 : = build_zero_cst (TREE_TYPE (vector_type));
10118 : 391 : gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
10119 : 391 : init_stmt = gimple_build_assign (new_temp, COND_EXPR,
10120 : : op, true_val,
10121 : : false_val);
10122 : : }
10123 : : else
10124 : : {
10125 : 182770 : op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
10126 : : op);
10127 : 182770 : init_stmt
10128 : 182770 : = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
10129 : : op);
10130 : : }
10131 : 183161 : gimple_seq_add_stmt (&ctor_seq, init_stmt);
10132 : 183161 : op = new_temp;
10133 : : }
10134 : : }
10135 : 1635638 : elts[number_of_places_left_in_vector] = op;
10136 : 1635638 : if (!CONSTANT_CLASS_P (op))
10137 : 333189 : constant_p = false;
10138 : : /* For BB vectorization we have to compute an insert location
10139 : : when a def is inside the analyzed region since we cannot
10140 : : simply insert at the BB start in this case. */
10141 : 1635638 : stmt_vec_info opdef;
10142 : 1635638 : if (TREE_CODE (orig_op) == SSA_NAME
10143 : 205419 : && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
10144 : 182129 : && is_a <bb_vec_info> (vinfo)
10145 : 1765394 : && (opdef = vinfo->lookup_def (orig_op)))
10146 : : {
10147 : 83296 : if (!insert_after)
10148 : : insert_after = opdef;
10149 : : else
10150 : 46537 : insert_after = get_later_stmt (insert_after, opdef);
10151 : : }
10152 : :
10153 : 1635638 : if (number_of_places_left_in_vector == 0)
10154 : : {
10155 : 594029 : auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
10156 : 594029 : if (uniform_elt)
10157 : 614216 : vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
10158 : 307108 : elts[0]);
10159 : 573842 : else if (constant_p
10160 : 573842 : ? multiple_p (type_nunits, nunits)
10161 : 111949 : : known_eq (type_nunits, nunits))
10162 : 286921 : vec_cst = gimple_build_vector (&ctor_seq, &elts);
10163 : : else
10164 : : {
10165 : 0 : if (permute_results.is_empty ())
10166 : 0 : duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
10167 : : elts, number_of_vectors,
10168 : : permute_results);
10169 : 0 : vec_cst = permute_results[number_of_vectors - j - 1];
10170 : : }
10171 : 594029 : if (!gimple_seq_empty_p (ctor_seq))
10172 : : {
10173 : 147597 : if (insert_after)
10174 : : {
10175 : 36759 : gimple_stmt_iterator gsi;
10176 : 36759 : if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
10177 : : {
10178 : 3254 : gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
10179 : 3254 : gsi_insert_seq_before (&gsi, ctor_seq,
10180 : : GSI_CONTINUE_LINKING);
10181 : : }
10182 : 33505 : else if (!stmt_ends_bb_p (insert_after->stmt))
10183 : : {
10184 : 33505 : gsi = gsi_for_stmt (insert_after->stmt);
10185 : 33505 : gsi_insert_seq_after (&gsi, ctor_seq,
10186 : : GSI_CONTINUE_LINKING);
10187 : : }
10188 : : else
10189 : : {
10190 : : /* When we want to insert after a def where the
10191 : : defining stmt throws then insert on the fallthru
10192 : : edge. */
10193 : 0 : edge e = find_fallthru_edge
10194 : 0 : (gimple_bb (insert_after->stmt)->succs);
10195 : 0 : basic_block new_bb
10196 : 0 : = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
10197 : 0 : gcc_assert (!new_bb);
10198 : : }
10199 : : }
10200 : : else
10201 : 110838 : vinfo->insert_seq_on_entry (NULL, ctor_seq);
10202 : 147597 : ctor_seq = NULL;
10203 : : }
10204 : 594029 : voprnds.quick_push (vec_cst);
10205 : 594029 : insert_after = NULL;
10206 : 594029 : number_of_places_left_in_vector = nunits;
10207 : 594029 : constant_p = true;
10208 : 594029 : elts.new_vector (vector_type, nunits, 1);
10209 : 594029 : elts.quick_grow (nunits);
10210 : : }
10211 : : }
10212 : : }
10213 : :
10214 : : /* Since the vectors are created in the reverse order, we should invert
10215 : : them. */
10216 : 476474 : vec_num = voprnds.length ();
10217 : 1070503 : for (j = vec_num; j != 0; j--)
10218 : : {
10219 : 594029 : vop = voprnds[j - 1];
10220 : 594029 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
10221 : : }
10222 : :
10223 : : /* In case that VF is greater than the unrolling factor needed for the SLP
10224 : : group of stmts, NUMBER_OF_VECTORS to be created is greater than
10225 : : NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
10226 : : to replicate the vectors. */
10227 : 476474 : while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
10228 : 476474 : for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
10229 : : i++)
10230 : 0 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
10231 : 476474 : }
10232 : :
10233 : : /* Get the scalar definition of the Nth lane from SLP_NODE or NULL_TREE
10234 : : if there is no definition for it in the scalar IL or it is not known. */
10235 : :
10236 : : tree
10237 : 82 : vect_get_slp_scalar_def (slp_tree slp_node, unsigned n)
10238 : : {
10239 : 82 : if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
10240 : : {
10241 : 82 : if (!SLP_TREE_SCALAR_STMTS (slp_node).exists ())
10242 : : return NULL_TREE;
10243 : 82 : stmt_vec_info def = SLP_TREE_SCALAR_STMTS (slp_node)[n];
10244 : 82 : if (!def)
10245 : : return NULL_TREE;
10246 : 82 : return gimple_get_lhs (STMT_VINFO_STMT (def));
10247 : : }
10248 : : else
10249 : 0 : return SLP_TREE_SCALAR_OPS (slp_node)[n];
10250 : : }
10251 : :
10252 : : /* Get the Ith vectorized definition from SLP_NODE. */
10253 : :
10254 : : tree
10255 : 128448 : vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
10256 : : {
10257 : 128448 : return SLP_TREE_VEC_DEFS (slp_node)[i];
10258 : : }
10259 : :
10260 : : /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
10261 : :
10262 : : void
10263 : 873732 : vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
10264 : : {
10265 : 873732 : vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
10266 : 873732 : vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
10267 : 873732 : }
10268 : :
10269 : : /* Get N vectorized definitions for SLP_NODE. */
10270 : :
10271 : : void
10272 : 2156 : vect_get_slp_defs (vec_info *,
10273 : : slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
10274 : : {
10275 : 2156 : if (n == -1U)
10276 : 2156 : n = SLP_TREE_CHILDREN (slp_node).length ();
10277 : :
10278 : 7839 : for (unsigned i = 0; i < n; ++i)
10279 : : {
10280 : 5683 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
10281 : 5683 : vec<tree> vec_defs = vNULL;
10282 : 5683 : vect_get_slp_defs (child, &vec_defs);
10283 : 5683 : vec_oprnds->quick_push (vec_defs);
10284 : : }
10285 : 2156 : }
10286 : :
10287 : : /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
10288 : : - PERM gives the permutation that the caller wants to use for NODE,
10289 : : which might be different from SLP_LOAD_PERMUTATION.
10290 : : - DUMP_P controls whether the function dumps information. */
10291 : :
10292 : : static bool
10293 : 123550 : vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
10294 : : load_permutation_t &perm,
10295 : : const vec<tree> &dr_chain,
10296 : : gimple_stmt_iterator *gsi, poly_uint64 vf,
10297 : : bool analyze_only, bool dump_p,
10298 : : unsigned *n_perms, unsigned int *n_loads,
10299 : : bool dce_chain)
10300 : : {
10301 : 123550 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
10302 : 123550 : int vec_index = 0;
10303 : 123550 : tree vectype = SLP_TREE_VECTYPE (node);
10304 : 123550 : unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
10305 : 123550 : unsigned int mask_element;
10306 : 123550 : unsigned dr_group_size;
10307 : 123550 : machine_mode mode;
10308 : :
10309 : 123550 : if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
10310 : : dr_group_size = 1;
10311 : : else
10312 : : {
10313 : 121911 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
10314 : 121911 : dr_group_size = DR_GROUP_SIZE (stmt_info);
10315 : : }
10316 : :
10317 : 123550 : mode = TYPE_MODE (vectype);
10318 : 123550 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10319 : 123550 : unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
10320 : :
10321 : : /* Initialize the vect stmts of NODE to properly insert the generated
10322 : : stmts later. */
10323 : 123550 : if (! analyze_only)
10324 : 32028 : for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
10325 : 12676 : SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
10326 : :
10327 : : /* Generate permutation masks for every NODE. Number of masks for each NODE
10328 : : is equal to GROUP_SIZE.
10329 : : E.g., we have a group of three nodes with three loads from the same
10330 : : location in each node, and the vector size is 4. I.e., we have a
10331 : : a0b0c0a1b1c1... sequence and we need to create the following vectors:
10332 : : for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
10333 : : for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
10334 : : ...
10335 : :
10336 : : The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
10337 : : The last mask is illegal since we assume two operands for permute
10338 : : operation, and the mask element values can't be outside that range.
10339 : : Hence, the last mask must be converted into {2,5,5,5}.
10340 : : For the first two permutations we need the first and the second input
10341 : : vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
10342 : : we need the second and the third vectors: {b1,c1,a2,b2} and
10343 : : {c2,a3,b3,c3}. */
10344 : :
10345 : 123550 : int vect_stmts_counter = 0;
10346 : 123550 : unsigned int index = 0;
10347 : 123550 : int first_vec_index = -1;
10348 : 123550 : int second_vec_index = -1;
10349 : 123550 : bool noop_p = true;
10350 : 123550 : *n_perms = 0;
10351 : :
10352 : 123550 : vec_perm_builder mask;
10353 : 123550 : unsigned int nelts_to_build;
10354 : 123550 : unsigned int nvectors_per_build;
10355 : 123550 : unsigned int in_nlanes;
10356 : 123550 : bool repeating_p = (group_size == dr_group_size
10357 : 158572 : && multiple_p (nunits, group_size));
10358 : 123550 : if (repeating_p)
10359 : : {
10360 : : /* A single vector contains a whole number of copies of the node, so:
10361 : : (a) all permutes can use the same mask; and
10362 : : (b) the permutes only need a single vector input. */
10363 : 24147 : mask.new_vector (nunits, group_size, 3);
10364 : 24147 : nelts_to_build = mask.encoded_nelts ();
10365 : : /* It's possible to obtain zero nstmts during analyze_only, so make
10366 : : it at least one to ensure the later computation for n_perms
10367 : : proceed. */
10368 : 24147 : nvectors_per_build = nstmts > 0 ? nstmts : 1;
10369 : 24147 : in_nlanes = dr_group_size * 3;
10370 : : }
10371 : : else
10372 : : {
10373 : : /* We need to construct a separate mask for each vector statement. */
10374 : 99403 : unsigned HOST_WIDE_INT const_nunits, const_vf;
10375 : 99403 : if (!nunits.is_constant (&const_nunits)
10376 : 99403 : || !vf.is_constant (&const_vf))
10377 : : return false;
10378 : 99403 : mask.new_vector (const_nunits, const_nunits, 1);
10379 : 99403 : nelts_to_build = const_vf * group_size;
10380 : 99403 : nvectors_per_build = 1;
10381 : 99403 : in_nlanes = const_vf * dr_group_size;
10382 : : }
10383 : 123550 : auto_sbitmap used_in_lanes (in_nlanes);
10384 : 123550 : bitmap_clear (used_in_lanes);
10385 : 123550 : auto_bitmap used_defs;
10386 : :
10387 : 123550 : unsigned int count = mask.encoded_nelts ();
10388 : 123550 : mask.quick_grow (count);
10389 : 123550 : vec_perm_indices indices;
10390 : :
10391 : 732097 : for (unsigned int j = 0; j < nelts_to_build; j++)
10392 : : {
10393 : 614452 : unsigned int iter_num = j / group_size;
10394 : 614452 : unsigned int stmt_num = j % group_size;
10395 : 614452 : unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
10396 : 614452 : bitmap_set_bit (used_in_lanes, i);
10397 : 614452 : if (repeating_p)
10398 : : {
10399 : : first_vec_index = 0;
10400 : : mask_element = i;
10401 : : }
10402 : : else
10403 : : {
10404 : : /* Enforced before the loop when !repeating_p. */
10405 : 448726 : unsigned int const_nunits = nunits.to_constant ();
10406 : 448726 : vec_index = i / const_nunits;
10407 : 448726 : mask_element = i % const_nunits;
10408 : 448726 : if (vec_index == first_vec_index
10409 : 448726 : || first_vec_index == -1)
10410 : : {
10411 : : first_vec_index = vec_index;
10412 : : }
10413 : 161559 : else if (vec_index == second_vec_index
10414 : 161559 : || second_vec_index == -1)
10415 : : {
10416 : 159118 : second_vec_index = vec_index;
10417 : 159118 : mask_element += const_nunits;
10418 : : }
10419 : : else
10420 : : {
10421 : 2441 : if (dump_p)
10422 : 163 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10423 : : "permutation requires at "
10424 : : "least three vectors %G",
10425 : : stmt_info->stmt);
10426 : 2441 : gcc_assert (analyze_only);
10427 : : return false;
10428 : : }
10429 : :
10430 : 446285 : gcc_assert (mask_element < 2 * const_nunits);
10431 : : }
10432 : :
10433 : 612011 : if (mask_element != index)
10434 : 413371 : noop_p = false;
10435 : 612011 : mask[index++] = mask_element;
10436 : :
10437 : 612011 : if (index == count)
10438 : : {
10439 : 170938 : if (!noop_p)
10440 : : {
10441 : 195543 : indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
10442 : 116049 : if (!can_vec_perm_const_p (mode, mode, indices))
10443 : : {
10444 : 3464 : if (dump_p)
10445 : : {
10446 : 81 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10447 : : "unsupported vect permute { ");
10448 : 721 : for (i = 0; i < count; ++i)
10449 : : {
10450 : 640 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
10451 : 640 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
10452 : : }
10453 : 81 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
10454 : : }
10455 : 3464 : gcc_assert (analyze_only);
10456 : : return false;
10457 : : }
10458 : :
10459 : 112585 : tree mask_vec = NULL_TREE;
10460 : 112585 : if (!analyze_only)
10461 : 11341 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
10462 : :
10463 : 112585 : if (second_vec_index == -1)
10464 : 34380 : second_vec_index = first_vec_index;
10465 : :
10466 : 225718 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
10467 : : {
10468 : 113133 : ++*n_perms;
10469 : 113133 : if (analyze_only)
10470 : 101680 : continue;
10471 : : /* Generate the permute statement if necessary. */
10472 : 11453 : tree first_vec = dr_chain[first_vec_index + ri];
10473 : 11453 : tree second_vec = dr_chain[second_vec_index + ri];
10474 : 11453 : gassign *stmt = as_a<gassign *> (stmt_info->stmt);
10475 : 11453 : tree perm_dest
10476 : 11453 : = vect_create_destination_var (gimple_assign_lhs (stmt),
10477 : : vectype);
10478 : 11453 : perm_dest = make_ssa_name (perm_dest);
10479 : 11453 : gimple *perm_stmt
10480 : 11453 : = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
10481 : : second_vec, mask_vec);
10482 : 11453 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
10483 : : gsi);
10484 : 11453 : if (dce_chain)
10485 : : {
10486 : 10830 : bitmap_set_bit (used_defs, first_vec_index + ri);
10487 : 10830 : bitmap_set_bit (used_defs, second_vec_index + ri);
10488 : : }
10489 : :
10490 : : /* Store the vector statement in NODE. */
10491 : 11453 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
10492 : : }
10493 : : }
10494 : 54889 : else if (!analyze_only)
10495 : : {
10496 : 2446 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
10497 : : {
10498 : 1223 : tree first_vec = dr_chain[first_vec_index + ri];
10499 : : /* If mask was NULL_TREE generate the requested
10500 : : identity transform. */
10501 : 1223 : if (dce_chain)
10502 : 1222 : bitmap_set_bit (used_defs, first_vec_index + ri);
10503 : :
10504 : : /* Store the vector statement in NODE. */
10505 : 1223 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
10506 : : }
10507 : : }
10508 : :
10509 : : index = 0;
10510 : : first_vec_index = -1;
10511 : : second_vec_index = -1;
10512 : : noop_p = true;
10513 : : }
10514 : : }
10515 : :
10516 : 117645 : if (n_loads)
10517 : : {
10518 : 2374 : if (repeating_p)
10519 : 192 : *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
10520 : : else
10521 : : {
10522 : : /* Enforced above when !repeating_p. */
10523 : 2182 : unsigned int const_nunits = nunits.to_constant ();
10524 : 2182 : *n_loads = 0;
10525 : 2182 : bool load_seen = false;
10526 : 26307 : for (unsigned i = 0; i < in_nlanes; ++i)
10527 : : {
10528 : 24125 : if (i % const_nunits == 0)
10529 : : {
10530 : 4337 : if (load_seen)
10531 : 2152 : *n_loads += 1;
10532 : : load_seen = false;
10533 : : }
10534 : 24125 : if (bitmap_bit_p (used_in_lanes, i))
10535 : 12092 : load_seen = true;
10536 : : }
10537 : 2182 : if (load_seen)
10538 : 2182 : *n_loads += 1;
10539 : : }
10540 : : }
10541 : :
10542 : 117645 : if (dce_chain)
10543 : 176831 : for (unsigned i = 0; i < dr_chain.length (); ++i)
10544 : 44202 : if (!bitmap_bit_p (used_defs, i))
10545 : : {
10546 : 26109 : tree def = dr_chain[i];
10547 : 26184 : do
10548 : : {
10549 : 26184 : gimple *stmt = SSA_NAME_DEF_STMT (def);
10550 : 26184 : if (is_gimple_assign (stmt)
10551 : 26184 : && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
10552 : 26184 : || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
10553 : 513 : def = single_ssa_tree_operand (stmt, SSA_OP_USE);
10554 : : else
10555 : : def = NULL;
10556 : 26184 : gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
10557 : 26184 : gsi_remove (&rgsi, true);
10558 : 26184 : release_defs (stmt);
10559 : : }
10560 : 26184 : while (def);
10561 : : }
10562 : :
10563 : : return true;
10564 : 123550 : }
10565 : :
10566 : : /* Generate vector permute statements from a list of loads in DR_CHAIN.
10567 : : If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
10568 : : permute statements for the SLP node NODE. Store the number of vector
10569 : : permute instructions in *N_PERMS and the number of vector load
10570 : : instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
10571 : : that were not needed. */
10572 : :
10573 : : bool
10574 : 90275 : vect_transform_slp_perm_load (vec_info *vinfo,
10575 : : slp_tree node, const vec<tree> &dr_chain,
10576 : : gimple_stmt_iterator *gsi, poly_uint64 vf,
10577 : : bool analyze_only, unsigned *n_perms,
10578 : : unsigned int *n_loads, bool dce_chain)
10579 : : {
10580 : 90275 : return vect_transform_slp_perm_load_1 (vinfo, node,
10581 : 90275 : SLP_TREE_LOAD_PERMUTATION (node),
10582 : : dr_chain, gsi, vf, analyze_only,
10583 : : dump_enabled_p (), n_perms, n_loads,
10584 : 90275 : dce_chain);
10585 : : }
10586 : :
10587 : : /* Produce the next vector result for SLP permutation NODE by adding a vector
10588 : : statement at GSI. If MASK_VEC is nonnull, add:
10589 : :
10590 : : <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
10591 : :
10592 : : otherwise add:
10593 : :
10594 : : <new SSA name> = FIRST_DEF. */
10595 : :
10596 : : static void
10597 : 29064 : vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
10598 : : slp_tree node, tree first_def, tree second_def,
10599 : : tree mask_vec, poly_uint64 identity_offset)
10600 : : {
10601 : 29064 : tree vectype = SLP_TREE_VECTYPE (node);
10602 : :
10603 : : /* ??? We SLP match existing vector element extracts but
10604 : : allow punning which we need to re-instantiate at uses
10605 : : but have no good way of explicitly representing. */
10606 : 29064 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
10607 : 29064 : && !types_compatible_p (TREE_TYPE (first_def), vectype))
10608 : : {
10609 : 26 : gassign *conv_stmt
10610 : 26 : = gimple_build_assign (make_ssa_name (vectype),
10611 : : build1 (VIEW_CONVERT_EXPR, vectype, first_def));
10612 : 26 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
10613 : 26 : first_def = gimple_assign_lhs (conv_stmt);
10614 : : }
10615 : 29064 : gassign *perm_stmt;
10616 : 29064 : tree perm_dest = make_ssa_name (vectype);
10617 : 29064 : if (mask_vec)
10618 : : {
10619 : 26072 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
10620 : 26072 : TYPE_SIZE (vectype))
10621 : 26072 : && !types_compatible_p (TREE_TYPE (second_def), vectype))
10622 : : {
10623 : 9 : gassign *conv_stmt
10624 : 9 : = gimple_build_assign (make_ssa_name (vectype),
10625 : : build1 (VIEW_CONVERT_EXPR,
10626 : : vectype, second_def));
10627 : 9 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
10628 : 9 : second_def = gimple_assign_lhs (conv_stmt);
10629 : : }
10630 : 26072 : perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
10631 : : first_def, second_def,
10632 : : mask_vec);
10633 : : }
10634 : 2992 : else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
10635 : : {
10636 : : /* For identity permutes we still need to handle the case
10637 : : of offsetted extracts or concats. */
10638 : 206 : unsigned HOST_WIDE_INT c;
10639 : 206 : auto first_def_nunits
10640 : 206 : = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
10641 : 206 : if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
10642 : : {
10643 : 202 : unsigned HOST_WIDE_INT elsz
10644 : 202 : = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
10645 : 404 : tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
10646 : 202 : TYPE_SIZE (vectype),
10647 : 202 : bitsize_int (identity_offset * elsz));
10648 : 202 : perm_stmt = gimple_build_assign (perm_dest, lowpart);
10649 : : }
10650 : 4 : else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
10651 : 4 : first_def_nunits, &c) && c == 2)
10652 : : {
10653 : 4 : tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
10654 : : NULL_TREE, second_def);
10655 : 4 : perm_stmt = gimple_build_assign (perm_dest, ctor);
10656 : : }
10657 : : else
10658 : 0 : gcc_unreachable ();
10659 : : }
10660 : : else
10661 : : {
10662 : : /* We need a copy here in case the def was external. */
10663 : 2786 : perm_stmt = gimple_build_assign (perm_dest, first_def);
10664 : : }
10665 : 29064 : vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
10666 : : /* Store the vector statement in NODE. */
10667 : 29064 : node->push_vec_def (perm_stmt);
10668 : 29064 : }
10669 : :
10670 : : /* Subroutine of vectorizable_slp_permutation. Check whether the target
10671 : : can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
10672 : : If GSI is nonnull, emit the permutation there.
10673 : :
10674 : : When GSI is null, the only purpose of NODE is to give properties
10675 : : of the result, such as the vector type and number of SLP lanes.
10676 : : The node does not need to be a VEC_PERM_EXPR.
10677 : :
10678 : : If the target supports the operation, return the number of individual
10679 : : VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
10680 : : dump file if DUMP_P is true. */
10681 : :
10682 : : static int
10683 : 344892 : vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
10684 : : slp_tree node, lane_permutation_t &perm,
10685 : : vec<slp_tree> &children, bool dump_p)
10686 : : {
10687 : 344892 : tree vectype = SLP_TREE_VECTYPE (node);
10688 : :
10689 : : /* ??? We currently only support all same vector input types
10690 : : while the SLP IL should really do a concat + select and thus accept
10691 : : arbitrary mismatches. */
10692 : 344892 : slp_tree child;
10693 : 344892 : unsigned i;
10694 : 344892 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10695 : 344892 : bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
10696 : : /* True if we're permuting a single input of 2N vectors down
10697 : : to N vectors. This case doesn't generalize beyond 2 since
10698 : : VEC_PERM_EXPR only takes 2 inputs. */
10699 : 344892 : bool pack_p = false;
10700 : : /* If we're permuting inputs of N vectors each into X*N outputs,
10701 : : this is the value of X, otherwise it is 1. */
10702 : 344892 : unsigned int unpack_factor = 1;
10703 : 344892 : tree op_vectype = NULL_TREE;
10704 : 346636 : FOR_EACH_VEC_ELT (children, i, child)
10705 : 346411 : if (SLP_TREE_VECTYPE (child))
10706 : : {
10707 : : op_vectype = SLP_TREE_VECTYPE (child);
10708 : : break;
10709 : : }
10710 : 344892 : if (!op_vectype)
10711 : 225 : op_vectype = vectype;
10712 : 744211 : FOR_EACH_VEC_ELT (children, i, child)
10713 : : {
10714 : 399325 : if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
10715 : 11294 : && !vect_maybe_update_slp_op_vectype (child, op_vectype))
10716 : 399325 : || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
10717 : 798650 : || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
10718 : : {
10719 : 6 : if (dump_p)
10720 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10721 : : "Unsupported vector types in lane permutation\n");
10722 : 6 : return -1;
10723 : : }
10724 : 399319 : auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
10725 : 399319 : unsigned int this_unpack_factor;
10726 : : /* Detect permutations of external, pre-existing vectors. The external
10727 : : node's SLP_TREE_LANES stores the total number of units in the vector,
10728 : : or zero if the vector has variable length.
10729 : :
10730 : : We are expected to keep the original VEC_PERM_EXPR for such cases.
10731 : : There is no repetition to model. */
10732 : 399319 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def
10733 : 399319 : && SLP_TREE_SCALAR_OPS (child).is_empty ())
10734 : : repeating_p = false;
10735 : : /* Check whether the input has twice as many lanes per vector. */
10736 : 390690 : else if (children.length () == 1
10737 : 390690 : && known_eq (SLP_TREE_LANES (child) * nunits,
10738 : : SLP_TREE_LANES (node) * op_nunits * 2))
10739 : : pack_p = true;
10740 : : /* Check whether the output has N times as many lanes per vector. */
10741 : 399319 : else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
10742 : 348521 : SLP_TREE_LANES (child) * nunits,
10743 : : &this_unpack_factor)
10744 : 314259 : && (i == 0 || unpack_factor == this_unpack_factor))
10745 : : unpack_factor = this_unpack_factor;
10746 : : else
10747 : : repeating_p = false;
10748 : : }
10749 : :
10750 : 689772 : gcc_assert (perm.length () == SLP_TREE_LANES (node));
10751 : :
10752 : : /* Load-lanes permute. This permute only acts as a forwarder to
10753 : : select the correct vector def of the load-lanes load which
10754 : : has the permuted vectors in its vector defs like
10755 : : { v0, w0, r0, v1, w1, r1 ... } for a ld3. All costs are
10756 : : accounted for in the costing for the actual load so we
10757 : : return zero here. */
10758 : 344886 : if (node->ldst_lanes)
10759 : : {
10760 : 0 : gcc_assert (children.length () == 1);
10761 : 0 : if (!gsi)
10762 : : /* This is a trivial op always supported. */
10763 : : return 0;
10764 : 0 : slp_tree child = children[0];
10765 : 0 : unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
10766 : 0 : / SLP_TREE_LANES (node));
10767 : 0 : unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
10768 : 0 : for (unsigned i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
10769 : : {
10770 : 0 : tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx];
10771 : 0 : node->push_vec_def (def);
10772 : : }
10773 : : return 0;
10774 : : }
10775 : :
10776 : : /* Set REPEATING_P to true if the permutations are cylical wrt UNPACK_FACTOR
10777 : : and if we can generate the vectors in a vector-length agnostic way.
10778 : : This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
10779 : : compile time.
10780 : :
10781 : : The significance of UNPACK_STEP is that, when PACK_P is false,
10782 : : output vector I operates on a window of UNPACK_STEP elements from each
10783 : : input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR). For example,
10784 : : when UNPACK_FACTOR is 2, the first output vector operates on lanes
10785 : : [0, NUNITS / 2 - 1] of each input vector and the second output vector
10786 : : operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
10787 : :
10788 : : When REPEATING_P is true, NOUTPUTS holds the total number of outputs
10789 : : that we actually need to generate. */
10790 : 344886 : uint64_t noutputs = 0;
10791 : 344886 : poly_uint64 unpack_step = 0;
10792 : 344886 : loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
10793 : 106450 : if (!linfo
10794 : 382417 : || !multiple_p (nunits, unpack_factor, &unpack_step)
10795 : 105394 : || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
10796 : 105394 : * SLP_TREE_LANES (node), nunits, &noutputs))
10797 : : repeating_p = false;
10798 : :
10799 : : /* We can handle the conditions described for REPEATING_P above for
10800 : : both variable- and constant-length vectors. The fallback requires
10801 : : us to generate every element of every permute vector explicitly,
10802 : : which is only possible for constant-length permute vectors.
10803 : :
10804 : : Set:
10805 : :
10806 : : - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
10807 : : mask vectors that we want to build.
10808 : :
10809 : : - NCOPIES to the number of copies of PERM that we need in order
10810 : : to build the necessary permute mask vectors. */
10811 : 105394 : uint64_t npatterns;
10812 : 105394 : unsigned nelts_per_pattern;
10813 : 105394 : uint64_t ncopies;
10814 : 105394 : if (repeating_p)
10815 : : {
10816 : : /* We need permute mask vectors that have the form:
10817 : :
10818 : : { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
10819 : :
10820 : : In other words, the original n-element permute in PERM is
10821 : : "unrolled" to fill a full vector. The stepped vector encoding
10822 : : that we use for permutes requires 3n elements. */
10823 : 67863 : npatterns = SLP_TREE_LANES (node);
10824 : 67863 : nelts_per_pattern = ncopies = 3;
10825 : : }
10826 : : else
10827 : : {
10828 : : /* Calculate every element of every permute mask vector explicitly,
10829 : : instead of relying on the pattern described above. */
10830 : 277023 : if (!nunits.is_constant (&npatterns)
10831 : 277023 : || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
10832 : : {
10833 : : if (dump_p)
10834 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10835 : : "unsupported permutation %p on variable-length"
10836 : : " vectors\n", (void *) node);
10837 : : return -1;
10838 : : }
10839 : 277023 : nelts_per_pattern = ncopies = 1;
10840 : 277023 : if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
10841 : : {
10842 : : if (dump_p)
10843 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10844 : : "unsupported permutation %p for variable VF\n",
10845 : : (void *) node);
10846 : : return -1;
10847 : : }
10848 : : pack_p = false;
10849 : : unpack_factor = 1;
10850 : : }
10851 : 344886 : unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
10852 : 344886 : gcc_assert (repeating_p || multiple_p (olanes, nunits));
10853 : :
10854 : : /* Compute the { { SLP operand, vector index}, lane } permutation sequence
10855 : : from the { SLP operand, scalar lane } permutation as recorded in the
10856 : : SLP node as intermediate step. This part should already work
10857 : : with SLP children with arbitrary number of lanes. */
10858 : 344886 : auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
10859 : 344886 : auto_vec<poly_uint64> active_lane;
10860 : 344886 : vperm.create (olanes);
10861 : 344886 : active_lane.safe_grow_cleared (children.length (), true);
10862 : 697696 : for (unsigned int ui = 0; ui < unpack_factor; ++ui)
10863 : : {
10864 : 1535954 : for (unsigned j = 0; j < children.length (); ++j)
10865 : 415167 : active_lane[j] = ui * unpack_step;
10866 : 967663 : for (unsigned i = 0; i < ncopies; ++i)
10867 : : {
10868 : 3905602 : for (unsigned pi = 0; pi < perm.length (); ++pi)
10869 : : {
10870 : 1337948 : std::pair<unsigned, unsigned> p = perm[pi];
10871 : 1337948 : tree vtype = SLP_TREE_VECTYPE (children[p.first]);
10872 : 1337948 : if (repeating_p)
10873 : 391236 : vperm.quick_push ({{p.first, 0},
10874 : 391236 : p.second + active_lane[p.first]});
10875 : : else
10876 : : {
10877 : : /* We checked above that the vectors are constant-length. */
10878 : 946712 : unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
10879 : 946712 : .to_constant ();
10880 : 946712 : unsigned lane = active_lane[p.first].to_constant ();
10881 : 946712 : unsigned vi = (lane + p.second) / vnunits;
10882 : 946712 : unsigned vl = (lane + p.second) % vnunits;
10883 : 946712 : vperm.quick_push ({{p.first, vi}, vl});
10884 : : }
10885 : : }
10886 : : /* Advance to the next group. */
10887 : 1341664 : for (unsigned j = 0; j < children.length (); ++j)
10888 : 726811 : active_lane[j] += SLP_TREE_LANES (children[j]);
10889 : : }
10890 : : }
10891 : :
10892 : 344886 : if (dump_p)
10893 : : {
10894 : 9040 : dump_printf_loc (MSG_NOTE, vect_location,
10895 : : "vectorizing permutation %p", (void *)node);
10896 : 32314 : for (unsigned i = 0; i < perm.length (); ++i)
10897 : 23274 : dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
10898 : 9040 : if (repeating_p)
10899 : 7779 : dump_printf (MSG_NOTE, " (repeat %d)", SLP_TREE_LANES (node));
10900 : 9040 : dump_printf (MSG_NOTE, "\n");
10901 : 9040 : dump_printf_loc (MSG_NOTE, vect_location, "as");
10902 : 91395 : for (unsigned i = 0; i < vperm.length (); ++i)
10903 : : {
10904 : 82355 : if (i != 0
10905 : 82355 : && (repeating_p
10906 : 57570 : ? multiple_p (i, npatterns)
10907 : 60662 : : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
10908 : 24461 : dump_printf (MSG_NOTE, ",");
10909 : 82355 : dump_printf (MSG_NOTE, " vops%u[%u][",
10910 : 82355 : vperm[i].first.first, vperm[i].first.second);
10911 : 82355 : dump_dec (MSG_NOTE, vperm[i].second);
10912 : 82355 : dump_printf (MSG_NOTE, "]");
10913 : : }
10914 : 9040 : dump_printf (MSG_NOTE, "\n");
10915 : : }
10916 : :
10917 : : /* We can only handle two-vector permutes, everything else should
10918 : : be lowered on the SLP level. The following is closely inspired
10919 : : by vect_transform_slp_perm_load and is supposed to eventually
10920 : : replace it.
10921 : : ??? As intermediate step do code-gen in the SLP tree representation
10922 : : somehow? */
10923 : 344886 : std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
10924 : 344886 : std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
10925 : 344886 : unsigned int index = 0;
10926 : 344886 : poly_uint64 mask_element;
10927 : 344886 : vec_perm_builder mask;
10928 : 344886 : mask.new_vector (nunits, npatterns, nelts_per_pattern);
10929 : 344886 : unsigned int count = mask.encoded_nelts ();
10930 : 344886 : mask.quick_grow (count);
10931 : 344886 : vec_perm_indices indices;
10932 : 344886 : unsigned nperms = 0;
10933 : : /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
10934 : : vectors to check during analysis, but we need to generate NOUTPUTS
10935 : : vectors during transformation. */
10936 : 344886 : unsigned total_nelts = olanes;
10937 : 344886 : unsigned process_nelts = olanes;
10938 : 344886 : if (repeating_p)
10939 : : {
10940 : 67863 : total_nelts = (total_nelts / unpack_factor) * noutputs;
10941 : 67863 : if (gsi)
10942 : 9384 : process_nelts = total_nelts;
10943 : : }
10944 : 344886 : unsigned last_ei = (total_nelts - 1) % process_nelts;
10945 : 1686901 : for (unsigned i = 0; i < process_nelts; ++i)
10946 : : {
10947 : : /* VI is the input vector index when generating code for REPEATING_P. */
10948 : 1349660 : unsigned vi = i / olanes * (pack_p ? 2 : 1);
10949 : 1349660 : unsigned ei = i % olanes;
10950 : 1349660 : mask_element = vperm[ei].second;
10951 : 1349660 : if (pack_p)
10952 : : {
10953 : : /* In this case, we have N outputs and the single child provides 2N
10954 : : inputs. Output X permutes inputs 2X and 2X+1.
10955 : :
10956 : : The mask indices are taken directly from the SLP permutation node.
10957 : : Index X selects from the first vector if (X / NUNITS) % 2 == 0;
10958 : : X selects from the second vector otherwise. These conditions
10959 : : are only known at compile time for constant-length vectors. */
10960 : : first_vec = std::make_pair (0, 0);
10961 : : second_vec = std::make_pair (0, 1);
10962 : : }
10963 : 1188263 : else if (first_vec.first == -1U
10964 : 1188263 : || first_vec == vperm[ei].first)
10965 : 1003695 : first_vec = vperm[ei].first;
10966 : 184568 : else if (second_vec.first == -1U
10967 : 184568 : || second_vec == vperm[ei].first)
10968 : : {
10969 : 183936 : second_vec = vperm[ei].first;
10970 : 183936 : mask_element += nunits;
10971 : : }
10972 : : else
10973 : : {
10974 : 632 : if (dump_p)
10975 : 24 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10976 : : "permutation requires at "
10977 : : "least three vectors\n");
10978 : 632 : gcc_assert (!gsi);
10979 : : return -1;
10980 : : }
10981 : :
10982 : 1349028 : mask[index++] = mask_element;
10983 : :
10984 : 1349028 : if (index == count)
10985 : : {
10986 : 652820 : indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
10987 : : TYPE_VECTOR_SUBPARTS (op_vectype));
10988 : 516312 : bool identity_p = (indices.series_p (0, 1, mask[0], 1)
10989 : 806487 : && constant_multiple_p (mask[0], nunits));
10990 : 516312 : machine_mode vmode = TYPE_MODE (vectype);
10991 : 516312 : machine_mode op_vmode = TYPE_MODE (op_vectype);
10992 : 516312 : unsigned HOST_WIDE_INT c;
10993 : 516312 : if ((!identity_p
10994 : 422786 : && !can_vec_perm_const_p (vmode, op_vmode, indices))
10995 : 516312 : || (identity_p
10996 : 93526 : && !known_le (nunits,
10997 : : TYPE_VECTOR_SUBPARTS (op_vectype))
10998 : 7021 : && (!constant_multiple_p (nunits,
10999 : 8 : TYPE_VECTOR_SUBPARTS (op_vectype),
11000 : 8 : &c) || c != 2)))
11001 : : {
11002 : 7013 : if (dump_p)
11003 : : {
11004 : 164 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
11005 : : vect_location,
11006 : : "unsupported vect permute { ");
11007 : 1734 : for (i = 0; i < count; ++i)
11008 : : {
11009 : 1570 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
11010 : 1570 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
11011 : : }
11012 : 164 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
11013 : : }
11014 : 7013 : gcc_assert (!gsi);
11015 : 7645 : return -1;
11016 : : }
11017 : :
11018 : 509299 : if (!identity_p)
11019 : 415773 : nperms += CEIL (total_nelts, process_nelts) - (ei > last_ei);
11020 : 509299 : if (gsi)
11021 : : {
11022 : 29064 : if (second_vec.first == -1U)
11023 : 6591 : second_vec = first_vec;
11024 : :
11025 : 29064 : slp_tree
11026 : 29064 : first_node = children[first_vec.first],
11027 : 29064 : second_node = children[second_vec.first];
11028 : :
11029 : 29064 : tree mask_vec = NULL_TREE;
11030 : 29064 : if (!identity_p)
11031 : 26072 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
11032 : :
11033 : 29064 : tree first_def
11034 : 29064 : = vect_get_slp_vect_def (first_node, first_vec.second + vi);
11035 : 29064 : tree second_def
11036 : 29064 : = vect_get_slp_vect_def (second_node, second_vec.second + vi);
11037 : 29064 : vect_add_slp_permutation (vinfo, gsi, node, first_def,
11038 : 29064 : second_def, mask_vec, mask[0]);
11039 : : }
11040 : :
11041 : : index = 0;
11042 : : first_vec = std::make_pair (-1U, -1U);
11043 : : second_vec = std::make_pair (-1U, -1U);
11044 : : }
11045 : : }
11046 : :
11047 : 337241 : return nperms;
11048 : 344886 : }
11049 : :
11050 : : /* Vectorize the SLP permutations in NODE as specified
11051 : : in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
11052 : : child number and lane number.
11053 : : Interleaving of two two-lane two-child SLP subtrees (not supported):
11054 : : [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
11055 : : A blend of two four-lane two-child SLP subtrees:
11056 : : [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
11057 : : Highpart of a four-lane one-child SLP subtree (not supported):
11058 : : [ { 0, 2 }, { 0, 3 } ]
11059 : : Where currently only a subset is supported by code generating below. */
11060 : :
11061 : : static bool
11062 : 115777 : vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11063 : : slp_tree node, stmt_vector_for_cost *cost_vec)
11064 : : {
11065 : 115777 : tree vectype = SLP_TREE_VECTYPE (node);
11066 : 115777 : lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
11067 : 115777 : int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
11068 : 115777 : SLP_TREE_CHILDREN (node),
11069 : : dump_enabled_p ());
11070 : 115777 : if (nperms < 0)
11071 : : return false;
11072 : :
11073 : 114496 : if (!gsi)
11074 : 99026 : record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
11075 : :
11076 : : return true;
11077 : : }
11078 : :
11079 : : /* Vectorize SLP NODE. */
11080 : :
11081 : : static void
11082 : 1406939 : vect_schedule_slp_node (vec_info *vinfo,
11083 : : slp_tree node, slp_instance instance)
11084 : : {
11085 : 1406939 : gimple_stmt_iterator si;
11086 : 1406939 : int i;
11087 : 1406939 : slp_tree child;
11088 : :
11089 : : /* Vectorize externals and constants. */
11090 : 1406939 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
11091 : 1406939 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
11092 : : {
11093 : : /* ??? vectorizable_shift can end up using a scalar operand which is
11094 : : currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
11095 : : node in this case. */
11096 : 482739 : if (!SLP_TREE_VECTYPE (node))
11097 : 482739 : return;
11098 : :
11099 : : /* There are two reasons vector defs might already exist. The first
11100 : : is that we are vectorizing an existing vector def. The second is
11101 : : when performing BB vectorization shared constant/external nodes
11102 : : are not split apart during partitioning so during the code-gen
11103 : : DFS walk we can end up visiting them twice. */
11104 : 477153 : if (! SLP_TREE_VEC_DEFS (node).exists ())
11105 : 476474 : vect_create_constant_vectors (vinfo, node);
11106 : 477153 : return;
11107 : : }
11108 : :
11109 : 924200 : gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
11110 : :
11111 : 924200 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
11112 : :
11113 : 924200 : gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
11114 : 924200 : SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
11115 : :
11116 : 924200 : if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
11117 : 908730 : && STMT_VINFO_DATA_REF (stmt_info))
11118 : : {
11119 : : /* Vectorized loads go before the first scalar load to make it
11120 : : ready early, vectorized stores go before the last scalar
11121 : : stmt which is where all uses are ready. */
11122 : 689190 : stmt_vec_info last_stmt_info = NULL;
11123 : 689190 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
11124 : 154699 : last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
11125 : : else /* DR_IS_WRITE */
11126 : 534491 : last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
11127 : 689190 : si = gsi_for_stmt (last_stmt_info->stmt);
11128 : 689190 : }
11129 : 235010 : else if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
11130 : 219540 : && (STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
11131 : : || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
11132 : : || STMT_VINFO_TYPE (stmt_info) == phi_info_type))
11133 : : {
11134 : : /* For PHI node vectorization we do not use the insertion iterator. */
11135 : 51399 : si = gsi_none ();
11136 : : }
11137 : : else
11138 : : {
11139 : : /* Emit other stmts after the children vectorized defs which is
11140 : : earliest possible. */
11141 : : gimple *last_stmt = NULL;
11142 : : bool seen_vector_def = false;
11143 : 509094 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11144 : 325483 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
11145 : : {
11146 : : /* For fold-left reductions we are retaining the scalar
11147 : : reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
11148 : : set so the representation isn't perfect. Resort to the
11149 : : last scalar def here. */
11150 : 262000 : if (SLP_TREE_VEC_DEFS (child).is_empty ())
11151 : : {
11152 : 804 : gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
11153 : : == cycle_phi_info_type);
11154 : 804 : gphi *phi = as_a <gphi *>
11155 : 804 : (vect_find_last_scalar_stmt_in_slp (child)->stmt);
11156 : 804 : if (!last_stmt
11157 : 804 : || vect_stmt_dominates_stmt_p (last_stmt, phi))
11158 : : last_stmt = phi;
11159 : : }
11160 : : /* We are emitting all vectorized stmts in the same place and
11161 : : the last one is the last.
11162 : : ??? Unless we have a load permutation applied and that
11163 : : figures to re-use an earlier generated load. */
11164 : : unsigned j;
11165 : : tree vdef;
11166 : 603359 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11167 : : {
11168 : 341359 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11169 : 341359 : if (!last_stmt
11170 : 341359 : || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11171 : : last_stmt = vstmt;
11172 : : }
11173 : : }
11174 : 63483 : else if (!SLP_TREE_VECTYPE (child))
11175 : : {
11176 : : /* For externals we use unvectorized at all scalar defs. */
11177 : : unsigned j;
11178 : : tree def;
11179 : 12163 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
11180 : 7159 : if (TREE_CODE (def) == SSA_NAME
11181 : 7159 : && !SSA_NAME_IS_DEFAULT_DEF (def))
11182 : : {
11183 : 162 : gimple *stmt = SSA_NAME_DEF_STMT (def);
11184 : 162 : if (!last_stmt
11185 : 162 : || vect_stmt_dominates_stmt_p (last_stmt, stmt))
11186 : : last_stmt = stmt;
11187 : : }
11188 : : }
11189 : : else
11190 : : {
11191 : : /* For externals we have to look at all defs since their
11192 : : insertion place is decided per vector. But beware
11193 : : of pre-existing vectors where we need to make sure
11194 : : we do not insert before the region boundary. */
11195 : 58479 : if (SLP_TREE_SCALAR_OPS (child).is_empty ()
11196 : 556 : && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
11197 : : seen_vector_def = true;
11198 : : else
11199 : : {
11200 : : unsigned j;
11201 : : tree vdef;
11202 : 461413 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11203 : 77566 : if (TREE_CODE (vdef) == SSA_NAME
11204 : 77566 : && !SSA_NAME_IS_DEFAULT_DEF (vdef))
11205 : : {
11206 : 16574 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11207 : 16574 : if (!last_stmt
11208 : 16574 : || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11209 : : last_stmt = vstmt;
11210 : : }
11211 : : }
11212 : : }
11213 : : /* This can happen when all children are pre-existing vectors or
11214 : : constants. */
11215 : 183611 : if (!last_stmt)
11216 : 1711 : last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
11217 : 1711 : if (!last_stmt)
11218 : : {
11219 : 0 : gcc_assert (seen_vector_def);
11220 : 0 : si = gsi_after_labels (vinfo->bbs[0]);
11221 : : }
11222 : 183611 : else if (is_ctrl_altering_stmt (last_stmt))
11223 : : {
11224 : : /* We split regions to vectorize at control altering stmts
11225 : : with a definition so this must be an external which
11226 : : we can insert at the start of the region. */
11227 : 3 : si = gsi_after_labels (vinfo->bbs[0]);
11228 : : }
11229 : 183608 : else if (is_a <bb_vec_info> (vinfo)
11230 : 13822 : && SLP_TREE_CODE (node) != VEC_PERM_EXPR
11231 : 12699 : && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
11232 : 184532 : && gimple_could_trap_p (stmt_info->stmt))
11233 : : {
11234 : : /* We've constrained possibly trapping operations to all come
11235 : : from the same basic-block, if vectorized defs would allow earlier
11236 : : scheduling still force vectorized stmts to the original block.
11237 : : This is only necessary for BB vectorization since for loop vect
11238 : : all operations are in a single BB and scalar stmt based
11239 : : placement doesn't play well with epilogue vectorization. */
11240 : 48 : gcc_assert (dominated_by_p (CDI_DOMINATORS,
11241 : : gimple_bb (stmt_info->stmt),
11242 : : gimple_bb (last_stmt)));
11243 : 48 : si = gsi_after_labels (gimple_bb (stmt_info->stmt));
11244 : : }
11245 : 183560 : else if (is_a <gphi *> (last_stmt))
11246 : 13769 : si = gsi_after_labels (gimple_bb (last_stmt));
11247 : : else
11248 : : {
11249 : 169791 : si = gsi_for_stmt (last_stmt);
11250 : 169791 : gsi_next (&si);
11251 : :
11252 : : /* Avoid scheduling internal defs outside of the loop when
11253 : : we might have only implicitly tracked loop mask/len defs. */
11254 : 169791 : if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
11255 : 38 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
11256 : 156184 : || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
11257 : : {
11258 : 38 : gimple_stmt_iterator si2
11259 : 38 : = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
11260 : 38 : if ((gsi_end_p (si2)
11261 : 0 : && (LOOP_VINFO_LOOP (loop_vinfo)->header
11262 : 0 : != gimple_bb (last_stmt))
11263 : 0 : && dominated_by_p (CDI_DOMINATORS,
11264 : : LOOP_VINFO_LOOP (loop_vinfo)->header,
11265 : 0 : gimple_bb (last_stmt)))
11266 : 38 : || (!gsi_end_p (si2)
11267 : 38 : && last_stmt != *si2
11268 : 37 : && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
11269 : 1 : si = si2;
11270 : : }
11271 : : }
11272 : : }
11273 : :
11274 : : /* Handle purely internal nodes. */
11275 : 924200 : if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
11276 : : {
11277 : 15470 : if (dump_enabled_p ())
11278 : 3021 : dump_printf_loc (MSG_NOTE, vect_location,
11279 : : "------>vectorizing SLP permutation node\n");
11280 : : /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
11281 : : be shared with different SLP nodes (but usually it's the same
11282 : : operation apart from the case the stmt is only there for denoting
11283 : : the actual scalar lane defs ...). So do not call vect_transform_stmt
11284 : : but open-code it here (partly). */
11285 : 15470 : bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
11286 : 15470 : gcc_assert (done);
11287 : : stmt_vec_info slp_stmt_info;
11288 : : unsigned int i;
11289 : 959645 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
11290 : 23010 : if (slp_stmt_info && STMT_VINFO_LIVE_P (slp_stmt_info))
11291 : : {
11292 : 680 : done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
11293 : : instance, i, true, NULL);
11294 : 680 : gcc_assert (done);
11295 : : }
11296 : : }
11297 : : else
11298 : : {
11299 : 908730 : if (dump_enabled_p ())
11300 : 62422 : dump_printf_loc (MSG_NOTE, vect_location,
11301 : : "------>vectorizing SLP node starting from: %G",
11302 : : stmt_info->stmt);
11303 : 908730 : vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
11304 : : }
11305 : : }
11306 : :
11307 : : /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
11308 : : For loop vectorization this is done in vectorizable_call, but for SLP
11309 : : it needs to be deferred until end of vect_schedule_slp, because multiple
11310 : : SLP instances may refer to the same scalar stmt. */
11311 : :
11312 : : static void
11313 : 552216 : vect_remove_slp_scalar_calls (vec_info *vinfo,
11314 : : slp_tree node, hash_set<slp_tree> &visited)
11315 : : {
11316 : 552216 : gimple *new_stmt;
11317 : 552216 : gimple_stmt_iterator gsi;
11318 : 552216 : int i;
11319 : 552216 : slp_tree child;
11320 : 552216 : tree lhs;
11321 : 552216 : stmt_vec_info stmt_info;
11322 : :
11323 : 552216 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
11324 : 173326 : return;
11325 : :
11326 : 418896 : if (visited.add (node))
11327 : : return;
11328 : :
11329 : 845971 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11330 : 467081 : vect_remove_slp_scalar_calls (vinfo, child, visited);
11331 : :
11332 : 1194744 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
11333 : : {
11334 : 440919 : if (!stmt_info)
11335 : 3597 : continue;
11336 : 437322 : gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
11337 : 4796 : if (!stmt || gimple_bb (stmt) == NULL)
11338 : 432532 : continue;
11339 : 4790 : if (is_pattern_stmt_p (stmt_info)
11340 : 4790 : || !PURE_SLP_STMT (stmt_info))
11341 : 730 : continue;
11342 : 4060 : lhs = gimple_call_lhs (stmt);
11343 : 4060 : if (lhs)
11344 : 3617 : new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
11345 : : else
11346 : : {
11347 : 443 : new_stmt = gimple_build_nop ();
11348 : 443 : unlink_stmt_vdef (stmt_info->stmt);
11349 : : }
11350 : 4060 : gsi = gsi_for_stmt (stmt);
11351 : 4060 : vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
11352 : 4060 : if (lhs)
11353 : 3617 : SSA_NAME_DEF_STMT (lhs) = new_stmt;
11354 : : }
11355 : : }
11356 : :
11357 : : static void
11358 : 85135 : vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
11359 : : {
11360 : 85135 : hash_set<slp_tree> visited;
11361 : 85135 : vect_remove_slp_scalar_calls (vinfo, node, visited);
11362 : 85135 : }
11363 : :
11364 : : /* Vectorize the instance root. */
11365 : :
11366 : : void
11367 : 5845 : vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance instance)
11368 : : {
11369 : 5845 : gassign *rstmt = NULL;
11370 : :
11371 : 5845 : if (instance->kind == slp_inst_kind_ctor)
11372 : : {
11373 : 251 : if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
11374 : : {
11375 : 220 : tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
11376 : 220 : tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
11377 : 220 : if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
11378 : 220 : TREE_TYPE (vect_lhs)))
11379 : 0 : vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
11380 : : vect_lhs);
11381 : 220 : rstmt = gimple_build_assign (root_lhs, vect_lhs);
11382 : : }
11383 : 31 : else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
11384 : : {
11385 : 31 : int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
11386 : 31 : tree child_def;
11387 : 31 : int j;
11388 : 31 : vec<constructor_elt, va_gc> *v;
11389 : 31 : vec_alloc (v, nelts);
11390 : :
11391 : : /* A CTOR can handle V16HI composition from VNx8HI so we
11392 : : do not need to convert vector elements if the types
11393 : : do not match. */
11394 : 125 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
11395 : 94 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
11396 : 31 : tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
11397 : 31 : tree rtype
11398 : 31 : = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
11399 : 31 : tree r_constructor = build_constructor (rtype, v);
11400 : 31 : rstmt = gimple_build_assign (lhs, r_constructor);
11401 : : }
11402 : : }
11403 : 5594 : else if (instance->kind == slp_inst_kind_bb_reduc)
11404 : : {
11405 : : /* Largely inspired by reduction chain epilogue handling in
11406 : : vect_create_epilog_for_reduction. */
11407 : 4066 : vec<tree> vec_defs = vNULL;
11408 : 4066 : vect_get_slp_defs (node, &vec_defs);
11409 : 4066 : enum tree_code reduc_code
11410 : 4066 : = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
11411 : : /* ??? We actually have to reflect signs somewhere. */
11412 : 4066 : if (reduc_code == MINUS_EXPR)
11413 : 0 : reduc_code = PLUS_EXPR;
11414 : 4066 : gimple_seq epilogue = NULL;
11415 : : /* We may end up with more than one vector result, reduce them
11416 : : to one vector. */
11417 : 4066 : tree vec_def = vec_defs[0];
11418 : 4066 : tree vectype = TREE_TYPE (vec_def);
11419 : 4066 : tree compute_vectype = vectype;
11420 : 4066 : bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
11421 : 3998 : && TYPE_OVERFLOW_UNDEFINED (vectype)
11422 : 6978 : && operation_can_overflow (reduc_code));
11423 : 2788 : if (pun_for_overflow_p)
11424 : : {
11425 : 2788 : compute_vectype = unsigned_type_for (vectype);
11426 : 2788 : vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
11427 : : compute_vectype, vec_def);
11428 : : }
11429 : 6435 : for (unsigned i = 1; i < vec_defs.length (); ++i)
11430 : : {
11431 : 2369 : tree def = vec_defs[i];
11432 : 2369 : if (pun_for_overflow_p)
11433 : 2273 : def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
11434 : : compute_vectype, def);
11435 : 2369 : vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
11436 : : vec_def, def);
11437 : : }
11438 : 4066 : vec_defs.release ();
11439 : : /* ??? Support other schemes than direct internal fn. */
11440 : 4066 : internal_fn reduc_fn;
11441 : 4066 : if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
11442 : 4066 : || reduc_fn == IFN_LAST)
11443 : 0 : gcc_unreachable ();
11444 : 4066 : tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
11445 : 4066 : TREE_TYPE (compute_vectype), vec_def);
11446 : 4066 : if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
11447 : : {
11448 : 2644 : tree rem_def = NULL_TREE;
11449 : 12065 : for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
11450 : : {
11451 : 9421 : def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
11452 : 9421 : if (!rem_def)
11453 : : rem_def = def;
11454 : : else
11455 : 6777 : rem_def = gimple_build (&epilogue, reduc_code,
11456 : 6777 : TREE_TYPE (scalar_def),
11457 : : rem_def, def);
11458 : : }
11459 : 2644 : scalar_def = gimple_build (&epilogue, reduc_code,
11460 : 2644 : TREE_TYPE (scalar_def),
11461 : : scalar_def, rem_def);
11462 : : }
11463 : 4066 : scalar_def = gimple_convert (&epilogue,
11464 : 4066 : TREE_TYPE (vectype), scalar_def);
11465 : 4066 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
11466 : 4066 : gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
11467 : 4066 : gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
11468 : 4066 : update_stmt (gsi_stmt (rgsi));
11469 : 4066 : return;
11470 : : }
11471 : 1528 : else if (instance->kind == slp_inst_kind_gcond)
11472 : : {
11473 : : /* Only support a single root for now as we can't codegen CFG yet and so we
11474 : : can't support lane > 1 at this time. */
11475 : 1528 : gcc_assert (instance->root_stmts.length () == 1);
11476 : 1528 : auto root_stmt_info = instance->root_stmts[0];
11477 : 1528 : auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
11478 : 1528 : gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
11479 : 1528 : gimple *vec_stmt = NULL;
11480 : 1528 : gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
11481 : 1528 : bool res = vectorizable_early_exit (vinfo, root_stmt_info, &rgsi,
11482 : : &vec_stmt, node, NULL);
11483 : 1528 : gcc_assert (res);
11484 : 1528 : return;
11485 : : }
11486 : : else
11487 : 0 : gcc_unreachable ();
11488 : :
11489 : 251 : gcc_assert (rstmt);
11490 : :
11491 : 251 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
11492 : 251 : gsi_replace (&rgsi, rstmt, true);
11493 : : }
11494 : :
11495 : : struct slp_scc_info
11496 : : {
11497 : : bool on_stack;
11498 : : int dfs;
11499 : : int lowlink;
11500 : : };
11501 : :
11502 : : /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
11503 : :
11504 : : static void
11505 : 1406939 : vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
11506 : : hash_map<slp_tree, slp_scc_info> &scc_info,
11507 : : int &maxdfs, vec<slp_tree> &stack)
11508 : : {
11509 : 1406939 : bool existed_p;
11510 : 1406939 : slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
11511 : 1406939 : gcc_assert (!existed_p);
11512 : 1406939 : info->dfs = maxdfs;
11513 : 1406939 : info->lowlink = maxdfs;
11514 : 1406939 : maxdfs++;
11515 : :
11516 : : /* Leaf. */
11517 : 1406939 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
11518 : : {
11519 : 482739 : info->on_stack = false;
11520 : 482739 : vect_schedule_slp_node (vinfo, node, instance);
11521 : 992845 : return;
11522 : : }
11523 : :
11524 : 924200 : info->on_stack = true;
11525 : 924200 : stack.safe_push (node);
11526 : :
11527 : 924200 : unsigned i;
11528 : 924200 : slp_tree child;
11529 : : /* DFS recurse. */
11530 : 1894851 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11531 : : {
11532 : 970651 : if (!child)
11533 : 52845 : continue;
11534 : 917806 : slp_scc_info *child_info = scc_info.get (child);
11535 : 917806 : if (!child_info)
11536 : : {
11537 : 840248 : vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
11538 : : /* Recursion might have re-allocated the node. */
11539 : 840248 : info = scc_info.get (node);
11540 : 840248 : child_info = scc_info.get (child);
11541 : 840248 : info->lowlink = MIN (info->lowlink, child_info->lowlink);
11542 : : }
11543 : 77558 : else if (child_info->on_stack)
11544 : 23383 : info->lowlink = MIN (info->lowlink, child_info->dfs);
11545 : : }
11546 : 924200 : if (info->lowlink != info->dfs)
11547 : : return;
11548 : :
11549 : 896833 : auto_vec<slp_tree, 4> phis_to_fixup;
11550 : :
11551 : : /* Singleton. */
11552 : 896833 : if (stack.last () == node)
11553 : : {
11554 : 874511 : stack.pop ();
11555 : 874511 : info->on_stack = false;
11556 : 874511 : vect_schedule_slp_node (vinfo, node, instance);
11557 : 874511 : if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
11558 : 874511 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
11559 : 29009 : phis_to_fixup.quick_push (node);
11560 : : }
11561 : : else
11562 : : {
11563 : : /* SCC. */
11564 : 22322 : int last_idx = stack.length () - 1;
11565 : 49689 : while (stack[last_idx] != node)
11566 : 27367 : last_idx--;
11567 : : /* We can break the cycle at PHIs who have at least one child
11568 : : code generated. Then we could re-start the DFS walk until
11569 : : all nodes in the SCC are covered (we might have new entries
11570 : : for only back-reachable nodes). But it's simpler to just
11571 : : iterate and schedule those that are ready. */
11572 : 22322 : unsigned todo = stack.length () - last_idx;
11573 : 22475 : do
11574 : : {
11575 : 95335 : for (int idx = stack.length () - 1; idx >= last_idx; --idx)
11576 : : {
11577 : 50385 : slp_tree entry = stack[idx];
11578 : 50385 : if (!entry)
11579 : 378 : continue;
11580 : 50007 : bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
11581 : 50007 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
11582 : 50007 : bool ready = !phi;
11583 : 125451 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
11584 : 98500 : if (!child)
11585 : : {
11586 : 21688 : gcc_assert (phi);
11587 : : ready = true;
11588 : : break;
11589 : : }
11590 : 76812 : else if (scc_info.get (child)->on_stack)
11591 : : {
11592 : 22015 : if (!phi)
11593 : : {
11594 : : ready = false;
11595 : : break;
11596 : : }
11597 : : }
11598 : : else
11599 : : {
11600 : 54797 : if (phi)
11601 : : {
11602 : : ready = true;
11603 : : break;
11604 : : }
11605 : : }
11606 : 28319 : if (ready)
11607 : : {
11608 : 49689 : vect_schedule_slp_node (vinfo, entry, instance);
11609 : 49689 : scc_info.get (entry)->on_stack = false;
11610 : 49689 : stack[idx] = NULL;
11611 : 49689 : todo--;
11612 : 49689 : if (phi)
11613 : 22762 : phis_to_fixup.safe_push (entry);
11614 : : }
11615 : : }
11616 : : }
11617 : 22475 : while (todo != 0);
11618 : :
11619 : : /* Pop the SCC. */
11620 : 22322 : stack.truncate (last_idx);
11621 : : }
11622 : :
11623 : : /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
11624 : : slp_tree phi_node;
11625 : 1845437 : FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
11626 : : {
11627 : 51771 : gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
11628 : 51771 : edge_iterator ei;
11629 : 51771 : edge e;
11630 : 159975 : FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
11631 : : {
11632 : 108204 : unsigned dest_idx = e->dest_idx;
11633 : 108204 : child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
11634 : 108204 : if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
11635 : 62310 : continue;
11636 : 45894 : unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
11637 : : /* Simply fill all args. */
11638 : 45894 : if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
11639 : : != vect_first_order_recurrence)
11640 : 94425 : for (unsigned i = 0; i < n; ++i)
11641 : : {
11642 : 48563 : tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
11643 : 48563 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11644 : 48563 : add_phi_arg (phi, vect_get_slp_vect_def (child, i),
11645 : : e, gimple_phi_arg_location (phi, dest_idx));
11646 : : }
11647 : : else
11648 : : {
11649 : : /* Unless it is a first order recurrence which needs
11650 : : args filled in for both the PHI node and the permutes. */
11651 : 32 : gimple *perm
11652 : 32 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
11653 : 32 : gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
11654 : 32 : add_phi_arg (as_a <gphi *> (rphi),
11655 : : vect_get_slp_vect_def (child, n - 1),
11656 : : e, gimple_phi_arg_location (phi, dest_idx));
11657 : 84 : for (unsigned i = 0; i < n; ++i)
11658 : : {
11659 : 52 : gimple *perm
11660 : 52 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
11661 : 52 : if (i > 0)
11662 : 20 : gimple_assign_set_rhs1 (perm,
11663 : : vect_get_slp_vect_def (child, i - 1));
11664 : 52 : gimple_assign_set_rhs2 (perm,
11665 : : vect_get_slp_vect_def (child, i));
11666 : 52 : update_stmt (perm);
11667 : : }
11668 : : }
11669 : : }
11670 : : }
11671 : 896833 : }
11672 : :
11673 : : /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
11674 : :
11675 : : void
11676 : 530376 : vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
11677 : : {
11678 : 530376 : slp_instance instance;
11679 : 530376 : unsigned int i;
11680 : :
11681 : 530376 : hash_map<slp_tree, slp_scc_info> scc_info;
11682 : 530376 : int maxdfs = 0;
11683 : 1097246 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
11684 : : {
11685 : 566870 : slp_tree node = SLP_INSTANCE_TREE (instance);
11686 : 566870 : if (dump_enabled_p ())
11687 : : {
11688 : 15253 : dump_printf_loc (MSG_NOTE, vect_location,
11689 : : "Vectorizing SLP tree:\n");
11690 : : /* ??? Dump all? */
11691 : 15253 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
11692 : 396 : dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
11693 : 396 : SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
11694 : 15253 : vect_print_slp_graph (MSG_NOTE, vect_location,
11695 : : SLP_INSTANCE_TREE (instance));
11696 : : }
11697 : : /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
11698 : : have a PHI be the node breaking the cycle. */
11699 : 566870 : auto_vec<slp_tree> stack;
11700 : 566870 : if (!scc_info.get (node))
11701 : 566691 : vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
11702 : :
11703 : 566870 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
11704 : 5845 : vectorize_slp_instance_root_stmt (vinfo, node, instance);
11705 : :
11706 : 566870 : if (dump_enabled_p ())
11707 : 15253 : dump_printf_loc (MSG_NOTE, vect_location,
11708 : : "vectorizing stmts using SLP.\n");
11709 : 566870 : }
11710 : :
11711 : 1627622 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
11712 : : {
11713 : 566870 : slp_tree root = SLP_INSTANCE_TREE (instance);
11714 : 566870 : stmt_vec_info store_info;
11715 : 566870 : unsigned int j;
11716 : :
11717 : : /* Remove scalar call stmts. Do not do this for basic-block
11718 : : vectorization as not all uses may be vectorized.
11719 : : ??? Why should this be necessary? DCE should be able to
11720 : : remove the stmts itself.
11721 : : ??? For BB vectorization we can as well remove scalar
11722 : : stmts starting from the SLP tree root if they have no
11723 : : uses. */
11724 : 566870 : if (is_a <loop_vec_info> (vinfo))
11725 : 85135 : vect_remove_slp_scalar_calls (vinfo, root);
11726 : :
11727 : : /* Remove vectorized stores original scalar stmts. */
11728 : 2533331 : for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
11729 : : {
11730 : 1431970 : if (!STMT_VINFO_DATA_REF (store_info)
11731 : 1403512 : || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
11732 : : break;
11733 : :
11734 : 1399591 : store_info = vect_orig_stmt (store_info);
11735 : : /* Free the attached stmt_vec_info and remove the stmt. */
11736 : 1399591 : vinfo->remove_stmt (store_info);
11737 : :
11738 : : /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
11739 : : to not crash in vect_free_slp_tree later. */
11740 : 1399591 : if (SLP_TREE_REPRESENTATIVE (root) == store_info)
11741 : 534256 : SLP_TREE_REPRESENTATIVE (root) = NULL;
11742 : : }
11743 : : }
11744 : 530376 : }
|