Branch data Line data Source code
1 : : /* SLP - Basic Block Vectorization
2 : : Copyright (C) 2007-2025 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : : and Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #include "config.h"
23 : : #define INCLUDE_ALGORITHM
24 : : #include "system.h"
25 : : #include "coretypes.h"
26 : : #include "backend.h"
27 : : #include "target.h"
28 : : #include "rtl.h"
29 : : #include "tree.h"
30 : : #include "gimple.h"
31 : : #include "tree-pass.h"
32 : : #include "ssa.h"
33 : : #include "optabs-tree.h"
34 : : #include "insn-config.h"
35 : : #include "recog.h" /* FIXME: for insn_data */
36 : : #include "fold-const.h"
37 : : #include "stor-layout.h"
38 : : #include "gimple-iterator.h"
39 : : #include "cfgloop.h"
40 : : #include "tree-vectorizer.h"
41 : : #include "langhooks.h"
42 : : #include "gimple-walk.h"
43 : : #include "dbgcnt.h"
44 : : #include "tree-vector-builder.h"
45 : : #include "vec-perm-indices.h"
46 : : #include "gimple-fold.h"
47 : : #include "internal-fn.h"
48 : : #include "dump-context.h"
49 : : #include "cfganal.h"
50 : : #include "tree-eh.h"
51 : : #include "tree-cfg.h"
52 : : #include "alloc-pool.h"
53 : : #include "sreal.h"
54 : : #include "predict.h"
55 : :
56 : : static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 : : load_permutation_t &,
58 : : const vec<tree> &,
59 : : gimple_stmt_iterator *,
60 : : poly_uint64, bool, bool,
61 : : unsigned *,
62 : : unsigned * = nullptr,
63 : : bool = false);
64 : : static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 : : slp_tree, lane_permutation_t &,
66 : : vec<slp_tree> &, bool);
67 : : static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
68 : : static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
69 : :
70 : : static object_allocator<_slp_tree> *slp_tree_pool;
71 : : static slp_tree slp_first_node;
72 : :
73 : : void
74 : 1121940 : vect_slp_init (void)
75 : : {
76 : 1121940 : slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
77 : 1121940 : }
78 : :
79 : : void
80 : 1121940 : vect_slp_fini (void)
81 : : {
82 : 1689034 : while (slp_first_node)
83 : 567094 : delete slp_first_node;
84 : 2243880 : delete slp_tree_pool;
85 : 1121940 : slp_tree_pool = NULL;
86 : 1121940 : }
87 : :
88 : : void *
89 : 7891906 : _slp_tree::operator new (size_t n)
90 : : {
91 : 7891906 : gcc_assert (n == sizeof (_slp_tree));
92 : 7891906 : return slp_tree_pool->allocate_raw ();
93 : : }
94 : :
95 : : void
96 : 7891906 : _slp_tree::operator delete (void *node, size_t n)
97 : : {
98 : 7891906 : gcc_assert (n == sizeof (_slp_tree));
99 : 7891906 : slp_tree_pool->remove_raw (node);
100 : 7891906 : }
101 : :
102 : :
103 : : /* Initialize a SLP node. */
104 : :
105 : 7891906 : _slp_tree::_slp_tree ()
106 : : {
107 : 7891906 : this->prev_node = NULL;
108 : 7891906 : if (slp_first_node)
109 : 6984814 : slp_first_node->prev_node = this;
110 : 7891906 : this->next_node = slp_first_node;
111 : 7891906 : slp_first_node = this;
112 : 7891906 : SLP_TREE_SCALAR_STMTS (this) = vNULL;
113 : 7891906 : SLP_TREE_SCALAR_OPS (this) = vNULL;
114 : 7891906 : SLP_TREE_VEC_DEFS (this) = vNULL;
115 : 7891906 : SLP_TREE_CHILDREN (this) = vNULL;
116 : 7891906 : SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
117 : 7891906 : SLP_TREE_LANE_PERMUTATION (this) = vNULL;
118 : 7891906 : SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
119 : 7891906 : SLP_TREE_CODE (this) = ERROR_MARK;
120 : 7891906 : SLP_TREE_GS_SCALE (this) = 0;
121 : 7891906 : SLP_TREE_GS_BASE (this) = NULL_TREE;
122 : 7891906 : this->ldst_lanes = false;
123 : 7891906 : this->avoid_stlf_fail = false;
124 : 7891906 : SLP_TREE_VECTYPE (this) = NULL_TREE;
125 : 7891906 : SLP_TREE_REPRESENTATIVE (this) = NULL;
126 : 7891906 : this->cycle_info.id = -1;
127 : 7891906 : this->cycle_info.reduc_idx = -1;
128 : 7891906 : SLP_TREE_REF_COUNT (this) = 1;
129 : 7891906 : this->failed = NULL;
130 : 7891906 : this->max_nunits = 1;
131 : 7891906 : this->lanes = 0;
132 : 7891906 : SLP_TREE_TYPE (this) = undef_vec_info_type;
133 : 7891906 : this->data = NULL;
134 : 7891906 : }
135 : :
136 : : /* Tear down a SLP node. */
137 : :
138 : 7891906 : _slp_tree::~_slp_tree ()
139 : : {
140 : 7891906 : if (this->prev_node)
141 : 5124392 : this->prev_node->next_node = this->next_node;
142 : : else
143 : 2767514 : slp_first_node = this->next_node;
144 : 7891906 : if (this->next_node)
145 : 5924909 : this->next_node->prev_node = this->prev_node;
146 : 7891906 : SLP_TREE_CHILDREN (this).release ();
147 : 7891906 : SLP_TREE_SCALAR_STMTS (this).release ();
148 : 7891906 : SLP_TREE_SCALAR_OPS (this).release ();
149 : 7891906 : SLP_TREE_VEC_DEFS (this).release ();
150 : 7891906 : SLP_TREE_LOAD_PERMUTATION (this).release ();
151 : 7891906 : SLP_TREE_LANE_PERMUTATION (this).release ();
152 : 7891906 : if (this->failed)
153 : 1929525 : free (failed);
154 : 7891906 : if (this->data)
155 : 1148189 : delete this->data;
156 : 7891906 : }
157 : :
158 : : /* Push the single SSA definition in DEF to the vector of vector defs. */
159 : :
160 : : void
161 : 494925 : _slp_tree::push_vec_def (gimple *def)
162 : : {
163 : 494925 : if (gphi *phi = dyn_cast <gphi *> (def))
164 : 60455 : vec_defs.quick_push (gimple_phi_result (phi));
165 : : else
166 : : {
167 : 434470 : def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
168 : 434470 : vec_defs.quick_push (get_def_from_ptr (defop));
169 : : }
170 : 494925 : }
171 : :
172 : : /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
173 : :
174 : : void
175 : 15111635 : vect_free_slp_tree (slp_tree node)
176 : : {
177 : 15111635 : int i;
178 : 15111635 : slp_tree child;
179 : :
180 : 15111635 : if (--SLP_TREE_REF_COUNT (node) != 0)
181 : 15111635 : return;
182 : :
183 : 11894848 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
184 : 4570036 : if (child)
185 : 3804911 : vect_free_slp_tree (child);
186 : :
187 : : /* If the node defines any SLP only patterns then those patterns are no
188 : : longer valid and should be removed. */
189 : 7324812 : stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
190 : 7324812 : if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
191 : : {
192 : 1051 : stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
193 : 1051 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
194 : 1051 : STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
195 : : }
196 : :
197 : 7324812 : delete node;
198 : : }
199 : :
200 : : /* Return a location suitable for dumpings related to the SLP instance. */
201 : :
202 : : dump_user_location_t
203 : 3419175 : _slp_instance::location () const
204 : : {
205 : 3419175 : if (!root_stmts.is_empty ())
206 : 318383 : return root_stmts[0]->stmt;
207 : : else
208 : 3100792 : return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
209 : : }
210 : :
211 : :
212 : : /* Free the memory allocated for the SLP instance. */
213 : :
214 : : void
215 : 1759845 : vect_free_slp_instance (slp_instance instance)
216 : : {
217 : 1759845 : vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
218 : 1759845 : SLP_INSTANCE_LOADS (instance).release ();
219 : 1759845 : SLP_INSTANCE_ROOT_STMTS (instance).release ();
220 : 1759845 : SLP_INSTANCE_REMAIN_DEFS (instance).release ();
221 : 1759845 : instance->subgraph_entries.release ();
222 : 1759845 : instance->cost_vec.release ();
223 : 1759845 : free (instance);
224 : 1759845 : }
225 : :
226 : :
227 : : /* Create an SLP node for SCALAR_STMTS. */
228 : :
229 : : slp_tree
230 : 107603 : vect_create_new_slp_node (unsigned nops, tree_code code)
231 : : {
232 : 107603 : slp_tree node = new _slp_tree;
233 : 107603 : SLP_TREE_SCALAR_STMTS (node) = vNULL;
234 : 107603 : SLP_TREE_CHILDREN (node).create (nops);
235 : 107603 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
236 : 107603 : SLP_TREE_CODE (node) = code;
237 : 107603 : return node;
238 : : }
239 : : /* Create an SLP node for SCALAR_STMTS. */
240 : :
241 : : static slp_tree
242 : 3818146 : vect_create_new_slp_node (slp_tree node,
243 : : vec<stmt_vec_info> scalar_stmts, unsigned nops)
244 : : {
245 : 3818146 : SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
246 : 3818146 : SLP_TREE_CHILDREN (node).create (nops);
247 : 3818146 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
248 : 3818146 : SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
249 : 3818146 : SLP_TREE_LANES (node) = scalar_stmts.length ();
250 : 3818146 : return node;
251 : : }
252 : :
253 : : /* Create an SLP node for SCALAR_STMTS. */
254 : :
255 : : static slp_tree
256 : 6497 : vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
257 : : {
258 : 6497 : return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
259 : : }
260 : :
261 : : /* Create an SLP node for OPS. */
262 : :
263 : : static slp_tree
264 : 2029346 : vect_create_new_slp_node (slp_tree node, vec<tree> ops)
265 : : {
266 : 2029346 : SLP_TREE_SCALAR_OPS (node) = ops;
267 : 2029346 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
268 : 0 : SLP_TREE_LANES (node) = ops.length ();
269 : 2029346 : return node;
270 : : }
271 : :
272 : : /* Create an SLP node for OPS. */
273 : :
274 : : static slp_tree
275 : 2029346 : vect_create_new_slp_node (vec<tree> ops)
276 : : {
277 : 2029346 : return vect_create_new_slp_node (new _slp_tree, ops);
278 : : }
279 : :
280 : :
281 : : /* This structure is used in creation of an SLP tree. Each instance
282 : : corresponds to the same operand in a group of scalar stmts in an SLP
283 : : node. */
284 : : typedef struct _slp_oprnd_info
285 : : {
286 : : /* Def-stmts for the operands. */
287 : : vec<stmt_vec_info> def_stmts;
288 : : /* Operands. */
289 : : vec<tree> ops;
290 : : /* Information about the first statement, its vector def-type, type, the
291 : : operand itself in case it's constant, and an indication if it's a pattern
292 : : stmt and gather/scatter info. */
293 : : tree first_op_type;
294 : : enum vect_def_type first_dt;
295 : : bool any_pattern;
296 : : bool first_gs_p;
297 : : gather_scatter_info first_gs_info;
298 : : } *slp_oprnd_info;
299 : :
300 : :
301 : : /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
302 : : operand. */
303 : : static vec<slp_oprnd_info>
304 : 3453084 : vect_create_oprnd_info (int nops, int group_size)
305 : : {
306 : 3453084 : int i;
307 : 3453084 : slp_oprnd_info oprnd_info;
308 : 3453084 : vec<slp_oprnd_info> oprnds_info;
309 : :
310 : 3453084 : oprnds_info.create (nops);
311 : 12517685 : for (i = 0; i < nops; i++)
312 : : {
313 : 5611517 : oprnd_info = XNEW (struct _slp_oprnd_info);
314 : 5611517 : oprnd_info->def_stmts.create (group_size);
315 : 5611517 : oprnd_info->ops.create (group_size);
316 : 5611517 : oprnd_info->first_dt = vect_uninitialized_def;
317 : 5611517 : oprnd_info->first_op_type = NULL_TREE;
318 : 5611517 : oprnd_info->any_pattern = false;
319 : 5611517 : oprnd_info->first_gs_p = false;
320 : 5611517 : oprnds_info.quick_push (oprnd_info);
321 : : }
322 : :
323 : 3453084 : return oprnds_info;
324 : : }
325 : :
326 : :
327 : : /* Free operands info. */
328 : :
329 : : static void
330 : 3453084 : vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
331 : : {
332 : 3453084 : int i;
333 : 3453084 : slp_oprnd_info oprnd_info;
334 : :
335 : 9064601 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
336 : : {
337 : 5611517 : oprnd_info->def_stmts.release ();
338 : 5611517 : oprnd_info->ops.release ();
339 : 5611517 : XDELETE (oprnd_info);
340 : : }
341 : :
342 : 3453084 : oprnds_info.release ();
343 : 3453084 : }
344 : :
345 : : /* Return the execution frequency of NODE (so that a higher value indicates
346 : : a "more important" node when optimizing for speed). */
347 : :
348 : : static sreal
349 : 3605793 : vect_slp_node_weight (slp_tree node)
350 : : {
351 : 3605793 : stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
352 : 3605793 : basic_block bb = gimple_bb (stmt_info->stmt);
353 : 3605793 : return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
354 : : }
355 : :
356 : : /* Return true if STMTS contains a pattern statement. */
357 : :
358 : : static bool
359 : 22796 : vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
360 : : {
361 : 22796 : stmt_vec_info stmt_info;
362 : 22796 : unsigned int i;
363 : 74614 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
364 : 54065 : if (stmt_info && is_pattern_stmt_p (stmt_info))
365 : : return true;
366 : : return false;
367 : : }
368 : :
369 : : /* Return true when all lanes in the external or constant NODE have
370 : : the same value. */
371 : :
372 : : static bool
373 : 601350 : vect_slp_tree_uniform_p (slp_tree node)
374 : : {
375 : 601350 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
376 : : || SLP_TREE_DEF_TYPE (node) == vect_external_def);
377 : :
378 : : /* Pre-exsting vectors. */
379 : 1062986 : if (SLP_TREE_SCALAR_OPS (node).is_empty ())
380 : : return false;
381 : :
382 : : unsigned i;
383 : : tree op, first = NULL_TREE;
384 : 1370888 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
385 : 1231174 : if (!first)
386 : : first = op;
387 : 629824 : else if (!operand_equal_p (first, op, 0))
388 : : return false;
389 : :
390 : : return true;
391 : : }
392 : :
393 : : /* Find the place of the data-ref in STMT_INFO in the interleaving chain
394 : : that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
395 : : of the chain. */
396 : :
397 : : int
398 : 682675 : vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
399 : : stmt_vec_info first_stmt_info)
400 : : {
401 : 682675 : stmt_vec_info next_stmt_info = first_stmt_info;
402 : 682675 : int result = 0;
403 : :
404 : 682675 : if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
405 : : return -1;
406 : :
407 : 1724101 : do
408 : : {
409 : 1724101 : if (next_stmt_info == stmt_info)
410 : : return result;
411 : 1041426 : next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
412 : 1041426 : if (next_stmt_info)
413 : 1041426 : result += DR_GROUP_GAP (next_stmt_info);
414 : : }
415 : 1041426 : while (next_stmt_info);
416 : :
417 : : return -1;
418 : : }
419 : :
420 : : /* Check whether it is possible to load COUNT elements of type ELT_TYPE
421 : : using the method implemented by duplicate_and_interleave. Return true
422 : : if so, returning the number of intermediate vectors in *NVECTORS_OUT
423 : : (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
424 : : (if nonnull). */
425 : :
426 : : bool
427 : 0 : can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
428 : : tree elt_type, unsigned int *nvectors_out,
429 : : tree *vector_type_out,
430 : : tree *permutes)
431 : : {
432 : 0 : tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
433 : 0 : if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
434 : 0 : return false;
435 : :
436 : 0 : machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
437 : 0 : poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
438 : 0 : unsigned int nvectors = 1;
439 : 0 : for (;;)
440 : : {
441 : 0 : scalar_int_mode int_mode;
442 : : poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
443 : 0 : if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
444 : : {
445 : : /* Get the natural vector type for this SLP group size. */
446 : 0 : tree int_type = build_nonstandard_integer_type
447 : 0 : (GET_MODE_BITSIZE (int_mode), 1);
448 : 0 : tree vector_type
449 : 0 : = get_vectype_for_scalar_type (vinfo, int_type, count);
450 : 0 : poly_int64 half_nelts;
451 : 0 : if (vector_type
452 : 0 : && VECTOR_MODE_P (TYPE_MODE (vector_type))
453 : 0 : && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
454 : : GET_MODE_SIZE (base_vector_mode))
455 : 0 : && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
456 : : 2, &half_nelts))
457 : : {
458 : : /* Try fusing consecutive sequences of COUNT / NVECTORS elements
459 : : together into elements of type INT_TYPE and using the result
460 : : to build NVECTORS vectors. */
461 : 0 : poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
462 : 0 : vec_perm_builder sel1 (nelts, 2, 3);
463 : 0 : vec_perm_builder sel2 (nelts, 2, 3);
464 : :
465 : 0 : for (unsigned int i = 0; i < 3; ++i)
466 : : {
467 : 0 : sel1.quick_push (i);
468 : 0 : sel1.quick_push (i + nelts);
469 : 0 : sel2.quick_push (half_nelts + i);
470 : 0 : sel2.quick_push (half_nelts + i + nelts);
471 : : }
472 : 0 : vec_perm_indices indices1 (sel1, 2, nelts);
473 : 0 : vec_perm_indices indices2 (sel2, 2, nelts);
474 : 0 : machine_mode vmode = TYPE_MODE (vector_type);
475 : 0 : if (can_vec_perm_const_p (vmode, vmode, indices1)
476 : 0 : && can_vec_perm_const_p (vmode, vmode, indices2))
477 : : {
478 : 0 : if (nvectors_out)
479 : 0 : *nvectors_out = nvectors;
480 : 0 : if (vector_type_out)
481 : 0 : *vector_type_out = vector_type;
482 : 0 : if (permutes)
483 : : {
484 : 0 : permutes[0] = vect_gen_perm_mask_checked (vector_type,
485 : : indices1);
486 : 0 : permutes[1] = vect_gen_perm_mask_checked (vector_type,
487 : : indices2);
488 : : }
489 : 0 : return true;
490 : : }
491 : 0 : }
492 : : }
493 : 0 : if (!multiple_p (elt_bytes, 2, &elt_bytes))
494 : : return false;
495 : 0 : nvectors *= 2;
496 : : /* We need to be able to fuse COUNT / NVECTORS elements together. */
497 : 0 : if (!multiple_p (count, nvectors))
498 : : return false;
499 : : }
500 : : }
501 : :
502 : : /* Return true if DTA and DTB match. */
503 : :
504 : : static bool
505 : 16623882 : vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
506 : : {
507 : 16623882 : return (dta == dtb
508 : 329475 : || ((dta == vect_external_def || dta == vect_constant_def)
509 : 208109 : && (dtb == vect_external_def || dtb == vect_constant_def)));
510 : : }
511 : :
512 : : #define GATHER_SCATTER_OFFSET (-3)
513 : :
514 : : static const int no_arg_map[] = { 0 };
515 : : static const int arg0_map[] = { 1, 0 };
516 : : static const int arg2_map[] = { 1, 2 };
517 : : static const int arg2_arg3_map[] = { 2, 2, 3 };
518 : : static const int arg2_arg4_map[] = { 2, 2, 4 };
519 : : static const int arg2_arg5_arg6_map[] = { 3, 2, 5, 6 };
520 : : static const int arg2_arg4_arg5_map[] = { 3, 2, 4, 5 };
521 : : static const int arg3_arg2_map[] = { 2, 3, 2 };
522 : : static const int op1_op0_map[] = { 2, 1, 0 };
523 : : static const int off_map[] = { 1, GATHER_SCATTER_OFFSET };
524 : : static const int off_op0_map[] = { 2, GATHER_SCATTER_OFFSET, 0 };
525 : : static const int off_arg2_arg3_map[] = { 3, GATHER_SCATTER_OFFSET, 2, 3 };
526 : : static const int off_arg3_arg2_map[] = { 3, GATHER_SCATTER_OFFSET, 3, 2 };
527 : : static const int mask_call_maps[6][7] = {
528 : : { 1, 1, },
529 : : { 2, 1, 2, },
530 : : { 3, 1, 2, 3, },
531 : : { 4, 1, 2, 3, 4, },
532 : : { 5, 1, 2, 3, 4, 5, },
533 : : { 6, 1, 2, 3, 4, 5, 6 },
534 : : };
535 : :
536 : : /* For most SLP statements, there is a one-to-one mapping between
537 : : gimple arguments and child nodes. If that is not true for STMT,
538 : : return an array that contains:
539 : :
540 : : - the number of child nodes, followed by
541 : : - for each child node, the index of the argument associated with that node.
542 : : The special index -1 is the first operand of an embedded comparison and
543 : : the special index -2 is the second operand of an embedded comparison.
544 : : The special indes -3 is the offset of a gather as analyzed by
545 : : vect_check_gather_scatter.
546 : :
547 : : SWAP is as for vect_get_and_check_slp_defs. */
548 : :
549 : : static const int *
550 : 19708871 : vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
551 : : unsigned char swap = 0)
552 : : {
553 : 19708871 : if (auto assign = dyn_cast<const gassign *> (stmt))
554 : : {
555 : 18111362 : if (gimple_assign_rhs_code (assign) == COND_EXPR
556 : 18111362 : && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
557 : 0 : gcc_unreachable ();
558 : 18111362 : if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
559 : 18111362 : && swap)
560 : : return op1_op0_map;
561 : 18069376 : if (gather_scatter_p)
562 : 33088 : return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
563 : 33088 : ? off_op0_map : off_map);
564 : : }
565 : 19633797 : gcc_assert (!swap);
566 : 19633797 : if (auto call = dyn_cast<const gcall *> (stmt))
567 : : {
568 : 132566 : if (gimple_call_internal_p (call))
569 : 70571 : switch (gimple_call_internal_fn (call))
570 : : {
571 : 11321 : case IFN_MASK_LOAD:
572 : 18422 : return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
573 : :
574 : 0 : case IFN_GATHER_LOAD:
575 : 0 : return arg2_map;
576 : :
577 : 0 : case IFN_MASK_GATHER_LOAD:
578 : 0 : case IFN_MASK_LEN_GATHER_LOAD:
579 : 0 : return arg2_arg5_arg6_map;
580 : :
581 : 0 : case IFN_SCATTER_STORE:
582 : 0 : return arg2_arg4_map;
583 : :
584 : 0 : case IFN_MASK_SCATTER_STORE:
585 : 0 : case IFN_MASK_LEN_SCATTER_STORE:
586 : 0 : return arg2_arg4_arg5_map;
587 : :
588 : 5912 : case IFN_MASK_STORE:
589 : 10526 : return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
590 : :
591 : 842 : case IFN_MASK_CALL:
592 : 842 : {
593 : 842 : unsigned nargs = gimple_call_num_args (call);
594 : 842 : if (nargs >= 2 && nargs <= 7)
595 : 842 : return mask_call_maps[nargs-2];
596 : : else
597 : : return nullptr;
598 : : }
599 : :
600 : 140 : case IFN_CLZ:
601 : 140 : case IFN_CTZ:
602 : 140 : return arg0_map;
603 : :
604 : 6294 : case IFN_GOMP_SIMD_LANE:
605 : 6294 : return no_arg_map;
606 : :
607 : : default:
608 : : break;
609 : : }
610 : : }
611 : : return nullptr;
612 : : }
613 : :
614 : : /* Return the SLP node child index for operand OP of STMT. */
615 : :
616 : : int
617 : 1331935 : vect_slp_child_index_for_operand (const gimple *stmt, int op,
618 : : bool gather_scatter_p)
619 : : {
620 : 1331935 : const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
621 : 1331935 : if (!opmap)
622 : : return op;
623 : 19381 : for (int i = 1; i < 1 + opmap[0]; ++i)
624 : 19381 : if (opmap[i] == op)
625 : 10449 : return i - 1;
626 : 0 : gcc_unreachable ();
627 : : }
628 : :
629 : : /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
630 : : they are of a valid type and that they match the defs of the first stmt of
631 : : the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
632 : : by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
633 : : indicates swap is required for cond_expr stmts. Specifically, SWAP
634 : : is 1 if STMT is cond and operands of comparison need to be swapped;
635 : : SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
636 : :
637 : : If there was a fatal error return -1; if the error could be corrected by
638 : : swapping operands of father node of this one, return 1; if everything is
639 : : ok return 0. */
640 : : static int
641 : 12622571 : vect_get_and_check_slp_defs (vec_info *vinfo, tree vectype, unsigned char swap,
642 : : bool *skip_args,
643 : : vec<stmt_vec_info> stmts, unsigned stmt_num,
644 : : vec<slp_oprnd_info> *oprnds_info)
645 : : {
646 : 12622571 : stmt_vec_info stmt_info = stmts[stmt_num];
647 : 12622571 : tree oprnd;
648 : 12622571 : unsigned int i, number_of_oprnds;
649 : 12622571 : enum vect_def_type dt = vect_uninitialized_def;
650 : 12622571 : slp_oprnd_info oprnd_info;
651 : 12622571 : gather_scatter_info gs_info;
652 : 12622571 : unsigned int gs_op = -1u;
653 : 12622571 : unsigned int commutative_op = -1U;
654 : 12622571 : bool first = stmt_num == 0;
655 : :
656 : 12622571 : if (!stmt_info)
657 : : {
658 : 0 : for (auto oi : *oprnds_info)
659 : : {
660 : 0 : oi->def_stmts.quick_push (NULL);
661 : 0 : oi->ops.quick_push (NULL_TREE);
662 : : }
663 : : return 0;
664 : : }
665 : :
666 : 12622571 : if (!is_a<gcall *> (stmt_info->stmt)
667 : : && !is_a<gassign *> (stmt_info->stmt)
668 : : && !is_a<gphi *> (stmt_info->stmt))
669 : : return -1;
670 : :
671 : 12622571 : number_of_oprnds = gimple_num_args (stmt_info->stmt);
672 : 12622571 : const int *map
673 : 25245142 : = vect_get_operand_map (stmt_info->stmt,
674 : 12622571 : STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
675 : 12622571 : if (map)
676 : 65663 : number_of_oprnds = *map++;
677 : 12622571 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
678 : : {
679 : 38435 : if (gimple_call_internal_p (stmt))
680 : : {
681 : 22910 : internal_fn ifn = gimple_call_internal_fn (stmt);
682 : 22910 : commutative_op = first_commutative_argument (ifn);
683 : 22910 : if (internal_gather_scatter_fn_p (ifn))
684 : : {
685 : 0 : vect_describe_gather_scatter_call
686 : 0 : (stmt_info,
687 : 0 : first ? &(*oprnds_info)[0]->first_gs_info : &gs_info);
688 : 0 : if (first)
689 : 0 : (*oprnds_info)[0]->first_gs_p = true;
690 : : gs_op = 0;
691 : : }
692 : : }
693 : : }
694 : 12584136 : else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
695 : : {
696 : 14422412 : if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
697 : 8216672 : commutative_op = 0;
698 : : }
699 : :
700 : 12622571 : bool swapped = (swap != 0);
701 : 12622571 : bool backedge = false;
702 : 12622571 : enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
703 : 34985357 : for (i = 0; i < number_of_oprnds; i++)
704 : : {
705 : 22365251 : oprnd_info = (*oprnds_info)[i];
706 : 22365251 : int opno = map ? map[i] : int (i);
707 : 22365251 : if (opno == GATHER_SCATTER_OFFSET)
708 : : {
709 : 16976 : gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
710 : 16976 : if (!is_a <loop_vec_info> (vinfo)
711 : 16976 : || !vect_check_gather_scatter (stmt_info, vectype,
712 : : as_a <loop_vec_info> (vinfo),
713 : : first ? &oprnd_info->first_gs_info
714 : : : &gs_info))
715 : 2465 : return -1;
716 : :
717 : 16976 : if (first)
718 : : {
719 : 16739 : oprnd_info->first_gs_p = true;
720 : 16739 : oprnd = oprnd_info->first_gs_info.offset;
721 : : }
722 : : else
723 : : {
724 : 237 : gs_op = i;
725 : 237 : oprnd = gs_info.offset;
726 : : }
727 : : }
728 : 22348275 : else if (opno < 0)
729 : 0 : oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
730 : : else
731 : : {
732 : 22348275 : oprnd = gimple_arg (stmt_info->stmt, opno);
733 : 22348275 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
734 : : {
735 : 1481072 : edge e = gimple_phi_arg_edge (stmt, opno);
736 : 2962144 : backedge = (is_a <bb_vec_info> (vinfo)
737 : 2426230 : ? e->flags & EDGE_DFS_BACK
738 : 945158 : : dominated_by_p (CDI_DOMINATORS, e->src,
739 : 945158 : gimple_bb (stmt_info->stmt)));
740 : : }
741 : : }
742 : 22365251 : if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
743 : 2390 : oprnd = TREE_OPERAND (oprnd, 0);
744 : :
745 : 22365251 : stmt_vec_info def_stmt_info;
746 : 22365251 : if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
747 : : {
748 : 1117 : if (dump_enabled_p ())
749 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
750 : : "Build SLP failed: can't analyze def for %T\n",
751 : : oprnd);
752 : :
753 : 1117 : return -1;
754 : : }
755 : :
756 : 22364134 : if (skip_args[i])
757 : : {
758 : 872318 : oprnd_info->def_stmts.quick_push (NULL);
759 : 872318 : oprnd_info->ops.quick_push (NULL_TREE);
760 : 872318 : oprnd_info->first_dt = vect_uninitialized_def;
761 : 872318 : continue;
762 : : }
763 : :
764 : 21491816 : oprnd_info->def_stmts.quick_push (def_stmt_info);
765 : 21491816 : oprnd_info->ops.quick_push (oprnd);
766 : :
767 : 21491816 : if (def_stmt_info
768 : 21491816 : && is_pattern_stmt_p (def_stmt_info))
769 : : {
770 : 330935 : if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
771 : : != def_stmt_info)
772 : 237673 : oprnd_info->any_pattern = true;
773 : : else
774 : : /* If we promote this to external use the original stmt def. */
775 : 93262 : oprnd_info->ops.last ()
776 : 186524 : = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
777 : : }
778 : :
779 : : /* If there's a extern def on a backedge make sure we can
780 : : code-generate at the region start.
781 : : ??? This is another case that could be fixed by adjusting
782 : : how we split the function but at the moment we'd have conflicting
783 : : goals there. */
784 : 21491816 : if (backedge
785 : 107939 : && dts[i] == vect_external_def
786 : 1369 : && is_a <bb_vec_info> (vinfo)
787 : 1369 : && TREE_CODE (oprnd) == SSA_NAME
788 : 1348 : && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
789 : 21493164 : && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
790 : 1348 : gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
791 : : {
792 : 1348 : if (dump_enabled_p ())
793 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
794 : : "Build SLP failed: extern def %T only defined "
795 : : "on backedge\n", oprnd);
796 : 1348 : return -1;
797 : : }
798 : :
799 : 21490468 : if (first)
800 : : {
801 : 4752491 : tree type = TREE_TYPE (oprnd);
802 : 4752491 : dt = dts[i];
803 : :
804 : : /* For the swapping logic below force vect_reduction_def
805 : : for the reduction op in a SLP reduction group. */
806 : 4752491 : if (!STMT_VINFO_DATA_REF (stmt_info)
807 : 3665933 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
808 : 6101 : && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
809 : 4755528 : && def_stmt_info)
810 : 3037 : dts[i] = dt = vect_reduction_def;
811 : :
812 : : /* Check the types of the definition. */
813 : 4752491 : switch (dt)
814 : : {
815 : 4752491 : case vect_external_def:
816 : 4752491 : case vect_constant_def:
817 : 4752491 : case vect_internal_def:
818 : 4752491 : case vect_reduction_def:
819 : 4752491 : case vect_double_reduction_def:
820 : 4752491 : case vect_induction_def:
821 : 4752491 : case vect_nested_cycle:
822 : 4752491 : case vect_first_order_recurrence:
823 : 4752491 : break;
824 : :
825 : 0 : default:
826 : : /* FORNOW: Not supported. */
827 : 0 : if (dump_enabled_p ())
828 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
829 : : "Build SLP failed: illegal type of def %T\n",
830 : : oprnd);
831 : 0 : return -1;
832 : : }
833 : :
834 : 4752491 : oprnd_info->first_dt = dt;
835 : 4752491 : oprnd_info->first_op_type = type;
836 : : }
837 : : }
838 : 12620106 : if (first)
839 : : return 0;
840 : :
841 : : /* Now match the operand definition types to that of the first stmt. */
842 : 25641306 : for (i = 0; i < number_of_oprnds;)
843 : : {
844 : 16724830 : if (skip_args[i])
845 : : {
846 : 17381 : ++i;
847 : 17381 : continue;
848 : : }
849 : :
850 : 16707449 : oprnd_info = (*oprnds_info)[i];
851 : 16707449 : dt = dts[i];
852 : 16707449 : stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
853 : 16707449 : oprnd = oprnd_info->ops[stmt_num];
854 : 16707449 : tree type = TREE_TYPE (oprnd);
855 : :
856 : 16707449 : if (!types_compatible_p (oprnd_info->first_op_type, type))
857 : : {
858 : 88979 : if (dump_enabled_p ())
859 : 107 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
860 : : "Build SLP failed: different operand types\n");
861 : 88979 : return 1;
862 : : }
863 : :
864 : 16618470 : if ((gs_op == i) != oprnd_info->first_gs_p)
865 : : {
866 : 0 : if (dump_enabled_p ())
867 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
868 : : "Build SLP failed: mixed gather and non-gather\n");
869 : 0 : return 1;
870 : : }
871 : 16618470 : else if (gs_op == i)
872 : : {
873 : 207 : if (!operand_equal_p (oprnd_info->first_gs_info.base,
874 : 207 : gs_info.base))
875 : : {
876 : 16 : if (dump_enabled_p ())
877 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
878 : : "Build SLP failed: different gather base\n");
879 : 16 : return 1;
880 : : }
881 : 191 : if (oprnd_info->first_gs_info.scale != gs_info.scale)
882 : : {
883 : 8 : if (dump_enabled_p ())
884 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
885 : : "Build SLP failed: different gather scale\n");
886 : 8 : return 1;
887 : : }
888 : : }
889 : :
890 : : /* Not first stmt of the group, check that the def-stmt/s match
891 : : the def-stmt/s of the first stmt. Allow different definition
892 : : types for reduction chains: the first stmt must be a
893 : : vect_reduction_def (a phi node), and the rest
894 : : end in the reduction chain. */
895 : 16618446 : if ((!vect_def_types_match (oprnd_info->first_dt, dt)
896 : 269961 : && !(oprnd_info->first_dt == vect_reduction_def
897 : 1553 : && !STMT_VINFO_DATA_REF (stmt_info)
898 : 1553 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
899 : 1553 : && def_stmt_info
900 : 1553 : && !STMT_VINFO_DATA_REF (def_stmt_info)
901 : 1547 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
902 : : == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
903 : 16349877 : || (!STMT_VINFO_DATA_REF (stmt_info)
904 : 15055585 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
905 : 9103 : && ((!def_stmt_info
906 : 8982 : || STMT_VINFO_DATA_REF (def_stmt_info)
907 : 16996 : || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
908 : : != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
909 : 9103 : != (oprnd_info->first_dt != vect_reduction_def))))
910 : : {
911 : : /* Try swapping operands if we got a mismatch. For BB
912 : : vectorization only in case it will clearly improve things. */
913 : 271248 : if (i == commutative_op && !swapped
914 : 268893 : && (!is_a <bb_vec_info> (vinfo)
915 : 4565 : || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
916 : 4565 : dts[i+1])
917 : 680 : && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
918 : : || vect_def_types_match
919 : 191 : ((*oprnds_info)[i+1]->first_dt, dts[i])))))
920 : : {
921 : 2355 : if (dump_enabled_p ())
922 : 285 : dump_printf_loc (MSG_NOTE, vect_location,
923 : : "trying swapped operands\n");
924 : 2355 : std::swap (dts[i], dts[i+1]);
925 : 2355 : std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
926 : 2355 : (*oprnds_info)[i+1]->def_stmts[stmt_num]);
927 : 2355 : std::swap ((*oprnds_info)[i]->ops[stmt_num],
928 : 2355 : (*oprnds_info)[i+1]->ops[stmt_num]);
929 : : /* After swapping some operands we lost track whether an
930 : : operand has any pattern defs so be conservative here. */
931 : 2355 : if ((*oprnds_info)[i]->any_pattern
932 : 2355 : || (*oprnds_info)[i+1]->any_pattern)
933 : 4 : (*oprnds_info)[i]->any_pattern
934 : 2 : = (*oprnds_info)[i+1]->any_pattern = true;
935 : 2355 : swapped = true;
936 : 2355 : continue;
937 : : }
938 : :
939 : 266538 : if (is_a <bb_vec_info> (vinfo)
940 : 255986 : && !oprnd_info->any_pattern
941 : 522287 : && number_of_oprnds > 1)
942 : : {
943 : : /* Now for commutative ops we should see whether we can
944 : : make the other operand matching. */
945 : 101306 : if (dump_enabled_p ())
946 : 149 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
947 : : "treating operand as external\n");
948 : 101306 : oprnd_info->first_dt = dt = vect_external_def;
949 : : }
950 : : else
951 : : {
952 : 165232 : if (dump_enabled_p ())
953 : 393 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
954 : : "Build SLP failed: different types\n");
955 : 165232 : return 1;
956 : : }
957 : : }
958 : :
959 : : /* Make sure to demote the overall operand to external. */
960 : 16450859 : if (dt == vect_external_def)
961 : 339715 : oprnd_info->first_dt = vect_external_def;
962 : : /* For a SLP reduction chain we want to duplicate the reduction to
963 : : each of the chain members. That gets us a sane SLP graph (still
964 : : the stmts are not 100% correct wrt the initial values). */
965 : 16111144 : else if ((dt == vect_internal_def
966 : 16111144 : || dt == vect_reduction_def)
967 : 15207538 : && oprnd_info->first_dt == vect_reduction_def
968 : 16229 : && !STMT_VINFO_DATA_REF (stmt_info)
969 : 16229 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
970 : 4280 : && !STMT_VINFO_DATA_REF (def_stmt_info)
971 : 16115424 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
972 : : == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
973 : : {
974 : 4280 : oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
975 : 4280 : oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
976 : : }
977 : :
978 : 16450859 : ++i;
979 : : }
980 : :
981 : : /* Swap operands. */
982 : 8916476 : if (swapped)
983 : : {
984 : 41681 : if (dump_enabled_p ())
985 : 356 : dump_printf_loc (MSG_NOTE, vect_location,
986 : : "swapped operands to match def types in %G",
987 : : stmt_info->stmt);
988 : : }
989 : :
990 : : return 0;
991 : : }
992 : :
993 : : /* Return true if call statements CALL1 and CALL2 are similar enough
994 : : to be combined into the same SLP group. */
995 : :
996 : : bool
997 : 20620 : compatible_calls_p (gcall *call1, gcall *call2, bool allow_two_operators)
998 : : {
999 : 20620 : unsigned int nargs = gimple_call_num_args (call1);
1000 : 20620 : if (nargs != gimple_call_num_args (call2))
1001 : : return false;
1002 : :
1003 : 18733 : auto cfn1 = gimple_call_combined_fn (call1);
1004 : 18733 : auto cfn2 = gimple_call_combined_fn (call2);
1005 : 18733 : if (cfn1 != cfn2
1006 : 2 : && (!allow_two_operators
1007 : 2 : || !((cfn1 == CFN_FMA || cfn1 == CFN_FMS)
1008 : 2 : && (cfn2 == CFN_FMA || cfn2 == CFN_FMS))))
1009 : : return false;
1010 : :
1011 : 18733 : if (gimple_call_internal_p (call1))
1012 : : {
1013 : 6762 : if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
1014 : 6762 : TREE_TYPE (gimple_call_lhs (call2))))
1015 : : return false;
1016 : 13731 : for (unsigned int i = 0; i < nargs; ++i)
1017 : 6969 : if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
1018 : 6969 : TREE_TYPE (gimple_call_arg (call2, i))))
1019 : : return false;
1020 : : }
1021 : : else
1022 : : {
1023 : 11971 : if (!operand_equal_p (gimple_call_fn (call1),
1024 : 11971 : gimple_call_fn (call2), 0))
1025 : : return false;
1026 : :
1027 : 25944 : if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
1028 : : return false;
1029 : : }
1030 : :
1031 : : /* Check that any unvectorized arguments are equal. */
1032 : 15410 : if (const int *map = vect_get_operand_map (call1))
1033 : : {
1034 : 15 : unsigned int nkept = *map++;
1035 : 15 : unsigned int mapi = 0;
1036 : 57 : for (unsigned int i = 0; i < nargs; ++i)
1037 : 42 : if (mapi < nkept && map[mapi] == int (i))
1038 : 27 : mapi += 1;
1039 : 15 : else if (!operand_equal_p (gimple_call_arg (call1, i),
1040 : 15 : gimple_call_arg (call2, i)))
1041 : : return false;
1042 : : }
1043 : :
1044 : : return true;
1045 : : }
1046 : :
1047 : : /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1048 : : caller's attempt to find the vector type in STMT_INFO with the narrowest
1049 : : element type. Return true if VECTYPE is nonnull and if it is valid
1050 : : for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1051 : : number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1052 : : vect_build_slp_tree. */
1053 : :
1054 : : static bool
1055 : 5646107 : vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1056 : : unsigned int group_size,
1057 : : tree vectype, poly_uint64 *max_nunits)
1058 : : {
1059 : 5646107 : if (!vectype)
1060 : : {
1061 : 4030 : if (dump_enabled_p ())
1062 : 11 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1063 : : "Build SLP failed: unsupported data-type in %G\n",
1064 : : stmt_info->stmt);
1065 : : /* Fatal mismatch. */
1066 : 4030 : return false;
1067 : : }
1068 : :
1069 : : /* If populating the vector type requires unrolling then fail
1070 : : before adjusting *max_nunits for basic-block vectorization. */
1071 : 5642077 : if (is_a <bb_vec_info> (vinfo)
1072 : 5642077 : && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1073 : : {
1074 : 142503 : if (dump_enabled_p ())
1075 : 34 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1076 : : "Build SLP failed: unrolling required "
1077 : : "in basic block SLP\n");
1078 : : /* Fatal mismatch. */
1079 : 142503 : return false;
1080 : : }
1081 : :
1082 : : /* In case of multiple types we need to detect the smallest type. */
1083 : 5499574 : vect_update_max_nunits (max_nunits, vectype);
1084 : 5499574 : return true;
1085 : : }
1086 : :
1087 : : /* Verify if the scalar stmts STMTS are isomorphic, require data
1088 : : permutation or are of unsupported types of operation. Return
1089 : : true if they are, otherwise return false and indicate in *MATCHES
1090 : : which stmts are not isomorphic to the first one. If MATCHES[0]
1091 : : is false then this indicates the comparison could not be
1092 : : carried out or the stmts will never be vectorized by SLP.
1093 : :
1094 : : Note COND_EXPR is possibly isomorphic to another one after swapping its
1095 : : operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1096 : : the first stmt by swapping the two operands of comparison; set SWAP[i]
1097 : : to 2 if stmt I is isormorphic to the first stmt by inverting the code
1098 : : of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1099 : : to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1100 : :
1101 : : static bool
1102 : 5734925 : vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1103 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1104 : : poly_uint64 *max_nunits, bool *matches,
1105 : : bool *two_operators, tree *node_vectype)
1106 : : {
1107 : 5734925 : unsigned int i;
1108 : 5734925 : stmt_vec_info first_stmt_info = stmts[0];
1109 : 5734925 : code_helper first_stmt_code = ERROR_MARK;
1110 : 5734925 : code_helper alt_stmt_code = ERROR_MARK;
1111 : 5734925 : code_helper first_cond_code = ERROR_MARK;
1112 : 5734925 : bool need_same_oprnds = false;
1113 : 5734925 : tree first_lhs = NULL_TREE;
1114 : 5734925 : tree first_op1 = NULL_TREE;
1115 : 5734925 : stmt_vec_info first_load = NULL, prev_first_load = NULL;
1116 : 5734925 : bool first_stmt_ldst_p = false, first_stmt_ldst_masklen_p = false;
1117 : 5734925 : bool first_stmt_phi_p = false;
1118 : 5734925 : int first_reduc_idx = -1;
1119 : 5734925 : bool maybe_soft_fail = false;
1120 : 5734925 : tree soft_fail_nunits_vectype = NULL_TREE;
1121 : :
1122 : 5734925 : tree vectype, nunits_vectype;
1123 : 5734925 : if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
1124 : : &nunits_vectype, group_size))
1125 : : {
1126 : : /* Fatal mismatch. */
1127 : 204543 : matches[0] = false;
1128 : 204543 : return false;
1129 : : }
1130 : 5530382 : if (is_a <bb_vec_info> (vinfo)
1131 : 5530382 : && known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U))
1132 : : {
1133 : 351316 : if (dump_enabled_p ())
1134 : 282 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1135 : : "Build SLP failed: not using single lane "
1136 : : "vector type %T\n", vectype);
1137 : 351316 : matches[0] = false;
1138 : 351316 : return false;
1139 : : }
1140 : : /* Record nunits required but continue analysis, producing matches[]
1141 : : as if nunits was not an issue. This allows splitting of groups
1142 : : to happen. */
1143 : 5179066 : if (nunits_vectype
1144 : 5179066 : && !vect_record_max_nunits (vinfo, first_stmt_info, group_size,
1145 : : nunits_vectype, max_nunits))
1146 : : {
1147 : 142503 : gcc_assert (is_a <bb_vec_info> (vinfo));
1148 : 142503 : maybe_soft_fail = true;
1149 : 142503 : soft_fail_nunits_vectype = nunits_vectype;
1150 : : }
1151 : :
1152 : 5179066 : gcc_assert (vectype || !gimple_get_lhs (first_stmt_info->stmt));
1153 : 5179066 : *node_vectype = vectype;
1154 : :
1155 : : /* For every stmt in NODE find its def stmt/s. */
1156 : 5179066 : stmt_vec_info stmt_info;
1157 : 21804146 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1158 : : {
1159 : 16785407 : bool ldst_p = false;
1160 : 16785407 : bool ldst_masklen_p = false;
1161 : 16785407 : bool phi_p = false;
1162 : 16785407 : code_helper rhs_code = ERROR_MARK;
1163 : :
1164 : 16785407 : swap[i] = 0;
1165 : 16785407 : matches[i] = false;
1166 : 16785407 : if (!stmt_info)
1167 : : {
1168 : 50734 : matches[i] = true;
1169 : 16675814 : continue;
1170 : : }
1171 : :
1172 : 16734673 : gimple *stmt = stmt_info->stmt;
1173 : 16734673 : if (dump_enabled_p ())
1174 : 208599 : dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1175 : :
1176 : : /* Fail to vectorize statements marked as unvectorizable, throw
1177 : : or are volatile. */
1178 : 16734673 : if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1179 : 16547051 : || stmt_can_throw_internal (cfun, stmt)
1180 : 32322995 : || gimple_has_volatile_ops (stmt))
1181 : : {
1182 : 193076 : if (dump_enabled_p ())
1183 : 213 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1184 : : "Build SLP failed: unvectorizable statement %G",
1185 : : stmt);
1186 : : /* ??? For BB vectorization we want to commutate operands in a way
1187 : : to shuffle all unvectorizable defs into one operand and have
1188 : : the other still vectorized. The following doesn't reliably
1189 : : work for this though but it's the easiest we can do here. */
1190 : 193076 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1191 : 60603 : continue;
1192 : : /* Fatal mismatch. */
1193 : 132473 : matches[0] = false;
1194 : 132473 : return false;
1195 : : }
1196 : :
1197 : 16541597 : gcall *call_stmt = dyn_cast <gcall *> (stmt);
1198 : 16541597 : tree lhs = gimple_get_lhs (stmt);
1199 : 16541597 : if (lhs == NULL_TREE && !call_stmt)
1200 : : {
1201 : 36 : if (dump_enabled_p ())
1202 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1203 : : "Build SLP failed: not GIMPLE_ASSIGN nor "
1204 : : "GIMPLE_CALL %G", stmt);
1205 : 36 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1206 : 36 : continue;
1207 : : /* Fatal mismatch. */
1208 : 0 : matches[0] = false;
1209 : 0 : return false;
1210 : : }
1211 : :
1212 : 16541561 : if (call_stmt)
1213 : : {
1214 : 88354 : combined_fn cfn = gimple_call_combined_fn (call_stmt);
1215 : 88354 : if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1216 : 47885 : rhs_code = cfn;
1217 : : else
1218 : : rhs_code = CALL_EXPR;
1219 : :
1220 : 88354 : if (cfn == CFN_GATHER_LOAD
1221 : 88354 : || cfn == CFN_SCATTER_STORE)
1222 : : ldst_p = true;
1223 : : else if (cfn == CFN_MASK_LOAD
1224 : : || cfn == CFN_MASK_GATHER_LOAD
1225 : : || cfn == CFN_MASK_LEN_GATHER_LOAD
1226 : : || cfn == CFN_MASK_SCATTER_STORE
1227 : : || cfn == CFN_MASK_LEN_SCATTER_STORE)
1228 : : {
1229 : : ldst_p = true;
1230 : : ldst_masklen_p = true;
1231 : : }
1232 : : else if (cfn == CFN_MASK_STORE)
1233 : : {
1234 : : ldst_p = true;
1235 : : ldst_masklen_p = true;
1236 : : rhs_code = CFN_MASK_STORE;
1237 : : }
1238 : : else if (cfn == CFN_GOMP_SIMD_LANE)
1239 : : ;
1240 : 80132 : else if ((cfn != CFN_LAST
1241 : : && cfn != CFN_MASK_CALL
1242 : 39663 : && internal_fn_p (cfn)
1243 : 30326 : && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1244 : 80087 : || gimple_call_tail_p (call_stmt)
1245 : 80087 : || gimple_call_noreturn_p (call_stmt)
1246 : 160219 : || gimple_call_chain (call_stmt))
1247 : : {
1248 : 394 : if (dump_enabled_p ())
1249 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1250 : : "Build SLP failed: unsupported call type %G",
1251 : : (gimple *) call_stmt);
1252 : 394 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1253 : 53 : continue;
1254 : : /* Fatal mismatch. */
1255 : 341 : matches[0] = false;
1256 : 341 : return false;
1257 : : }
1258 : : }
1259 : 16453207 : else if (gimple_code (stmt) == GIMPLE_PHI)
1260 : : {
1261 : : rhs_code = ERROR_MARK;
1262 : : phi_p = true;
1263 : : }
1264 : : else
1265 : : {
1266 : 15494478 : rhs_code = gimple_assign_rhs_code (stmt);
1267 : 15494478 : ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1268 : : }
1269 : :
1270 : : /* Check the operation. */
1271 : 16541167 : if (i == 0)
1272 : : {
1273 : 5046252 : first_lhs = lhs;
1274 : 5046252 : first_stmt_code = rhs_code;
1275 : 5046252 : first_stmt_ldst_p = ldst_p;
1276 : 5046252 : first_stmt_ldst_masklen_p = ldst_masklen_p;
1277 : 5046252 : first_stmt_phi_p = phi_p;
1278 : 5046252 : first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
1279 : :
1280 : : /* Shift arguments should be equal in all the packed stmts for a
1281 : : vector shift with scalar shift operand. */
1282 : 5046252 : if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1283 : 4927943 : || rhs_code == LROTATE_EXPR
1284 : 9974153 : || rhs_code == RROTATE_EXPR)
1285 : : {
1286 : : /* First see if we have a vector/vector shift. */
1287 : 118537 : if (!directly_supported_p (rhs_code, vectype, optab_vector))
1288 : : {
1289 : : /* No vector/vector shift, try for a vector/scalar shift. */
1290 : 105826 : if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1291 : : {
1292 : 8937 : if (dump_enabled_p ())
1293 : 370 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1294 : : "Build SLP failed: "
1295 : : "op not supported by target.\n");
1296 : 8937 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1297 : : continue;
1298 : : /* Fatal mismatch. */
1299 : 8937 : matches[0] = false;
1300 : 8937 : return false;
1301 : : }
1302 : 96889 : need_same_oprnds = true;
1303 : 96889 : first_op1 = gimple_assign_rhs2 (stmt);
1304 : : }
1305 : : }
1306 : 4927715 : else if (rhs_code == WIDEN_LSHIFT_EXPR)
1307 : : {
1308 : 0 : need_same_oprnds = true;
1309 : 0 : first_op1 = gimple_assign_rhs2 (stmt);
1310 : : }
1311 : 4927715 : else if (!ldst_p
1312 : 4927715 : && rhs_code == BIT_FIELD_REF)
1313 : : {
1314 : 5034 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1315 : 5034 : if (!is_a <bb_vec_info> (vinfo)
1316 : 4940 : || TREE_CODE (vec) != SSA_NAME
1317 : : /* When the element types are not compatible we pun the
1318 : : source to the target vectype which requires equal size. */
1319 : 9962 : || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1320 : 4223 : || !types_compatible_p (TREE_TYPE (vectype),
1321 : 4223 : TREE_TYPE (TREE_TYPE (vec))))
1322 : 943 : && !operand_equal_p (TYPE_SIZE (vectype),
1323 : 943 : TYPE_SIZE (TREE_TYPE (vec)))))
1324 : : {
1325 : 703 : if (dump_enabled_p ())
1326 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1327 : : "Build SLP failed: "
1328 : : "BIT_FIELD_REF not supported\n");
1329 : : /* Fatal mismatch. */
1330 : 703 : matches[0] = false;
1331 : 703 : return false;
1332 : : }
1333 : : }
1334 : 4922681 : else if (rhs_code == CFN_DIV_POW2)
1335 : : {
1336 : 0 : need_same_oprnds = true;
1337 : 0 : first_op1 = gimple_call_arg (call_stmt, 1);
1338 : : }
1339 : 4922681 : else if (rhs_code == CFN_GOMP_SIMD_LANE)
1340 : : {
1341 : 3147 : need_same_oprnds = true;
1342 : 3147 : first_op1 = gimple_call_arg (call_stmt, 1);
1343 : : }
1344 : : }
1345 : : else
1346 : : {
1347 : 11495614 : if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1348 : : /* For SLP reduction groups the index isn't necessarily
1349 : : uniform but only that of the first stmt matters. */
1350 : 11494915 : && !(first_reduc_idx != -1
1351 : 1598 : && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1352 : 1598 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
1353 : : {
1354 : 699 : if (dump_enabled_p ())
1355 : : {
1356 : 47 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1357 : : "Build SLP failed: different reduc_idx "
1358 : : "%d instead of %d in %G",
1359 : : STMT_VINFO_REDUC_IDX (stmt_info),
1360 : : first_reduc_idx, stmt);
1361 : : }
1362 : : /* Mismatch. */
1363 : 699 : continue;
1364 : : }
1365 : 11494216 : if (!ldst_p
1366 : 9105550 : && first_stmt_code != rhs_code
1367 : 12878591 : && alt_stmt_code == ERROR_MARK)
1368 : : alt_stmt_code = rhs_code;
1369 : 12863273 : if ((!ldst_p
1370 : 9105550 : && first_stmt_code != rhs_code
1371 : 1384375 : && (first_stmt_code != IMAGPART_EXPR
1372 : 111 : || rhs_code != REALPART_EXPR)
1373 : 1384363 : && (first_stmt_code != REALPART_EXPR
1374 : 951 : || rhs_code != IMAGPART_EXPR)
1375 : : /* Handle mismatches in plus/minus by computing both
1376 : : and merging the results. */
1377 : 1384360 : && !((((first_stmt_code == PLUS_EXPR
1378 : 1298376 : || first_stmt_code == MINUS_EXPR)
1379 : 105386 : && (alt_stmt_code == PLUS_EXPR
1380 : 96605 : || alt_stmt_code == MINUS_EXPR))
1381 : 1370863 : || ((first_stmt_code == CFN_FMA
1382 : 1370861 : || first_stmt_code == CFN_FMS)
1383 : 2 : && (alt_stmt_code == CFN_FMA
1384 : 2 : || alt_stmt_code == CFN_FMS)))
1385 : 13499 : && rhs_code == alt_stmt_code)
1386 : 1413661 : && !(first_stmt_code.is_tree_code ()
1387 : 1290919 : && rhs_code.is_tree_code ()
1388 : 1200215 : && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1389 : : == tcc_comparison)
1390 : 141600 : && (swap_tree_comparison (tree_code (first_stmt_code))
1391 : 141600 : == tree_code (rhs_code))
1392 : : && (first_reduc_idx == -1
1393 : 0 : || REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
1394 : : || (ldst_p
1395 : 4777332 : && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1396 : 2388666 : != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1397 : : || (ldst_p
1398 : 2347945 : && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1399 : 2347945 : != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1400 : 10125271 : || first_stmt_ldst_p != ldst_p
1401 : 10125159 : || (ldst_p && first_stmt_ldst_masklen_p != ldst_masklen_p)
1402 : 21619375 : || first_stmt_phi_p != phi_p)
1403 : : {
1404 : 1369057 : if (dump_enabled_p ())
1405 : : {
1406 : 2766 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1407 : : "Build SLP failed: different operation "
1408 : : "in stmt %G", stmt);
1409 : 2766 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1410 : : "original stmt %G", first_stmt_info->stmt);
1411 : : }
1412 : : /* Mismatch. */
1413 : 1369057 : continue;
1414 : : }
1415 : :
1416 : 10126825 : if (!ldst_p
1417 : 7777330 : && first_stmt_code == BIT_FIELD_REF
1418 : 10129738 : && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1419 : 4579 : != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1420 : : {
1421 : 1666 : if (dump_enabled_p ())
1422 : 36 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1423 : : "Build SLP failed: different BIT_FIELD_REF "
1424 : : "arguments in %G", stmt);
1425 : : /* Mismatch. */
1426 : 1666 : continue;
1427 : : }
1428 : :
1429 : 10123493 : if (call_stmt
1430 : 20747 : && first_stmt_code != CFN_MASK_LOAD
1431 : 10144175 : && first_stmt_code != CFN_MASK_STORE)
1432 : : {
1433 : 20620 : if (!is_a <gcall *> (stmts[0]->stmt)
1434 : 20620 : || !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1435 : : call_stmt, true))
1436 : : {
1437 : 5210 : if (dump_enabled_p ())
1438 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1439 : : "Build SLP failed: different calls in %G",
1440 : : stmt);
1441 : : /* Mismatch. */
1442 : 5210 : continue;
1443 : : }
1444 : : }
1445 : :
1446 : 9940730 : if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1447 : 10742450 : && (gimple_bb (first_stmt_info->stmt)
1448 : 801720 : != gimple_bb (stmt_info->stmt)))
1449 : : {
1450 : 27534 : if (dump_enabled_p ())
1451 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1452 : : "Build SLP failed: different BB for PHI "
1453 : : "or possibly trapping operation in %G", stmt);
1454 : : /* Mismatch. */
1455 : 27534 : continue;
1456 : : }
1457 : :
1458 : 10090749 : if (need_same_oprnds)
1459 : : {
1460 : 53077 : tree other_op1 = gimple_arg (stmt, 1);
1461 : 53077 : if (!operand_equal_p (first_op1, other_op1, 0))
1462 : : {
1463 : 7153 : if (dump_enabled_p ())
1464 : 125 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465 : : "Build SLP failed: different shift "
1466 : : "arguments in %G", stmt);
1467 : : /* Mismatch. */
1468 : 7153 : continue;
1469 : : }
1470 : : }
1471 : :
1472 : 10084218 : if (first_lhs
1473 : 10083596 : && lhs
1474 : 10083596 : && !types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (first_lhs)))
1475 : : {
1476 : 622 : if (dump_enabled_p ())
1477 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1478 : : "Build SLP failed: different vector type "
1479 : : "in %G", stmt);
1480 : : /* Mismatch. */
1481 : 622 : continue;
1482 : : }
1483 : : }
1484 : :
1485 : : /* Grouped store or load. */
1486 : 15119586 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1487 : : {
1488 : 3752024 : gcc_assert (ldst_p);
1489 : 3752024 : if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1490 : : {
1491 : : /* Store. */
1492 : 2983646 : gcc_assert (rhs_code == CFN_MASK_STORE
1493 : : || REFERENCE_CLASS_P (lhs)
1494 : : || DECL_P (lhs));
1495 : : }
1496 : : else
1497 : : {
1498 : : /* Load. */
1499 : 768378 : first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1500 : 768378 : if (prev_first_load)
1501 : : {
1502 : : /* Check that there are no loads from different interleaving
1503 : : chains in the same node. */
1504 : 330871 : if (prev_first_load != first_load)
1505 : : {
1506 : 29409 : if (dump_enabled_p ())
1507 : 1844 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1508 : : vect_location,
1509 : : "Build SLP failed: different "
1510 : : "interleaving chains in one node %G",
1511 : : stmt);
1512 : : /* Mismatch. */
1513 : 29409 : continue;
1514 : : }
1515 : : }
1516 : : else
1517 : : prev_first_load = first_load;
1518 : : }
1519 : : }
1520 : : /* Non-grouped store or load. */
1521 : 11367562 : else if (ldst_p)
1522 : : {
1523 : 644028 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1524 : 427252 : && rhs_code != CFN_GATHER_LOAD
1525 : : && rhs_code != CFN_MASK_GATHER_LOAD
1526 : : && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1527 : : && rhs_code != CFN_SCATTER_STORE
1528 : : && rhs_code != CFN_MASK_SCATTER_STORE
1529 : : && rhs_code != CFN_MASK_LEN_SCATTER_STORE
1530 : 427252 : && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1531 : : /* Not grouped loads are handled as externals for BB
1532 : : vectorization. For loop vectorization we can handle
1533 : : splats the same we handle single element interleaving. */
1534 : 1059018 : && (is_a <bb_vec_info> (vinfo)
1535 : 414990 : || stmt_info != first_stmt_info))
1536 : : {
1537 : : /* Not grouped load. */
1538 : 12966 : if (dump_enabled_p ())
1539 : 118 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1540 : : "Build SLP failed: not grouped load %G", stmt);
1541 : :
1542 : 12966 : if (i != 0)
1543 : 12966 : continue;
1544 : : /* Fatal mismatch. */
1545 : 0 : matches[0] = false;
1546 : 0 : return false;
1547 : : }
1548 : : }
1549 : : /* Not memory operation. */
1550 : : else
1551 : : {
1552 : 10723534 : if (!phi_p
1553 : 9885423 : && rhs_code.is_tree_code ()
1554 : 9844369 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1555 : 1438193 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1556 : 927012 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1557 : 870892 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1558 : 59045 : && rhs_code != VIEW_CONVERT_EXPR
1559 : : && rhs_code != CALL_EXPR
1560 : : && rhs_code != BIT_FIELD_REF
1561 : 10723534 : && rhs_code != SSA_NAME)
1562 : : {
1563 : 17873 : if (dump_enabled_p ())
1564 : 17 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1565 : : "Build SLP failed: operation unsupported %G",
1566 : : stmt);
1567 : 17873 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1568 : 0 : continue;
1569 : : /* Fatal mismatch. */
1570 : 17873 : matches[0] = false;
1571 : 17873 : return false;
1572 : : }
1573 : :
1574 : 10705661 : if (rhs_code == COND_EXPR)
1575 : : {
1576 : 53613 : tree cond_expr = gimple_assign_rhs1 (stmt);
1577 : 53613 : enum tree_code cond_code = TREE_CODE (cond_expr);
1578 : 53613 : enum tree_code swap_code = ERROR_MARK;
1579 : 53613 : enum tree_code invert_code = ERROR_MARK;
1580 : :
1581 : 53613 : if (i == 0)
1582 : 45087 : first_cond_code = TREE_CODE (cond_expr);
1583 : 8526 : else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1584 : : {
1585 : 0 : bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1586 : 0 : swap_code = swap_tree_comparison (cond_code);
1587 : 0 : invert_code = invert_tree_comparison (cond_code, honor_nans);
1588 : : }
1589 : :
1590 : 53613 : if (first_cond_code == cond_code)
1591 : : ;
1592 : : /* Isomorphic can be achieved by swapping. */
1593 : 0 : else if (first_cond_code == swap_code)
1594 : 0 : swap[i] = 1;
1595 : : /* Isomorphic can be achieved by inverting. */
1596 : 0 : else if (first_cond_code == invert_code)
1597 : 0 : swap[i] = 2;
1598 : : else
1599 : : {
1600 : 0 : if (dump_enabled_p ())
1601 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1602 : : "Build SLP failed: different"
1603 : : " operation %G", stmt);
1604 : : /* Mismatch. */
1605 : 0 : continue;
1606 : : }
1607 : : }
1608 : :
1609 : 10705661 : if (i != 0
1610 : 7736433 : && first_stmt_code != rhs_code
1611 : 56155 : && first_stmt_code.is_tree_code ()
1612 : 56153 : && rhs_code.is_tree_code ()
1613 : 56153 : && TREE_CODE_CLASS ((tree_code)first_stmt_code) == tcc_comparison
1614 : 10748389 : && (swap_tree_comparison ((tree_code)first_stmt_code)
1615 : 42728 : == (tree_code)rhs_code))
1616 : 42728 : swap[i] = 1;
1617 : : }
1618 : :
1619 : 15059338 : matches[i] = true;
1620 : : }
1621 : :
1622 : 20108583 : for (i = 0; i < group_size; ++i)
1623 : 15739439 : if (!matches[i])
1624 : : return false;
1625 : :
1626 : : /* If we allowed a two-operation SLP node verify the target can cope
1627 : : with the permute we are going to use. */
1628 : 4369144 : if (alt_stmt_code != ERROR_MARK
1629 : 4369144 : && (!alt_stmt_code.is_tree_code ()
1630 : 51218 : || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1631 : 51218 : && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1632 : : {
1633 : 9328 : *two_operators = true;
1634 : : }
1635 : :
1636 : 4369144 : if (maybe_soft_fail)
1637 : : {
1638 : 142094 : unsigned HOST_WIDE_INT const_nunits;
1639 : 142094 : if (!TYPE_VECTOR_SUBPARTS
1640 : 142094 : (soft_fail_nunits_vectype).is_constant (&const_nunits)
1641 : 142094 : || const_nunits > group_size)
1642 : 0 : matches[0] = false;
1643 : : else
1644 : : {
1645 : : /* With constant vector elements simulate a mismatch at the
1646 : : point we need to split. */
1647 : 142094 : unsigned tail = group_size & (const_nunits - 1);
1648 : 142094 : memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1649 : : }
1650 : 142094 : return false;
1651 : : }
1652 : :
1653 : : return true;
1654 : : }
1655 : :
1656 : : /* Traits for the hash_set to record failed SLP builds for a stmt set.
1657 : : Note we never remove apart from at destruction time so we do not
1658 : : need a special value for deleted that differs from empty. */
1659 : : struct bst_traits
1660 : : {
1661 : : typedef vec <stmt_vec_info> value_type;
1662 : : typedef vec <stmt_vec_info> compare_type;
1663 : : static inline hashval_t hash (value_type);
1664 : : static inline bool equal (value_type existing, value_type candidate);
1665 : 492161136 : static inline bool is_empty (value_type x) { return !x.exists (); }
1666 : 114613305 : static inline bool is_deleted (value_type x) { return !x.exists (); }
1667 : : static const bool empty_zero_p = true;
1668 : 0 : static inline void mark_empty (value_type &x) { x.release (); }
1669 : : static inline void mark_deleted (value_type &x) { x.release (); }
1670 : 9231849 : static inline void remove (value_type &x) { x.release (); }
1671 : : };
1672 : : inline hashval_t
1673 : 99000267 : bst_traits::hash (value_type x)
1674 : : {
1675 : 99000267 : inchash::hash h;
1676 : 425767410 : for (unsigned i = 0; i < x.length (); ++i)
1677 : 326767143 : h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
1678 : 99000267 : return h.end ();
1679 : : }
1680 : : inline bool
1681 : 88832032 : bst_traits::equal (value_type existing, value_type candidate)
1682 : : {
1683 : 266496096 : if (existing.length () != candidate.length ())
1684 : : return false;
1685 : 91084043 : for (unsigned i = 0; i < existing.length (); ++i)
1686 : 86308558 : if (existing[i] != candidate[i])
1687 : : return false;
1688 : : return true;
1689 : : }
1690 : :
1691 : : typedef hash_map <vec <stmt_vec_info>, slp_tree,
1692 : : simple_hashmap_traits <bst_traits, slp_tree> >
1693 : : scalar_stmts_to_slp_tree_map_t;
1694 : :
1695 : : /* Release BST_MAP. */
1696 : :
1697 : : static void
1698 : 1644096 : release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
1699 : : {
1700 : : /* The map keeps a reference on SLP nodes built, release that. */
1701 : 10875945 : for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1702 : 20107794 : it != bst_map->end (); ++it)
1703 : 9231849 : if ((*it).second)
1704 : 9231849 : vect_free_slp_tree ((*it).second);
1705 : 1644096 : delete bst_map;
1706 : 1644096 : }
1707 : :
1708 : : /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1709 : : but then vec::insert does memmove and that's not compatible with
1710 : : std::pair. */
1711 : : struct chain_op_t
1712 : : {
1713 : 3601239 : chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1714 : 3601239 : : code (code_), dt (dt_), op (op_) {}
1715 : : tree_code code;
1716 : : vect_def_type dt;
1717 : : tree op;
1718 : : };
1719 : :
1720 : : /* Comparator for sorting associatable chains. */
1721 : :
1722 : : static int
1723 : 8518002 : dt_sort_cmp (const void *op1_, const void *op2_, void *)
1724 : : {
1725 : 8518002 : auto *op1 = (const chain_op_t *) op1_;
1726 : 8518002 : auto *op2 = (const chain_op_t *) op2_;
1727 : 8518002 : if (op1->dt != op2->dt)
1728 : 1040638 : return (int)op1->dt - (int)op2->dt;
1729 : 7477364 : return (int)op1->code - (int)op2->code;
1730 : : }
1731 : :
1732 : : /* Linearize the associatable expression chain at START with the
1733 : : associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1734 : : filling CHAIN with the result and using WORKLIST as intermediate storage.
1735 : : CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1736 : : or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1737 : : stmts, starting with START. */
1738 : :
1739 : : static void
1740 : 1611772 : vect_slp_linearize_chain (vec_info *vinfo,
1741 : : vec<std::pair<tree_code, gimple *> > &worklist,
1742 : : vec<chain_op_t> &chain,
1743 : : enum tree_code code, gimple *start,
1744 : : gimple *&code_stmt, gimple *&alt_code_stmt,
1745 : : vec<gimple *> *chain_stmts)
1746 : : {
1747 : : /* For each lane linearize the addition/subtraction (or other
1748 : : uniform associatable operation) expression tree. */
1749 : 1611772 : worklist.safe_push (std::make_pair (code, start));
1750 : 3601239 : while (!worklist.is_empty ())
1751 : : {
1752 : 1989467 : auto entry = worklist.pop ();
1753 : 1989467 : gassign *stmt = as_a <gassign *> (entry.second);
1754 : 1989467 : enum tree_code in_code = entry.first;
1755 : 3978934 : enum tree_code this_code = gimple_assign_rhs_code (stmt);
1756 : : /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1757 : 1989467 : if (!code_stmt
1758 : 1989467 : && gimple_assign_rhs_code (stmt) == code)
1759 : 1349350 : code_stmt = stmt;
1760 : 640117 : else if (!alt_code_stmt
1761 : 640117 : && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1762 : 337805 : alt_code_stmt = stmt;
1763 : 1989467 : if (chain_stmts)
1764 : 1981189 : chain_stmts->safe_push (stmt);
1765 : 5968401 : for (unsigned opnum = 1; opnum <= 2; ++opnum)
1766 : : {
1767 : 3978934 : tree op = gimple_op (stmt, opnum);
1768 : 3978934 : vect_def_type dt;
1769 : 3978934 : stmt_vec_info def_stmt_info;
1770 : 3978934 : bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1771 : 3978934 : gcc_assert (res);
1772 : 3978934 : if (dt == vect_internal_def
1773 : 3978934 : && is_pattern_stmt_p (def_stmt_info))
1774 : 988 : op = gimple_get_lhs (def_stmt_info->stmt);
1775 : 3978934 : gimple *use_stmt;
1776 : 3978934 : use_operand_p use_p;
1777 : 3978934 : if (dt == vect_internal_def
1778 : 3719424 : && single_imm_use (op, &use_p, &use_stmt)
1779 : 2321432 : && is_gimple_assign (def_stmt_info->stmt)
1780 : 6108557 : && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1781 : 1778179 : || (code == PLUS_EXPR
1782 : 867682 : && (gimple_assign_rhs_code (def_stmt_info->stmt)
1783 : : == MINUS_EXPR))))
1784 : : {
1785 : 377695 : tree_code op_def_code = this_code;
1786 : 377695 : if (op_def_code == MINUS_EXPR && opnum == 1)
1787 : 56133 : op_def_code = PLUS_EXPR;
1788 : 377695 : if (in_code == MINUS_EXPR)
1789 : 193 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1790 : 377695 : worklist.safe_push (std::make_pair (op_def_code,
1791 : 377695 : def_stmt_info->stmt));
1792 : : }
1793 : : else
1794 : : {
1795 : 3601239 : tree_code op_def_code = this_code;
1796 : 3601239 : if (op_def_code == MINUS_EXPR && opnum == 1)
1797 : 286010 : op_def_code = PLUS_EXPR;
1798 : 3601239 : if (in_code == MINUS_EXPR)
1799 : 4305 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1800 : 3601239 : chain.safe_push (chain_op_t (op_def_code, dt, op));
1801 : : }
1802 : : }
1803 : : }
1804 : 1611772 : }
1805 : :
1806 : : static slp_tree
1807 : : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1808 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1809 : : poly_uint64 *max_nunits,
1810 : : bool *matches, unsigned *limit, unsigned *tree_size,
1811 : : scalar_stmts_to_slp_tree_map_t *bst_map);
1812 : :
1813 : : static slp_tree
1814 : 6208858 : vect_build_slp_tree (vec_info *vinfo,
1815 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1816 : : poly_uint64 *max_nunits,
1817 : : bool *matches, unsigned *limit, unsigned *tree_size,
1818 : : scalar_stmts_to_slp_tree_map_t *bst_map)
1819 : : {
1820 : 6208858 : if (slp_tree *leader = bst_map->get (stmts))
1821 : : {
1822 : 468331 : if (dump_enabled_p ())
1823 : 16583 : dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1824 : 16583 : !(*leader)->failed ? "" : "failed ",
1825 : : (void *) *leader);
1826 : 468331 : if (!(*leader)->failed)
1827 : : {
1828 : 422355 : SLP_TREE_REF_COUNT (*leader)++;
1829 : 422355 : vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1830 : 422355 : stmts.release ();
1831 : 422355 : return *leader;
1832 : : }
1833 : 45976 : memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1834 : 45976 : return NULL;
1835 : : }
1836 : :
1837 : : /* Single-lane SLP doesn't have the chance of run-away, do not account
1838 : : it to the limit. */
1839 : 5740527 : if (stmts.length () > 1)
1840 : : {
1841 : 3083929 : if (*limit == 0)
1842 : : {
1843 : 1501 : if (dump_enabled_p ())
1844 : 12 : dump_printf_loc (MSG_NOTE, vect_location,
1845 : : "SLP discovery limit exceeded\n");
1846 : 1501 : memset (matches, 0, sizeof (bool) * group_size);
1847 : 1501 : return NULL;
1848 : : }
1849 : 3082428 : --*limit;
1850 : : }
1851 : :
1852 : : /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1853 : : so we can pick up backedge destinations during discovery. */
1854 : 5739026 : slp_tree res = new _slp_tree;
1855 : 5739026 : SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1856 : 5739026 : SLP_TREE_SCALAR_STMTS (res) = stmts;
1857 : 5739026 : bst_map->put (stmts.copy (), res);
1858 : :
1859 : 5739026 : if (dump_enabled_p ())
1860 : 141725 : dump_printf_loc (MSG_NOTE, vect_location,
1861 : : "starting SLP discovery for node %p\n", (void *) res);
1862 : :
1863 : 5739026 : poly_uint64 this_max_nunits = 1;
1864 : 5739026 : slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1865 : : &this_max_nunits,
1866 : : matches, limit, tree_size, bst_map);
1867 : 5739026 : if (!res_)
1868 : : {
1869 : 1929525 : if (dump_enabled_p ())
1870 : 7846 : dump_printf_loc (MSG_NOTE, vect_location,
1871 : : "SLP discovery for node %p failed\n", (void *) res);
1872 : : /* Mark the node invalid so we can detect those when still in use
1873 : : as backedge destinations. */
1874 : 1929525 : SLP_TREE_SCALAR_STMTS (res) = vNULL;
1875 : 1929525 : SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1876 : 1929525 : res->failed = XNEWVEC (bool, group_size);
1877 : 1929525 : if (flag_checking)
1878 : : {
1879 : : unsigned i;
1880 : 3389172 : for (i = 0; i < group_size; ++i)
1881 : 3389172 : if (!matches[i])
1882 : : break;
1883 : 1929525 : gcc_assert (i < group_size);
1884 : : }
1885 : 1929525 : memcpy (res->failed, matches, sizeof (bool) * group_size);
1886 : : }
1887 : : else
1888 : : {
1889 : 3809501 : if (dump_enabled_p ())
1890 : 133879 : dump_printf_loc (MSG_NOTE, vect_location,
1891 : : "SLP discovery for node %p succeeded\n",
1892 : : (void *) res);
1893 : 3809501 : gcc_assert (res_ == res);
1894 : 3809501 : res->max_nunits = this_max_nunits;
1895 : 3809501 : vect_update_max_nunits (max_nunits, this_max_nunits);
1896 : : /* Keep a reference for the bst_map use. */
1897 : 3809501 : SLP_TREE_REF_COUNT (res)++;
1898 : : }
1899 : : return res_;
1900 : : }
1901 : :
1902 : : /* Helper for building an associated SLP node chain. */
1903 : :
1904 : : static void
1905 : 123 : vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1906 : : slp_tree op0, slp_tree op1,
1907 : : stmt_vec_info oper1, stmt_vec_info oper2,
1908 : : vec<std::pair<unsigned, unsigned> > lperm)
1909 : : {
1910 : 123 : unsigned group_size = SLP_TREE_LANES (op1);
1911 : :
1912 : 123 : slp_tree child1 = new _slp_tree;
1913 : 123 : SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1914 : 123 : SLP_TREE_VECTYPE (child1) = vectype;
1915 : 123 : SLP_TREE_LANES (child1) = group_size;
1916 : 123 : SLP_TREE_CHILDREN (child1).create (2);
1917 : 123 : SLP_TREE_CHILDREN (child1).quick_push (op0);
1918 : 123 : SLP_TREE_CHILDREN (child1).quick_push (op1);
1919 : 123 : SLP_TREE_REPRESENTATIVE (child1) = oper1;
1920 : :
1921 : 123 : slp_tree child2 = new _slp_tree;
1922 : 123 : SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1923 : 123 : SLP_TREE_VECTYPE (child2) = vectype;
1924 : 123 : SLP_TREE_LANES (child2) = group_size;
1925 : 123 : SLP_TREE_CHILDREN (child2).create (2);
1926 : 123 : SLP_TREE_CHILDREN (child2).quick_push (op0);
1927 : 123 : SLP_TREE_REF_COUNT (op0)++;
1928 : 123 : SLP_TREE_CHILDREN (child2).quick_push (op1);
1929 : 123 : SLP_TREE_REF_COUNT (op1)++;
1930 : 123 : SLP_TREE_REPRESENTATIVE (child2) = oper2;
1931 : :
1932 : 123 : SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1933 : 123 : SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1934 : 123 : SLP_TREE_VECTYPE (perm) = vectype;
1935 : 123 : SLP_TREE_LANES (perm) = group_size;
1936 : : /* ??? We should set this NULL but that's not expected. */
1937 : 123 : SLP_TREE_REPRESENTATIVE (perm) = oper1;
1938 : 123 : SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1939 : 123 : SLP_TREE_CHILDREN (perm).quick_push (child1);
1940 : 123 : SLP_TREE_CHILDREN (perm).quick_push (child2);
1941 : 123 : }
1942 : :
1943 : : /* Recursively build an SLP tree starting from NODE.
1944 : : Fail (and return a value not equal to zero) if def-stmts are not
1945 : : isomorphic, require data permutation or are of unsupported types of
1946 : : operation. Otherwise, return 0.
1947 : : The value returned is the depth in the SLP tree where a mismatch
1948 : : was found. */
1949 : :
1950 : : static slp_tree
1951 : 5739026 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1952 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1953 : : poly_uint64 *max_nunits,
1954 : : bool *matches, unsigned *limit, unsigned *tree_size,
1955 : : scalar_stmts_to_slp_tree_map_t *bst_map)
1956 : : {
1957 : 5739026 : unsigned nops, i, this_tree_size = 0;
1958 : 5739026 : poly_uint64 this_max_nunits = *max_nunits;
1959 : :
1960 : 5739026 : matches[0] = false;
1961 : :
1962 : 5739026 : stmt_vec_info stmt_info = stmts[0];
1963 : 5739026 : if (!is_a<gcall *> (stmt_info->stmt)
1964 : : && !is_a<gassign *> (stmt_info->stmt)
1965 : : && !is_a<gphi *> (stmt_info->stmt))
1966 : : return NULL;
1967 : :
1968 : 5738955 : nops = gimple_num_args (stmt_info->stmt);
1969 : 5738955 : if (const int *map = vect_get_operand_map (stmt_info->stmt,
1970 : 5738955 : STMT_VINFO_GATHER_SCATTER_P
1971 : : (stmt_info)))
1972 : 23456 : nops = map[0];
1973 : :
1974 : : /* If the SLP node is a PHI (induction or reduction), terminate
1975 : : the recursion. */
1976 : 5738955 : bool *skip_args = XALLOCAVEC (bool, nops);
1977 : 5738955 : memset (skip_args, 0, sizeof (bool) * nops);
1978 : 5738955 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1979 : 2773143 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1980 : : {
1981 : 467061 : tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1982 : 467061 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1983 : : group_size);
1984 : 467061 : if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1985 : : max_nunits))
1986 : : return NULL;
1987 : :
1988 : 463031 : vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1989 : 463031 : if (def_type == vect_induction_def)
1990 : : {
1991 : : /* Induction PHIs are not cycles but walk the initial
1992 : : value. Only for inner loops through, for outer loops
1993 : : we need to pick up the value from the actual PHIs
1994 : : to more easily support peeling and epilogue vectorization. */
1995 : 393674 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1996 : 393674 : if (!nested_in_vect_loop_p (loop, stmt_info))
1997 : 393021 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1998 : : else
1999 : : loop = loop->inner;
2000 : 393674 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
2001 : : }
2002 : 69357 : else if (def_type == vect_reduction_def
2003 : : || def_type == vect_double_reduction_def
2004 : : || def_type == vect_nested_cycle
2005 : 69357 : || def_type == vect_first_order_recurrence)
2006 : : {
2007 : : /* Else def types have to match. */
2008 : : stmt_vec_info other_info;
2009 : : bool all_same = true;
2010 : 143124 : FOR_EACH_VEC_ELT (stmts, i, other_info)
2011 : : {
2012 : 74839 : if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
2013 : 1715114 : return NULL;
2014 : 74839 : if (other_info != stmt_info)
2015 : 2406 : all_same = false;
2016 : : }
2017 : 68285 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2018 : : /* Reduction initial values are not explicitely represented. */
2019 : 68285 : if (def_type != vect_first_order_recurrence
2020 : 68285 : && gimple_bb (stmt_info->stmt) == loop->header)
2021 : 65425 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
2022 : : /* Reduction chain backedge defs are filled manually.
2023 : : ??? Need a better way to identify a SLP reduction chain PHI.
2024 : : Or a better overall way to SLP match those. */
2025 : 68285 : if (stmts.length () > 1
2026 : 68285 : && all_same && def_type == vect_reduction_def)
2027 : 2817 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
2028 : : }
2029 : 1072 : else if (def_type != vect_internal_def)
2030 : : return NULL;
2031 : : }
2032 : :
2033 : :
2034 : 5734925 : bool two_operators = false;
2035 : 5734925 : unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
2036 : 5734925 : tree vectype = NULL_TREE;
2037 : 5734925 : if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
2038 : : &this_max_nunits, matches, &two_operators,
2039 : : &vectype))
2040 : : return NULL;
2041 : :
2042 : : /* If the SLP node is a load, terminate the recursion unless masked. */
2043 : 4227050 : if (STMT_VINFO_DATA_REF (stmt_info)
2044 : 1848676 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2045 : : {
2046 : 786999 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2047 : : gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
2048 : : else
2049 : : {
2050 : 774974 : *max_nunits = this_max_nunits;
2051 : 774974 : (*tree_size)++;
2052 : 774974 : node = vect_create_new_slp_node (node, stmts, 0);
2053 : 774974 : SLP_TREE_VECTYPE (node) = vectype;
2054 : : /* And compute the load permutation. Whether it is actually
2055 : : a permutation depends on the unrolling factor which is
2056 : : decided later. */
2057 : 774974 : vec<unsigned> load_permutation;
2058 : 774974 : int j;
2059 : 774974 : stmt_vec_info load_info;
2060 : 774974 : load_permutation.create (group_size);
2061 : 774974 : stmt_vec_info first_stmt_info
2062 : 774974 : = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2063 : 774974 : ? DR_GROUP_FIRST_ELEMENT (stmt_info) : stmt_info;
2064 : 774974 : bool any_permute = false;
2065 : 1894547 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2066 : : {
2067 : 1119573 : int load_place;
2068 : 1119573 : if (! load_info)
2069 : : {
2070 : 50734 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2071 : : load_place = j;
2072 : : else
2073 : : load_place = 0;
2074 : : }
2075 : 1068839 : else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2076 : 671463 : load_place = vect_get_place_in_interleaving_chain
2077 : 671463 : (load_info, first_stmt_info);
2078 : : else
2079 : : load_place = 0;
2080 : 722197 : gcc_assert (load_place != -1);
2081 : 1119573 : any_permute |= load_place != j;
2082 : 1119573 : load_permutation.quick_push (load_place);
2083 : : }
2084 : :
2085 : 774974 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
2086 : : {
2087 : 2164 : gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
2088 : 2164 : bool has_gaps = false;
2089 : 2164 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2090 : 119 : for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
2091 : 196 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2092 : 77 : if (DR_GROUP_GAP (si) != 1)
2093 : 20 : has_gaps = true;
2094 : : /* We cannot handle permuted masked loads directly, see
2095 : : PR114375. We cannot handle strided masked loads or masked
2096 : : loads with gaps unless the mask is uniform. */
2097 : 2164 : if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
2098 : 119 : && (DR_GROUP_GAP (first_stmt_info) != 0
2099 : 59 : || (has_gaps
2100 : 20 : && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))))
2101 : 4248 : || STMT_VINFO_STRIDED_P (stmt_info))
2102 : : {
2103 : 93 : load_permutation.release ();
2104 : 93 : matches[0] = false;
2105 : 772927 : return NULL;
2106 : : }
2107 : :
2108 : : /* For permuted masked loads do an unpermuted masked load of
2109 : : the whole group followed by a SLP permute node. */
2110 : 2071 : if (any_permute
2111 : 2071 : || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2112 : 25 : && DR_GROUP_SIZE (first_stmt_info) != group_size))
2113 : : {
2114 : : /* Discover the whole unpermuted load. */
2115 : 24 : vec<stmt_vec_info> stmts2;
2116 : 24 : unsigned dr_group_size = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2117 : 38 : ? DR_GROUP_SIZE (first_stmt_info) : 1;
2118 : 24 : stmts2.create (dr_group_size);
2119 : 24 : stmts2.quick_grow_cleared (dr_group_size);
2120 : 24 : unsigned i = 0;
2121 : 24 : for (stmt_vec_info si = first_stmt_info;
2122 : 74 : si; si = DR_GROUP_NEXT_ELEMENT (si))
2123 : : {
2124 : 50 : if (si != first_stmt_info)
2125 : 26 : for (unsigned k = 1; k < DR_GROUP_GAP (si); ++k)
2126 : 0 : stmts2[i++] = NULL;
2127 : 50 : stmts2[i++] = si;
2128 : : }
2129 : 24 : bool *matches2 = XALLOCAVEC (bool, dr_group_size);
2130 : 24 : slp_tree unperm_load
2131 : 24 : = vect_build_slp_tree (vinfo, stmts2, dr_group_size,
2132 : : &this_max_nunits, matches2, limit,
2133 : 24 : &this_tree_size, bst_map);
2134 : : /* When we are able to do the full masked load emit that
2135 : : followed by 'node' being the desired final permutation. */
2136 : 24 : if (unperm_load)
2137 : : {
2138 : 16 : gcc_assert
2139 : : (!SLP_TREE_LOAD_PERMUTATION (unperm_load).exists ());
2140 : 16 : lane_permutation_t lperm;
2141 : 16 : lperm.create (group_size);
2142 : 56 : for (unsigned j = 0; j < load_permutation.length (); ++j)
2143 : 40 : lperm.quick_push
2144 : 40 : (std::make_pair (0, load_permutation[j]));
2145 : 16 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2146 : 16 : SLP_TREE_CHILDREN (node).safe_push (unperm_load);
2147 : 16 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2148 : 16 : load_permutation.release ();
2149 : 16 : return node;
2150 : : }
2151 : 8 : stmts2.release ();
2152 : 8 : load_permutation.release ();
2153 : 8 : matches[0] = false;
2154 : 8 : return NULL;
2155 : : }
2156 : 2047 : load_permutation.release ();
2157 : : }
2158 : : else
2159 : : {
2160 : 772810 : if (!any_permute
2161 : 671657 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2162 : 1053829 : && group_size == DR_GROUP_SIZE (first_stmt_info))
2163 : 133406 : load_permutation.release ();
2164 : 772810 : SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
2165 : 772810 : return node;
2166 : : }
2167 : : }
2168 : : }
2169 : 3440051 : else if (gimple_assign_single_p (stmt_info->stmt)
2170 : 2128672 : && !gimple_vuse (stmt_info->stmt)
2171 : 3447487 : && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
2172 : : {
2173 : : /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
2174 : : the same SSA name vector of a compatible type to vectype. */
2175 : 2009 : vec<std::pair<unsigned, unsigned> > lperm = vNULL;
2176 : 2009 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
2177 : 2009 : stmt_vec_info estmt_info;
2178 : 6607 : FOR_EACH_VEC_ELT (stmts, i, estmt_info)
2179 : : {
2180 : 4601 : gassign *estmt = as_a <gassign *> (estmt_info->stmt);
2181 : 4601 : tree bfref = gimple_assign_rhs1 (estmt);
2182 : 4601 : HOST_WIDE_INT lane;
2183 : 4601 : if (!known_eq (bit_field_size (bfref),
2184 : : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2185 : 9199 : || !constant_multiple_p (bit_field_offset (bfref),
2186 : 4598 : bit_field_size (bfref), &lane))
2187 : : {
2188 : 3 : lperm.release ();
2189 : 3 : matches[0] = false;
2190 : 3 : return NULL;
2191 : : }
2192 : 4598 : lperm.safe_push (std::make_pair (0, (unsigned)lane));
2193 : : }
2194 : 2006 : slp_tree vnode = vect_create_new_slp_node (vNULL);
2195 : 2006 : if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2196 : : /* ??? We record vectype here but we hide eventually necessary
2197 : : punning and instead rely on code generation to materialize
2198 : : VIEW_CONVERT_EXPRs as necessary. We instead should make
2199 : : this explicit somehow. */
2200 : 646 : SLP_TREE_VECTYPE (vnode) = vectype;
2201 : : else
2202 : : {
2203 : : /* For different size but compatible elements we can still
2204 : : use VEC_PERM_EXPR without punning. */
2205 : 1360 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2206 : : && types_compatible_p (TREE_TYPE (vectype),
2207 : : TREE_TYPE (TREE_TYPE (vec))));
2208 : 1360 : SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2209 : : }
2210 : 2006 : auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2211 : 2006 : unsigned HOST_WIDE_INT const_nunits;
2212 : 2006 : if (nunits.is_constant (&const_nunits))
2213 : 2006 : SLP_TREE_LANES (vnode) = const_nunits;
2214 : 2006 : SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2215 : : /* We are always building a permutation node even if it is an identity
2216 : : permute to shield the rest of the vectorizer from the odd node
2217 : : representing an actual vector without any scalar ops.
2218 : : ??? We could hide it completely with making the permute node
2219 : : external? */
2220 : 2006 : node = vect_create_new_slp_node (node, stmts, 1);
2221 : 2006 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2222 : 2006 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2223 : 2006 : SLP_TREE_VECTYPE (node) = vectype;
2224 : 2006 : SLP_TREE_CHILDREN (node).quick_push (vnode);
2225 : 2006 : return node;
2226 : : }
2227 : : /* When discovery reaches an associatable operation see whether we can
2228 : : improve that to match up lanes in a way superior to the operand
2229 : : swapping code which at most looks at two defs.
2230 : : ??? For BB vectorization we cannot do the brute-force search
2231 : : for matching as we can succeed by means of builds from scalars
2232 : : and have no good way to "cost" one build against another. */
2233 : 3438042 : else if (is_a <loop_vec_info> (vinfo)
2234 : : /* Do not bother for single-lane SLP. */
2235 : 2085736 : && group_size > 1
2236 : : /* ??? We don't handle !vect_internal_def defs below. */
2237 : 56834 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2238 : : /* ??? Do not associate a reduction, this will wreck REDUC_IDX
2239 : : mapping as long as that exists on the stmt_info level. */
2240 : 47516 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1
2241 : 47026 : && is_gimple_assign (stmt_info->stmt)
2242 : 46801 : && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2243 : 36926 : || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2244 : 3449476 : && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2245 : 7707 : || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2246 : 5857 : && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2247 : : {
2248 : : /* See if we have a chain of (mixed) adds or subtracts or other
2249 : : associatable ops. */
2250 : 6678 : enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2251 : 6678 : if (code == MINUS_EXPR)
2252 : 664 : code = PLUS_EXPR;
2253 : 6678 : stmt_vec_info other_op_stmt_info = NULL;
2254 : 6678 : stmt_vec_info op_stmt_info = NULL;
2255 : 6678 : unsigned chain_len = 0;
2256 : 6678 : auto_vec<chain_op_t> chain;
2257 : 6678 : auto_vec<std::pair<tree_code, gimple *> > worklist;
2258 : 6678 : auto_vec<vec<chain_op_t> > chains (group_size);
2259 : 6678 : auto_vec<slp_tree, 4> children;
2260 : 6678 : bool hard_fail = true;
2261 : 7479 : for (unsigned lane = 0; lane < group_size; ++lane)
2262 : : {
2263 : 7225 : if (!stmts[lane])
2264 : : {
2265 : : /* ??? Below we require lane zero is present. */
2266 : 0 : if (lane == 0)
2267 : : {
2268 : : hard_fail = false;
2269 : 6424 : break;
2270 : : }
2271 : 0 : chains.quick_push (vNULL);
2272 : 0 : continue;
2273 : : }
2274 : : /* For each lane linearize the addition/subtraction (or other
2275 : : uniform associatable operation) expression tree. */
2276 : 7225 : gimple *op_stmt = NULL, *other_op_stmt = NULL;
2277 : 7225 : vect_slp_linearize_chain (vinfo, worklist, chain, code,
2278 : 7225 : stmts[lane]->stmt, op_stmt, other_op_stmt,
2279 : : NULL);
2280 : 7225 : if (!op_stmt_info && op_stmt)
2281 : 6161 : op_stmt_info = vinfo->lookup_stmt (op_stmt);
2282 : 7225 : if (!other_op_stmt_info && other_op_stmt)
2283 : 701 : other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2284 : 7225 : if (chain.length () == 2)
2285 : : {
2286 : : /* In a chain of just two elements resort to the regular
2287 : : operand swapping scheme. Likewise if we run into a
2288 : : length mismatch process regularly as well as we did not
2289 : : process the other lanes we cannot report a good hint what
2290 : : lanes to try swapping in the parent. */
2291 : : hard_fail = false;
2292 : : break;
2293 : : }
2294 : 804 : else if (chain_len == 0)
2295 : 294 : chain_len = chain.length ();
2296 : 1020 : else if (chain.length () != chain_len)
2297 : : {
2298 : : /* ??? Here we could slip in magic to compensate with
2299 : : neutral operands. */
2300 : 3 : matches[lane] = false;
2301 : 3 : if (lane != group_size - 1)
2302 : 3 : matches[0] = false;
2303 : : break;
2304 : : }
2305 : 801 : chains.quick_push (chain.copy ());
2306 : 801 : chain.truncate (0);
2307 : : }
2308 : 13356 : if (chains.length () == group_size)
2309 : : {
2310 : : /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2311 : 254 : if (!op_stmt_info)
2312 : : {
2313 : 2 : hard_fail = false;
2314 : 2 : goto out;
2315 : : }
2316 : : /* Now we have a set of chains with the same length. */
2317 : : /* 1. pre-sort according to def_type and operation. */
2318 : 943 : for (unsigned lane = 0; lane < group_size; ++lane)
2319 : 1382 : chains[lane].stablesort (dt_sort_cmp, vinfo);
2320 : 252 : if (dump_enabled_p ())
2321 : : {
2322 : 128 : dump_printf_loc (MSG_NOTE, vect_location,
2323 : : "pre-sorted chains of %s\n",
2324 : : get_tree_code_name (code));
2325 : 544 : for (unsigned lane = 0; lane < group_size; ++lane)
2326 : : {
2327 : 416 : if (!stmts[lane])
2328 : 0 : dump_printf (MSG_NOTE, "--");
2329 : : else
2330 : 1878 : for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2331 : 2924 : dump_printf (MSG_NOTE, "%s %T ",
2332 : 1462 : get_tree_code_name (chains[lane][opnum].code),
2333 : 1462 : chains[lane][opnum].op);
2334 : 416 : dump_printf (MSG_NOTE, "\n");
2335 : : }
2336 : : }
2337 : : /* 2. try to build children nodes, associating as necessary. */
2338 : : /* 2a. prepare and perform early checks to avoid eating into
2339 : : discovery limit unnecessarily. */
2340 : 252 : vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
2341 : 1063 : for (unsigned n = 0; n < chain_len; ++n)
2342 : : {
2343 : 811 : vect_def_type dt = chains[0][n].dt;
2344 : 811 : unsigned lane;
2345 : 3130 : for (lane = 0; lane < group_size; ++lane)
2346 : 4638 : if (stmts[lane] && chains[lane][n].dt != dt)
2347 : : {
2348 : 0 : if (dt == vect_constant_def
2349 : 0 : && chains[lane][n].dt == vect_external_def)
2350 : : dt = vect_external_def;
2351 : 0 : else if (dt == vect_external_def
2352 : 0 : && chains[lane][n].dt == vect_constant_def)
2353 : : ;
2354 : : else
2355 : : break;
2356 : : }
2357 : 811 : if (lane != group_size)
2358 : : {
2359 : 0 : if (dump_enabled_p ())
2360 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2361 : : "giving up on chain due to mismatched "
2362 : : "def types\n");
2363 : 0 : matches[lane] = false;
2364 : 0 : if (lane != group_size - 1)
2365 : 0 : matches[0] = false;
2366 : 0 : goto out;
2367 : : }
2368 : 811 : dts[n] = dt;
2369 : 811 : if (dt == vect_constant_def
2370 : 811 : || dt == vect_external_def)
2371 : : {
2372 : : /* Check whether we can build the invariant. If we can't
2373 : : we never will be able to. */
2374 : 72 : tree type = TREE_TYPE (chains[0][n].op);
2375 : 811 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2376 : : && (TREE_CODE (type) == BOOLEAN_TYPE
2377 : : || !can_duplicate_and_interleave_p (vinfo, group_size,
2378 : : type)))
2379 : : {
2380 : : matches[0] = false;
2381 : : goto out;
2382 : : }
2383 : : }
2384 : 739 : else if (dt != vect_internal_def)
2385 : : {
2386 : : /* Not sure, we might need sth special.
2387 : : gcc.dg/vect/pr96854.c,
2388 : : gfortran.dg/vect/fast-math-pr37021.f90
2389 : : and gfortran.dg/vect/pr61171.f trigger. */
2390 : : /* Soft-fail for now. */
2391 : 0 : hard_fail = false;
2392 : 0 : goto out;
2393 : : }
2394 : : }
2395 : : /* 2b. do the actual build. */
2396 : 997 : for (unsigned n = 0; n < chain_len; ++n)
2397 : : {
2398 : 768 : vect_def_type dt = dts[n];
2399 : 768 : unsigned lane;
2400 : 768 : if (dt == vect_constant_def
2401 : 768 : || dt == vect_external_def)
2402 : : {
2403 : 72 : vec<tree> ops;
2404 : 72 : ops.create (group_size);
2405 : 359 : for (lane = 0; lane < group_size; ++lane)
2406 : 215 : if (stmts[lane])
2407 : 215 : ops.quick_push (chains[lane][n].op);
2408 : : else
2409 : 0 : ops.quick_push (NULL_TREE);
2410 : 72 : slp_tree child = vect_create_new_slp_node (ops);
2411 : 72 : SLP_TREE_DEF_TYPE (child) = dt;
2412 : 72 : children.safe_push (child);
2413 : : }
2414 : : else
2415 : : {
2416 : 696 : vec<stmt_vec_info> op_stmts;
2417 : 696 : op_stmts.create (group_size);
2418 : 696 : slp_tree child = NULL;
2419 : : /* Brute-force our way. We have to consider a lane
2420 : : failing after fixing an earlier fail up in the
2421 : : SLP discovery recursion. So track the current
2422 : : permute per lane. */
2423 : 696 : unsigned *perms = XALLOCAVEC (unsigned, group_size);
2424 : 696 : memset (perms, 0, sizeof (unsigned) * group_size);
2425 : 783 : do
2426 : : {
2427 : 783 : op_stmts.truncate (0);
2428 : 3824 : for (lane = 0; lane < group_size; ++lane)
2429 : 2258 : if (stmts[lane])
2430 : 2258 : op_stmts.quick_push
2431 : 2258 : (vinfo->lookup_def (chains[lane][n].op));
2432 : : else
2433 : 0 : op_stmts.quick_push (NULL);
2434 : 783 : child = vect_build_slp_tree (vinfo, op_stmts,
2435 : : group_size, &this_max_nunits,
2436 : : matches, limit,
2437 : : &this_tree_size, bst_map);
2438 : : /* ??? We're likely getting too many fatal mismatches
2439 : : here so maybe we want to ignore them (but then we
2440 : : have no idea which lanes fatally mismatched). */
2441 : 783 : if (child || !matches[0])
2442 : : break;
2443 : : /* Swap another lane we have not yet matched up into
2444 : : lanes that did not match. If we run out of
2445 : : permute possibilities for a lane terminate the
2446 : : search. */
2447 : 281 : bool term = false;
2448 : 281 : for (lane = 1; lane < group_size; ++lane)
2449 : 194 : if (!matches[lane])
2450 : : {
2451 : 166 : if (n + perms[lane] + 1 == chain_len)
2452 : : {
2453 : : term = true;
2454 : : break;
2455 : : }
2456 : 143 : if (dump_enabled_p ())
2457 : 113 : dump_printf_loc (MSG_NOTE, vect_location,
2458 : : "swapping operand %d and %d "
2459 : : "of lane %d\n",
2460 : : n, n + perms[lane] + 1, lane);
2461 : 286 : std::swap (chains[lane][n],
2462 : 143 : chains[lane][n + perms[lane] + 1]);
2463 : 143 : perms[lane]++;
2464 : : }
2465 : 110 : if (term)
2466 : : break;
2467 : : }
2468 : : while (1);
2469 : 696 : if (!child)
2470 : : {
2471 : 23 : if (dump_enabled_p ())
2472 : 18 : dump_printf_loc (MSG_NOTE, vect_location,
2473 : : "failed to match up op %d\n", n);
2474 : 23 : op_stmts.release ();
2475 : 23 : if (lane != group_size - 1)
2476 : 11 : matches[0] = false;
2477 : : else
2478 : 12 : matches[lane] = false;
2479 : 23 : goto out;
2480 : : }
2481 : 673 : if (dump_enabled_p ())
2482 : : {
2483 : 339 : dump_printf_loc (MSG_NOTE, vect_location,
2484 : : "matched up op %d to\n", n);
2485 : 339 : vect_print_slp_tree (MSG_NOTE, vect_location, child);
2486 : : }
2487 : 673 : children.safe_push (child);
2488 : : }
2489 : : }
2490 : : /* 3. build SLP nodes to combine the chain. */
2491 : 837 : for (unsigned lane = 0; lane < group_size; ++lane)
2492 : 1228 : if (stmts[lane] && chains[lane][0].code != code)
2493 : : {
2494 : : /* See if there's any alternate all-PLUS entry. */
2495 : : unsigned n;
2496 : 6 : for (n = 1; n < chain_len; ++n)
2497 : : {
2498 : 30 : for (lane = 0; lane < group_size; ++lane)
2499 : 48 : if (stmts[lane] && chains[lane][n].code != code)
2500 : : break;
2501 : 6 : if (lane == group_size)
2502 : : break;
2503 : : }
2504 : 6 : if (n != chain_len)
2505 : : {
2506 : : /* Swap that in at first position. */
2507 : 6 : std::swap (children[0], children[n]);
2508 : 30 : for (lane = 0; lane < group_size; ++lane)
2509 : 24 : if (stmts[lane])
2510 : 24 : std::swap (chains[lane][0], chains[lane][n]);
2511 : : }
2512 : : else
2513 : : {
2514 : : /* ??? When this triggers and we end up with two
2515 : : vect_constant/external_def up-front things break (ICE)
2516 : : spectacularly finding an insertion place for the
2517 : : all-constant op. We should have a fully
2518 : : vect_internal_def operand though(?) so we can swap
2519 : : that into first place and then prepend the all-zero
2520 : : constant. */
2521 : 0 : if (dump_enabled_p ())
2522 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2523 : : "inserting constant zero to compensate "
2524 : : "for (partially) negated first "
2525 : : "operand\n");
2526 : 0 : chain_len++;
2527 : 0 : for (lane = 0; lane < group_size; ++lane)
2528 : 0 : if (stmts[lane])
2529 : 0 : chains[lane].safe_insert
2530 : 0 : (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2531 : 0 : vec<tree> zero_ops;
2532 : 0 : zero_ops.create (group_size);
2533 : 0 : zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2534 : 0 : for (lane = 1; lane < group_size; ++lane)
2535 : 0 : if (stmts[lane])
2536 : 0 : zero_ops.quick_push (zero_ops[0]);
2537 : : else
2538 : 0 : zero_ops.quick_push (NULL_TREE);
2539 : 0 : slp_tree zero = vect_create_new_slp_node (zero_ops);
2540 : 0 : SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2541 : 0 : children.safe_insert (0, zero);
2542 : : }
2543 : : break;
2544 : : }
2545 : 740 : for (unsigned i = 1; i < children.length (); ++i)
2546 : : {
2547 : 511 : slp_tree op0 = children[i - 1];
2548 : 511 : slp_tree op1 = children[i];
2549 : 511 : bool this_two_op = false;
2550 : 1834 : for (unsigned lane = 0; lane < group_size; ++lane)
2551 : 2892 : if (stmts[lane] && chains[lane][i].code != chains[0][i].code)
2552 : : {
2553 : : this_two_op = true;
2554 : : break;
2555 : : }
2556 : 511 : slp_tree child;
2557 : 511 : if (i == children.length () - 1)
2558 : 229 : child = vect_create_new_slp_node (node, stmts, 2);
2559 : : else
2560 : 282 : child = vect_create_new_slp_node (2, ERROR_MARK);
2561 : 511 : if (this_two_op)
2562 : : {
2563 : 123 : vec<std::pair<unsigned, unsigned> > lperm;
2564 : 123 : lperm.create (group_size);
2565 : 465 : for (unsigned lane = 0; lane < group_size; ++lane)
2566 : 684 : lperm.quick_push (std::make_pair
2567 : 342 : (chains[lane][i].code != chains[0][i].code, lane));
2568 : 246 : vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2569 : 123 : (chains[0][i].code == code
2570 : : ? op_stmt_info
2571 : : : other_op_stmt_info),
2572 : 123 : (chains[0][i].code == code
2573 : : ? other_op_stmt_info
2574 : : : op_stmt_info),
2575 : : lperm);
2576 : : }
2577 : : else
2578 : : {
2579 : 388 : SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2580 : 388 : SLP_TREE_VECTYPE (child) = vectype;
2581 : 388 : SLP_TREE_LANES (child) = group_size;
2582 : 388 : SLP_TREE_CHILDREN (child).quick_push (op0);
2583 : 388 : SLP_TREE_CHILDREN (child).quick_push (op1);
2584 : 388 : SLP_TREE_REPRESENTATIVE (child)
2585 : 776 : = (chains[0][i].code == code
2586 : 388 : ? op_stmt_info : other_op_stmt_info);
2587 : : }
2588 : 511 : children[i] = child;
2589 : : }
2590 : 229 : *tree_size += this_tree_size + 1;
2591 : 229 : *max_nunits = this_max_nunits;
2592 : 1116 : while (!chains.is_empty ())
2593 : 632 : chains.pop ().release ();
2594 : : return node;
2595 : : }
2596 : 6424 : out:
2597 : 6449 : if (dump_enabled_p ())
2598 : 2717 : dump_printf_loc (MSG_NOTE, vect_location,
2599 : : "failed to line up SLP graph by re-associating "
2600 : : "operations in lanes%s\n",
2601 : : !hard_fail ? " trying regular discovery" : "");
2602 : 6454 : while (!children.is_empty ())
2603 : 5 : vect_free_slp_tree (children.pop ());
2604 : 6618 : while (!chains.is_empty ())
2605 : 169 : chains.pop ().release ();
2606 : : /* Hard-fail, otherwise we might run into quadratic processing of the
2607 : : chains starting one stmt into the chain again. */
2608 : 6449 : if (hard_fail)
2609 : : return NULL;
2610 : : /* Fall thru to normal processing. */
2611 : 6678 : }
2612 : :
2613 : : /* Get at the operands, verifying they are compatible. */
2614 : 3451859 : vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2615 : 3451859 : slp_oprnd_info oprnd_info;
2616 : 16071965 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2617 : : {
2618 : 25245142 : int res = vect_get_and_check_slp_defs (vinfo, vectype,
2619 : 12622571 : swap[i], skip_args,
2620 : : stmts, i, &oprnds_info);
2621 : 12622571 : if (res != 0)
2622 : 510935 : matches[(res == -1) ? 0 : i] = false;
2623 : 12622571 : if (!matches[0])
2624 : : break;
2625 : : }
2626 : 15782093 : for (i = 0; i < group_size; ++i)
2627 : 12533372 : if (!matches[i])
2628 : : {
2629 : 203138 : vect_free_oprnd_info (oprnds_info);
2630 : 203138 : return NULL;
2631 : : }
2632 : 9746163 : swap = NULL;
2633 : :
2634 : 9746163 : bool has_two_operators_perm = false;
2635 : 19492326 : auto_vec<unsigned> two_op_perm_indices[2];
2636 : 3248721 : vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
2637 : :
2638 : 3257881 : if (two_operators && oprnds_info.length () == 2 && group_size > 2)
2639 : : {
2640 : 1225 : unsigned idx = 0;
2641 : 1225 : hash_map<gimple *, unsigned> seen;
2642 : 1225 : vec<slp_oprnd_info> new_oprnds_info
2643 : 1225 : = vect_create_oprnd_info (1, group_size);
2644 : 1225 : bool success = true;
2645 : :
2646 : 1225 : enum tree_code code = ERROR_MARK;
2647 : 1225 : if (oprnds_info[0]->def_stmts[0]
2648 : 1225 : && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
2649 : 1168 : code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
2650 : 1225 : basic_block bb = nullptr;
2651 : :
2652 : 4254 : for (unsigned j = 0; j < group_size; ++j)
2653 : : {
2654 : 10662 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2655 : : {
2656 : 7633 : stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
2657 : 7633 : if (!stmt_info
2658 : 7460 : || !is_a<gassign *> (stmt_info->stmt)
2659 : 7460 : || gimple_assign_rhs_code (stmt_info->stmt) != code
2660 : 14412 : || skip_args[i])
2661 : : {
2662 : : success = false;
2663 : 858 : break;
2664 : : }
2665 : : /* Avoid mixing lanes with defs in different basic-blocks. */
2666 : 6779 : if (!bb)
2667 : 1326 : bb = gimple_bb (vect_orig_stmt (stmt_info)->stmt);
2668 : 6975 : else if (gimple_bb (vect_orig_stmt (stmt_info)->stmt) != bb)
2669 : : {
2670 : : success = false;
2671 : : break;
2672 : : }
2673 : :
2674 : 6775 : bool exists;
2675 : 6775 : unsigned &stmt_idx
2676 : 6775 : = seen.get_or_insert (stmt_info->stmt, &exists);
2677 : :
2678 : 6775 : if (!exists)
2679 : : {
2680 : 5766 : new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
2681 : 5766 : new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
2682 : 5766 : stmt_idx = idx;
2683 : 5766 : idx++;
2684 : : }
2685 : :
2686 : 6775 : two_op_perm_indices[i].safe_push (stmt_idx);
2687 : : }
2688 : :
2689 : 3887 : if (!success)
2690 : : break;
2691 : : }
2692 : :
2693 : 1225 : if (success && idx == group_size)
2694 : : {
2695 : 46 : if (dump_enabled_p ())
2696 : : {
2697 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2698 : : "Replace two_operators operands:\n");
2699 : :
2700 : 0 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2701 : : {
2702 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2703 : : "Operand %u:\n", i);
2704 : 0 : for (unsigned j = 0; j < group_size; j++)
2705 : 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2706 : 0 : j, oprnd_info->def_stmts[j]->stmt);
2707 : : }
2708 : :
2709 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2710 : : "With a single operand:\n");
2711 : 0 : for (unsigned j = 0; j < group_size; j++)
2712 : 0 : dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2713 : 0 : j, new_oprnds_info[0]->def_stmts[j]->stmt);
2714 : : }
2715 : :
2716 : 46 : two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
2717 : 46 : two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
2718 : :
2719 : 46 : new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
2720 : 46 : new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
2721 : 46 : new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
2722 : 46 : new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
2723 : 46 : new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
2724 : :
2725 : 46 : vect_free_oprnd_info (oprnds_info);
2726 : 46 : oprnds_info = new_oprnds_info;
2727 : 46 : nops = 1;
2728 : 46 : has_two_operators_perm = true;
2729 : : }
2730 : : else
2731 : 1179 : vect_free_oprnd_info (new_oprnds_info);
2732 : 1225 : }
2733 : :
2734 : 6497442 : auto_vec<slp_tree, 4> children;
2735 : :
2736 : 3248721 : stmt_info = stmts[0];
2737 : :
2738 : 3248721 : int reduc_idx = -1;
2739 : 3248721 : int gs_scale = 0;
2740 : 3248721 : tree gs_base = NULL_TREE;
2741 : :
2742 : : /* Create SLP_TREE nodes for the definition node/s. */
2743 : 8497692 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2744 : : {
2745 : 5326404 : slp_tree child = nullptr;
2746 : 5326404 : unsigned int j;
2747 : :
2748 : : /* We're skipping certain operands from processing, for example
2749 : : outer loop reduction initial defs. */
2750 : 5326404 : if (skip_args[i])
2751 : : {
2752 : 854937 : children.safe_push (NULL);
2753 : 6103908 : continue;
2754 : : }
2755 : :
2756 : 4471467 : if (oprnd_info->first_dt == vect_uninitialized_def)
2757 : : {
2758 : : /* COND_EXPR have one too many eventually if the condition
2759 : : is a SSA name. */
2760 : 0 : gcc_assert (i == 3 && nops == 4);
2761 : 0 : continue;
2762 : : }
2763 : :
2764 : 4471467 : if (oprnd_info->first_gs_p)
2765 : : {
2766 : 16691 : gs_scale = oprnd_info->first_gs_info.scale;
2767 : 16691 : gs_base = oprnd_info->first_gs_info.base;
2768 : : }
2769 : :
2770 : 4471467 : if (is_a <bb_vec_info> (vinfo)
2771 : 1584871 : && oprnd_info->first_dt == vect_internal_def
2772 : 5299223 : && !oprnd_info->any_pattern)
2773 : : {
2774 : : /* For BB vectorization, if all defs are the same do not
2775 : : bother to continue the build along the single-lane
2776 : : graph but use a splat of the scalar value. */
2777 : 785433 : stmt_vec_info first_def = oprnd_info->def_stmts[0];
2778 : 844903 : for (j = 1; j < group_size; ++j)
2779 : 801905 : if (oprnd_info->def_stmts[j] != first_def)
2780 : : break;
2781 : 785433 : if (j == group_size
2782 : : /* But avoid doing this for loads where we may be
2783 : : able to CSE things, unless the stmt is not
2784 : : vectorizable. */
2785 : 785433 : && (!STMT_VINFO_VECTORIZABLE (first_def)
2786 : 49018 : || !gimple_vuse (first_def->stmt)))
2787 : : {
2788 : 33768 : if (dump_enabled_p ())
2789 : 102 : dump_printf_loc (MSG_NOTE, vect_location,
2790 : : "Using a splat of the uniform operand %G",
2791 : : first_def->stmt);
2792 : 33768 : oprnd_info->first_dt = vect_external_def;
2793 : : }
2794 : : }
2795 : :
2796 : 4471467 : if (oprnd_info->first_dt == vect_external_def
2797 : 4471467 : || oprnd_info->first_dt == vect_constant_def)
2798 : : {
2799 : 1654340 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2800 : : {
2801 : : tree op0;
2802 : : tree uniform_val = op0 = oprnd_info->ops[0];
2803 : : for (j = 1; j < oprnd_info->ops.length (); ++j)
2804 : : if (oprnd_info->ops[j]
2805 : : && !operand_equal_p (uniform_val, oprnd_info->ops[j]))
2806 : : {
2807 : : uniform_val = NULL_TREE;
2808 : : break;
2809 : : }
2810 : : if (!uniform_val
2811 : : && !can_duplicate_and_interleave_p (vinfo,
2812 : : oprnd_info->ops.length (),
2813 : : TREE_TYPE (op0)))
2814 : : {
2815 : : matches[j] = false;
2816 : : if (dump_enabled_p ())
2817 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2818 : : "Build SLP failed: invalid type of def "
2819 : : "for variable-length SLP %T\n", op0);
2820 : : goto fail;
2821 : : }
2822 : : }
2823 : 1654340 : slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2824 : 1654340 : SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2825 : 1654340 : oprnd_info->ops = vNULL;
2826 : 1654340 : children.safe_push (invnode);
2827 : 1654340 : continue;
2828 : 1654340 : }
2829 : :
2830 : : /* See which SLP operand a reduction chain continues on. We want
2831 : : to chain even PHIs but not backedges. */
2832 : 2817127 : if (STMT_VINFO_REDUC_DEF (oprnd_info->def_stmts[0])
2833 : 2817127 : || STMT_VINFO_REDUC_IDX (oprnd_info->def_stmts[0]) != -1)
2834 : : {
2835 : 140304 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2836 : : {
2837 : 634 : if (oprnd_info->first_dt == vect_double_reduction_def)
2838 : 317 : reduc_idx = i;
2839 : : }
2840 : 139670 : else if (is_a <gphi *> (stmt_info->stmt)
2841 : 139670 : && gimple_phi_num_args
2842 : 62931 : (as_a <gphi *> (stmt_info->stmt)) != 1)
2843 : : ;
2844 : 77062 : else if (STMT_VINFO_REDUC_IDX (stmt_info) == -1
2845 : 343 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2846 : : ;
2847 : 77042 : else if (reduc_idx == -1)
2848 : 72918 : reduc_idx = i;
2849 : : else
2850 : : /* For .COND_* reduction operations the else value can be the
2851 : : same as one of the operation operands. The other def
2852 : : stmts have been moved, so we can't check easily. Check
2853 : : it's a call at least. */
2854 : 4124 : gcc_assert (is_a <gcall *> (stmt_info->stmt));
2855 : : }
2856 : :
2857 : : /* When we have a masked load with uniform mask discover this
2858 : : as a single-lane mask with a splat permute. This way we can
2859 : : recognize this as a masked load-lane by stripping the splat. */
2860 : 2817127 : if (is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
2861 : 32996 : && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
2862 : : IFN_MASK_LOAD)
2863 : 4582 : && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2864 : 2817149 : && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info)))
2865 : : {
2866 : 0 : vec<stmt_vec_info> def_stmts2;
2867 : 0 : def_stmts2.create (1);
2868 : 0 : def_stmts2.quick_push (oprnd_info->def_stmts[0]);
2869 : 0 : child = vect_build_slp_tree (vinfo, def_stmts2, 1,
2870 : : &this_max_nunits,
2871 : : matches, limit,
2872 : : &this_tree_size, bst_map);
2873 : 0 : if (child)
2874 : : {
2875 : 0 : slp_tree pnode = vect_create_new_slp_node (1, VEC_PERM_EXPR);
2876 : 0 : SLP_TREE_VECTYPE (pnode) = SLP_TREE_VECTYPE (child);
2877 : 0 : SLP_TREE_LANES (pnode) = group_size;
2878 : 0 : SLP_TREE_SCALAR_STMTS (pnode).create (group_size);
2879 : 0 : SLP_TREE_LANE_PERMUTATION (pnode).create (group_size);
2880 : 0 : for (unsigned k = 0; k < group_size; ++k)
2881 : : {
2882 : 0 : SLP_TREE_SCALAR_STMTS (pnode)
2883 : 0 : .quick_push (oprnd_info->def_stmts[0]);
2884 : 0 : SLP_TREE_LANE_PERMUTATION (pnode)
2885 : 0 : .quick_push (std::make_pair (0u, 0u));
2886 : : }
2887 : 0 : SLP_TREE_CHILDREN (pnode).quick_push (child);
2888 : 0 : pnode->max_nunits = child->max_nunits;
2889 : 0 : children.safe_push (pnode);
2890 : 0 : oprnd_info->def_stmts = vNULL;
2891 : 0 : continue;
2892 : 0 : }
2893 : : else
2894 : 0 : def_stmts2.release ();
2895 : : }
2896 : :
2897 : 2817127 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2898 : : group_size, &this_max_nunits,
2899 : : matches, limit,
2900 : : &this_tree_size, bst_map)) != NULL)
2901 : : {
2902 : 2361714 : oprnd_info->def_stmts = vNULL;
2903 : 2361714 : children.safe_push (child);
2904 : 2361714 : continue;
2905 : : }
2906 : :
2907 : : /* If the SLP build for operand zero failed and operand zero
2908 : : and one can be commutated try that for the scalar stmts
2909 : : that failed the match. */
2910 : 455413 : if (i == 0
2911 : : /* A first scalar stmt mismatch signals a fatal mismatch. */
2912 : 361476 : && matches[0]
2913 : : /* ??? For COND_EXPRs we can swap the comparison operands
2914 : : as well as the arms under some constraints. */
2915 : 159087 : && (nops == 2 || nops == 3)
2916 : 94242 : && oprnds_info[1]->first_dt == vect_internal_def
2917 : 53627 : && (is_gimple_assign (stmt_info->stmt)
2918 : 11297 : || is_gimple_call (stmt_info->stmt))
2919 : : /* Swapping operands for reductions breaks assumptions later on. */
2920 : 497756 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
2921 : : {
2922 : : /* See whether we can swap the matching or the non-matching
2923 : : stmt operands. */
2924 : : bool swap_not_matching = true;
2925 : 49841 : do
2926 : : {
2927 : 7017093 : for (j = 0; j < group_size; ++j)
2928 : : {
2929 : 6981946 : if (matches[j] != !swap_not_matching)
2930 : 59659 : continue;
2931 : 6922287 : stmt_vec_info stmt_info = stmts[j];
2932 : : /* Verify if we can swap operands of this stmt. */
2933 : 6922287 : if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
2934 : : {
2935 : 6922261 : tree_code code = gimple_assign_rhs_code (stmt);
2936 : 6922261 : if (! commutative_tree_code (code)
2937 : 6922261 : && ! commutative_ternary_tree_code (code))
2938 : : {
2939 : 14670 : if (!swap_not_matching)
2940 : 6854 : goto fail;
2941 : : swap_not_matching = false;
2942 : : break;
2943 : : }
2944 : : }
2945 : 6967278 : else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
2946 : : {
2947 : 26 : internal_fn fn = (gimple_call_internal_p (call)
2948 : 26 : ? gimple_call_internal_fn (call)
2949 : : : IFN_LAST);
2950 : 26 : if ((! commutative_binary_fn_p (fn)
2951 : 26 : && ! commutative_ternary_fn_p (fn))
2952 : 28 : || first_commutative_argument (fn) != 0)
2953 : : {
2954 : 24 : if (!swap_not_matching)
2955 : 12 : goto fail;
2956 : : swap_not_matching = false;
2957 : : break;
2958 : : }
2959 : : }
2960 : : }
2961 : : }
2962 : 42975 : while (j != group_size);
2963 : :
2964 : : /* Swap mismatched definition stmts. */
2965 : 35147 : if (dump_enabled_p ())
2966 : 328 : dump_printf_loc (MSG_NOTE, vect_location,
2967 : : "Re-trying with swapped operands of stmts ");
2968 : 6993786 : for (j = 0; j < group_size; ++j)
2969 : 6958639 : if (matches[j] == !swap_not_matching)
2970 : : {
2971 : 13814914 : std::swap (oprnds_info[0]->def_stmts[j],
2972 : 6907457 : oprnds_info[1]->def_stmts[j]);
2973 : 13814914 : std::swap (oprnds_info[0]->ops[j],
2974 : 6907457 : oprnds_info[1]->ops[j]);
2975 : 6907457 : if (dump_enabled_p ())
2976 : 923 : dump_printf (MSG_NOTE, "%d ", j);
2977 : : }
2978 : 35147 : if (dump_enabled_p ())
2979 : 328 : dump_printf (MSG_NOTE, "\n");
2980 : : /* After swapping some operands we lost track whether an
2981 : : operand has any pattern defs so be conservative here. */
2982 : 67089 : if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2983 : 3226 : oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2984 : : /* And try again with scratch 'matches' ... */
2985 : 35147 : bool *tem = XALLOCAVEC (bool, group_size);
2986 : 35147 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2987 : : group_size, &this_max_nunits,
2988 : : tem, limit,
2989 : : &this_tree_size, bst_map)) != NULL)
2990 : : {
2991 : 6689 : oprnd_info->def_stmts = vNULL;
2992 : 6689 : children.safe_push (child);
2993 : 6689 : continue;
2994 : : }
2995 : : }
2996 : 448724 : fail:
2997 : :
2998 : : /* If the SLP build failed and we analyze a basic-block
2999 : : simply treat nodes we fail to build as externally defined
3000 : : (and thus build vectors from the scalar defs).
3001 : : The cost model will reject outright expensive cases.
3002 : : ??? This doesn't treat cases where permutation ultimatively
3003 : : fails (or we don't try permutation below). Ideally we'd
3004 : : even compute a permutation that will end up with the maximum
3005 : : SLP tree size... */
3006 : 448724 : if (is_a <bb_vec_info> (vinfo)
3007 : : /* ??? Rejecting patterns this way doesn't work. We'd have to
3008 : : do extra work to cancel the pattern so the uses see the
3009 : : scalar version. */
3010 : 405655 : && !is_pattern_stmt_p (stmt_info)
3011 : 830683 : && !oprnd_info->any_pattern)
3012 : : {
3013 : : /* But if there's a leading vector sized set of matching stmts
3014 : : fail here so we can split the group. This matches the condition
3015 : : vect_analyze_slp_instance uses. */
3016 : : /* ??? We might want to split here and combine the results to support
3017 : : multiple vector sizes better. */
3018 : 590919 : for (j = 0; j < group_size; ++j)
3019 : 590919 : if (!matches[j])
3020 : : break;
3021 : 381718 : if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
3022 : 381682 : && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
3023 : : {
3024 : 371291 : if (dump_enabled_p ())
3025 : 503 : dump_printf_loc (MSG_NOTE, vect_location,
3026 : : "Building vector operands from scalars\n");
3027 : 371291 : this_tree_size++;
3028 : 371291 : child = vect_create_new_slp_node (oprnd_info->ops);
3029 : 371291 : children.safe_push (child);
3030 : 371291 : oprnd_info->ops = vNULL;
3031 : 371291 : continue;
3032 : : }
3033 : : }
3034 : :
3035 : 77433 : gcc_assert (child == NULL);
3036 : 82209 : FOR_EACH_VEC_ELT (children, j, child)
3037 : 4776 : if (child)
3038 : 4776 : vect_free_slp_tree (child);
3039 : 77433 : vect_free_oprnd_info (oprnds_info);
3040 : 77433 : return NULL;
3041 : : }
3042 : :
3043 : 3171288 : vect_free_oprnd_info (oprnds_info);
3044 : :
3045 : : /* If we have all children of a child built up from uniform scalars
3046 : : or does more than one possibly expensive vector construction then
3047 : : just throw that away, causing it built up from scalars.
3048 : : The exception is the SLP node for the vector store. */
3049 : 3171288 : if (is_a <bb_vec_info> (vinfo)
3050 : 1119098 : && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
3051 : : /* ??? Rejecting patterns this way doesn't work. We'd have to
3052 : : do extra work to cancel the pattern so the uses see the
3053 : : scalar version. */
3054 : 3625180 : && !is_pattern_stmt_p (stmt_info))
3055 : : {
3056 : : slp_tree child;
3057 : : unsigned j;
3058 : : bool all_uniform_p = true;
3059 : : unsigned n_vector_builds = 0;
3060 : 1265750 : FOR_EACH_VEC_ELT (children, j, child)
3061 : : {
3062 : 836642 : if (!child)
3063 : : ;
3064 : 836642 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
3065 : : all_uniform_p = false;
3066 : 598590 : else if (!vect_slp_tree_uniform_p (child))
3067 : : {
3068 : 459999 : all_uniform_p = false;
3069 : 459999 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
3070 : 425225 : n_vector_builds++;
3071 : : }
3072 : : }
3073 : 429108 : if (all_uniform_p
3074 : 429108 : || n_vector_builds > 1
3075 : 729732 : || (n_vector_builds == children.length ()
3076 : 33117 : && is_a <gphi *> (stmt_info->stmt)))
3077 : : {
3078 : : /* Roll back. */
3079 : 136848 : matches[0] = false;
3080 : 420817 : FOR_EACH_VEC_ELT (children, j, child)
3081 : 283969 : if (child)
3082 : 283969 : vect_free_slp_tree (child);
3083 : :
3084 : 136848 : if (dump_enabled_p ())
3085 : 129 : dump_printf_loc (MSG_NOTE, vect_location,
3086 : : "Building parent vector operands from "
3087 : : "scalars instead\n");
3088 : 136848 : return NULL;
3089 : : }
3090 : : }
3091 : :
3092 : 3034440 : *tree_size += this_tree_size + 1;
3093 : 3034440 : *max_nunits = this_max_nunits;
3094 : :
3095 : 3034440 : if (two_operators)
3096 : : {
3097 : : /* ??? We'd likely want to either cache in bst_map sth like
3098 : : { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
3099 : : the true { a+b, a+b, a+b, a+b } ... but there we don't have
3100 : : explicit stmts to put in so the keying on 'stmts' doesn't
3101 : : work (but we have the same issue with nodes that use 'ops'). */
3102 : :
3103 : 4594 : if (has_two_operators_perm)
3104 : : {
3105 : 22 : slp_tree child = children[0];
3106 : 22 : children.truncate (0);
3107 : 66 : for (i = 0; i < 2; i++)
3108 : : {
3109 : 44 : slp_tree pnode
3110 : 44 : = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
3111 : 44 : SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
3112 : 44 : SLP_TREE_VECTYPE (pnode) = vectype;
3113 : 44 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3114 : 44 : SLP_TREE_CHILDREN (pnode).quick_push (child);
3115 : 44 : lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
3116 : 44 : children.safe_push (pnode);
3117 : :
3118 : 476 : for (unsigned j = 0; j < stmts.length (); j++)
3119 : 432 : perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
3120 : : }
3121 : :
3122 : 22 : SLP_TREE_REF_COUNT (child) += 4;
3123 : : }
3124 : :
3125 : 4594 : slp_tree one = new _slp_tree;
3126 : 4594 : slp_tree two = new _slp_tree;
3127 : 4594 : SLP_TREE_DEF_TYPE (one) = vect_internal_def;
3128 : 4594 : SLP_TREE_DEF_TYPE (two) = vect_internal_def;
3129 : 4594 : SLP_TREE_VECTYPE (one) = vectype;
3130 : 4594 : SLP_TREE_VECTYPE (two) = vectype;
3131 : 4594 : SLP_TREE_CHILDREN (one).safe_splice (children);
3132 : 4594 : SLP_TREE_CHILDREN (two).safe_splice (children);
3133 : 4594 : slp_tree child;
3134 : 18378 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
3135 : 9190 : SLP_TREE_REF_COUNT (child)++;
3136 : :
3137 : : /* Here we record the original defs since this
3138 : : node represents the final lane configuration. */
3139 : 4594 : node = vect_create_new_slp_node (node, stmts, 2);
3140 : 4594 : SLP_TREE_VECTYPE (node) = vectype;
3141 : 4594 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3142 : 4594 : SLP_TREE_CHILDREN (node).quick_push (one);
3143 : 4594 : SLP_TREE_CHILDREN (node).quick_push (two);
3144 : 4594 : enum tree_code code0 = ERROR_MARK;
3145 : 4594 : enum tree_code ocode = ERROR_MARK;
3146 : 4594 : if (gassign *stmt = dyn_cast <gassign *> (stmts[0]->stmt))
3147 : 4592 : code0 = gimple_assign_rhs_code (stmt);
3148 : 4594 : stmt_vec_info ostmt_info;
3149 : 4594 : unsigned j = 0;
3150 : 18006 : FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
3151 : : {
3152 : 13412 : int op = 0;
3153 : 13412 : if (gassign *ostmt = dyn_cast <gassign *> (ostmt_info->stmt))
3154 : : {
3155 : 13408 : if (gimple_assign_rhs_code (ostmt) != code0)
3156 : : {
3157 : 6732 : ocode = gimple_assign_rhs_code (ostmt);
3158 : : op = 1;
3159 : : j = i;
3160 : : }
3161 : : }
3162 : : else
3163 : : {
3164 : 8 : if (gimple_call_combined_fn (stmts[0]->stmt)
3165 : 4 : != gimple_call_combined_fn (ostmt_info->stmt))
3166 : : {
3167 : 2 : op = 1;
3168 : 2 : j = i;
3169 : : }
3170 : : }
3171 : 13412 : SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (op, i));
3172 : : }
3173 : 4594 : SLP_TREE_CODE (one) = code0;
3174 : 4594 : SLP_TREE_CODE (two) = ocode;
3175 : 4594 : SLP_TREE_LANES (one) = stmts.length ();
3176 : 4594 : SLP_TREE_LANES (two) = stmts.length ();
3177 : 4594 : SLP_TREE_REPRESENTATIVE (one) = stmts[0];
3178 : 4594 : SLP_TREE_REPRESENTATIVE (two) = stmts[j];
3179 : :
3180 : 4594 : return node;
3181 : : }
3182 : :
3183 : 3029846 : node = vect_create_new_slp_node (node, stmts, nops);
3184 : 3029846 : SLP_TREE_VECTYPE (node) = vectype;
3185 : 3029846 : SLP_TREE_CHILDREN (node).splice (children);
3186 : 3029846 : SLP_TREE_GS_SCALE (node) = gs_scale;
3187 : 3029846 : SLP_TREE_GS_BASE (node) = gs_base;
3188 : 3029846 : if (reduc_idx != -1)
3189 : : {
3190 : 72217 : gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) != -1
3191 : : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
3192 : : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def);
3193 : 72217 : SLP_TREE_REDUC_IDX (node) = reduc_idx;
3194 : 72217 : node->cycle_info.id = SLP_TREE_CHILDREN (node)[reduc_idx]->cycle_info.id;
3195 : : }
3196 : : /* When reaching the reduction PHI, create a vect_reduc_info. */
3197 : 2957629 : else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3198 : 2957629 : || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3199 : 2957629 : && is_a <gphi *> (STMT_VINFO_STMT (stmt_info)))
3200 : : {
3201 : 65425 : loop_vec_info loop_vinfo = as_a <loop_vec_info> (vinfo);
3202 : 65425 : gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) == -1);
3203 : 65425 : node->cycle_info.id = loop_vinfo->reduc_infos.length ();
3204 : 65425 : vect_reduc_info reduc_info = new vect_reduc_info_s ();
3205 : 65425 : loop_vinfo->reduc_infos.safe_push (reduc_info);
3206 : 65425 : stmt_vec_info reduc_phi = stmt_info;
3207 : : /* ??? For double reductions vect_is_simple_reduction stores the
3208 : : reduction type and code on the inner loop header PHI. */
3209 : 65425 : if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3210 : : {
3211 : 317 : use_operand_p use_p;
3212 : 317 : gimple *use_stmt;
3213 : 317 : bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
3214 : : &use_p, &use_stmt);
3215 : 317 : gcc_assert (res);
3216 : 317 : reduc_phi = loop_vinfo->lookup_stmt (use_stmt);
3217 : : }
3218 : 65425 : VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (stmt_info);
3219 : 65425 : VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (reduc_phi);
3220 : 65425 : VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (reduc_phi);
3221 : 65425 : VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
3222 : : }
3223 : : return node;
3224 : 9746163 : }
3225 : :
3226 : : /* Dump a single SLP tree NODE. */
3227 : :
3228 : : static void
3229 : 437190 : vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
3230 : : slp_tree node)
3231 : : {
3232 : 437190 : unsigned i, j;
3233 : 437190 : slp_tree child;
3234 : 437190 : stmt_vec_info stmt_info;
3235 : 437190 : tree op;
3236 : :
3237 : 437190 : dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
3238 : 437190 : dump_user_location_t user_loc = loc.get_user_location ();
3239 : 437190 : dump_printf_loc (metadata, user_loc,
3240 : : "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
3241 : : ", refcnt=%u)",
3242 : 437190 : SLP_TREE_DEF_TYPE (node) == vect_external_def
3243 : : ? " (external)"
3244 : : : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
3245 : 421982 : ? " (constant)"
3246 : : : ""), (void *) node,
3247 : 437190 : estimated_poly_value (node->max_nunits),
3248 : : SLP_TREE_REF_COUNT (node));
3249 : 437190 : if (SLP_TREE_VECTYPE (node))
3250 : 367181 : dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
3251 : 437190 : dump_printf (metadata, "%s",
3252 : 437190 : node->avoid_stlf_fail ? " (avoid-stlf-fail)" : "");
3253 : 437190 : if (node->cycle_info.id != -1 || node->cycle_info.reduc_idx != -1)
3254 : 20779 : dump_printf (metadata, " cycle %d, link %d", node->cycle_info.id,
3255 : : node->cycle_info.reduc_idx);
3256 : 437190 : dump_printf (metadata, "\n");
3257 : 437190 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3258 : : {
3259 : 352673 : if (SLP_TREE_PERMUTE_P (node))
3260 : 12863 : dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
3261 : : else
3262 : 339810 : dump_printf_loc (metadata, user_loc, "op template: %G",
3263 : 339810 : SLP_TREE_REPRESENTATIVE (node)->stmt);
3264 : : }
3265 : 437190 : if (SLP_TREE_SCALAR_STMTS (node).exists ())
3266 : 833941 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3267 : 489021 : if (stmt_info)
3268 : 484242 : dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
3269 : 484242 : STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
3270 : : i, stmt_info->stmt);
3271 : : else
3272 : 4779 : dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
3273 : : else
3274 : : {
3275 : 92270 : dump_printf_loc (metadata, user_loc, "\t{ ");
3276 : 292671 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
3277 : 108131 : dump_printf (metadata, "%T%s ", op,
3278 : 108131 : i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
3279 : 92270 : dump_printf (metadata, "}\n");
3280 : : }
3281 : 437190 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3282 : : {
3283 : 59778 : dump_printf_loc (metadata, user_loc, "\tload permutation {");
3284 : 194811 : FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
3285 : 75255 : dump_printf (dump_kind, " %u", j);
3286 : 59778 : dump_printf (dump_kind, " }\n");
3287 : : }
3288 : 437190 : if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3289 : : {
3290 : 12871 : dump_printf_loc (metadata, user_loc, "\tlane permutation {");
3291 : 61339 : for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
3292 : 35597 : dump_printf (dump_kind, " %u[%u]",
3293 : 35597 : SLP_TREE_LANE_PERMUTATION (node)[i].first,
3294 : 35597 : SLP_TREE_LANE_PERMUTATION (node)[i].second);
3295 : 12871 : dump_printf (dump_kind, " }%s\n",
3296 : 12871 : node->ldst_lanes ? " (load-lanes)" : "");
3297 : : }
3298 : 437190 : if (SLP_TREE_CHILDREN (node).is_empty ())
3299 : 165855 : return;
3300 : 271335 : dump_printf_loc (metadata, user_loc, "\tchildren");
3301 : 990908 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3302 : 448238 : dump_printf (dump_kind, " %p", (void *)child);
3303 : 271335 : dump_printf (dump_kind, "%s\n",
3304 : 271335 : node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
3305 : : ? " (store-lanes)" : "");
3306 : : }
3307 : :
3308 : : DEBUG_FUNCTION void
3309 : 0 : debug (slp_tree node)
3310 : : {
3311 : 0 : debug_dump_context ctx;
3312 : 0 : vect_print_slp_tree (MSG_NOTE,
3313 : 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3314 : : node);
3315 : 0 : }
3316 : :
3317 : : /* Recursive helper for the dot producer below. */
3318 : :
3319 : : static void
3320 : 0 : dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
3321 : : {
3322 : 0 : if (visited.add (node))
3323 : : return;
3324 : :
3325 : 0 : fprintf (f, "\"%p\" [label=\"", (void *)node);
3326 : 0 : vect_print_slp_tree (MSG_NOTE,
3327 : 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3328 : : node);
3329 : 0 : fprintf (f, "\"];\n");
3330 : :
3331 : :
3332 : 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3333 : 0 : fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
3334 : :
3335 : 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
3336 : 0 : if (child)
3337 : 0 : dot_slp_tree (f, child, visited);
3338 : : }
3339 : :
3340 : : DEBUG_FUNCTION void
3341 : 0 : dot_slp_tree (const char *fname, slp_tree node)
3342 : : {
3343 : 0 : FILE *f = fopen (fname, "w");
3344 : 0 : fprintf (f, "digraph {\n");
3345 : 0 : fflush (f);
3346 : 0 : {
3347 : 0 : debug_dump_context ctx (f);
3348 : 0 : hash_set<slp_tree> visited;
3349 : 0 : dot_slp_tree (f, node, visited);
3350 : 0 : }
3351 : 0 : fflush (f);
3352 : 0 : fprintf (f, "}\n");
3353 : 0 : fclose (f);
3354 : 0 : }
3355 : :
3356 : : DEBUG_FUNCTION void
3357 : 0 : dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
3358 : : {
3359 : 0 : FILE *f = fopen (fname, "w");
3360 : 0 : fprintf (f, "digraph {\n");
3361 : 0 : fflush (f);
3362 : 0 : {
3363 : 0 : debug_dump_context ctx (f);
3364 : 0 : hash_set<slp_tree> visited;
3365 : 0 : for (auto inst : slp_instances)
3366 : 0 : dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
3367 : 0 : }
3368 : 0 : fflush (f);
3369 : 0 : fprintf (f, "}\n");
3370 : 0 : fclose (f);
3371 : 0 : }
3372 : :
3373 : : /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
3374 : :
3375 : : static void
3376 : 473803 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3377 : : slp_tree node, hash_set<slp_tree> &visited)
3378 : : {
3379 : 473803 : unsigned i;
3380 : 473803 : slp_tree child;
3381 : :
3382 : 473803 : if (visited.add (node))
3383 : 473803 : return;
3384 : :
3385 : 436800 : vect_print_slp_tree (dump_kind, loc, node);
3386 : :
3387 : 1321373 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3388 : 447773 : if (child)
3389 : 395442 : vect_print_slp_graph (dump_kind, loc, child, visited);
3390 : : }
3391 : :
3392 : : static void
3393 : 47589 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3394 : : slp_tree entry)
3395 : : {
3396 : 47589 : hash_set<slp_tree> visited;
3397 : 47589 : vect_print_slp_graph (dump_kind, loc, entry, visited);
3398 : 47589 : }
3399 : :
3400 : : DEBUG_FUNCTION void
3401 : 0 : debug (slp_instance instance)
3402 : : {
3403 : 0 : debug_dump_context ctx;
3404 : 0 : vect_print_slp_graph (MSG_NOTE,
3405 : 0 : dump_location_t::from_location_t (UNKNOWN_LOCATION),
3406 : : SLP_INSTANCE_TREE (instance));
3407 : 0 : }
3408 : :
3409 : : /* Mark the tree rooted at NODE with PURE_SLP. */
3410 : :
3411 : : static void
3412 : 6490092 : vect_mark_slp_stmts (vec_info *vinfo, slp_tree node,
3413 : : hash_set<slp_tree> &visited)
3414 : : {
3415 : 6490092 : int i;
3416 : 6490092 : stmt_vec_info stmt_info;
3417 : 6490092 : slp_tree child;
3418 : :
3419 : 6490092 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3420 : : return;
3421 : :
3422 : 4597082 : if (visited.add (node))
3423 : : return;
3424 : :
3425 : 10798866 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3426 : 6467220 : if (stmt_info)
3427 : : {
3428 : 6394538 : STMT_SLP_TYPE (stmt_info) = pure_slp;
3429 : : /* ??? For .MASK_LOAD and .MASK_STORE detected as load/store-lanes
3430 : : when there is the mask_conversion pattern applied we have lost the
3431 : : alternate lanes of the uniform mask which nevertheless
3432 : : have separate pattern defs. To not confuse hybrid
3433 : : analysis we mark those as covered as well here. */
3434 : 6394538 : if (node->ldst_lanes)
3435 : 6467220 : if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
3436 : 0 : if (gimple_call_internal_p (call, IFN_MASK_LOAD)
3437 : 0 : || gimple_call_internal_p (call, IFN_MASK_STORE))
3438 : : {
3439 : 0 : tree mask = gimple_call_arg (call,
3440 : : internal_fn_mask_index
3441 : 0 : (gimple_call_internal_fn (call)));
3442 : 0 : if (TREE_CODE (mask) == SSA_NAME)
3443 : 0 : if (stmt_vec_info mask_info = vinfo->lookup_def (mask))
3444 : : {
3445 : 0 : mask_info = vect_stmt_to_vectorize (mask_info);
3446 : 0 : STMT_SLP_TYPE (mask_info) = pure_slp;
3447 : : }
3448 : : }
3449 : : }
3450 : :
3451 : 10241806 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3452 : 5910160 : if (child)
3453 : 4751695 : vect_mark_slp_stmts (vinfo, child, visited);
3454 : : }
3455 : :
3456 : : static void
3457 : 1738397 : vect_mark_slp_stmts (vec_info *vinfo, slp_tree node)
3458 : : {
3459 : 1738397 : hash_set<slp_tree> visited;
3460 : 1738397 : vect_mark_slp_stmts (vinfo, node, visited);
3461 : 1738397 : }
3462 : :
3463 : : /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
3464 : :
3465 : : static void
3466 : 2349447 : vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
3467 : : {
3468 : 2349447 : int i;
3469 : 2349447 : stmt_vec_info stmt_info;
3470 : 2349447 : slp_tree child;
3471 : :
3472 : 2349447 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3473 : : return;
3474 : :
3475 : 1380731 : if (visited.add (node))
3476 : : return;
3477 : :
3478 : 4331342 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3479 : 3049994 : if (stmt_info)
3480 : : {
3481 : 3049994 : gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
3482 : : || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
3483 : 3049994 : STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
3484 : : }
3485 : :
3486 : 2842678 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3487 : 1561330 : if (child)
3488 : 1561330 : vect_mark_slp_stmts_relevant (child, visited);
3489 : : }
3490 : :
3491 : : static void
3492 : 788117 : vect_mark_slp_stmts_relevant (slp_tree node)
3493 : : {
3494 : 788117 : hash_set<slp_tree> visited;
3495 : 788117 : vect_mark_slp_stmts_relevant (node, visited);
3496 : 788117 : }
3497 : :
3498 : :
3499 : : /* Gather loads in the SLP graph NODE and populate the INST loads array. */
3500 : :
3501 : : static void
3502 : 11855123 : vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
3503 : : hash_set<slp_tree> &visited)
3504 : : {
3505 : 11855123 : if (!node || visited.add (node))
3506 : 2510144 : return;
3507 : :
3508 : 9344979 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3509 : : return;
3510 : :
3511 : 6685963 : if (!SLP_TREE_PERMUTE_P (node))
3512 : : {
3513 : 6470686 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3514 : 6470686 : if (STMT_VINFO_DATA_REF (stmt_info)
3515 : 2383742 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3516 : 1282739 : loads.safe_push (node);
3517 : : }
3518 : :
3519 : : unsigned i;
3520 : : slp_tree child;
3521 : 15925029 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3522 : 9239066 : vect_gather_slp_loads (loads, child, visited);
3523 : : }
3524 : :
3525 : :
3526 : : /* Find the last store in SLP INSTANCE. */
3527 : :
3528 : : stmt_vec_info
3529 : 2746045 : vect_find_last_scalar_stmt_in_slp (slp_tree node)
3530 : : {
3531 : 2746045 : stmt_vec_info last = NULL;
3532 : 2746045 : stmt_vec_info stmt_vinfo;
3533 : :
3534 : 10030482 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3535 : 7284437 : if (stmt_vinfo)
3536 : : {
3537 : 7284437 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3538 : 7284437 : last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
3539 : : }
3540 : :
3541 : 2746045 : return last;
3542 : : }
3543 : :
3544 : : /* Find the first stmt in NODE. */
3545 : :
3546 : : stmt_vec_info
3547 : 532918 : vect_find_first_scalar_stmt_in_slp (slp_tree node)
3548 : : {
3549 : 532918 : stmt_vec_info first = NULL;
3550 : 532918 : stmt_vec_info stmt_vinfo;
3551 : :
3552 : 1794516 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3553 : 1261598 : if (stmt_vinfo)
3554 : : {
3555 : 1258987 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3556 : 1258987 : if (!first
3557 : 1258987 : || get_later_stmt (stmt_vinfo, first) == first)
3558 : : first = stmt_vinfo;
3559 : : }
3560 : :
3561 : 532918 : return first;
3562 : : }
3563 : :
3564 : : /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3565 : : two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3566 : : (also containing the first GROUP1_SIZE stmts, since stores are
3567 : : consecutive), the second containing the remainder.
3568 : : Return the first stmt in the second group. */
3569 : :
3570 : : static stmt_vec_info
3571 : 157582 : vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3572 : : {
3573 : 157582 : gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3574 : 157582 : gcc_assert (group1_size > 0);
3575 : 157582 : int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3576 : 157582 : gcc_assert (group2_size > 0);
3577 : 157582 : DR_GROUP_SIZE (first_vinfo) = group1_size;
3578 : :
3579 : 157582 : stmt_vec_info stmt_info = first_vinfo;
3580 : 528431 : for (unsigned i = group1_size; i > 1; i--)
3581 : : {
3582 : 370849 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3583 : 370849 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3584 : : }
3585 : : /* STMT is now the last element of the first group. */
3586 : 157582 : stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3587 : 157582 : DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3588 : :
3589 : 157582 : DR_GROUP_SIZE (group2) = group2_size;
3590 : 437740 : for (stmt_info = group2; stmt_info;
3591 : 280158 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3592 : : {
3593 : 280158 : DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3594 : 280158 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3595 : : }
3596 : :
3597 : : /* For the second group, the DR_GROUP_GAP is that before the original group,
3598 : : plus skipping over the first vector. */
3599 : 157582 : DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3600 : :
3601 : : /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3602 : 157582 : DR_GROUP_GAP (first_vinfo) += group2_size;
3603 : :
3604 : 157582 : if (dump_enabled_p ())
3605 : 53 : dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3606 : : group1_size, group2_size);
3607 : :
3608 : 157582 : return group2;
3609 : : }
3610 : :
3611 : : /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3612 : : statements and a vector of NUNITS elements. */
3613 : :
3614 : : static poly_uint64
3615 : 4501296 : calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3616 : : {
3617 : 4501296 : return exact_div (common_multiple (nunits, group_size), group_size);
3618 : : }
3619 : :
3620 : : /* Helper that checks to see if a node is a load node. */
3621 : :
3622 : : static inline bool
3623 : 89 : vect_is_slp_load_node (slp_tree root)
3624 : : {
3625 : 89 : return (!SLP_TREE_PERMUTE_P (root)
3626 : 77 : && SLP_TREE_DEF_TYPE (root) == vect_internal_def
3627 : 65 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3628 : 129 : && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
3629 : : }
3630 : :
3631 : :
3632 : : /* Helper function of optimize_load_redistribution that performs the operation
3633 : : recursively. */
3634 : :
3635 : : static slp_tree
3636 : 22178 : optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3637 : : vec_info *vinfo, unsigned int group_size,
3638 : : hash_map<slp_tree, slp_tree> *load_map,
3639 : : slp_tree root)
3640 : : {
3641 : 22178 : if (slp_tree *leader = load_map->get (root))
3642 : 3580 : return *leader;
3643 : :
3644 : 18598 : slp_tree node;
3645 : 18598 : unsigned i;
3646 : :
3647 : : /* For now, we don't know anything about externals so do not do anything. */
3648 : 18598 : if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3649 : : return NULL;
3650 : 13597 : else if (SLP_TREE_PERMUTE_P (root))
3651 : : {
3652 : : /* First convert this node into a load node and add it to the leaves
3653 : : list and flatten the permute from a lane to a load one. If it's
3654 : : unneeded it will be elided later. */
3655 : 69 : vec<stmt_vec_info> stmts;
3656 : 69 : stmts.create (SLP_TREE_LANES (root));
3657 : 69 : lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3658 : 109 : for (unsigned j = 0; j < lane_perm.length (); j++)
3659 : : {
3660 : 89 : std::pair<unsigned, unsigned> perm = lane_perm[j];
3661 : 89 : node = SLP_TREE_CHILDREN (root)[perm.first];
3662 : :
3663 : 89 : if (!vect_is_slp_load_node (node)
3664 : 89 : || SLP_TREE_CHILDREN (node).exists ())
3665 : : {
3666 : 49 : stmts.release ();
3667 : 49 : goto next;
3668 : : }
3669 : :
3670 : 40 : stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3671 : : }
3672 : :
3673 : 20 : if (dump_enabled_p ())
3674 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
3675 : : "converting stmts on permute node %p\n",
3676 : : (void *) root);
3677 : :
3678 : 20 : bool *matches = XALLOCAVEC (bool, group_size);
3679 : 20 : poly_uint64 max_nunits = 1;
3680 : 20 : unsigned tree_size = 0, limit = 1;
3681 : 20 : node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3682 : : matches, &limit, &tree_size, bst_map);
3683 : 20 : if (!node)
3684 : 0 : stmts.release ();
3685 : :
3686 : 20 : load_map->put (root, node);
3687 : 20 : return node;
3688 : : }
3689 : :
3690 : 13528 : next:
3691 : 13577 : load_map->put (root, NULL);
3692 : :
3693 : 31635 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3694 : : {
3695 : 18058 : slp_tree value
3696 : 18058 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3697 : : node);
3698 : 18058 : if (value)
3699 : : {
3700 : 20 : SLP_TREE_REF_COUNT (value)++;
3701 : 20 : SLP_TREE_CHILDREN (root)[i] = value;
3702 : : /* ??? We know the original leafs of the replaced nodes will
3703 : : be referenced by bst_map, only the permutes created by
3704 : : pattern matching are not. */
3705 : 20 : if (SLP_TREE_REF_COUNT (node) == 1)
3706 : 20 : load_map->remove (node);
3707 : 20 : vect_free_slp_tree (node);
3708 : : }
3709 : : }
3710 : :
3711 : : return NULL;
3712 : : }
3713 : :
3714 : : /* Temporary workaround for loads not being CSEd during SLP build. This
3715 : : function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3716 : : VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3717 : : same DR such that the final operation is equal to a permuted load. Such
3718 : : NODES are then directly converted into LOADS themselves. The nodes are
3719 : : CSEd using BST_MAP. */
3720 : :
3721 : : static void
3722 : 3076 : optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3723 : : vec_info *vinfo, unsigned int group_size,
3724 : : hash_map<slp_tree, slp_tree> *load_map,
3725 : : slp_tree root)
3726 : : {
3727 : 3076 : slp_tree node;
3728 : 3076 : unsigned i;
3729 : :
3730 : 7196 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3731 : : {
3732 : 4120 : slp_tree value
3733 : 4120 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3734 : : node);
3735 : 4120 : if (value)
3736 : : {
3737 : 0 : SLP_TREE_REF_COUNT (value)++;
3738 : 0 : SLP_TREE_CHILDREN (root)[i] = value;
3739 : : /* ??? We know the original leafs of the replaced nodes will
3740 : : be referenced by bst_map, only the permutes created by
3741 : : pattern matching are not. */
3742 : 0 : if (SLP_TREE_REF_COUNT (node) == 1)
3743 : 0 : load_map->remove (node);
3744 : 0 : vect_free_slp_tree (node);
3745 : : }
3746 : : }
3747 : 3076 : }
3748 : :
3749 : : /* Helper function of vect_match_slp_patterns.
3750 : :
3751 : : Attempts to match patterns against the slp tree rooted in REF_NODE using
3752 : : VINFO. Patterns are matched in post-order traversal.
3753 : :
3754 : : If matching is successful the value in REF_NODE is updated and returned, if
3755 : : not then it is returned unchanged. */
3756 : :
3757 : : static bool
3758 : 6682683 : vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3759 : : slp_tree_to_load_perm_map_t *perm_cache,
3760 : : slp_compat_nodes_map_t *compat_cache,
3761 : : hash_set<slp_tree> *visited)
3762 : : {
3763 : 6682683 : unsigned i;
3764 : 6682683 : slp_tree node = *ref_node;
3765 : 6682683 : bool found_p = false;
3766 : 6682683 : if (!node || visited->add (node))
3767 : 1229475 : return false;
3768 : :
3769 : : slp_tree child;
3770 : 10389458 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3771 : 4936250 : found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3772 : : vinfo, perm_cache, compat_cache,
3773 : : visited);
3774 : :
3775 : 16359624 : for (unsigned x = 0; x < num__slp_patterns; x++)
3776 : : {
3777 : 10906416 : vect_pattern *pattern
3778 : 10906416 : = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3779 : 10906416 : if (pattern)
3780 : : {
3781 : 1159 : pattern->build (vinfo);
3782 : 1159 : delete pattern;
3783 : 1159 : found_p = true;
3784 : : }
3785 : : }
3786 : :
3787 : : return found_p;
3788 : : }
3789 : :
3790 : : /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3791 : : vec_info VINFO.
3792 : :
3793 : : The modified tree is returned. Patterns are tried in order and multiple
3794 : : patterns may match. */
3795 : :
3796 : : static bool
3797 : 1746433 : vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3798 : : hash_set<slp_tree> *visited,
3799 : : slp_tree_to_load_perm_map_t *perm_cache,
3800 : : slp_compat_nodes_map_t *compat_cache)
3801 : : {
3802 : 1746433 : DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3803 : 1746433 : slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3804 : :
3805 : 1746433 : if (dump_enabled_p ())
3806 : 31874 : dump_printf_loc (MSG_NOTE, vect_location,
3807 : : "Analyzing SLP tree %p for patterns\n",
3808 : 31874 : (void *) SLP_INSTANCE_TREE (instance));
3809 : :
3810 : 1746433 : return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3811 : 1746433 : visited);
3812 : : }
3813 : :
3814 : : /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3815 : : vectorizing with VECTYPE that might be NULL. MASKED_P indicates whether
3816 : : the stores are masked.
3817 : : Return true if we could use IFN_STORE_LANES instead and if that appears
3818 : : to be the better approach. */
3819 : :
3820 : : static bool
3821 : 4929 : vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3822 : : tree vectype, bool masked_p,
3823 : : unsigned int group_size,
3824 : : unsigned int new_group_size)
3825 : : {
3826 : 4929 : if (!vectype)
3827 : : {
3828 : 4929 : tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3829 : 4929 : vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3830 : : }
3831 : 4929 : if (!vectype)
3832 : : return false;
3833 : : /* Allow the split if one of the two new groups would operate on full
3834 : : vectors *within* rather than across one scalar loop iteration.
3835 : : This is purely a heuristic, but it should work well for group
3836 : : sizes of 3 and 4, where the possible splits are:
3837 : :
3838 : : 3->2+1: OK if the vector has exactly two elements
3839 : : 4->2+2: Likewise
3840 : : 4->3+1: Less clear-cut. */
3841 : 4929 : if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3842 : 2561 : || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3843 : 2419 : return false;
3844 : 2510 : return vect_store_lanes_supported (vectype, group_size, masked_p) != IFN_LAST;
3845 : : }
3846 : :
3847 : : /* Analyze an SLP instance starting from a group of grouped stores. Call
3848 : : vect_build_slp_tree to build a tree of packed stmts if possible.
3849 : : Return FALSE if it's impossible to SLP any stmt in the loop. */
3850 : :
3851 : : static bool
3852 : : vect_analyze_slp_instance (vec_info *vinfo,
3853 : : scalar_stmts_to_slp_tree_map_t *bst_map,
3854 : : stmt_vec_info stmt_info, slp_instance_kind kind,
3855 : : unsigned max_tree_size, unsigned *limit,
3856 : : bool force_single_lane);
3857 : :
3858 : : /* Build an interleaving scheme for the store sources RHS_NODES from
3859 : : SCALAR_STMTS. */
3860 : :
3861 : : static slp_tree
3862 : 6417 : vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
3863 : : vec<stmt_vec_info> &scalar_stmts,
3864 : : poly_uint64 max_nunits)
3865 : : {
3866 : 6417 : unsigned int group_size = scalar_stmts.length ();
3867 : 12834 : slp_tree node = vect_create_new_slp_node (scalar_stmts,
3868 : 6417 : SLP_TREE_CHILDREN
3869 : : (rhs_nodes[0]).length ());
3870 : 6417 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
3871 : 6417 : node->max_nunits = max_nunits;
3872 : 6417 : for (unsigned l = 0;
3873 : 12861 : l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
3874 : : {
3875 : : /* And a permute merging all RHS SLP trees. */
3876 : 6444 : slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
3877 : 6444 : VEC_PERM_EXPR);
3878 : 6444 : SLP_TREE_CHILDREN (node).quick_push (perm);
3879 : 6444 : SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
3880 : 6444 : SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
3881 : 6444 : perm->max_nunits = max_nunits;
3882 : 6444 : SLP_TREE_LANES (perm) = group_size;
3883 : : /* ??? We should set this NULL but that's not expected. */
3884 : 6444 : SLP_TREE_REPRESENTATIVE (perm)
3885 : 6444 : = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
3886 : 25237 : for (unsigned j = 0; j < rhs_nodes.length (); ++j)
3887 : : {
3888 : 18793 : SLP_TREE_CHILDREN (perm)
3889 : 18793 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
3890 : 18793 : SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
3891 : 18793 : for (unsigned k = 0;
3892 : 39652 : k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
3893 : : {
3894 : : /* ??? We should populate SLP_TREE_SCALAR_STMTS
3895 : : or SLP_TREE_SCALAR_OPS but then we might have
3896 : : a mix of both in our children. */
3897 : 20859 : SLP_TREE_LANE_PERMUTATION (perm)
3898 : 20859 : .quick_push (std::make_pair (j, k));
3899 : : }
3900 : : }
3901 : :
3902 : : /* Now we have a single permute node but we cannot code-generate
3903 : : the case with more than two inputs.
3904 : : Perform pairwise reduction, reducing the two inputs
3905 : : with the least number of lanes to one and then repeat until
3906 : : we end up with two inputs. That scheme makes sure we end
3907 : : up with permutes satisfying the restriction of requiring at
3908 : : most two vector inputs to produce a single vector output
3909 : : when the number of lanes is even. */
3910 : 12349 : while (SLP_TREE_CHILDREN (perm).length () > 2)
3911 : : {
3912 : : /* When we have three equal sized groups left the pairwise
3913 : : reduction does not result in a scheme that avoids using
3914 : : three vectors. Instead merge the first two groups
3915 : : to the final size with do-not-care elements (chosen
3916 : : from the first group) and then merge with the third.
3917 : : { A0, B0, x, A1, B1, x, ... }
3918 : : -> { A0, B0, C0, A1, B1, C1, ... }
3919 : : This handles group size of three (and at least
3920 : : power-of-two multiples of that). */
3921 : 5905 : if (SLP_TREE_CHILDREN (perm).length () == 3
3922 : 3062 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3923 : 3062 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
3924 : 5905 : && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3925 : 2319 : == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
3926 : : {
3927 : 2123 : int ai = 0;
3928 : 2123 : int bi = 1;
3929 : 2123 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3930 : 2123 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3931 : 2123 : unsigned n = SLP_TREE_LANES (perm);
3932 : :
3933 : 2123 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3934 : 2123 : SLP_TREE_LANES (permab) = n;
3935 : 2123 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
3936 : 2123 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3937 : 2123 : permab->max_nunits = max_nunits;
3938 : : /* ??? Should be NULL but that's not expected. */
3939 : 2123 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3940 : 2123 : SLP_TREE_CHILDREN (permab).quick_push (a);
3941 : 4255 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3942 : 2132 : SLP_TREE_LANE_PERMUTATION (permab)
3943 : 2132 : .quick_push (std::make_pair (0, k));
3944 : 2123 : SLP_TREE_CHILDREN (permab).quick_push (b);
3945 : 4255 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3946 : 2132 : SLP_TREE_LANE_PERMUTATION (permab)
3947 : 2132 : .quick_push (std::make_pair (1, k));
3948 : : /* Push the do-not-care lanes. */
3949 : 4255 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3950 : 2132 : SLP_TREE_LANE_PERMUTATION (permab)
3951 : 2132 : .quick_push (std::make_pair (0, k));
3952 : :
3953 : : /* Put the merged node into 'perm', in place of a. */
3954 : 2123 : SLP_TREE_CHILDREN (perm)[ai] = permab;
3955 : : /* Adjust the references to b in the permutation
3956 : : of perm and to the later children which we'll
3957 : : remove. */
3958 : 8519 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3959 : : {
3960 : 6396 : std::pair<unsigned, unsigned> &p
3961 : 6396 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
3962 : 6396 : if (p.first == (unsigned) bi)
3963 : : {
3964 : 2132 : p.first = ai;
3965 : 2132 : p.second += SLP_TREE_LANES (a);
3966 : : }
3967 : 4264 : else if (p.first > (unsigned) bi)
3968 : 2132 : p.first--;
3969 : : }
3970 : 2123 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3971 : 2123 : break;
3972 : : }
3973 : :
3974 : : /* Pick the two nodes with the least number of lanes,
3975 : : prefer the earliest candidate and maintain ai < bi. */
3976 : : int ai = -1;
3977 : : int bi = -1;
3978 : 33073 : for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
3979 : : {
3980 : 29291 : if (ai == -1)
3981 : 3782 : ai = ci;
3982 : 25509 : else if (bi == -1)
3983 : 3782 : bi = ci;
3984 : 21727 : else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3985 : 21727 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
3986 : 21727 : || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3987 : 17813 : < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
3988 : : {
3989 : 8716 : if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
3990 : 4358 : <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
3991 : 2075 : bi = ci;
3992 : : else
3993 : : {
3994 : 2283 : ai = bi;
3995 : 2283 : bi = ci;
3996 : : }
3997 : : }
3998 : : }
3999 : :
4000 : : /* Produce a merge of nodes ai and bi. */
4001 : 3782 : slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
4002 : 3782 : slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
4003 : 3782 : unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
4004 : 3782 : slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
4005 : 3782 : SLP_TREE_LANES (permab) = n;
4006 : 3782 : SLP_TREE_LANE_PERMUTATION (permab).create (n);
4007 : 3782 : SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
4008 : 3782 : permab->max_nunits = max_nunits;
4009 : : /* ??? Should be NULL but that's not expected. */
4010 : 3782 : SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
4011 : 3782 : SLP_TREE_CHILDREN (permab).quick_push (a);
4012 : 9888 : for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
4013 : 6106 : SLP_TREE_LANE_PERMUTATION (permab)
4014 : 6106 : .quick_push (std::make_pair (0, k));
4015 : 3782 : SLP_TREE_CHILDREN (permab).quick_push (b);
4016 : 9400 : for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
4017 : 5618 : SLP_TREE_LANE_PERMUTATION (permab)
4018 : 5618 : .quick_push (std::make_pair (1, k));
4019 : :
4020 : : /* Put the merged node into 'perm', in place of a. */
4021 : 3782 : SLP_TREE_CHILDREN (perm)[ai] = permab;
4022 : : /* Adjust the references to b in the permutation
4023 : : of perm and to the later children which we'll
4024 : : remove. */
4025 : 52699 : for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
4026 : : {
4027 : 48917 : std::pair<unsigned, unsigned> &p
4028 : 48917 : = SLP_TREE_LANE_PERMUTATION (perm)[k];
4029 : 48917 : if (p.first == (unsigned) bi)
4030 : : {
4031 : 5618 : p.first = ai;
4032 : 5618 : p.second += SLP_TREE_LANES (a);
4033 : : }
4034 : 43299 : else if (p.first > (unsigned) bi)
4035 : 17862 : p.first--;
4036 : : }
4037 : 3782 : SLP_TREE_CHILDREN (perm).ordered_remove (bi);
4038 : : }
4039 : : }
4040 : :
4041 : 6417 : return node;
4042 : : }
4043 : :
4044 : : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
4045 : : of KIND. Return true if successful. */
4046 : :
4047 : : static bool
4048 : 2176941 : vect_build_slp_instance (vec_info *vinfo,
4049 : : slp_instance_kind kind,
4050 : : vec<stmt_vec_info> &scalar_stmts,
4051 : : vec<stmt_vec_info> &root_stmt_infos,
4052 : : vec<tree> &remain,
4053 : : unsigned max_tree_size, unsigned *limit,
4054 : : scalar_stmts_to_slp_tree_map_t *bst_map,
4055 : : bool force_single_lane)
4056 : : {
4057 : : /* If there's no budget left bail out early. */
4058 : 2176941 : if (*limit == 0)
4059 : : return false;
4060 : :
4061 : 2149724 : if (kind == slp_inst_kind_ctor)
4062 : : {
4063 : 11599 : if (dump_enabled_p ())
4064 : 54 : dump_printf_loc (MSG_NOTE, vect_location,
4065 : : "Analyzing vectorizable constructor: %G\n",
4066 : 27 : root_stmt_infos[0]->stmt);
4067 : : }
4068 : 2138125 : else if (kind == slp_inst_kind_gcond)
4069 : : {
4070 : 247453 : if (dump_enabled_p ())
4071 : 5178 : dump_printf_loc (MSG_NOTE, vect_location,
4072 : : "Analyzing vectorizable control flow: %G",
4073 : 2589 : root_stmt_infos[0]->stmt);
4074 : : }
4075 : :
4076 : 2149724 : if (dump_enabled_p ())
4077 : : {
4078 : 31156 : dump_printf_loc (MSG_NOTE, vect_location,
4079 : : "Starting SLP discovery for\n");
4080 : 66009 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4081 : 69706 : dump_printf_loc (MSG_NOTE, vect_location,
4082 : 34853 : " %G", scalar_stmts[i]->stmt);
4083 : : }
4084 : :
4085 : : /* Build the tree for the SLP instance. */
4086 : 2149724 : unsigned int group_size = scalar_stmts.length ();
4087 : 2149724 : bool *matches = XALLOCAVEC (bool, group_size);
4088 : 2149724 : poly_uint64 max_nunits = 1;
4089 : 2149724 : unsigned tree_size = 0;
4090 : :
4091 : 2149724 : slp_tree node = NULL;
4092 : 2149724 : if (group_size > 1 && force_single_lane)
4093 : : {
4094 : 0 : matches[0] = true;
4095 : 0 : matches[1] = false;
4096 : : }
4097 : : else
4098 : 2149724 : node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4099 : : &max_nunits, matches, limit,
4100 : : &tree_size, bst_map);
4101 : 2149724 : if (node != NULL)
4102 : : {
4103 : : /* Calculate the unrolling factor based on the smallest type. */
4104 : 1070159 : poly_uint64 unrolling_factor
4105 : 1070159 : = calculate_unrolling_factor (max_nunits, group_size);
4106 : :
4107 : 1070159 : if (maybe_ne (unrolling_factor, 1U)
4108 : 1070159 : && is_a <bb_vec_info> (vinfo))
4109 : : {
4110 : 0 : unsigned HOST_WIDE_INT const_max_nunits;
4111 : 0 : if (!max_nunits.is_constant (&const_max_nunits)
4112 : 0 : || const_max_nunits > group_size)
4113 : : {
4114 : 0 : if (dump_enabled_p ())
4115 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4116 : : "Build SLP failed: store group "
4117 : : "size not a multiple of the vector size "
4118 : : "in basic block SLP\n");
4119 : 0 : vect_free_slp_tree (node);
4120 : 0 : return false;
4121 : : }
4122 : : /* Fatal mismatch. */
4123 : 0 : if (dump_enabled_p ())
4124 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
4125 : : "SLP discovery succeeded but node needs "
4126 : : "splitting\n");
4127 : 0 : memset (matches, true, group_size);
4128 : 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
4129 : 0 : vect_free_slp_tree (node);
4130 : : }
4131 : : else
4132 : : {
4133 : : /* Create a new SLP instance. */
4134 : 1070159 : slp_instance new_instance = XNEW (class _slp_instance);
4135 : 1070159 : SLP_INSTANCE_TREE (new_instance) = node;
4136 : 1070159 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4137 : 1070159 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4138 : 1070159 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4139 : 1070159 : SLP_INSTANCE_KIND (new_instance) = kind;
4140 : 1070159 : new_instance->reduc_phis = NULL;
4141 : 1070159 : new_instance->cost_vec = vNULL;
4142 : 1070159 : new_instance->subgraph_entries = vNULL;
4143 : :
4144 : 1070159 : if (dump_enabled_p ())
4145 : 27989 : dump_printf_loc (MSG_NOTE, vect_location,
4146 : : "SLP size %u vs. limit %u.\n",
4147 : : tree_size, max_tree_size);
4148 : :
4149 : 1070159 : vinfo->slp_instances.safe_push (new_instance);
4150 : :
4151 : : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4152 : : the number of scalar stmts in the root in a few places.
4153 : : Verify that assumption holds. */
4154 : 2140318 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4155 : : .length () == group_size);
4156 : :
4157 : 1070159 : if (dump_enabled_p ())
4158 : : {
4159 : 27989 : dump_printf_loc (MSG_NOTE, vect_location,
4160 : : "Final SLP tree for instance %p:\n",
4161 : : (void *) new_instance);
4162 : 27989 : vect_print_slp_graph (MSG_NOTE, vect_location,
4163 : : SLP_INSTANCE_TREE (new_instance));
4164 : : }
4165 : :
4166 : 1070159 : return true;
4167 : : }
4168 : : }
4169 : : /* Failed to SLP. */
4170 : :
4171 : : /* While we arrive here even with slp_inst_kind_store we should only
4172 : : for group_size == 1. The code to split store groups is only in
4173 : : vect_analyze_slp_instance now. */
4174 : 1079565 : gcc_assert (kind != slp_inst_kind_store || group_size == 1);
4175 : :
4176 : : /* Free the allocated memory. */
4177 : 1079565 : scalar_stmts.release ();
4178 : :
4179 : : /* Failed to SLP. */
4180 : 1079565 : if (dump_enabled_p ())
4181 : 3167 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4182 : : return false;
4183 : : }
4184 : :
4185 : : /* Analyze an SLP instance starting from a the start of a reduction chain.
4186 : : Call vect_build_slp_tree to build a tree of packed stmts if possible.
4187 : : Return FALSE if SLP build fails. */
4188 : :
4189 : : static bool
4190 : 3053 : vect_analyze_slp_reduc_chain (vec_info *vinfo,
4191 : : scalar_stmts_to_slp_tree_map_t *bst_map,
4192 : : stmt_vec_info stmt_info,
4193 : : unsigned max_tree_size, unsigned *limit)
4194 : : {
4195 : 3053 : vec<stmt_vec_info> scalar_stmts;
4196 : :
4197 : : /* Collect the reduction stmts and store them in scalar_stmts. */
4198 : 3053 : scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
4199 : 3053 : stmt_vec_info next_info = stmt_info;
4200 : 13626 : while (next_info)
4201 : : {
4202 : 7520 : scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4203 : 7520 : next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
4204 : : }
4205 : : /* Mark the first element of the reduction chain as reduction to properly
4206 : : transform the node. In the reduction analysis phase only the last
4207 : : element of the chain is marked as reduction. */
4208 : 3053 : STMT_VINFO_DEF_TYPE (stmt_info)
4209 : 3053 : = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
4210 : 3053 : STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
4211 : 3085 : = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
4212 : :
4213 : : /* Build the tree for the SLP instance. */
4214 : 3053 : vec<stmt_vec_info> root_stmt_infos = vNULL;
4215 : 3053 : vec<tree> remain = vNULL;
4216 : :
4217 : : /* If there's no budget left bail out early. */
4218 : 3053 : if (*limit == 0)
4219 : : return false;
4220 : :
4221 : 3053 : if (dump_enabled_p ())
4222 : : {
4223 : 186 : dump_printf_loc (MSG_NOTE, vect_location,
4224 : : "Starting SLP discovery for\n");
4225 : 940 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4226 : 1508 : dump_printf_loc (MSG_NOTE, vect_location,
4227 : 754 : " %G", scalar_stmts[i]->stmt);
4228 : : }
4229 : :
4230 : : /* Build the tree for the SLP instance. */
4231 : 3053 : unsigned int group_size = scalar_stmts.length ();
4232 : 3053 : bool *matches = XALLOCAVEC (bool, group_size);
4233 : 3053 : poly_uint64 max_nunits = 1;
4234 : 3053 : unsigned tree_size = 0;
4235 : :
4236 : 3053 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4237 : : &max_nunits, matches, limit,
4238 : 3053 : &tree_size, bst_map);
4239 : 3053 : if (node != NULL)
4240 : : {
4241 : : /* Calculate the unrolling factor based on the smallest type. */
4242 : 2815 : poly_uint64 unrolling_factor
4243 : 2815 : = calculate_unrolling_factor (max_nunits, group_size);
4244 : :
4245 : 2815 : if (maybe_ne (unrolling_factor, 1U)
4246 : 2815 : && is_a <bb_vec_info> (vinfo))
4247 : : {
4248 : 0 : unsigned HOST_WIDE_INT const_max_nunits;
4249 : 0 : if (!max_nunits.is_constant (&const_max_nunits)
4250 : 0 : || const_max_nunits > group_size)
4251 : : {
4252 : 0 : if (dump_enabled_p ())
4253 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4254 : : "Build SLP failed: store group "
4255 : : "size not a multiple of the vector size "
4256 : : "in basic block SLP\n");
4257 : 0 : vect_free_slp_tree (node);
4258 : 0 : return false;
4259 : : }
4260 : : /* Fatal mismatch. */
4261 : 0 : if (dump_enabled_p ())
4262 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
4263 : : "SLP discovery succeeded but node needs "
4264 : : "splitting\n");
4265 : 0 : memset (matches, true, group_size);
4266 : 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
4267 : 0 : vect_free_slp_tree (node);
4268 : : }
4269 : : else
4270 : : {
4271 : : /* Create a new SLP instance. */
4272 : 2815 : slp_instance new_instance = XNEW (class _slp_instance);
4273 : 2815 : SLP_INSTANCE_TREE (new_instance) = node;
4274 : 2815 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4275 : 2815 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4276 : 2815 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4277 : 2815 : SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
4278 : 2815 : new_instance->reduc_phis = NULL;
4279 : 2815 : new_instance->cost_vec = vNULL;
4280 : 2815 : new_instance->subgraph_entries = vNULL;
4281 : :
4282 : 2815 : if (dump_enabled_p ())
4283 : 131 : dump_printf_loc (MSG_NOTE, vect_location,
4284 : : "SLP size %u vs. limit %u.\n",
4285 : : tree_size, max_tree_size);
4286 : :
4287 : : /* Fixup SLP reduction chains. If this is a reduction chain with
4288 : : a conversion in front amend the SLP tree with a node for that. */
4289 : 2815 : gimple *scalar_def
4290 : 2815 : = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
4291 : 2815 : if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
4292 : : {
4293 : : /* Get at the conversion stmt - we know it's the single use
4294 : : of the last stmt of the reduction chain. */
4295 : 36 : use_operand_p use_p;
4296 : 36 : bool r = single_imm_use (gimple_assign_lhs (scalar_def),
4297 : : &use_p, &scalar_def);
4298 : 36 : gcc_assert (r);
4299 : 36 : stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
4300 : 36 : next_info = vect_stmt_to_vectorize (next_info);
4301 : 36 : scalar_stmts = vNULL;
4302 : 36 : scalar_stmts.create (group_size);
4303 : 114 : for (unsigned i = 0; i < group_size; ++i)
4304 : 78 : scalar_stmts.quick_push (next_info);
4305 : 36 : slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
4306 : 36 : SLP_TREE_VECTYPE (conv)
4307 : 36 : = get_vectype_for_scalar_type (vinfo,
4308 : 36 : TREE_TYPE
4309 : : (gimple_assign_lhs (scalar_def)),
4310 : : group_size);
4311 : 36 : SLP_TREE_REDUC_IDX (conv) = 0;
4312 : 36 : conv->cycle_info.id = node->cycle_info.id;
4313 : 36 : SLP_TREE_CHILDREN (conv).quick_push (node);
4314 : 36 : SLP_INSTANCE_TREE (new_instance) = conv;
4315 : : /* We also have to fake this conversion stmt as SLP reduction
4316 : : group so we don't have to mess with too much code
4317 : : elsewhere. */
4318 : 36 : REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
4319 : 36 : REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
4320 : : }
4321 : : /* Fill the backedge child of the PHI SLP node. The
4322 : : general matching code cannot find it because the
4323 : : scalar code does not reflect how we vectorize the
4324 : : reduction. */
4325 : 2815 : use_operand_p use_p;
4326 : 2815 : imm_use_iterator imm_iter;
4327 : 2815 : class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
4328 : 10961 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
4329 : : gimple_get_lhs (scalar_def))
4330 : : /* There are exactly two non-debug uses, the reduction
4331 : : PHI and the loop-closed PHI node. */
4332 : 8146 : if (!is_gimple_debug (USE_STMT (use_p))
4333 : 8146 : && gimple_bb (USE_STMT (use_p)) == loop->header)
4334 : : {
4335 : 2815 : auto_vec<stmt_vec_info, 64> phis (group_size);
4336 : 2815 : stmt_vec_info phi_info
4337 : 2815 : = vinfo->lookup_stmt (USE_STMT (use_p));
4338 : 9759 : for (unsigned i = 0; i < group_size; ++i)
4339 : 6944 : phis.quick_push (phi_info);
4340 : 2815 : slp_tree *phi_node = bst_map->get (phis);
4341 : 2815 : unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
4342 : 5630 : SLP_TREE_CHILDREN (*phi_node)[dest_idx]
4343 : 2815 : = SLP_INSTANCE_TREE (new_instance);
4344 : 2815 : SLP_INSTANCE_TREE (new_instance)->refcnt++;
4345 : 2815 : }
4346 : :
4347 : 2815 : vinfo->slp_instances.safe_push (new_instance);
4348 : :
4349 : : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4350 : : the number of scalar stmts in the root in a few places.
4351 : : Verify that assumption holds. */
4352 : 5630 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4353 : : .length () == group_size);
4354 : :
4355 : 2815 : if (dump_enabled_p ())
4356 : : {
4357 : 131 : dump_printf_loc (MSG_NOTE, vect_location,
4358 : : "Final SLP tree for instance %p:\n",
4359 : : (void *) new_instance);
4360 : 131 : vect_print_slp_graph (MSG_NOTE, vect_location,
4361 : : SLP_INSTANCE_TREE (new_instance));
4362 : : }
4363 : :
4364 : 2815 : return true;
4365 : : }
4366 : : }
4367 : : /* Failed to SLP. */
4368 : :
4369 : : /* Free the allocated memory. */
4370 : 238 : scalar_stmts.release ();
4371 : :
4372 : : /* Failed to SLP. */
4373 : 238 : if (dump_enabled_p ())
4374 : 55 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4375 : : return false;
4376 : : }
4377 : :
4378 : : /* Analyze an SLP instance starting from a group of grouped stores. Call
4379 : : vect_build_slp_tree to build a tree of packed stmts if possible.
4380 : : Return FALSE if it's impossible to SLP any stmt in the group. */
4381 : :
4382 : : static bool
4383 : 1090407 : vect_analyze_slp_instance (vec_info *vinfo,
4384 : : scalar_stmts_to_slp_tree_map_t *bst_map,
4385 : : stmt_vec_info stmt_info,
4386 : : slp_instance_kind kind,
4387 : : unsigned max_tree_size, unsigned *limit,
4388 : : bool force_single_lane)
4389 : : {
4390 : 1090407 : vec<stmt_vec_info> scalar_stmts;
4391 : :
4392 : 1090407 : if (is_a <bb_vec_info> (vinfo))
4393 : 1068465 : vect_location = stmt_info->stmt;
4394 : :
4395 : 1090407 : gcc_assert (kind == slp_inst_kind_store);
4396 : :
4397 : : /* Collect the stores and store them in scalar_stmts. */
4398 : 1090407 : scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
4399 : 1090407 : stmt_vec_info next_info = stmt_info;
4400 : 5421172 : while (next_info)
4401 : : {
4402 : 3240358 : scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4403 : 3240358 : next_info = DR_GROUP_NEXT_ELEMENT (next_info);
4404 : : }
4405 : :
4406 : 1090407 : vec<stmt_vec_info> root_stmt_infos = vNULL;
4407 : 1090407 : vec<tree> remain = vNULL;
4408 : :
4409 : : /* Build the tree for the SLP instance. */
4410 : :
4411 : : /* If there's no budget left bail out early. */
4412 : 1090407 : if (*limit == 0)
4413 : : return false;
4414 : :
4415 : 1090384 : if (dump_enabled_p ())
4416 : : {
4417 : 3973 : dump_printf_loc (MSG_NOTE, vect_location,
4418 : : "Starting SLP discovery for\n");
4419 : 22654 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
4420 : 37362 : dump_printf_loc (MSG_NOTE, vect_location,
4421 : 18681 : " %G", scalar_stmts[i]->stmt);
4422 : : }
4423 : :
4424 : : /* Build the tree for the SLP instance. */
4425 : 1090384 : unsigned int group_size = scalar_stmts.length ();
4426 : 1090384 : bool *matches = XALLOCAVEC (bool, group_size);
4427 : 1090384 : poly_uint64 max_nunits = 1;
4428 : 1090384 : unsigned tree_size = 0;
4429 : 1090384 : unsigned i;
4430 : :
4431 : 1090384 : slp_tree node = NULL;
4432 : 1090384 : if (group_size > 1 && force_single_lane)
4433 : : {
4434 : 1460 : matches[0] = true;
4435 : 1460 : matches[1] = false;
4436 : : }
4437 : : else
4438 : 1088924 : node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
4439 : : &max_nunits, matches, limit,
4440 : : &tree_size, bst_map);
4441 : 1090384 : if (node != NULL)
4442 : : {
4443 : : /* Calculate the unrolling factor based on the smallest type. */
4444 : 680454 : poly_uint64 unrolling_factor
4445 : 680454 : = calculate_unrolling_factor (max_nunits, group_size);
4446 : :
4447 : 680454 : if (maybe_ne (unrolling_factor, 1U)
4448 : 680454 : && is_a <bb_vec_info> (vinfo))
4449 : : {
4450 : 0 : unsigned HOST_WIDE_INT const_max_nunits;
4451 : 0 : if (!max_nunits.is_constant (&const_max_nunits)
4452 : 0 : || const_max_nunits > group_size)
4453 : : {
4454 : 0 : if (dump_enabled_p ())
4455 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4456 : : "Build SLP failed: store group "
4457 : : "size not a multiple of the vector size "
4458 : : "in basic block SLP\n");
4459 : 0 : vect_free_slp_tree (node);
4460 : 0 : return false;
4461 : : }
4462 : : /* Fatal mismatch. */
4463 : 0 : if (dump_enabled_p ())
4464 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
4465 : : "SLP discovery succeeded but node needs "
4466 : : "splitting\n");
4467 : 0 : memset (matches, true, group_size);
4468 : 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
4469 : 0 : vect_free_slp_tree (node);
4470 : : }
4471 : : else
4472 : : {
4473 : : /* Create a new SLP instance. */
4474 : 680454 : slp_instance new_instance = XNEW (class _slp_instance);
4475 : 680454 : SLP_INSTANCE_TREE (new_instance) = node;
4476 : 680454 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4477 : 680454 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4478 : 680454 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4479 : 680454 : SLP_INSTANCE_KIND (new_instance) = kind;
4480 : 680454 : new_instance->reduc_phis = NULL;
4481 : 680454 : new_instance->cost_vec = vNULL;
4482 : 680454 : new_instance->subgraph_entries = vNULL;
4483 : :
4484 : 680454 : if (dump_enabled_p ())
4485 : 3014 : dump_printf_loc (MSG_NOTE, vect_location,
4486 : : "SLP size %u vs. limit %u.\n",
4487 : : tree_size, max_tree_size);
4488 : :
4489 : 680454 : vinfo->slp_instances.safe_push (new_instance);
4490 : :
4491 : : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4492 : : the number of scalar stmts in the root in a few places.
4493 : : Verify that assumption holds. */
4494 : 1360908 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4495 : : .length () == group_size);
4496 : :
4497 : 680454 : if (dump_enabled_p ())
4498 : : {
4499 : 3014 : dump_printf_loc (MSG_NOTE, vect_location,
4500 : : "Final SLP tree for instance %p:\n",
4501 : : (void *) new_instance);
4502 : 3014 : vect_print_slp_graph (MSG_NOTE, vect_location,
4503 : : SLP_INSTANCE_TREE (new_instance));
4504 : : }
4505 : :
4506 : 680454 : return true;
4507 : : }
4508 : : }
4509 : : /* Failed to SLP. */
4510 : :
4511 : : /* Try to break the group up into pieces. */
4512 : 409930 : if (*limit > 0 && kind == slp_inst_kind_store)
4513 : : {
4514 : : /* ??? We could delay all the actual splitting of store-groups
4515 : : until after SLP discovery of the original group completed.
4516 : : Then we can recurse to vect_build_slp_instance directly. */
4517 : 1070621 : for (i = 0; i < group_size; i++)
4518 : 1070621 : if (!matches[i])
4519 : : break;
4520 : :
4521 : : /* For basic block SLP, try to break the group up into multiples of
4522 : : a vector size. */
4523 : 409929 : if (is_a <bb_vec_info> (vinfo)
4524 : 409929 : && (i > 1 && i < group_size))
4525 : : {
4526 : : /* Free the allocated memory. */
4527 : 155400 : scalar_stmts.release ();
4528 : :
4529 : 155400 : tree scalar_type
4530 : 155400 : = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
4531 : 310800 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
4532 : 155400 : 1 << floor_log2 (i));
4533 : 155400 : unsigned HOST_WIDE_INT const_nunits;
4534 : 155400 : if (vectype
4535 : 155400 : && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
4536 : : {
4537 : : /* Split into two groups at the first vector boundary. */
4538 : 155400 : gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
4539 : 155400 : unsigned group1_size = i & ~(const_nunits - 1);
4540 : :
4541 : 155400 : if (dump_enabled_p ())
4542 : 51 : dump_printf_loc (MSG_NOTE, vect_location,
4543 : : "Splitting SLP group at stmt %u\n", i);
4544 : 155400 : stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
4545 : : group1_size);
4546 : 155400 : bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
4547 : : kind, max_tree_size,
4548 : : limit, false);
4549 : : /* Split the rest at the failure point and possibly
4550 : : re-analyze the remaining matching part if it has
4551 : : at least two lanes. */
4552 : 155400 : if (group1_size < i
4553 : 5195 : && (i + 1 < group_size
4554 : 3039 : || i - group1_size > 1))
4555 : : {
4556 : 2182 : stmt_vec_info rest2 = rest;
4557 : 2182 : rest = vect_split_slp_store_group (rest, i - group1_size);
4558 : 2182 : if (i - group1_size > 1)
4559 : 59 : res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
4560 : : kind, max_tree_size,
4561 : : limit, false);
4562 : : }
4563 : : /* Re-analyze the non-matching tail if it has at least
4564 : : two lanes. */
4565 : 155400 : if (i + 1 < group_size)
4566 : 22045 : res |= vect_analyze_slp_instance (vinfo, bst_map,
4567 : : rest, kind, max_tree_size,
4568 : : limit, false);
4569 : 155400 : return res;
4570 : : }
4571 : : }
4572 : :
4573 : : /* For loop vectorization split the RHS into arbitrary pieces of
4574 : : size >= 1. */
4575 : 254529 : else if (is_a <loop_vec_info> (vinfo)
4576 : 254529 : && (group_size != 1 && i < group_size))
4577 : : {
4578 : 6683 : gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4579 : 27 : bool masked_p = call
4580 : 27 : && gimple_call_internal_p (call)
4581 : 27 : && internal_fn_mask_index (gimple_call_internal_fn (call)) != -1;
4582 : : /* There are targets that cannot do even/odd interleaving schemes
4583 : : so they absolutely need to use load/store-lanes. For now
4584 : : force single-lane SLP for them - they would be happy with
4585 : : uniform power-of-two lanes (but depending on element size),
4586 : : but even if we can use 'i' as indicator we would need to
4587 : : backtrack when later lanes fail to discover with the same
4588 : : granularity. We cannot turn any of strided or scatter store
4589 : : into store-lanes. */
4590 : : /* ??? If this is not in sync with what get_load_store_type
4591 : : later decides the SLP representation is not good for other
4592 : : store vectorization methods. */
4593 : 6683 : bool want_store_lanes
4594 : 6683 : = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
4595 : 6683 : && ! STMT_VINFO_STRIDED_P (stmt_info)
4596 : 4955 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
4597 : 4952 : && compare_step_with_zero (vinfo, stmt_info) > 0
4598 : 11612 : && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
4599 : 13366 : masked_p, group_size, i));
4600 : 6683 : if (want_store_lanes || force_single_lane)
4601 : : i = 1;
4602 : :
4603 : : /* A fatal discovery fail doesn't always mean single-lane SLP
4604 : : isn't a possibility, so try. */
4605 : 5223 : if (i == 0)
4606 : : i = 1;
4607 : :
4608 : 6683 : if (dump_enabled_p ())
4609 : 866 : dump_printf_loc (MSG_NOTE, vect_location,
4610 : : "Splitting SLP group at stmt %u\n", i);
4611 : :
4612 : : /* Analyze the stored values and pinch them together with
4613 : : a permute node so we can preserve the whole store group. */
4614 : 6683 : auto_vec<slp_tree> rhs_nodes;
4615 : 6683 : poly_uint64 max_nunits = 1;
4616 : :
4617 : 6683 : unsigned int rhs_common_nlanes = 0;
4618 : 6683 : unsigned int start = 0, end = i;
4619 : 29884 : while (start < group_size)
4620 : : {
4621 : 23467 : gcc_assert (end - start >= 1);
4622 : 23467 : vec<stmt_vec_info> substmts;
4623 : 23467 : substmts.create (end - start);
4624 : 70544 : for (unsigned j = start; j < end; ++j)
4625 : 47077 : substmts.quick_push (scalar_stmts[j]);
4626 : 23467 : max_nunits = 1;
4627 : 23467 : node = vect_build_slp_tree (vinfo, substmts, end - start,
4628 : : &max_nunits,
4629 : : matches, limit, &tree_size, bst_map);
4630 : 23467 : if (node)
4631 : : {
4632 : 18727 : rhs_nodes.safe_push (node);
4633 : 18727 : vect_update_max_nunits (&max_nunits, node->max_nunits);
4634 : 18727 : if (start == 0)
4635 : 6421 : rhs_common_nlanes = SLP_TREE_LANES (node);
4636 : 12306 : else if (rhs_common_nlanes != SLP_TREE_LANES (node))
4637 : 1337 : rhs_common_nlanes = 0;
4638 : 18727 : start = end;
4639 : 18727 : if (want_store_lanes || force_single_lane)
4640 : 4476 : end = start + 1;
4641 : : else
4642 : : end = group_size;
4643 : : }
4644 : : else
4645 : : {
4646 : 4740 : substmts.release ();
4647 : 4740 : if (end - start == 1)
4648 : : {
4649 : : /* Single-lane discovery failed. Free ressources. */
4650 : 280 : for (auto node : rhs_nodes)
4651 : 6 : vect_free_slp_tree (node);
4652 : 266 : scalar_stmts.release ();
4653 : 266 : if (dump_enabled_p ())
4654 : 38 : dump_printf_loc (MSG_NOTE, vect_location,
4655 : : "SLP discovery failed\n");
4656 : 266 : return false;
4657 : : }
4658 : :
4659 : : /* ??? It really happens that we soft-fail SLP
4660 : : build at a mismatch but the matching part hard-fails
4661 : : later. As we know we arrived here with a group
4662 : : larger than one try a group of size one! */
4663 : 4474 : if (!matches[0])
4664 : 42 : end = start + 1;
4665 : : else
4666 : 9956 : for (unsigned j = start; j < end; j++)
4667 : 9956 : if (!matches[j - start])
4668 : : {
4669 : : end = j;
4670 : : break;
4671 : : }
4672 : : }
4673 : : }
4674 : :
4675 : : /* Now re-assess whether we want store lanes in case the
4676 : : discovery ended up producing all single-lane RHSs. */
4677 : 6417 : if (! want_store_lanes
4678 : 6417 : && rhs_common_nlanes == 1
4679 : 5485 : && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
4680 : 5485 : && ! STMT_VINFO_STRIDED_P (stmt_info)
4681 : 4050 : && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
4682 : 4047 : && compare_step_with_zero (vinfo, stmt_info) > 0
4683 : 10453 : && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes[0]),
4684 : : group_size, masked_p)
4685 : : != IFN_LAST))
4686 : : want_store_lanes = true;
4687 : :
4688 : : /* Now we assume we can build the root SLP node from all stores. */
4689 : 6417 : if (want_store_lanes)
4690 : : {
4691 : : /* For store-lanes feed the store node with all RHS nodes
4692 : : in order. */
4693 : 0 : node = vect_create_new_slp_node (scalar_stmts,
4694 : 0 : SLP_TREE_CHILDREN
4695 : : (rhs_nodes[0]).length ());
4696 : 0 : SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
4697 : 0 : node->max_nunits = max_nunits;
4698 : 0 : node->ldst_lanes = true;
4699 : 0 : SLP_TREE_CHILDREN (node)
4700 : 0 : .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
4701 : 0 : + rhs_nodes.length () - 1);
4702 : : /* First store value and possibly mask. */
4703 : 0 : SLP_TREE_CHILDREN (node)
4704 : 0 : .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
4705 : : /* Rest of the store values. All mask nodes are the same,
4706 : : this should be guaranteed by dataref group discovery. */
4707 : 0 : for (unsigned j = 1; j < rhs_nodes.length (); ++j)
4708 : 0 : SLP_TREE_CHILDREN (node)
4709 : 0 : .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
4710 : 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
4711 : 0 : child->refcnt++;
4712 : : }
4713 : : else
4714 : 6417 : node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
4715 : : max_nunits);
4716 : :
4717 : 25138 : while (!rhs_nodes.is_empty ())
4718 : 18721 : vect_free_slp_tree (rhs_nodes.pop ());
4719 : :
4720 : : /* Create a new SLP instance. */
4721 : 6417 : slp_instance new_instance = XNEW (class _slp_instance);
4722 : 6417 : SLP_INSTANCE_TREE (new_instance) = node;
4723 : 6417 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
4724 : 6417 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4725 : 6417 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4726 : 6417 : SLP_INSTANCE_KIND (new_instance) = kind;
4727 : 6417 : new_instance->reduc_phis = NULL;
4728 : 6417 : new_instance->cost_vec = vNULL;
4729 : 6417 : new_instance->subgraph_entries = vNULL;
4730 : :
4731 : 6417 : if (dump_enabled_p ())
4732 : 828 : dump_printf_loc (MSG_NOTE, vect_location,
4733 : : "SLP size %u vs. limit %u.\n",
4734 : : tree_size, max_tree_size);
4735 : :
4736 : 6417 : vinfo->slp_instances.safe_push (new_instance);
4737 : :
4738 : : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4739 : : the number of scalar stmts in the root in a few places.
4740 : : Verify that assumption holds. */
4741 : 12834 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4742 : : .length () == group_size);
4743 : :
4744 : 6417 : if (dump_enabled_p ())
4745 : : {
4746 : 828 : dump_printf_loc (MSG_NOTE, vect_location,
4747 : : "Final SLP tree for instance %p:\n",
4748 : : (void *) new_instance);
4749 : 828 : vect_print_slp_graph (MSG_NOTE, vect_location,
4750 : : SLP_INSTANCE_TREE (new_instance));
4751 : : }
4752 : 6417 : return true;
4753 : 6683 : }
4754 : : else
4755 : : /* Free the allocated memory. */
4756 : 247846 : scalar_stmts.release ();
4757 : :
4758 : : /* Even though the first vector did not all match, we might be able to SLP
4759 : : (some) of the remainder. FORNOW ignore this possibility. */
4760 : : }
4761 : : else
4762 : : /* Free the allocated memory. */
4763 : 1 : scalar_stmts.release ();
4764 : :
4765 : : /* Failed to SLP. */
4766 : 247847 : if (dump_enabled_p ())
4767 : 42 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4768 : : return false;
4769 : : }
4770 : :
4771 : : /* qsort comparator ordering SLP load nodes. */
4772 : :
4773 : : static int
4774 : 2172750 : vllp_cmp (const void *a_, const void *b_)
4775 : : {
4776 : 2172750 : const slp_tree a = *(const slp_tree *)a_;
4777 : 2172750 : const slp_tree b = *(const slp_tree *)b_;
4778 : 2172750 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
4779 : 2172750 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
4780 : 2172750 : if (STMT_VINFO_GROUPED_ACCESS (a0)
4781 : 1339819 : && STMT_VINFO_GROUPED_ACCESS (b0)
4782 : 3459161 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
4783 : : {
4784 : : /* Same group, order after lanes used. */
4785 : 333502 : if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
4786 : : return 1;
4787 : 320929 : else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
4788 : : return -1;
4789 : : else
4790 : : {
4791 : : /* Try to order loads using the same lanes together, breaking
4792 : : the tie with the lane number that first differs. */
4793 : 309366 : if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4794 : 309366 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4795 : : return 0;
4796 : 309366 : else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
4797 : 309366 : && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4798 : : return 1;
4799 : 307919 : else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4800 : 307919 : && SLP_TREE_LOAD_PERMUTATION (b).exists ())
4801 : : return -1;
4802 : : else
4803 : : {
4804 : 306458 : for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
4805 : 306458 : if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4806 : 306458 : != SLP_TREE_LOAD_PERMUTATION (b)[i])
4807 : : {
4808 : : /* In-order lane first, that's what the above case for
4809 : : no permutation does. */
4810 : 306250 : if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
4811 : : return -1;
4812 : 185596 : else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
4813 : : return 1;
4814 : 89836 : else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4815 : 89836 : < SLP_TREE_LOAD_PERMUTATION (b)[i])
4816 : : return -1;
4817 : : else
4818 : : return 1;
4819 : : }
4820 : : return 0;
4821 : : }
4822 : : }
4823 : : }
4824 : : else /* Different groups or non-groups. */
4825 : : {
4826 : : /* Order groups as their first element to keep them together. */
4827 : 1839248 : if (STMT_VINFO_GROUPED_ACCESS (a0))
4828 : 1839248 : a0 = DR_GROUP_FIRST_ELEMENT (a0);
4829 : 1839248 : if (STMT_VINFO_GROUPED_ACCESS (b0))
4830 : 1839248 : b0 = DR_GROUP_FIRST_ELEMENT (b0);
4831 : 1839248 : if (a0 == b0)
4832 : : return 0;
4833 : : /* Tie using UID. */
4834 : 1839236 : else if (gimple_uid (STMT_VINFO_STMT (a0))
4835 : 1839236 : < gimple_uid (STMT_VINFO_STMT (b0)))
4836 : : return -1;
4837 : : else
4838 : : {
4839 : 820786 : gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
4840 : : != gimple_uid (STMT_VINFO_STMT (b0)));
4841 : : return 1;
4842 : : }
4843 : : }
4844 : : }
4845 : :
4846 : : /* Process the set of LOADS that are all from the same dataref group. */
4847 : :
4848 : : static void
4849 : 135382 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
4850 : : scalar_stmts_to_slp_tree_map_t *bst_map,
4851 : : const array_slice<slp_tree> &loads,
4852 : : bool force_single_lane)
4853 : : {
4854 : : /* We at this point want to lower without a fixed VF or vector
4855 : : size in mind which means we cannot actually compute whether we
4856 : : need three or more vectors for a load permutation yet. So always
4857 : : lower. */
4858 : 135382 : stmt_vec_info first
4859 : 135382 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
4860 : 135382 : unsigned group_lanes = DR_GROUP_SIZE (first);
4861 : :
4862 : : /* Verify if all load permutations can be implemented with a suitably
4863 : : large element load-lanes operation. */
4864 : 135382 : unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
4865 : 135382 : if (STMT_VINFO_STRIDED_P (first)
4866 : 133763 : || compare_step_with_zero (loop_vinfo, first) <= 0
4867 : 131652 : || exact_log2 (ld_lanes_lanes) == -1
4868 : : /* ??? For now only support the single-lane case as there is
4869 : : missing support on the store-lane side and code generation
4870 : : isn't up to the task yet. */
4871 : 129809 : || ld_lanes_lanes != 1
4872 : 258001 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
4873 : : group_lanes / ld_lanes_lanes,
4874 : : false) == IFN_LAST)
4875 : : ld_lanes_lanes = 0;
4876 : : else
4877 : : /* Verify the loads access the same number of lanes aligned to
4878 : : ld_lanes_lanes. */
4879 : 0 : for (slp_tree load : loads)
4880 : : {
4881 : 0 : if (SLP_TREE_LANES (load) != ld_lanes_lanes)
4882 : : {
4883 : : ld_lanes_lanes = 0;
4884 : : break;
4885 : : }
4886 : 0 : unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
4887 : 0 : if (first % ld_lanes_lanes != 0)
4888 : : {
4889 : : ld_lanes_lanes = 0;
4890 : : break;
4891 : : }
4892 : 0 : for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i)
4893 : : if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i)
4894 : : {
4895 : : ld_lanes_lanes = 0;
4896 : : break;
4897 : : }
4898 : : }
4899 : :
4900 : : /* Only a power-of-two number of lanes matches interleaving with N levels.
4901 : : ??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
4902 : : at each step. */
4903 : 220030 : if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
4904 : : return;
4905 : :
4906 : 238783 : for (slp_tree load : loads)
4907 : : {
4908 : : /* Leave masked or gather loads alone for now. */
4909 : 162626 : if (!SLP_TREE_CHILDREN (load).is_empty ())
4910 : 43070 : continue;
4911 : :
4912 : : /* For single-element interleaving spanning multiple vectors avoid
4913 : : lowering, we want to use VMAT_ELEMENTWISE later. */
4914 : 162620 : if (ld_lanes_lanes == 0
4915 : 162620 : && SLP_TREE_LANES (load) == 1
4916 : 151710 : && !DR_GROUP_NEXT_ELEMENT (first)
4917 : 219833 : && maybe_gt (group_lanes,
4918 : : TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (load))))
4919 : 28967 : return;
4920 : :
4921 : : /* We want to pattern-match special cases here and keep those
4922 : : alone. Candidates are splats and load-lane. */
4923 : :
4924 : : /* We need to lower only loads of less than half of the groups
4925 : : lanes, including duplicate lanes. Note this leaves nodes
4926 : : with a non-1:1 load permutation around instead of canonicalizing
4927 : : those into a load and a permute node. Removing this early
4928 : : check would do such canonicalization. */
4929 : 133653 : if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
4930 : 39637 : && ld_lanes_lanes == 0)
4931 : 39637 : continue;
4932 : :
4933 : : /* Build the permute to get the original load permutation order. */
4934 : 94016 : bool contiguous = true;
4935 : 94016 : lane_permutation_t final_perm;
4936 : 94016 : final_perm.create (SLP_TREE_LANES (load));
4937 : 188574 : for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
4938 : : {
4939 : 94558 : final_perm.quick_push
4940 : 94558 : (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
4941 : 94558 : if (i != 0
4942 : 94558 : && (SLP_TREE_LOAD_PERMUTATION (load)[i]
4943 : 542 : != SLP_TREE_LOAD_PERMUTATION (load)[i-1] + 1))
4944 : : contiguous = false;
4945 : : }
4946 : :
4947 : : /* When the load permutation accesses a contiguous unpermuted,
4948 : : power-of-two aligned and sized chunk leave the load alone.
4949 : : We can likely (re-)load it more efficiently rather than
4950 : : extracting it from the larger load.
4951 : : ??? Long-term some of the lowering should move to where
4952 : : the vector types involved are fixed. */
4953 : 97443 : if (!force_single_lane
4954 : 94016 : && ld_lanes_lanes == 0
4955 : 58758 : && contiguous
4956 : 58549 : && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
4957 : 6552 : && pow2p_hwi (SLP_TREE_LANES (load))
4958 : 6552 : && pow2p_hwi (group_lanes)
4959 : 3427 : && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
4960 : 97443 : && group_lanes % SLP_TREE_LANES (load) == 0)
4961 : : {
4962 : 3427 : final_perm.release ();
4963 : 3427 : continue;
4964 : : }
4965 : :
4966 : : /* First build (and possibly re-use) a load node for the
4967 : : unpermuted group. Gaps in the middle and on the end are
4968 : : represented with NULL stmts. */
4969 : 90589 : vec<stmt_vec_info> stmts;
4970 : 90589 : stmts.create (group_lanes);
4971 : 305480 : for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
4972 : : {
4973 : 214891 : if (s != first)
4974 : 128214 : for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
4975 : 3912 : stmts.quick_push (NULL);
4976 : 214891 : stmts.quick_push (s);
4977 : : }
4978 : 171211 : for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
4979 : 80622 : stmts.quick_push (NULL);
4980 : 90589 : poly_uint64 max_nunits = 1;
4981 : 90589 : bool *matches = XALLOCAVEC (bool, group_lanes);
4982 : 90589 : unsigned limit = 1;
4983 : 90589 : unsigned tree_size = 0;
4984 : 90589 : slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
4985 : : group_lanes,
4986 : : &max_nunits, matches, &limit,
4987 : 90589 : &tree_size, bst_map);
4988 : 90589 : gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0).exists ());
4989 : :
4990 : 90589 : if (ld_lanes_lanes != 0)
4991 : : {
4992 : : /* ??? If this is not in sync with what get_load_store_type
4993 : : later decides the SLP representation is not good for other
4994 : : store vectorization methods. */
4995 : 0 : l0->ldst_lanes = true;
4996 : 0 : load->ldst_lanes = true;
4997 : : }
4998 : :
4999 : 279061 : while (1)
5000 : : {
5001 : 184825 : unsigned group_lanes = SLP_TREE_LANES (l0);
5002 : 184825 : if (ld_lanes_lanes != 0
5003 : 184825 : || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
5004 : : break;
5005 : :
5006 : : /* Try to lower by reducing the group to half its size using an
5007 : : interleaving scheme. For this try to compute whether all
5008 : : elements needed for this load are in even or odd elements of
5009 : : an even/odd decomposition with N consecutive elements.
5010 : : Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
5011 : : with N == 2. */
5012 : : /* ??? Only an even number of lanes can be handed this way, but the
5013 : : fallback below could work for any number. We have to make sure
5014 : : to round up in that case. */
5015 : 94236 : gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
5016 : 9857 : unsigned even = 0, odd = 0;
5017 : 9857 : if ((group_lanes & 1) == 0)
5018 : : {
5019 : 9857 : even = (1 << ceil_log2 (group_lanes)) - 1;
5020 : 9857 : odd = even;
5021 : 39843 : for (auto l : final_perm)
5022 : : {
5023 : 10272 : even &= ~l.second;
5024 : 10272 : odd &= l.second;
5025 : : }
5026 : : }
5027 : :
5028 : : /* Now build an even or odd extraction from the unpermuted load. */
5029 : 94236 : lane_permutation_t perm;
5030 : 94236 : perm.create ((group_lanes + 1) / 2);
5031 : 94236 : unsigned even_level = even ? 1 << ctz_hwi (even) : 0;
5032 : 94236 : unsigned odd_level = odd ? 1 << ctz_hwi (odd) : 0;
5033 : 94236 : if (even_level
5034 : 9207 : && group_lanes % (2 * even_level) == 0
5035 : : /* ??? When code generating permutes we do not try to pun
5036 : : to larger component modes so level != 1 isn't a natural
5037 : : even/odd extract. Prefer one if possible. */
5038 : 9207 : && (even_level == 1 || !odd_level || odd_level != 1))
5039 : : {
5040 : : /* { 0, 1, ... 4, 5 ..., } */
5041 : 31757 : for (unsigned i = 0; i < group_lanes / 2 / even_level; ++i)
5042 : 49054 : for (unsigned j = 0; j < even_level; ++j)
5043 : 24584 : perm.quick_push (std::make_pair (0, 2 * i * even_level + j));
5044 : : }
5045 : 85029 : else if (odd_level)
5046 : : {
5047 : : /* { ..., 2, 3, ... 6, 7 } */
5048 : 2570 : gcc_assert (group_lanes % (2 * odd_level) == 0);
5049 : 11404 : for (unsigned i = 0; i < group_lanes / 2 / odd_level; ++i)
5050 : 17668 : for (unsigned j = 0; j < odd_level; ++j)
5051 : 8834 : perm.quick_push
5052 : 8834 : (std::make_pair (0, (2 * i + 1) * odd_level + j));
5053 : : }
5054 : : else
5055 : : {
5056 : : /* As fallback extract all used lanes and fill to half the
5057 : : group size by repeating the last element.
5058 : : ??? This is quite a bad strathegy for re-use - we could
5059 : : brute force our way to find more optimal filling lanes to
5060 : : maximize re-use when looking at all loads from the group. */
5061 : 84379 : auto_bitmap l;
5062 : 337516 : for (auto p : final_perm)
5063 : 84379 : bitmap_set_bit (l, p.second);
5064 : 84379 : unsigned i = 0;
5065 : 84379 : bitmap_iterator bi;
5066 : 168758 : EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
5067 : 84379 : perm.quick_push (std::make_pair (0, i));
5068 : 337516 : while (perm.length () < (group_lanes + 1) / 2)
5069 : 84379 : perm.quick_push (perm.last ());
5070 : 84379 : }
5071 : :
5072 : : /* Update final_perm with the intermediate permute. */
5073 : 188887 : for (unsigned i = 0; i < final_perm.length (); ++i)
5074 : : {
5075 : 94651 : unsigned l = final_perm[i].second;
5076 : 94651 : unsigned j;
5077 : 101045 : for (j = 0; j < perm.length (); ++j)
5078 : 101045 : if (perm[j].second == l)
5079 : : {
5080 : 94651 : final_perm[i].second = j;
5081 : 94651 : break;
5082 : : }
5083 : 94651 : gcc_assert (j < perm.length ());
5084 : : }
5085 : :
5086 : : /* And create scalar stmts. */
5087 : 94236 : vec<stmt_vec_info> perm_stmts;
5088 : 94236 : perm_stmts.create (perm.length ());
5089 : 296412 : for (unsigned i = 0; i < perm.length (); ++i)
5090 : 202176 : perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
5091 : :
5092 : 94236 : slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
5093 : 94236 : SLP_TREE_CHILDREN (p).quick_push (l0);
5094 : 94236 : SLP_TREE_LANE_PERMUTATION (p) = perm;
5095 : 94236 : SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
5096 : 94236 : SLP_TREE_LANES (p) = perm.length ();
5097 : 94236 : SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
5098 : : /* ??? As we have scalar stmts for this intermediate permute we
5099 : : could CSE it via bst_map but we do not want to pick up
5100 : : another SLP node with a load permutation. We instead should
5101 : : have a "local" CSE map here. */
5102 : 94236 : SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
5103 : :
5104 : : /* We now have a node for (group_lanes + 1) / 2 lanes. */
5105 : 94236 : l0 = p;
5106 : 94236 : }
5107 : :
5108 : : /* And finally from the ordered reduction node create the
5109 : : permute to shuffle the lanes into the original load-permutation
5110 : : order. We replace the original load node with this. */
5111 : 90589 : SLP_TREE_CODE (load) = VEC_PERM_EXPR;
5112 : 90589 : SLP_TREE_LOAD_PERMUTATION (load).release ();
5113 : 90589 : SLP_TREE_LANE_PERMUTATION (load) = final_perm;
5114 : 90589 : SLP_TREE_CHILDREN (load).create (1);
5115 : 90589 : SLP_TREE_CHILDREN (load).quick_push (l0);
5116 : : }
5117 : : }
5118 : :
5119 : : /* Transform SLP loads in the SLP graph created by SLP discovery to
5120 : : group loads from the same group and lower load permutations that
5121 : : are unlikely to be supported into a series of permutes.
5122 : : In the degenerate case of having only single-lane SLP instances
5123 : : this should result in a series of permute nodes emulating an
5124 : : interleaving scheme. */
5125 : :
5126 : : static void
5127 : 380554 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
5128 : : scalar_stmts_to_slp_tree_map_t *bst_map,
5129 : : bool force_single_lane)
5130 : : {
5131 : : /* Gather and sort loads across all instances. */
5132 : 380554 : hash_set<slp_tree> visited;
5133 : 380554 : auto_vec<slp_tree> loads;
5134 : 2089948 : for (auto inst : loop_vinfo->slp_instances)
5135 : 950280 : vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
5136 : 380554 : if (loads.is_empty ())
5137 : 68438 : return;
5138 : 312116 : loads.qsort (vllp_cmp);
5139 : :
5140 : : /* Now process each dataref group separately. */
5141 : 312116 : unsigned firsti = 0;
5142 : 587332 : for (unsigned i = 1; i < loads.length (); ++i)
5143 : : {
5144 : 275216 : slp_tree first = loads[firsti];
5145 : 275216 : slp_tree next = loads[i];
5146 : 275216 : stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
5147 : 275216 : stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
5148 : 275216 : if (STMT_VINFO_GROUPED_ACCESS (a0)
5149 : 140220 : && STMT_VINFO_GROUPED_ACCESS (b0)
5150 : 403449 : && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
5151 : 62605 : continue;
5152 : : /* Now we have one or multiple SLP loads of the same group from
5153 : : firsti to i - 1. */
5154 : 212611 : if (STMT_VINFO_GROUPED_ACCESS (a0))
5155 : 77615 : vect_lower_load_permutations (loop_vinfo, bst_map,
5156 : 77615 : make_array_slice (&loads[firsti],
5157 : : i - firsti),
5158 : : force_single_lane);
5159 : : firsti = i;
5160 : : }
5161 : 624232 : if (firsti < loads.length ()
5162 : 624232 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
5163 : 57767 : vect_lower_load_permutations (loop_vinfo, bst_map,
5164 : 57767 : make_array_slice (&loads[firsti],
5165 : 57767 : loads.length () - firsti),
5166 : : force_single_lane);
5167 : 380554 : }
5168 : :
5169 : : /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
5170 : : trees of packed scalar stmts if SLP is possible. */
5171 : :
5172 : : opt_result
5173 : 1045474 : vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
5174 : : bool force_single_lane)
5175 : : {
5176 : 1045474 : loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
5177 : 1045474 : unsigned int i;
5178 : 1045474 : stmt_vec_info first_element;
5179 : 1045474 : slp_instance instance;
5180 : :
5181 : 1045474 : DUMP_VECT_SCOPE ("vect_analyze_slp");
5182 : :
5183 : 1045474 : unsigned limit = max_tree_size;
5184 : :
5185 : 1045474 : scalar_stmts_to_slp_tree_map_t *bst_map
5186 : 1045474 : = new scalar_stmts_to_slp_tree_map_t ();
5187 : :
5188 : : /* Find SLP sequences starting from groups of grouped stores. */
5189 : 3003577 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
5190 : 912903 : if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
5191 : : slp_inst_kind_store, max_tree_size, &limit,
5192 : : force_single_lane)
5193 : 912903 : && loop_vinfo)
5194 : 274 : return opt_result::failure_at (vect_location, "SLP build failed.\n");
5195 : :
5196 : : /* For loops also start SLP discovery from non-grouped stores. */
5197 : 1045200 : if (loop_vinfo)
5198 : : {
5199 : : data_reference_p dr;
5200 : 1332639 : FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
5201 : 929665 : if (DR_IS_WRITE (dr))
5202 : : {
5203 : 289708 : stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
5204 : : /* Grouped stores are already handled above. */
5205 : 289708 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5206 : 72932 : continue;
5207 : 216776 : vec<stmt_vec_info> stmts;
5208 : 216776 : vec<stmt_vec_info> roots = vNULL;
5209 : 216776 : vec<tree> remain = vNULL;
5210 : 216776 : stmts.create (1);
5211 : 216776 : stmts.quick_push (stmt_info);
5212 : 216776 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
5213 : : stmts, roots, remain, max_tree_size,
5214 : : &limit, bst_map, force_single_lane))
5215 : 5406 : return opt_result::failure_at (vect_location,
5216 : : "SLP build failed.\n");
5217 : : }
5218 : :
5219 : : stmt_vec_info stmt_info;
5220 : 403014 : FOR_EACH_VEC_ELT (LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo), i, stmt_info)
5221 : : {
5222 : 20 : vec<stmt_vec_info> stmts;
5223 : 20 : vec<stmt_vec_info> roots = vNULL;
5224 : 20 : vec<tree> remain = vNULL;
5225 : 20 : stmts.create (1);
5226 : 20 : stmts.quick_push (stmt_info);
5227 : 20 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
5228 : : stmts, roots, remain, max_tree_size,
5229 : : &limit, bst_map, force_single_lane))
5230 : 0 : return opt_result::failure_at (vect_location,
5231 : : "SLP build failed.\n");
5232 : : }
5233 : : }
5234 : :
5235 : 1039794 : if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
5236 : : {
5237 : 1846668 : for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
5238 : : {
5239 : 1209848 : vect_location = bb_vinfo->roots[i].roots[0]->stmt;
5240 : : /* Apply patterns. */
5241 : 3781922 : for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
5242 : 5144148 : bb_vinfo->roots[i].stmts[j]
5243 : 2645331 : = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
5244 : 1209848 : if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
5245 : 1209848 : bb_vinfo->roots[i].stmts,
5246 : 1209848 : bb_vinfo->roots[i].roots,
5247 : 1209848 : bb_vinfo->roots[i].remain,
5248 : : max_tree_size, &limit, bst_map, false))
5249 : : {
5250 : 130950 : bb_vinfo->roots[i].stmts = vNULL;
5251 : 130950 : bb_vinfo->roots[i].roots = vNULL;
5252 : 130950 : bb_vinfo->roots[i].remain = vNULL;
5253 : : }
5254 : : }
5255 : : }
5256 : :
5257 : 1039794 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5258 : : {
5259 : : /* Find SLP sequences starting from reduction chains. */
5260 : 406013 : FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
5261 : 3053 : if (! STMT_VINFO_RELEVANT_P (first_element)
5262 : 0 : && ! STMT_VINFO_LIVE_P (first_element))
5263 : : ;
5264 : 3053 : else if (force_single_lane
5265 : 3053 : || ! vect_analyze_slp_reduc_chain (vinfo, bst_map,
5266 : : first_element,
5267 : : max_tree_size, &limit))
5268 : : {
5269 : 238 : if (dump_enabled_p ())
5270 : 55 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5271 : : "SLP discovery of reduction chain failed\n");
5272 : : /* Dissolve reduction chain group. */
5273 : 238 : stmt_vec_info vinfo = first_element;
5274 : 238 : stmt_vec_info last = NULL;
5275 : 814 : while (vinfo)
5276 : : {
5277 : 576 : stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
5278 : 576 : REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
5279 : 576 : REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
5280 : 576 : last = vinfo;
5281 : 576 : vinfo = next;
5282 : : }
5283 : 238 : STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
5284 : : /* ??? When there's a conversion around the reduction
5285 : : chain 'last' isn't the entry of the reduction. */
5286 : 238 : if (STMT_VINFO_DEF_TYPE (last) != vect_reduction_def)
5287 : 14 : return opt_result::failure_at (vect_location,
5288 : : "SLP build failed.\n");
5289 : : /* It can be still vectorized as part of an SLP reduction. */
5290 : 224 : loop_vinfo->reductions.safe_push (last);
5291 : : }
5292 : :
5293 : : /* Find SLP sequences starting from groups of reductions. */
5294 : 402960 : if (loop_vinfo->reductions.length () > 0)
5295 : : {
5296 : : /* Collect reduction statements we can combine into
5297 : : a SLP reduction. */
5298 : 50003 : vec<stmt_vec_info> scalar_stmts;
5299 : 50003 : scalar_stmts.create (loop_vinfo->reductions.length ());
5300 : 215080 : for (auto next_info : loop_vinfo->reductions)
5301 : : {
5302 : 65071 : next_info = vect_stmt_to_vectorize (next_info);
5303 : 65071 : if ((STMT_VINFO_RELEVANT_P (next_info)
5304 : 4 : || STMT_VINFO_LIVE_P (next_info))
5305 : : /* ??? Make sure we didn't skip a conversion around a
5306 : : reduction path. In that case we'd have to reverse
5307 : : engineer that conversion stmt following the chain using
5308 : : reduc_idx and from the PHI using reduc_def. */
5309 : 65067 : && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
5310 : 65067 : || (STMT_VINFO_DEF_TYPE (next_info)
5311 : : == vect_double_reduction_def)))
5312 : : {
5313 : : /* Do not discover SLP reductions combining lane-reducing
5314 : : ops, that will fail later. */
5315 : 65067 : if (!force_single_lane
5316 : 65067 : && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
5317 : 49229 : scalar_stmts.quick_push (next_info);
5318 : : else
5319 : : {
5320 : : /* Do SLP discovery for single-lane reductions. */
5321 : 15838 : vec<stmt_vec_info> stmts;
5322 : 15838 : vec<stmt_vec_info> roots = vNULL;
5323 : 15838 : vec<tree> remain = vNULL;
5324 : 15838 : stmts.create (1);
5325 : 15838 : stmts.quick_push (next_info);
5326 : 15838 : if (! vect_build_slp_instance (vinfo,
5327 : : slp_inst_kind_reduc_group,
5328 : : stmts, roots, remain,
5329 : : max_tree_size, &limit,
5330 : : bst_map,
5331 : : force_single_lane))
5332 : 0 : return opt_result::failure_at (vect_location,
5333 : : "SLP build failed.\n");
5334 : : }
5335 : : }
5336 : : }
5337 : : /* Save for re-processing on failure. */
5338 : 50003 : vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
5339 : 50003 : vec<stmt_vec_info> roots = vNULL;
5340 : 50003 : vec<tree> remain = vNULL;
5341 : 50003 : if (scalar_stmts.length () <= 1
5342 : 50003 : || !vect_build_slp_instance (loop_vinfo,
5343 : : slp_inst_kind_reduc_group,
5344 : : scalar_stmts, roots, remain,
5345 : : max_tree_size, &limit, bst_map,
5346 : : force_single_lane))
5347 : : {
5348 : 49689 : if (scalar_stmts.length () <= 1)
5349 : 49689 : scalar_stmts.release ();
5350 : : /* Do SLP discovery for single-lane reductions. */
5351 : 169967 : for (auto stmt_info : saved_stmts)
5352 : : {
5353 : 48140 : vec<stmt_vec_info> stmts;
5354 : 48140 : vec<stmt_vec_info> roots = vNULL;
5355 : 48140 : vec<tree> remain = vNULL;
5356 : 48140 : stmts.create (1);
5357 : 48140 : stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
5358 : 48140 : if (! vect_build_slp_instance (vinfo,
5359 : : slp_inst_kind_reduc_group,
5360 : : stmts, roots, remain,
5361 : : max_tree_size, &limit,
5362 : : bst_map, force_single_lane))
5363 : 2328 : return opt_result::failure_at (vect_location,
5364 : : "SLP build failed.\n");
5365 : : }
5366 : : }
5367 : 47675 : saved_stmts.release ();
5368 : : }
5369 : :
5370 : : /* Make sure to vectorize only-live stmts, usually inductions. */
5371 : 1849738 : for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
5372 : 1188369 : for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
5373 : 533419 : gsi_next (&gsi))
5374 : : {
5375 : 540527 : gphi *lc_phi = *gsi;
5376 : 540527 : tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
5377 : 540527 : stmt_vec_info stmt_info;
5378 : 540527 : if (TREE_CODE (def) == SSA_NAME
5379 : 446081 : && !virtual_operand_p (def)
5380 : 212604 : && (stmt_info = loop_vinfo->lookup_def (def))
5381 : 192936 : && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
5382 : 192936 : && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
5383 : 147766 : && STMT_VINFO_LIVE_P (stmt_info)
5384 : 147766 : && !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))
5385 : 623278 : && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
5386 : : {
5387 : 82680 : vec<stmt_vec_info> stmts;
5388 : 82680 : vec<stmt_vec_info> roots = vNULL;
5389 : 82680 : vec<tree> remain = vNULL;
5390 : 82680 : stmts.create (1);
5391 : 82680 : stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
5392 : 82680 : if (! vect_build_slp_instance (vinfo,
5393 : : slp_inst_kind_reduc_group,
5394 : : stmts, roots, remain,
5395 : : max_tree_size, &limit,
5396 : : bst_map, force_single_lane))
5397 : 7108 : return opt_result::failure_at (vect_location,
5398 : : "SLP build failed.\n");
5399 : : }
5400 : 7108 : }
5401 : :
5402 : : /* Find SLP sequences starting from gconds. */
5403 : 1023149 : for (auto cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
5404 : : {
5405 : 249779 : auto cond_info = loop_vinfo->lookup_stmt (cond);
5406 : :
5407 : 249779 : cond_info = vect_stmt_to_vectorize (cond_info);
5408 : 249779 : vec<stmt_vec_info> roots = vNULL;
5409 : 249779 : roots.safe_push (cond_info);
5410 : 249779 : gimple *stmt = STMT_VINFO_STMT (cond_info);
5411 : 249779 : tree args0 = gimple_cond_lhs (stmt);
5412 : 249779 : tree args1 = gimple_cond_rhs (stmt);
5413 : :
5414 : : /* These should be enforced by cond lowering, but if it failed
5415 : : bail. */
5416 : 249779 : if (gimple_cond_code (stmt) != NE_EXPR
5417 : 248504 : || TREE_TYPE (args0) != boolean_type_node
5418 : 497232 : || !integer_zerop (args1))
5419 : : {
5420 : 2326 : roots.release ();
5421 : 2326 : return opt_result::failure_at (vect_location,
5422 : : "SLP build failed.\n");
5423 : : }
5424 : :
5425 : : /* An argument without a loop def will be codegened from vectorizing the
5426 : : root gcond itself. As such we don't need to try to build an SLP tree
5427 : : from them. It's highly likely that the resulting SLP tree here if both
5428 : : arguments have a def will be incompatible, but we rely on it being split
5429 : : later on. */
5430 : 247453 : auto varg = loop_vinfo->lookup_def (args0);
5431 : 247453 : vec<stmt_vec_info> stmts;
5432 : 247453 : vec<tree> remain = vNULL;
5433 : 247453 : stmts.create (1);
5434 : 247453 : stmts.quick_push (vect_stmt_to_vectorize (varg));
5435 : :
5436 : 247453 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
5437 : : stmts, roots, remain,
5438 : : max_tree_size, &limit,
5439 : : bst_map, force_single_lane))
5440 : : {
5441 : 2330 : roots.release ();
5442 : 2330 : return opt_result::failure_at (vect_location,
5443 : : "SLP build failed.\n");
5444 : : }
5445 : : }
5446 : :
5447 : : /* Find and create slp instances for inductions that have been forced
5448 : : live due to early break. */
5449 : 388868 : edge latch_e = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
5450 : 1098924 : for (auto stmt_info : LOOP_VINFO_EARLY_BREAKS_LIVE_IVS (loop_vinfo))
5451 : : {
5452 : 343180 : vec<stmt_vec_info> stmts;
5453 : 343180 : vec<stmt_vec_info> roots = vNULL;
5454 : 343180 : vec<tree> remain = vNULL;
5455 : 343180 : gphi *phi = as_a<gphi *> (STMT_VINFO_STMT (stmt_info));
5456 : 343180 : tree def = gimple_phi_arg_def_from_edge (phi, latch_e);
5457 : 343180 : stmt_vec_info lc_info = loop_vinfo->lookup_def (def);
5458 : 343180 : if (lc_info)
5459 : : {
5460 : 343180 : stmts.create (1);
5461 : 343210 : stmts.quick_push (vect_stmt_to_vectorize (lc_info));
5462 : 343180 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_reduc_group,
5463 : : stmts, roots, remain,
5464 : : max_tree_size, &limit,
5465 : : bst_map, force_single_lane))
5466 : 8314 : return opt_result::failure_at (vect_location,
5467 : : "SLP build failed.\n");
5468 : : }
5469 : : /* When the latch def is from a different cycle this can only
5470 : : be a induction. Build a simple instance for this.
5471 : : ??? We should be able to start discovery from the PHI
5472 : : for all inductions, but then there will be stray
5473 : : non-SLP stmts we choke on as needing non-SLP handling. */
5474 : 334866 : auto_vec<stmt_vec_info, 1> tem;
5475 : 334866 : tem.quick_push (stmt_info);
5476 : 334866 : if (!bst_map->get (tem))
5477 : : {
5478 : 10294 : stmts.create (1);
5479 : 10294 : stmts.quick_push (stmt_info);
5480 : 10294 : if (! vect_build_slp_instance (vinfo, slp_inst_kind_reduc_group,
5481 : : stmts, roots, remain,
5482 : : max_tree_size, &limit,
5483 : : bst_map, force_single_lane))
5484 : 0 : return opt_result::failure_at (vect_location,
5485 : : "SLP build failed.\n");
5486 : : }
5487 : 334866 : }
5488 : : }
5489 : :
5490 : 1017374 : hash_set<slp_tree> visited_patterns;
5491 : 1017374 : slp_tree_to_load_perm_map_t perm_cache;
5492 : 1017374 : slp_compat_nodes_map_t compat_cache;
5493 : :
5494 : : /* See if any patterns can be found in the SLP tree. */
5495 : 1017374 : bool pattern_found = false;
5496 : 3781181 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5497 : 1746433 : pattern_found |= vect_match_slp_patterns (instance, vinfo,
5498 : : &visited_patterns, &perm_cache,
5499 : : &compat_cache);
5500 : :
5501 : : /* If any were found optimize permutations of loads. */
5502 : 1017374 : if (pattern_found)
5503 : : {
5504 : 201 : hash_map<slp_tree, slp_tree> load_map;
5505 : 3478 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5506 : : {
5507 : 3076 : slp_tree root = SLP_INSTANCE_TREE (instance);
5508 : 3076 : optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
5509 : : &load_map, root);
5510 : : }
5511 : 201 : }
5512 : :
5513 : : /* Check whether we should force some SLP instances to use load/store-lanes
5514 : : and do so by forcing SLP re-discovery with single lanes. We used
5515 : : to cancel SLP when this applied to all instances in a loop but now
5516 : : we decide this per SLP instance. It's important to do this only
5517 : : after SLP pattern recognition. */
5518 : 1017374 : if (is_a <loop_vec_info> (vinfo))
5519 : 1330834 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5520 : 950280 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
5521 : 231152 : && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
5522 : : {
5523 : 231152 : slp_tree slp_root = SLP_INSTANCE_TREE (instance);
5524 : 231152 : unsigned int group_size = SLP_TREE_LANES (slp_root);
5525 : 231152 : tree vectype = SLP_TREE_VECTYPE (slp_root);
5526 : :
5527 : 231152 : stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
5528 : 231152 : gimple *rep = STMT_VINFO_STMT (rep_info);
5529 : 231152 : bool masked = (is_gimple_call (rep)
5530 : 1379 : && gimple_call_internal_p (rep)
5531 : 232511 : && internal_fn_mask_index
5532 : 1359 : (gimple_call_internal_fn (rep)) != -1);
5533 : 231132 : if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
5534 : 21642 : || slp_root->ldst_lanes
5535 : 252794 : || (vect_store_lanes_supported (vectype, group_size, masked)
5536 : : == IFN_LAST))
5537 : 231152 : continue;
5538 : :
5539 : 0 : auto_vec<slp_tree> loads;
5540 : 0 : hash_set<slp_tree> visited;
5541 : 0 : vect_gather_slp_loads (loads, slp_root, visited);
5542 : :
5543 : : /* Check whether any load in the SLP instance is possibly
5544 : : permuted. */
5545 : 0 : bool loads_permuted = false;
5546 : 0 : slp_tree load_node;
5547 : 0 : unsigned j;
5548 : 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
5549 : : {
5550 : 0 : if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
5551 : 0 : continue;
5552 : : unsigned k;
5553 : : stmt_vec_info load_info;
5554 : 0 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
5555 : 0 : if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
5556 : : {
5557 : : loads_permuted = true;
5558 : : break;
5559 : : }
5560 : : }
5561 : :
5562 : : /* If the loads and stores can use load/store-lanes force re-discovery
5563 : : with single lanes. */
5564 : 0 : if (loads_permuted)
5565 : : {
5566 : 0 : bool can_use_lanes = true;
5567 : : bool prefer_load_lanes = false;
5568 : 0 : FOR_EACH_VEC_ELT (loads, j, load_node)
5569 : 0 : if (STMT_VINFO_GROUPED_ACCESS
5570 : : (SLP_TREE_REPRESENTATIVE (load_node)))
5571 : : {
5572 : 0 : stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
5573 : : (SLP_TREE_REPRESENTATIVE (load_node));
5574 : 0 : rep = STMT_VINFO_STMT (stmt_vinfo);
5575 : 0 : masked = (is_gimple_call (rep)
5576 : 0 : && gimple_call_internal_p (rep)
5577 : 0 : && internal_fn_mask_index
5578 : 0 : (gimple_call_internal_fn (rep)));
5579 : : /* Use SLP for strided accesses (or if we can't
5580 : : load-lanes). */
5581 : 0 : if (STMT_VINFO_STRIDED_P (stmt_vinfo)
5582 : 0 : || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
5583 : 0 : || vect_load_lanes_supported
5584 : 0 : (SLP_TREE_VECTYPE (load_node),
5585 : 0 : DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
5586 : : /* ??? During SLP re-discovery with a single lane
5587 : : a masked grouped load will appear permuted and
5588 : : discovery will fail. We have to rework this
5589 : : on the discovery side - for now avoid ICEing. */
5590 : 0 : || masked)
5591 : : {
5592 : : can_use_lanes = false;
5593 : : break;
5594 : : }
5595 : : /* Make sure that the target would prefer store-lanes
5596 : : for at least one of the loads.
5597 : :
5598 : : ??? Perhaps we should instead require this for
5599 : : all loads? */
5600 : 0 : prefer_load_lanes
5601 : : = (prefer_load_lanes
5602 : 0 : || SLP_TREE_LANES (load_node) == group_size
5603 : 0 : || (vect_slp_prefer_store_lanes_p
5604 : 0 : (vinfo, stmt_vinfo,
5605 : : SLP_TREE_VECTYPE (load_node), masked,
5606 : : group_size, SLP_TREE_LANES (load_node))));
5607 : : }
5608 : :
5609 : 0 : if (can_use_lanes && prefer_load_lanes)
5610 : : {
5611 : 0 : if (dump_enabled_p ())
5612 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
5613 : : "SLP instance %p can use load/store-lanes,"
5614 : : " re-discovering with single-lanes\n",
5615 : : (void *) instance);
5616 : :
5617 : 0 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
5618 : :
5619 : 0 : vect_free_slp_instance (instance);
5620 : 0 : limit = max_tree_size;
5621 : 0 : bool res = vect_analyze_slp_instance (vinfo, bst_map,
5622 : : stmt_info,
5623 : : slp_inst_kind_store,
5624 : : max_tree_size, &limit,
5625 : : true);
5626 : 0 : gcc_assert (res);
5627 : 0 : auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
5628 : 0 : LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
5629 : : }
5630 : : }
5631 : 0 : }
5632 : :
5633 : : /* When we end up with load permutations that we cannot possibly handle,
5634 : : like those requiring three vector inputs, lower them using interleaving
5635 : : like schemes. */
5636 : 1017374 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5637 : : {
5638 : 380554 : vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
5639 : 380554 : if (dump_enabled_p ())
5640 : : {
5641 : 18255 : dump_printf_loc (MSG_NOTE, vect_location,
5642 : : "SLP graph after lowering permutations:\n");
5643 : 18255 : hash_set<slp_tree> visited;
5644 : 85336 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5645 : 30595 : vect_print_slp_graph (MSG_NOTE, vect_location,
5646 : : SLP_INSTANCE_TREE (instance), visited);
5647 : 18255 : }
5648 : : }
5649 : :
5650 : 1017374 : release_scalar_stmts_to_slp_tree_map (bst_map);
5651 : :
5652 : 1017374 : if (pattern_found && dump_enabled_p ())
5653 : : {
5654 : 23 : dump_printf_loc (MSG_NOTE, vect_location,
5655 : : "Pattern matched SLP tree\n");
5656 : 23 : hash_set<slp_tree> visited;
5657 : 143 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5658 : 74 : vect_print_slp_graph (MSG_NOTE, vect_location,
5659 : : SLP_INSTANCE_TREE (instance), visited);
5660 : 23 : }
5661 : :
5662 : 1017374 : return opt_result::success ();
5663 : 1017374 : }
5664 : :
5665 : : /* Estimates the cost of inserting layout changes into the SLP graph.
5666 : : It can also say that the insertion is impossible. */
5667 : :
5668 : : struct slpg_layout_cost
5669 : : {
5670 : 11098681 : slpg_layout_cost () = default;
5671 : : slpg_layout_cost (sreal, bool);
5672 : :
5673 : 484215 : static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
5674 : 5277151 : bool is_possible () const { return depth != sreal::max (); }
5675 : :
5676 : : bool operator== (const slpg_layout_cost &) const;
5677 : : bool operator!= (const slpg_layout_cost &) const;
5678 : :
5679 : : bool is_better_than (const slpg_layout_cost &, bool) const;
5680 : :
5681 : : void add_parallel_cost (const slpg_layout_cost &);
5682 : : void add_serial_cost (const slpg_layout_cost &);
5683 : : void split (unsigned int);
5684 : :
5685 : : /* The longest sequence of layout changes needed during any traversal
5686 : : of the partition dag, weighted by execution frequency.
5687 : :
5688 : : This is the most important metric when optimizing for speed, since
5689 : : it helps to ensure that we keep the number of operations on
5690 : : critical paths to a minimum. */
5691 : : sreal depth = 0;
5692 : :
5693 : : /* An estimate of the total number of operations needed. It is weighted by
5694 : : execution frequency when optimizing for speed but not when optimizing for
5695 : : size. In order to avoid double-counting, a node with a fanout of N will
5696 : : distribute 1/N of its total cost to each successor.
5697 : :
5698 : : This is the most important metric when optimizing for size, since
5699 : : it helps to keep the total number of operations to a minimum, */
5700 : : sreal total = 0;
5701 : : };
5702 : :
5703 : : /* Construct costs for a node with weight WEIGHT. A higher weight
5704 : : indicates more frequent execution. IS_FOR_SIZE is true if we are
5705 : : optimizing for size rather than speed. */
5706 : :
5707 : 1224014 : slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
5708 : 1224941 : : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
5709 : : {
5710 : 1224014 : }
5711 : :
5712 : : bool
5713 : 0 : slpg_layout_cost::operator== (const slpg_layout_cost &other) const
5714 : : {
5715 : 0 : return depth == other.depth && total == other.total;
5716 : : }
5717 : :
5718 : : bool
5719 : 0 : slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
5720 : : {
5721 : 0 : return !operator== (other);
5722 : : }
5723 : :
5724 : : /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
5725 : : true if we are optimizing for size rather than speed. */
5726 : :
5727 : : bool
5728 : 309828 : slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
5729 : : bool is_for_size) const
5730 : : {
5731 : 309828 : if (is_for_size)
5732 : : {
5733 : 413 : if (total != other.total)
5734 : 177 : return total < other.total;
5735 : 236 : return depth < other.depth;
5736 : : }
5737 : : else
5738 : : {
5739 : 309415 : if (depth != other.depth)
5740 : 139583 : return depth < other.depth;
5741 : 169832 : return total < other.total;
5742 : : }
5743 : : }
5744 : :
5745 : : /* Increase the costs to account for something with cost INPUT_COST
5746 : : happening in parallel with the current costs. */
5747 : :
5748 : : void
5749 : 354559 : slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
5750 : : {
5751 : 354559 : depth = std::max (depth, input_cost.depth);
5752 : 354559 : total += input_cost.total;
5753 : 354559 : }
5754 : :
5755 : : /* Increase the costs to account for something with cost INPUT_COST
5756 : : happening in series with the current costs. */
5757 : :
5758 : : void
5759 : 1487684 : slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
5760 : : {
5761 : 1487684 : depth += other.depth;
5762 : 1487684 : total += other.total;
5763 : 1487684 : }
5764 : :
5765 : : /* Split the total cost among TIMES successors or predecessors. */
5766 : :
5767 : : void
5768 : 1191599 : slpg_layout_cost::split (unsigned int times)
5769 : : {
5770 : 1191599 : if (times > 1)
5771 : 459149 : total /= times;
5772 : 1191599 : }
5773 : :
5774 : : /* Information about one node in the SLP graph, for use during
5775 : : vect_optimize_slp_pass. */
5776 : :
5777 : : struct slpg_vertex
5778 : : {
5779 : 10538246 : slpg_vertex (slp_tree node_) : node (node_) {}
5780 : :
5781 : : /* The node itself. */
5782 : : slp_tree node;
5783 : :
5784 : : /* Which partition the node belongs to, or -1 if none. Nodes outside of
5785 : : partitions are flexible; they can have whichever layout consumers
5786 : : want them to have. */
5787 : : int partition = -1;
5788 : :
5789 : : /* The number of nodes that directly use the result of this one
5790 : : (i.e. the number of nodes that count this one as a child). */
5791 : : unsigned int out_degree = 0;
5792 : :
5793 : : /* The execution frequency of the node. */
5794 : : sreal weight = 0;
5795 : :
5796 : : /* The total execution frequency of all nodes that directly use the
5797 : : result of this one. */
5798 : : sreal out_weight = 0;
5799 : : };
5800 : :
5801 : : /* Information about one partition of the SLP graph, for use during
5802 : : vect_optimize_slp_pass. */
5803 : :
5804 : : struct slpg_partition_info
5805 : : {
5806 : : /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
5807 : : of m_partitioned_nodes. */
5808 : : unsigned int node_begin = 0;
5809 : : unsigned int node_end = 0;
5810 : :
5811 : : /* Which layout we've chosen to use for this partition, or -1 if
5812 : : we haven't picked one yet. */
5813 : : int layout = -1;
5814 : :
5815 : : /* The number of predecessors and successors in the partition dag.
5816 : : The predecessors always have lower partition numbers and the
5817 : : successors always have higher partition numbers.
5818 : :
5819 : : Note that the directions of these edges are not necessarily the
5820 : : same as in the data flow graph. For example, if an SCC has separate
5821 : : partitions for an inner loop and an outer loop, the inner loop's
5822 : : partition will have at least two incoming edges from the outer loop's
5823 : : partition: one for a live-in value and one for a live-out value.
5824 : : In data flow terms, one of these edges would also be from the outer loop
5825 : : to the inner loop, but the other would be in the opposite direction. */
5826 : : unsigned int in_degree = 0;
5827 : : unsigned int out_degree = 0;
5828 : : };
5829 : :
5830 : : /* Information about the costs of using a particular layout for a
5831 : : particular partition. It can also say that the combination is
5832 : : impossible. */
5833 : :
5834 : : struct slpg_partition_layout_costs
5835 : : {
5836 : 1528459 : bool is_possible () const { return internal_cost.is_possible (); }
5837 : 56228 : void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
5838 : :
5839 : : /* The costs inherited from predecessor partitions. */
5840 : : slpg_layout_cost in_cost;
5841 : :
5842 : : /* The inherent cost of the layout within the node itself. For example,
5843 : : this is nonzero for a load if choosing a particular layout would require
5844 : : the load to permute the loaded elements. It is nonzero for a
5845 : : VEC_PERM_EXPR if the permutation cannot be eliminated or converted
5846 : : to full-vector moves. */
5847 : : slpg_layout_cost internal_cost;
5848 : :
5849 : : /* The costs inherited from successor partitions. */
5850 : : slpg_layout_cost out_cost;
5851 : : };
5852 : :
5853 : : /* This class tries to optimize the layout of vectors in order to avoid
5854 : : unnecessary shuffling. At the moment, the set of possible layouts are
5855 : : restricted to bijective permutations.
5856 : :
5857 : : The goal of the pass depends on whether we're optimizing for size or
5858 : : for speed. When optimizing for size, the goal is to reduce the overall
5859 : : number of layout changes (including layout changes implied by things
5860 : : like load permutations). When optimizing for speed, the goal is to
5861 : : reduce the maximum latency attributable to layout changes on any
5862 : : non-cyclical path through the data flow graph.
5863 : :
5864 : : For example, when optimizing a loop nest for speed, we will prefer
5865 : : to make layout changes outside of a loop rather than inside of a loop,
5866 : : and will prefer to make layout changes in parallel rather than serially,
5867 : : even if that increases the overall number of layout changes.
5868 : :
5869 : : The high-level procedure is:
5870 : :
5871 : : (1) Build a graph in which edges go from uses (parents) to definitions
5872 : : (children).
5873 : :
5874 : : (2) Divide the graph into a dag of strongly-connected components (SCCs).
5875 : :
5876 : : (3) When optimizing for speed, partition the nodes in each SCC based
5877 : : on their containing cfg loop. When optimizing for size, treat
5878 : : each SCC as a single partition.
5879 : :
5880 : : This gives us a dag of partitions. The goal is now to assign a
5881 : : layout to each partition.
5882 : :
5883 : : (4) Construct a set of vector layouts that are worth considering.
5884 : : Record which nodes must keep their current layout.
5885 : :
5886 : : (5) Perform a forward walk over the partition dag (from loads to stores)
5887 : : accumulating the "forward" cost of using each layout. When visiting
5888 : : each partition, assign a tentative choice of layout to the partition
5889 : : and use that choice when calculating the cost of using a different
5890 : : layout in successor partitions.
5891 : :
5892 : : (6) Perform a backward walk over the partition dag (from stores to loads),
5893 : : accumulating the "backward" cost of using each layout. When visiting
5894 : : each partition, make a final choice of layout for that partition based
5895 : : on the accumulated forward costs (from (5)) and backward costs
5896 : : (from (6)).
5897 : :
5898 : : (7) Apply the chosen layouts to the SLP graph.
5899 : :
5900 : : For example, consider the SLP statements:
5901 : :
5902 : : S1: a_1 = load
5903 : : loop:
5904 : : S2: a_2 = PHI<a_1, a_3>
5905 : : S3: b_1 = load
5906 : : S4: a_3 = a_2 + b_1
5907 : : exit:
5908 : : S5: a_4 = PHI<a_3>
5909 : : S6: store a_4
5910 : :
5911 : : S2 and S4 form an SCC and are part of the same loop. Every other
5912 : : statement is in a singleton SCC. In this example there is a one-to-one
5913 : : mapping between SCCs and partitions and the partition dag looks like this;
5914 : :
5915 : : S1 S3
5916 : : \ /
5917 : : S2+S4
5918 : : |
5919 : : S5
5920 : : |
5921 : : S6
5922 : :
5923 : : S2, S3 and S4 will have a higher execution frequency than the other
5924 : : statements, so when optimizing for speed, the goal is to avoid any
5925 : : layout changes:
5926 : :
5927 : : - within S3
5928 : : - within S2+S4
5929 : : - on the S3->S2+S4 edge
5930 : :
5931 : : For example, if S3 was originally a reversing load, the goal of the
5932 : : pass is to make it an unreversed load and change the layout on the
5933 : : S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
5934 : : on S1->S2+S4 and S5->S6 would also be acceptable.)
5935 : :
5936 : : The difference between SCCs and partitions becomes important if we
5937 : : add an outer loop:
5938 : :
5939 : : S1: a_1 = ...
5940 : : loop1:
5941 : : S2: a_2 = PHI<a_1, a_6>
5942 : : S3: b_1 = load
5943 : : S4: a_3 = a_2 + b_1
5944 : : loop2:
5945 : : S5: a_4 = PHI<a_3, a_5>
5946 : : S6: c_1 = load
5947 : : S7: a_5 = a_4 + c_1
5948 : : exit2:
5949 : : S8: a_6 = PHI<a_5>
5950 : : S9: store a_6
5951 : : exit1:
5952 : :
5953 : : Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
5954 : : for speed, we usually do not want restrictions in the outer loop to "infect"
5955 : : the decision for the inner loop. For example, if an outer-loop node
5956 : : in the SCC contains a statement with a fixed layout, that should not
5957 : : prevent the inner loop from using a different layout. Conversely,
5958 : : the inner loop should not dictate a layout to the outer loop: if the
5959 : : outer loop does a lot of computation, then it may not be efficient to
5960 : : do all of that computation in the inner loop's preferred layout.
5961 : :
5962 : : So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
5963 : : and S5+S7 (inner). We also try to arrange partitions so that:
5964 : :
5965 : : - the partition for an outer loop comes before the partition for
5966 : : an inner loop
5967 : :
5968 : : - if a sibling loop A dominates a sibling loop B, A's partition
5969 : : comes before B's
5970 : :
5971 : : This gives the following partition dag for the example above:
5972 : :
5973 : : S1 S3
5974 : : \ /
5975 : : S2+S4+S8 S6
5976 : : | \\ /
5977 : : | S5+S7
5978 : : |
5979 : : S9
5980 : :
5981 : : There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
5982 : : one for a reversal of the edge S7->S8.
5983 : :
5984 : : The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
5985 : : for S2+S4+S8 therefore has to balance the cost of using the outer loop's
5986 : : preferred layout against the cost of changing the layout on entry to the
5987 : : inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
5988 : :
5989 : : Although this works well when optimizing for speed, it has the downside
5990 : : when optimizing for size that the choice of layout for S5+S7 is completely
5991 : : independent of S9, which lessens the chance of reducing the overall number
5992 : : of permutations. We therefore do not partition SCCs when optimizing
5993 : : for size.
5994 : :
5995 : : To give a concrete example of the difference between optimizing
5996 : : for size and speed, consider:
5997 : :
5998 : : a[0] = (b[1] << c[3]) - d[1];
5999 : : a[1] = (b[0] << c[2]) - d[0];
6000 : : a[2] = (b[3] << c[1]) - d[3];
6001 : : a[3] = (b[2] << c[0]) - d[2];
6002 : :
6003 : : There are three different layouts here: one for a, one for b and d,
6004 : : and one for c. When optimizing for speed it is better to permute each
6005 : : of b, c and d into the order required by a, since those permutations
6006 : : happen in parallel. But when optimizing for size, it is better to:
6007 : :
6008 : : - permute c into the same order as b
6009 : : - do the arithmetic
6010 : : - permute the result into the order required by a
6011 : :
6012 : : This gives 2 permutations rather than 3. */
6013 : :
6014 : : class vect_optimize_slp_pass
6015 : : {
6016 : : public:
6017 : 626722 : vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
6018 : : void run ();
6019 : :
6020 : : private:
6021 : : /* Graph building. */
6022 : : struct loop *containing_loop (slp_tree);
6023 : : bool is_cfg_latch_edge (graph_edge *);
6024 : : void build_vertices (hash_set<slp_tree> &, slp_tree);
6025 : : void build_vertices ();
6026 : : void build_graph ();
6027 : :
6028 : : /* Partitioning. */
6029 : : void create_partitions ();
6030 : : template<typename T> void for_each_partition_edge (unsigned int, T);
6031 : :
6032 : : /* Layout selection. */
6033 : : bool is_compatible_layout (slp_tree, unsigned int);
6034 : : int change_layout_cost (slp_tree, unsigned int, unsigned int);
6035 : : slpg_partition_layout_costs &partition_layout_costs (unsigned int,
6036 : : unsigned int);
6037 : : void change_vec_perm_layout (slp_tree, lane_permutation_t &,
6038 : : int, unsigned int);
6039 : : int internal_node_cost (slp_tree, int, unsigned int);
6040 : : void start_choosing_layouts ();
6041 : :
6042 : : /* Cost propagation. */
6043 : : slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
6044 : : unsigned int, unsigned int);
6045 : : slpg_layout_cost total_in_cost (unsigned int);
6046 : : slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
6047 : : slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
6048 : : void forward_pass ();
6049 : : void backward_pass ();
6050 : :
6051 : : /* Rematerialization. */
6052 : : slp_tree get_result_with_layout (slp_tree, unsigned int);
6053 : : void materialize ();
6054 : :
6055 : : /* Clean-up. */
6056 : : void remove_redundant_permutations ();
6057 : :
6058 : : /* Masked load lanes discovery. */
6059 : : void decide_masked_load_lanes ();
6060 : :
6061 : : void dump ();
6062 : :
6063 : : vec_info *m_vinfo;
6064 : :
6065 : : /* True if we should optimize the graph for size, false if we should
6066 : : optimize it for speed. (It wouldn't be easy to make this decision
6067 : : more locally.) */
6068 : : bool m_optimize_size;
6069 : :
6070 : : /* A graph of all SLP nodes, with edges leading from uses to definitions.
6071 : : In other words, a node's predecessors are its slp_tree parents and
6072 : : a node's successors are its slp_tree children. */
6073 : : graph *m_slpg = nullptr;
6074 : :
6075 : : /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
6076 : : auto_vec<slpg_vertex> m_vertices;
6077 : :
6078 : : /* The list of all leaves of M_SLPG. such as external definitions, constants,
6079 : : and loads. */
6080 : : auto_vec<int> m_leafs;
6081 : :
6082 : : /* This array has one entry for every vector layout that we're considering.
6083 : : Element 0 is null and indicates "no change". Other entries describe
6084 : : permutations that are inherent in the current graph and that we would
6085 : : like to reverse if possible.
6086 : :
6087 : : For example, a permutation { 1, 2, 3, 0 } means that something has
6088 : : effectively been permuted in that way, such as a load group
6089 : : { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
6090 : : We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
6091 : : in order to put things "back" in order. */
6092 : : auto_vec<vec<unsigned> > m_perms;
6093 : :
6094 : : /* A partitioning of the nodes for which a layout must be chosen.
6095 : : Each partition represents an <SCC, cfg loop> pair; that is,
6096 : : nodes in different SCCs belong to different partitions, and nodes
6097 : : within an SCC can be further partitioned according to a containing
6098 : : cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
6099 : :
6100 : : - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
6101 : : from leaves (such as loads) to roots (such as stores).
6102 : :
6103 : : - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
6104 : : auto_vec<slpg_partition_info> m_partitions;
6105 : :
6106 : : /* The list of all nodes for which a layout must be chosen. Nodes for
6107 : : partition P come before the nodes for partition P+1. Nodes within a
6108 : : partition are in reverse postorder. */
6109 : : auto_vec<unsigned int> m_partitioned_nodes;
6110 : :
6111 : : /* Index P * num-layouts + L contains the cost of using layout L
6112 : : for partition P. */
6113 : : auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
6114 : :
6115 : : /* Index N * num-layouts + L, if nonnull, is a node that provides the
6116 : : original output of node N adjusted to have layout L. */
6117 : : auto_vec<slp_tree> m_node_layouts;
6118 : : };
6119 : :
6120 : : /* Fill the vertices and leafs vector with all nodes in the SLP graph.
6121 : : Also record whether we should optimize anything for speed rather
6122 : : than size. */
6123 : :
6124 : : void
6125 : 11324626 : vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
6126 : : slp_tree node)
6127 : : {
6128 : 11324626 : unsigned i;
6129 : 11324626 : slp_tree child;
6130 : :
6131 : 11324626 : if (visited.add (node))
6132 : 11324626 : return;
6133 : :
6134 : 10538246 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
6135 : : {
6136 : 7925319 : basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
6137 : 7212299 : if (optimize_bb_for_speed_p (bb))
6138 : 7094303 : m_optimize_size = false;
6139 : : }
6140 : :
6141 : 10538246 : node->vertex = m_vertices.length ();
6142 : 10538246 : m_vertices.safe_push (slpg_vertex (node));
6143 : :
6144 : 10538246 : bool leaf = true;
6145 : 10538246 : bool force_leaf = false;
6146 : 20101200 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6147 : 9562954 : if (child)
6148 : : {
6149 : 7993072 : leaf = false;
6150 : 7993072 : build_vertices (visited, child);
6151 : : }
6152 : : else
6153 : : force_leaf = true;
6154 : : /* Since SLP discovery works along use-def edges all cycles have an
6155 : : entry - but there's the exception of cycles where we do not handle
6156 : : the entry explicitely (but with a NULL SLP node), like some reductions
6157 : : and inductions. Force those SLP PHIs to act as leafs to make them
6158 : : backwards reachable. */
6159 : 10538246 : if (leaf || force_leaf)
6160 : 5396748 : m_leafs.safe_push (node->vertex);
6161 : : }
6162 : :
6163 : : /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
6164 : :
6165 : : void
6166 : 1253444 : vect_optimize_slp_pass::build_vertices ()
6167 : : {
6168 : 1253444 : hash_set<slp_tree> visited;
6169 : 1253444 : unsigned i;
6170 : 1253444 : slp_instance instance;
6171 : 1253444 : m_vertices.truncate (0);
6172 : 1253444 : m_leafs.truncate (0);
6173 : 7091886 : FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
6174 : 3331554 : build_vertices (visited, SLP_INSTANCE_TREE (instance));
6175 : 1253444 : }
6176 : :
6177 : : /* Apply (reverse) bijectite PERM to VEC. */
6178 : :
6179 : : template <class T>
6180 : : static void
6181 : 184217 : vect_slp_permute (vec<unsigned> perm,
6182 : : vec<T> &vec, bool reverse)
6183 : : {
6184 : 184217 : auto_vec<T, 64> saved;
6185 : 184217 : saved.create (vec.length ());
6186 : 608979 : for (unsigned i = 0; i < vec.length (); ++i)
6187 : 424762 : saved.quick_push (vec[i]);
6188 : :
6189 : 184217 : if (reverse)
6190 : : {
6191 : 1207828 : for (unsigned i = 0; i < vec.length (); ++i)
6192 : 423094 : vec[perm[i]] = saved[i];
6193 : 606537 : for (unsigned i = 0; i < vec.length (); ++i)
6194 : 737231 : gcc_assert (vec[perm[i]] == saved[i]);
6195 : : }
6196 : : else
6197 : : {
6198 : 4884 : for (unsigned i = 0; i < vec.length (); ++i)
6199 : 1668 : vec[i] = saved[perm[i]];
6200 : 185885 : for (unsigned i = 0; i < vec.length (); ++i)
6201 : 2502 : gcc_assert (vec[i] == saved[perm[i]]);
6202 : : }
6203 : 184217 : }
6204 : :
6205 : : /* Return the cfg loop that contains NODE. */
6206 : :
6207 : : struct loop *
6208 : 3881754 : vect_optimize_slp_pass::containing_loop (slp_tree node)
6209 : : {
6210 : 3881754 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
6211 : 3881754 : if (!rep)
6212 : 4335 : return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
6213 : 4242559 : return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
6214 : : }
6215 : :
6216 : : /* Return true if UD (an edge from a use to a definition) is associated
6217 : : with a loop latch edge in the cfg. */
6218 : :
6219 : : bool
6220 : 7993072 : vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
6221 : : {
6222 : 7993072 : slp_tree use = m_vertices[ud->src].node;
6223 : 7993072 : slp_tree def = m_vertices[ud->dest].node;
6224 : 7993072 : if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
6225 : 7993072 : || SLP_TREE_PERMUTE_P (use))
6226 : 7624120 : || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
6227 : : return false;
6228 : :
6229 : 4308408 : stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
6230 : 4308408 : return (is_a<gphi *> (use_rep->stmt)
6231 : 309346 : && bb_loop_header_p (gimple_bb (use_rep->stmt))
6232 : 4456768 : && containing_loop (def) == containing_loop (use));
6233 : : }
6234 : :
6235 : : /* Build the graph. Mark edges that correspond to cfg loop latch edges with
6236 : : a nonnull data field. */
6237 : :
6238 : : void
6239 : 1253444 : vect_optimize_slp_pass::build_graph ()
6240 : : {
6241 : 1253444 : m_optimize_size = true;
6242 : 1253444 : build_vertices ();
6243 : :
6244 : 2506888 : m_slpg = new_graph (m_vertices.length ());
6245 : 14298578 : for (slpg_vertex &v : m_vertices)
6246 : 32072120 : for (slp_tree child : SLP_TREE_CHILDREN (v.node))
6247 : 9562954 : if (child)
6248 : : {
6249 : 7993072 : graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
6250 : 7993072 : if (is_cfg_latch_edge (ud))
6251 : 140154 : ud->data = this;
6252 : : }
6253 : 1253444 : }
6254 : :
6255 : : /* Return true if E corresponds to a loop latch edge in the cfg. */
6256 : :
6257 : : static bool
6258 : 4066254 : skip_cfg_latch_edges (graph_edge *e)
6259 : : {
6260 : 4066254 : return e->data;
6261 : : }
6262 : :
6263 : : /* Create the node partitions. */
6264 : :
6265 : : void
6266 : 626722 : vect_optimize_slp_pass::create_partitions ()
6267 : : {
6268 : : /* Calculate a postorder of the graph, ignoring edges that correspond
6269 : : to natural latch edges in the cfg. Reading the vector from the end
6270 : : to the beginning gives the reverse postorder. */
6271 : 626722 : auto_vec<int> initial_rpo;
6272 : 1253444 : graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
6273 : : false, NULL, skip_cfg_latch_edges);
6274 : 1880166 : gcc_assert (initial_rpo.length () == m_vertices.length ());
6275 : :
6276 : : /* Calculate the strongly connected components of the graph. */
6277 : 626722 : auto_vec<int> scc_grouping;
6278 : 626722 : unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
6279 : :
6280 : : /* Create a new index order in which all nodes from the same SCC are
6281 : : consecutive. Use scc_pos to record the index of the first node in
6282 : : each SCC. */
6283 : 626722 : auto_vec<unsigned int> scc_pos (num_sccs);
6284 : 626722 : int last_component = -1;
6285 : 626722 : unsigned int node_count = 0;
6286 : 7148932 : for (unsigned int node_i : scc_grouping)
6287 : : {
6288 : 5268766 : if (last_component != m_slpg->vertices[node_i].component)
6289 : : {
6290 : 5189484 : last_component = m_slpg->vertices[node_i].component;
6291 : 10378968 : gcc_assert (last_component == int (scc_pos.length ()));
6292 : 5189484 : scc_pos.quick_push (node_count);
6293 : : }
6294 : 5268766 : node_count += 1;
6295 : : }
6296 : 1253444 : gcc_assert (node_count == initial_rpo.length ()
6297 : : && last_component + 1 == int (num_sccs));
6298 : :
6299 : : /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
6300 : : inside each SCC following the RPO we calculated above. The fact that
6301 : : we ignored natural latch edges when calculating the RPO should ensure
6302 : : that, for natural loop nests:
6303 : :
6304 : : - the first node that we encounter in a cfg loop is the loop header phi
6305 : : - the loop header phis are in dominance order
6306 : :
6307 : : Arranging for this is an optimization (see below) rather than a
6308 : : correctness issue. Unnatural loops with a tangled mess of backedges
6309 : : will still work correctly, but might give poorer results.
6310 : :
6311 : : Also update scc_pos so that it gives 1 + the index of the last node
6312 : : in the SCC. */
6313 : 626722 : m_partitioned_nodes.safe_grow (node_count);
6314 : 6522210 : for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
6315 : : {
6316 : 5268766 : unsigned int node_i = initial_rpo[old_i];
6317 : 5268766 : unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
6318 : 5268766 : m_partitioned_nodes[new_i] = node_i;
6319 : : }
6320 : :
6321 : : /* When optimizing for speed, partition each SCC based on the containing
6322 : : cfg loop. The order we constructed above should ensure that, for natural
6323 : : cfg loops, we'll create sub-SCC partitions for outer loops before
6324 : : the corresponding sub-SCC partitions for inner loops. Similarly,
6325 : : when one sibling loop A dominates another sibling loop B, we should
6326 : : create a sub-SCC partition for A before a sub-SCC partition for B.
6327 : :
6328 : : As above, nothing depends for correctness on whether this achieves
6329 : : a natural nesting, but we should get better results when it does. */
6330 : 1253444 : m_partitions.reserve (m_vertices.length ());
6331 : 626722 : unsigned int next_partition_i = 0;
6332 : 626722 : hash_map<struct loop *, int> loop_partitions;
6333 : 626722 : unsigned int rpo_begin = 0;
6334 : 626722 : unsigned int num_partitioned_nodes = 0;
6335 : 7069650 : for (unsigned int rpo_end : scc_pos)
6336 : : {
6337 : 5189484 : loop_partitions.empty ();
6338 : : unsigned int partition_i = next_partition_i;
6339 : 10458250 : for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
6340 : : {
6341 : : /* Handle externals and constants optimistically throughout.
6342 : : But treat existing vectors as fixed since we do not handle
6343 : : permuting them. */
6344 : 5268766 : unsigned int node_i = m_partitioned_nodes[rpo_i];
6345 : 5268766 : auto &vertex = m_vertices[node_i];
6346 : 5268766 : if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
6347 : 504251 : && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
6348 : 5270772 : || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
6349 : 1658638 : vertex.partition = -1;
6350 : : else
6351 : : {
6352 : 3610128 : bool existed;
6353 : 3610128 : if (m_optimize_size)
6354 : 25094 : existed = next_partition_i > partition_i;
6355 : : else
6356 : : {
6357 : 3585034 : struct loop *loop = containing_loop (vertex.node);
6358 : 3585034 : auto &entry = loop_partitions.get_or_insert (loop, &existed);
6359 : 3585034 : if (!existed)
6360 : 3506724 : entry = next_partition_i;
6361 : 3585034 : partition_i = entry;
6362 : : }
6363 : 3610128 : if (!existed)
6364 : : {
6365 : 3531735 : m_partitions.quick_push (slpg_partition_info ());
6366 : 3531735 : next_partition_i += 1;
6367 : : }
6368 : 3610128 : vertex.partition = partition_i;
6369 : 3610128 : num_partitioned_nodes += 1;
6370 : 3610128 : m_partitions[partition_i].node_end += 1;
6371 : : }
6372 : : }
6373 : 5189484 : rpo_begin = rpo_end;
6374 : : }
6375 : :
6376 : : /* Assign ranges of consecutive node indices to each partition,
6377 : : in partition order. Start with node_end being the same as
6378 : : node_begin so that the next loop can use it as a counter. */
6379 : 626722 : unsigned int node_begin = 0;
6380 : 5411901 : for (auto &partition : m_partitions)
6381 : : {
6382 : 3531735 : partition.node_begin = node_begin;
6383 : 3531735 : node_begin += partition.node_end;
6384 : 3531735 : partition.node_end = partition.node_begin;
6385 : : }
6386 : 626722 : gcc_assert (node_begin == num_partitioned_nodes);
6387 : :
6388 : : /* Finally build the list of nodes in partition order. */
6389 : 626722 : m_partitioned_nodes.truncate (num_partitioned_nodes);
6390 : 5895488 : for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
6391 : : {
6392 : 5268766 : int partition_i = m_vertices[node_i].partition;
6393 : 5268766 : if (partition_i >= 0)
6394 : : {
6395 : 3610128 : unsigned int order_i = m_partitions[partition_i].node_end++;
6396 : 3610128 : m_partitioned_nodes[order_i] = node_i;
6397 : : }
6398 : : }
6399 : 626722 : }
6400 : :
6401 : : /* Look for edges from earlier partitions into node NODE_I and edges from
6402 : : node NODE_I into later partitions. Call:
6403 : :
6404 : : FN (ud, other_node_i)
6405 : :
6406 : : for each such use-to-def edge ud, where other_node_i is the node at the
6407 : : other end of the edge. */
6408 : :
6409 : : template<typename T>
6410 : : void
6411 : 4027141 : vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
6412 : : {
6413 : 4027141 : int partition_i = m_vertices[node_i].partition;
6414 : 4027141 : for (graph_edge *pred = m_slpg->vertices[node_i].pred;
6415 : 6757460 : pred; pred = pred->pred_next)
6416 : : {
6417 : 2730319 : int src_partition_i = m_vertices[pred->src].partition;
6418 : 2730319 : if (src_partition_i >= 0 && src_partition_i != partition_i)
6419 : 2535054 : fn (pred, pred->src);
6420 : : }
6421 : 4027141 : for (graph_edge *succ = m_slpg->vertices[node_i].succ;
6422 : 8564371 : succ; succ = succ->succ_next)
6423 : : {
6424 : 4537230 : int dest_partition_i = m_vertices[succ->dest].partition;
6425 : 4537230 : if (dest_partition_i >= 0 && dest_partition_i != partition_i)
6426 : 2552209 : fn (succ, succ->dest);
6427 : : }
6428 : 4027141 : }
6429 : :
6430 : : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
6431 : : that NODE would operate on. This test is independent of NODE's actual
6432 : : operation. */
6433 : :
6434 : : bool
6435 : 1620832 : vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
6436 : : unsigned int layout_i)
6437 : : {
6438 : 1620832 : if (layout_i == 0)
6439 : : return true;
6440 : :
6441 : 900078 : if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
6442 : 19850 : return false;
6443 : :
6444 : : return true;
6445 : : }
6446 : :
6447 : : /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
6448 : : to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
6449 : : layouts is incompatible with NODE or if the change is not possible for
6450 : : some other reason.
6451 : :
6452 : : The properties taken from NODE include the number of lanes and the
6453 : : vector type. The actual operation doesn't matter. */
6454 : :
6455 : : int
6456 : 696327 : vect_optimize_slp_pass::change_layout_cost (slp_tree node,
6457 : : unsigned int from_layout_i,
6458 : : unsigned int to_layout_i)
6459 : : {
6460 : 696327 : if (!is_compatible_layout (node, from_layout_i)
6461 : 696327 : || !is_compatible_layout (node, to_layout_i))
6462 : 545 : return -1;
6463 : :
6464 : 695782 : if (from_layout_i == to_layout_i)
6465 : : return 0;
6466 : :
6467 : 287545 : auto_vec<slp_tree, 1> children (1);
6468 : 287545 : children.quick_push (node);
6469 : 287545 : auto_lane_permutation_t perm (SLP_TREE_LANES (node));
6470 : 287545 : if (from_layout_i > 0)
6471 : 833276 : for (unsigned int i : m_perms[from_layout_i])
6472 : 367451 : perm.quick_push ({ 0, i });
6473 : : else
6474 : 430013 : for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
6475 : 297743 : perm.quick_push ({ 0, i });
6476 : 287545 : if (to_layout_i > 0)
6477 : 132697 : vect_slp_permute (m_perms[to_layout_i], perm, true);
6478 : 287545 : auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
6479 : : children, false);
6480 : 287545 : if (count >= 0)
6481 : 282812 : return MAX (count, 1);
6482 : :
6483 : : /* ??? In principle we could try changing via layout 0, giving two
6484 : : layout changes rather than 1. Doing that would require
6485 : : corresponding support in get_result_with_layout. */
6486 : : return -1;
6487 : 287545 : }
6488 : :
6489 : : /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
6490 : :
6491 : : inline slpg_partition_layout_costs &
6492 : 1036670 : vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
6493 : : unsigned int layout_i)
6494 : : {
6495 : 2073340 : return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
6496 : : }
6497 : :
6498 : : /* Change PERM in one of two ways:
6499 : :
6500 : : - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
6501 : : chosen for child I of NODE.
6502 : :
6503 : : - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
6504 : :
6505 : : In both cases, arrange for the output to have layout OUT_LAYOUT_I */
6506 : :
6507 : : void
6508 : 23223 : vect_optimize_slp_pass::
6509 : : change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
6510 : : int in_layout_i, unsigned int out_layout_i)
6511 : : {
6512 : 138909 : for (auto &entry : perm)
6513 : : {
6514 : 69240 : int this_in_layout_i = in_layout_i;
6515 : 69240 : if (this_in_layout_i < 0)
6516 : : {
6517 : 52429 : slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
6518 : 52429 : unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
6519 : 52429 : if (in_partition_i == -1u)
6520 : 329 : continue;
6521 : 52100 : this_in_layout_i = m_partitions[in_partition_i].layout;
6522 : : }
6523 : 68911 : if (this_in_layout_i > 0)
6524 : 14357 : entry.second = m_perms[this_in_layout_i][entry.second];
6525 : : }
6526 : 23223 : if (out_layout_i > 0)
6527 : 4862 : vect_slp_permute (m_perms[out_layout_i], perm, true);
6528 : 23223 : }
6529 : :
6530 : : /* Check whether the target allows NODE to be rearranged so that the node's
6531 : : output has layout OUT_LAYOUT_I. Return the cost of the change if so,
6532 : : in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
6533 : :
6534 : : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
6535 : : NODE can adapt to the layout changes that have (perhaps provisionally)
6536 : : been chosen for NODE's children, so that no extra permutations are
6537 : : needed on either the input or the output of NODE.
6538 : :
6539 : : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
6540 : : that all inputs will be forced into layout IN_LAYOUT_I beforehand.
6541 : :
6542 : : IN_LAYOUT_I has no meaning for other types of node.
6543 : :
6544 : : Keeping the node as-is is always valid. If the target doesn't appear
6545 : : to support the node as-is, but might realistically support other layouts,
6546 : : then layout 0 instead has the cost of a worst-case permutation. On the
6547 : : one hand, this ensures that every node has at least one valid layout,
6548 : : avoiding what would otherwise be an awkward special case. On the other,
6549 : : it still encourages the pass to change an invalid pre-existing layout
6550 : : choice into a valid one. */
6551 : :
6552 : : int
6553 : 220871 : vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
6554 : : unsigned int out_layout_i)
6555 : : {
6556 : 220871 : const int fallback_cost = 1;
6557 : :
6558 : 220871 : if (SLP_TREE_PERMUTE_P (node))
6559 : : {
6560 : 20353 : auto_lane_permutation_t tmp_perm;
6561 : 20353 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
6562 : :
6563 : : /* Check that the child nodes support the chosen layout. Checking
6564 : : the first child is enough, since any second child would have the
6565 : : same shape. */
6566 : 20353 : auto first_child = SLP_TREE_CHILDREN (node)[0];
6567 : 20353 : if (in_layout_i > 0
6568 : 20353 : && !is_compatible_layout (first_child, in_layout_i))
6569 : : return -1;
6570 : :
6571 : 19810 : change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
6572 : 39620 : int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
6573 : : node, tmp_perm,
6574 : 19810 : SLP_TREE_CHILDREN (node),
6575 : : false);
6576 : 19810 : if (count < 0)
6577 : : {
6578 : 1472 : if (in_layout_i == 0 && out_layout_i == 0)
6579 : : {
6580 : : /* Use the fallback cost if the node could in principle support
6581 : : some nonzero layout for both the inputs and the outputs.
6582 : : Otherwise assume that the node will be rejected later
6583 : : and rebuilt from scalars. */
6584 : 349 : if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
6585 : : return fallback_cost;
6586 : 275 : return 0;
6587 : : }
6588 : : return -1;
6589 : : }
6590 : :
6591 : : /* We currently have no way of telling whether the new layout is cheaper
6592 : : or more expensive than the old one. But at least in principle,
6593 : : it should be worth making zero permutations (whole-vector shuffles)
6594 : : cheaper than real permutations, in case the pass is able to remove
6595 : : the latter. */
6596 : 18338 : return count == 0 ? 0 : 1;
6597 : 20353 : }
6598 : :
6599 : 200518 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
6600 : 200518 : if (rep
6601 : 199616 : && STMT_VINFO_DATA_REF (rep)
6602 : 68971 : && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
6603 : 252922 : && SLP_TREE_LOAD_PERMUTATION (node).exists ())
6604 : : {
6605 : 46646 : auto_load_permutation_t tmp_perm;
6606 : 46646 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
6607 : 46646 : if (out_layout_i > 0)
6608 : 17072 : vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
6609 : :
6610 : 46646 : poly_uint64 vf = 1;
6611 : 46646 : if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
6612 : 10504 : vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6613 : 46646 : unsigned int n_perms;
6614 : 46646 : if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
6615 : : nullptr, vf, true, false, &n_perms))
6616 : : {
6617 : 2051 : auto rep = SLP_TREE_REPRESENTATIVE (node);
6618 : 2051 : if (out_layout_i == 0)
6619 : : {
6620 : : /* Use the fallback cost if the load is an N-to-N permutation.
6621 : : Otherwise assume that the node will be rejected later
6622 : : and rebuilt from scalars. */
6623 : 1585 : if (STMT_VINFO_GROUPED_ACCESS (rep)
6624 : 3170 : && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
6625 : 1585 : == SLP_TREE_LANES (node)))
6626 : 1014 : return fallback_cost;
6627 : : return 0;
6628 : : }
6629 : : return -1;
6630 : : }
6631 : :
6632 : : /* See the comment above the corresponding VEC_PERM_EXPR handling. */
6633 : 44595 : return n_perms == 0 ? 0 : 1;
6634 : 46646 : }
6635 : :
6636 : : return 0;
6637 : : }
6638 : :
6639 : : /* Decide which element layouts we should consider using. Calculate the
6640 : : weights associated with inserting layout changes on partition edges.
6641 : : Also mark partitions that cannot change layout, by setting their
6642 : : layout to zero. */
6643 : :
6644 : : void
6645 : 626722 : vect_optimize_slp_pass::start_choosing_layouts ()
6646 : : {
6647 : : /* Used to assign unique permutation indices. */
6648 : 626722 : using perm_hash = unbounded_hashmap_traits<
6649 : : vec_free_hash_base<int_hash_base<unsigned>>,
6650 : : int_hash<int, -1, -2>
6651 : : >;
6652 : 626722 : hash_map<vec<unsigned>, int, perm_hash> layout_ids;
6653 : :
6654 : : /* Layout 0 is "no change". */
6655 : 626722 : m_perms.safe_push (vNULL);
6656 : :
6657 : : /* Create layouts from existing permutations. */
6658 : 626722 : auto_load_permutation_t tmp_perm;
6659 : 5490294 : for (unsigned int node_i : m_partitioned_nodes)
6660 : : {
6661 : : /* Leafs also double as entries to the reverse graph. Allow the
6662 : : layout of those to be changed. */
6663 : 3610128 : auto &vertex = m_vertices[node_i];
6664 : 3610128 : auto &partition = m_partitions[vertex.partition];
6665 : 3610128 : if (!m_slpg->vertices[node_i].succ)
6666 : 980715 : partition.layout = 0;
6667 : :
6668 : : /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
6669 : 3610128 : slp_tree node = vertex.node;
6670 : 3610128 : stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
6671 : 3610128 : slp_tree child;
6672 : 3610128 : unsigned HOST_WIDE_INT imin, imax = 0;
6673 : 3610128 : bool any_permute = false;
6674 : 3610128 : tmp_perm.truncate (0);
6675 : 3610128 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
6676 : : {
6677 : : /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
6678 : : unpermuted, record a layout that reverses this permutation.
6679 : :
6680 : : We would need more work to cope with loads that are internally
6681 : : permuted and also have inputs (such as masks for
6682 : : IFN_MASK_LOADs). */
6683 : 487513 : gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
6684 : 487513 : if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
6685 : : {
6686 : 347523 : partition.layout = -1;
6687 : 3594644 : continue;
6688 : : }
6689 : 139990 : dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
6690 : 139990 : imin = DR_GROUP_SIZE (dr_stmt) + 1;
6691 : 139990 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
6692 : : }
6693 : 6089167 : else if (SLP_TREE_PERMUTE_P (node)
6694 : 170090 : && SLP_TREE_CHILDREN (node).length () == 1
6695 : 156063 : && (child = SLP_TREE_CHILDREN (node)[0])
6696 : 3278678 : && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
6697 : 156063 : .is_constant (&imin)))
6698 : : {
6699 : : /* If the child has the same vector size as this node,
6700 : : reversing the permutation can make the permutation a no-op.
6701 : : In other cases it can change a true permutation into a
6702 : : full-vector extract. */
6703 : 156063 : tmp_perm.reserve (SLP_TREE_LANES (node));
6704 : 407256 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6705 : 251193 : tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
6706 : : }
6707 : : else
6708 : 2966552 : continue;
6709 : :
6710 : 783827 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6711 : : {
6712 : 487774 : unsigned idx = tmp_perm[j];
6713 : 487774 : imin = MIN (imin, idx);
6714 : 487774 : imax = MAX (imax, idx);
6715 : 487774 : if (idx - tmp_perm[0] != j)
6716 : 148515 : any_permute = true;
6717 : : }
6718 : : /* If the span doesn't match we'd disrupt VF computation, avoid
6719 : : that for now. */
6720 : 296053 : if (imax - imin + 1 != SLP_TREE_LANES (node))
6721 : 98008 : continue;
6722 : : /* If there's no permute no need to split one out. In this case
6723 : : we can consider turning a load into a permuted load, if that
6724 : : turns out to be cheaper than alternatives. */
6725 : 198045 : if (!any_permute)
6726 : : {
6727 : 182408 : partition.layout = -1;
6728 : 182408 : continue;
6729 : : }
6730 : :
6731 : : /* For now only handle true permutes, like
6732 : : vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
6733 : : when permuting constants and invariants keeping the permute
6734 : : bijective. */
6735 : 15637 : auto_sbitmap load_index (SLP_TREE_LANES (node));
6736 : 15637 : bitmap_clear (load_index);
6737 : 60665 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6738 : 45028 : bitmap_set_bit (load_index, tmp_perm[j] - imin);
6739 : : unsigned j;
6740 : 59933 : for (j = 0; j < SLP_TREE_LANES (node); ++j)
6741 : 44449 : if (!bitmap_bit_p (load_index, j))
6742 : : break;
6743 : 15637 : if (j != SLP_TREE_LANES (node))
6744 : 153 : continue;
6745 : :
6746 : 15484 : vec<unsigned> perm = vNULL;
6747 : 15484 : perm.safe_grow (SLP_TREE_LANES (node), true);
6748 : 59675 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6749 : 44191 : perm[j] = tmp_perm[j] - imin;
6750 : :
6751 : 30968 : if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
6752 : : {
6753 : : /* Continue to use existing layouts, but don't add any more. */
6754 : 0 : int *entry = layout_ids.get (perm);
6755 : 0 : partition.layout = entry ? *entry : 0;
6756 : 0 : perm.release ();
6757 : : }
6758 : : else
6759 : : {
6760 : 15484 : bool existed;
6761 : 15484 : int &layout_i = layout_ids.get_or_insert (perm, &existed);
6762 : 15484 : if (existed)
6763 : 5449 : perm.release ();
6764 : : else
6765 : : {
6766 : 10035 : layout_i = m_perms.length ();
6767 : 10035 : m_perms.safe_push (perm);
6768 : : }
6769 : 15484 : partition.layout = layout_i;
6770 : : }
6771 : 15637 : }
6772 : :
6773 : : /* Initially assume that every layout is possible and has zero cost
6774 : : in every partition. */
6775 : 626722 : m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
6776 : 1253444 : * m_perms.length ());
6777 : :
6778 : : /* We have to mark outgoing permutations facing non-associating-reduction
6779 : : graph entries that are not represented as to be materialized.
6780 : : slp_inst_kind_bb_reduc currently only covers associatable reductions. */
6781 : 3545943 : for (slp_instance instance : m_vinfo->slp_instances)
6782 : 1665777 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
6783 : : {
6784 : 5195 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
6785 : 5195 : m_partitions[m_vertices[node_i].partition].layout = 0;
6786 : : }
6787 : 1660582 : else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
6788 : : {
6789 : 2771 : stmt_vec_info stmt_info
6790 : 2771 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
6791 : 2771 : vect_reduc_info reduc_info
6792 : 2771 : = info_for_reduction (as_a <loop_vec_info> (m_vinfo),
6793 : : SLP_INSTANCE_TREE (instance));
6794 : 2771 : if (needs_fold_left_reduction_p (TREE_TYPE
6795 : : (gimple_get_lhs (stmt_info->stmt)),
6796 : : VECT_REDUC_INFO_CODE (reduc_info)))
6797 : : {
6798 : 60 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
6799 : 60 : m_partitions[m_vertices[node_i].partition].layout = 0;
6800 : : }
6801 : : }
6802 : :
6803 : : /* Check which layouts each node and partition can handle. Calculate the
6804 : : weights associated with inserting layout changes on edges. */
6805 : 5490294 : for (unsigned int node_i : m_partitioned_nodes)
6806 : : {
6807 : 3610128 : auto &vertex = m_vertices[node_i];
6808 : 3610128 : auto &partition = m_partitions[vertex.partition];
6809 : 3610128 : slp_tree node = vertex.node;
6810 : :
6811 : 3610128 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
6812 : : {
6813 : 3605793 : vertex.weight = vect_slp_node_weight (node);
6814 : :
6815 : : /* We do not handle stores with a permutation, so all
6816 : : incoming permutations must have been materialized.
6817 : :
6818 : : We also don't handle masked grouped loads, which lack a
6819 : : permutation vector. In this case the memory locations
6820 : : form an implicit second input to the loads, on top of the
6821 : : explicit mask input, and the memory input's layout cannot
6822 : : be changed.
6823 : :
6824 : : On the other hand, we do support permuting gather loads and
6825 : : masked gather loads, where each scalar load is independent
6826 : : of the others. This can be useful if the address/index input
6827 : : benefits from permutation. */
6828 : 3605793 : if (STMT_VINFO_DATA_REF (rep)
6829 : 1652402 : && STMT_VINFO_GROUPED_ACCESS (rep)
6830 : 4711944 : && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
6831 : 966161 : partition.layout = 0;
6832 : :
6833 : : /* We cannot change the layout of an operation that is
6834 : : not independent on lanes. Note this is an explicit
6835 : : negative list since that's much shorter than the respective
6836 : : positive one but it's critical to keep maintaining it. */
6837 : 3605793 : if (is_gimple_call (STMT_VINFO_STMT (rep)))
6838 : 22281 : switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
6839 : : {
6840 : 1149 : case CFN_COMPLEX_ADD_ROT90:
6841 : 1149 : case CFN_COMPLEX_ADD_ROT270:
6842 : 1149 : case CFN_COMPLEX_MUL:
6843 : 1149 : case CFN_COMPLEX_MUL_CONJ:
6844 : 1149 : case CFN_VEC_ADDSUB:
6845 : 1149 : case CFN_VEC_FMADDSUB:
6846 : 1149 : case CFN_VEC_FMSUBADD:
6847 : 1149 : partition.layout = 0;
6848 : : default:;
6849 : : }
6850 : : }
6851 : :
6852 : 7975402 : auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
6853 : : {
6854 : 4365274 : auto &other_vertex = m_vertices[other_node_i];
6855 : :
6856 : : /* Count the number of edges from earlier partitions and the number
6857 : : of edges to later partitions. */
6858 : 4365274 : if (other_vertex.partition < vertex.partition)
6859 : 2182637 : partition.in_degree += 1;
6860 : : else
6861 : 2182637 : partition.out_degree += 1;
6862 : :
6863 : : /* If the current node uses the result of OTHER_NODE_I, accumulate
6864 : : the effects of that. */
6865 : 4365274 : if (ud->src == int (node_i))
6866 : : {
6867 : 2182637 : other_vertex.out_weight += vertex.weight;
6868 : 2182637 : other_vertex.out_degree += 1;
6869 : : }
6870 : 7975402 : };
6871 : 3610128 : for_each_partition_edge (node_i, process_edge);
6872 : : }
6873 : 626722 : }
6874 : :
6875 : : /* Return the incoming costs for node NODE_I, assuming that each input keeps
6876 : : its current (provisional) choice of layout. The inputs do not necessarily
6877 : : have the same layout as each other. */
6878 : :
6879 : : slpg_layout_cost
6880 : 3018 : vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
6881 : : {
6882 : 3018 : auto &vertex = m_vertices[node_i];
6883 : 3018 : slpg_layout_cost cost;
6884 : 11034 : auto add_cost = [&](graph_edge *, unsigned int other_node_i)
6885 : : {
6886 : 8016 : auto &other_vertex = m_vertices[other_node_i];
6887 : 8016 : if (other_vertex.partition < vertex.partition)
6888 : : {
6889 : 5097 : auto &other_partition = m_partitions[other_vertex.partition];
6890 : 10194 : auto &other_costs = partition_layout_costs (other_vertex.partition,
6891 : 5097 : other_partition.layout);
6892 : 5097 : slpg_layout_cost this_cost = other_costs.in_cost;
6893 : 5097 : this_cost.add_serial_cost (other_costs.internal_cost);
6894 : 5097 : this_cost.split (other_partition.out_degree);
6895 : 5097 : cost.add_parallel_cost (this_cost);
6896 : : }
6897 : 11034 : };
6898 : 3018 : for_each_partition_edge (node_i, add_cost);
6899 : 3018 : return cost;
6900 : : }
6901 : :
6902 : : /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
6903 : : and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
6904 : : slpg_layout_cost::impossible () if the change isn't possible. */
6905 : :
6906 : : slpg_layout_cost
6907 : 696327 : vect_optimize_slp_pass::
6908 : : edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
6909 : : unsigned int layout2_i)
6910 : : {
6911 : 696327 : auto &def_vertex = m_vertices[ud->dest];
6912 : 696327 : auto &use_vertex = m_vertices[ud->src];
6913 : 696327 : auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
6914 : 696327 : auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
6915 : 696327 : auto factor = change_layout_cost (def_vertex.node, def_layout_i,
6916 : : use_layout_i);
6917 : 696327 : if (factor < 0)
6918 : 5278 : return slpg_layout_cost::impossible ();
6919 : :
6920 : : /* We have a choice of putting the layout change at the site of the
6921 : : definition or at the site of the use. Prefer the former when
6922 : : optimizing for size or when the execution frequency of the
6923 : : definition is no greater than the combined execution frequencies of
6924 : : the uses. When putting the layout change at the site of the definition,
6925 : : divvy up the cost among all consumers. */
6926 : 691049 : if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
6927 : : {
6928 : 672803 : slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
6929 : 672803 : cost.split (def_vertex.out_degree);
6930 : 672803 : return cost;
6931 : : }
6932 : 18246 : return { use_vertex.weight * factor, m_optimize_size };
6933 : : }
6934 : :
6935 : : /* UD represents a use-def link between FROM_NODE_I and a node in a later
6936 : : partition; FROM_NODE_I could be the definition node or the use node.
6937 : : The node at the other end of the link wants to use layout TO_LAYOUT_I.
6938 : : Return the cost of any necessary fix-ups on edge UD, or return
6939 : : slpg_layout_cost::impossible () if the change isn't possible.
6940 : :
6941 : : At this point, FROM_NODE_I's partition has chosen the cheapest
6942 : : layout based on the information available so far, but this choice
6943 : : is only provisional. */
6944 : :
6945 : : slpg_layout_cost
6946 : 181369 : vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
6947 : : unsigned int to_layout_i)
6948 : : {
6949 : 181369 : auto &from_vertex = m_vertices[from_node_i];
6950 : 181369 : unsigned int from_partition_i = from_vertex.partition;
6951 : 181369 : slpg_partition_info &from_partition = m_partitions[from_partition_i];
6952 : 181369 : gcc_assert (from_partition.layout >= 0);
6953 : :
6954 : : /* First calculate the cost on the assumption that FROM_PARTITION sticks
6955 : : with its current layout preference. */
6956 : 181369 : slpg_layout_cost cost = slpg_layout_cost::impossible ();
6957 : 181369 : auto edge_cost = edge_layout_cost (ud, from_node_i,
6958 : 181369 : from_partition.layout, to_layout_i);
6959 : 181369 : if (edge_cost.is_possible ())
6960 : : {
6961 : 357304 : auto &from_costs = partition_layout_costs (from_partition_i,
6962 : 178652 : from_partition.layout);
6963 : 178652 : cost = from_costs.in_cost;
6964 : 178652 : cost.add_serial_cost (from_costs.internal_cost);
6965 : 178652 : cost.split (from_partition.out_degree);
6966 : 178652 : cost.add_serial_cost (edge_cost);
6967 : : }
6968 : 2717 : else if (from_partition.layout == 0)
6969 : : /* We must allow the source partition to have layout 0 as a fallback,
6970 : : in case all other options turn out to be impossible. */
6971 : 2717 : return cost;
6972 : :
6973 : : /* Take the minimum of that cost and the cost that applies if
6974 : : FROM_PARTITION instead switches to TO_LAYOUT_I. */
6975 : 178652 : auto &direct_layout_costs = partition_layout_costs (from_partition_i,
6976 : : to_layout_i);
6977 : 178652 : if (direct_layout_costs.is_possible ())
6978 : : {
6979 : 164237 : slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
6980 : 164237 : direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
6981 : 164237 : direct_cost.split (from_partition.out_degree);
6982 : 164237 : if (!cost.is_possible ()
6983 : 164237 : || direct_cost.is_better_than (cost, m_optimize_size))
6984 : 44757 : cost = direct_cost;
6985 : : }
6986 : :
6987 : 178652 : return cost;
6988 : : }
6989 : :
6990 : : /* UD represents a use-def link between TO_NODE_I and a node in an earlier
6991 : : partition; TO_NODE_I could be the definition node or the use node.
6992 : : The node at the other end of the link wants to use layout FROM_LAYOUT_I;
6993 : : return the cost of any necessary fix-ups on edge UD, or
6994 : : slpg_layout_cost::impossible () if the choice cannot be made.
6995 : :
6996 : : At this point, TO_NODE_I's partition has a fixed choice of layout. */
6997 : :
6998 : : slpg_layout_cost
6999 : 170810 : vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
7000 : : unsigned int from_layout_i)
7001 : : {
7002 : 170810 : auto &to_vertex = m_vertices[to_node_i];
7003 : 170810 : unsigned int to_partition_i = to_vertex.partition;
7004 : 170810 : slpg_partition_info &to_partition = m_partitions[to_partition_i];
7005 : 170810 : gcc_assert (to_partition.layout >= 0);
7006 : :
7007 : : /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
7008 : : adjusted for this input having layout FROM_LAYOUT_I. Assume that
7009 : : any other inputs keep their current choice of layout. */
7010 : 170810 : auto &to_costs = partition_layout_costs (to_partition_i,
7011 : : to_partition.layout);
7012 : 170810 : if (ud->src == int (to_node_i)
7013 : 170648 : && SLP_TREE_PERMUTE_P (to_vertex.node))
7014 : : {
7015 : 9092 : auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
7016 : 9092 : auto old_layout = from_partition.layout;
7017 : 9092 : from_partition.layout = from_layout_i;
7018 : 18184 : int factor = internal_node_cost (to_vertex.node, -1,
7019 : 9092 : to_partition.layout);
7020 : 9092 : from_partition.layout = old_layout;
7021 : 9092 : if (factor >= 0)
7022 : : {
7023 : 8468 : slpg_layout_cost cost = to_costs.out_cost;
7024 : 16936 : cost.add_serial_cost ({ to_vertex.weight * factor,
7025 : 8468 : m_optimize_size });
7026 : 8468 : cost.split (to_partition.in_degree);
7027 : 8468 : return cost;
7028 : : }
7029 : : }
7030 : :
7031 : : /* Compute the cost if we insert any necessary layout change on edge UD. */
7032 : 162342 : auto edge_cost = edge_layout_cost (ud, to_node_i,
7033 : 162342 : to_partition.layout, from_layout_i);
7034 : 162342 : if (edge_cost.is_possible ())
7035 : : {
7036 : 162342 : slpg_layout_cost cost = to_costs.out_cost;
7037 : 162342 : cost.add_serial_cost (to_costs.internal_cost);
7038 : 162342 : cost.split (to_partition.in_degree);
7039 : 162342 : cost.add_serial_cost (edge_cost);
7040 : 162342 : return cost;
7041 : : }
7042 : :
7043 : 0 : return slpg_layout_cost::impossible ();
7044 : : }
7045 : :
7046 : : /* Make a forward pass through the partitions, accumulating input costs.
7047 : : Make a tentative (provisional) choice of layout for each partition,
7048 : : ensuring that this choice still allows later partitions to keep
7049 : : their original layout. */
7050 : :
7051 : : void
7052 : 9615 : vect_optimize_slp_pass::forward_pass ()
7053 : : {
7054 : 130285 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
7055 : : ++partition_i)
7056 : : {
7057 : 120670 : auto &partition = m_partitions[partition_i];
7058 : :
7059 : : /* If the partition consists of a single VEC_PERM_EXPR, precompute
7060 : : the incoming cost that would apply if every predecessor partition
7061 : : keeps its current layout. This is used within the loop below. */
7062 : 120670 : slpg_layout_cost in_cost;
7063 : 120670 : slp_tree single_node = nullptr;
7064 : 120670 : if (partition.node_end == partition.node_begin + 1)
7065 : : {
7066 : 115158 : unsigned int node_i = m_partitioned_nodes[partition.node_begin];
7067 : 115158 : single_node = m_vertices[node_i].node;
7068 : 115158 : if (SLP_TREE_PERMUTE_P (single_node))
7069 : 3018 : in_cost = total_in_cost (node_i);
7070 : : }
7071 : :
7072 : : /* Go through the possible layouts. Decide which ones are valid
7073 : : for this partition and record which of the valid layouts has
7074 : : the lowest cost. */
7075 : 120670 : unsigned int min_layout_i = 0;
7076 : 120670 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
7077 : 367936 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7078 : : {
7079 : 247266 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7080 : 247266 : if (!layout_costs.is_possible ())
7081 : 56228 : continue;
7082 : :
7083 : : /* If the recorded layout is already 0 then the layout cannot
7084 : : change. */
7085 : 247266 : if (partition.layout == 0 && layout_i != 0)
7086 : : {
7087 : 33959 : layout_costs.mark_impossible ();
7088 : 33959 : continue;
7089 : : }
7090 : :
7091 : 213307 : bool is_possible = true;
7092 : 418723 : for (unsigned int order_i = partition.node_begin;
7093 : 418723 : order_i < partition.node_end; ++order_i)
7094 : : {
7095 : 225261 : unsigned int node_i = m_partitioned_nodes[order_i];
7096 : 225261 : auto &vertex = m_vertices[node_i];
7097 : :
7098 : : /* Reject the layout if it is individually incompatible
7099 : : with any node in the partition. */
7100 : 225261 : if (!is_compatible_layout (vertex.node, layout_i))
7101 : : {
7102 : 18762 : is_possible = false;
7103 : 19845 : break;
7104 : : }
7105 : :
7106 : 561967 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
7107 : : {
7108 : 355468 : auto &other_vertex = m_vertices[other_node_i];
7109 : 355468 : if (other_vertex.partition < vertex.partition)
7110 : : {
7111 : : /* Accumulate the incoming costs from earlier
7112 : : partitions, plus the cost of any layout changes
7113 : : on UD itself. */
7114 : 181369 : auto cost = forward_cost (ud, other_node_i, layout_i);
7115 : 181369 : if (!cost.is_possible ())
7116 : 2717 : is_possible = false;
7117 : : else
7118 : 178652 : layout_costs.in_cost.add_parallel_cost (cost);
7119 : : }
7120 : : else
7121 : : /* Reject the layout if it would make layout 0 impossible
7122 : : for later partitions. This amounts to testing that the
7123 : : target supports reversing the layout change on edges
7124 : : to later partitions.
7125 : :
7126 : : In principle, it might be possible to push a layout
7127 : : change all the way down a graph, so that it never
7128 : : needs to be reversed and so that the target doesn't
7129 : : need to support the reverse operation. But it would
7130 : : be awkward to bail out if we hit a partition that
7131 : : does not support the new layout, especially since
7132 : : we are not dealing with a lattice. */
7133 : 174099 : is_possible &= edge_layout_cost (ud, other_node_i, 0,
7134 : 174099 : layout_i).is_possible ();
7135 : 561967 : };
7136 : 206499 : for_each_partition_edge (node_i, add_cost);
7137 : :
7138 : : /* Accumulate the cost of using LAYOUT_I within NODE,
7139 : : both for the inputs and the outputs. */
7140 : 206499 : int factor = internal_node_cost (vertex.node, layout_i,
7141 : : layout_i);
7142 : 206499 : if (factor < 0)
7143 : : {
7144 : 1083 : is_possible = false;
7145 : 1083 : break;
7146 : : }
7147 : 205416 : else if (factor)
7148 : 35427 : layout_costs.internal_cost.add_serial_cost
7149 : 35427 : ({ vertex.weight * factor, m_optimize_size });
7150 : : }
7151 : 213307 : if (!is_possible)
7152 : : {
7153 : 22269 : layout_costs.mark_impossible ();
7154 : 22269 : continue;
7155 : : }
7156 : :
7157 : : /* Combine the incoming and partition-internal costs. */
7158 : 191038 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7159 : 191038 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7160 : :
7161 : : /* If this partition consists of a single VEC_PERM_EXPR, see
7162 : : if the VEC_PERM_EXPR can be changed to support output layout
7163 : : LAYOUT_I while keeping all the provisional choices of input
7164 : : layout. */
7165 : 191038 : if (single_node && SLP_TREE_PERMUTE_P (single_node))
7166 : : {
7167 : 5280 : int factor = internal_node_cost (single_node, -1, layout_i);
7168 : 5280 : if (factor >= 0)
7169 : : {
7170 : 4855 : auto weight = m_vertices[single_node->vertex].weight;
7171 : 4855 : slpg_layout_cost internal_cost
7172 : 4855 : = { weight * factor, m_optimize_size };
7173 : :
7174 : 4855 : slpg_layout_cost alt_cost = in_cost;
7175 : 4855 : alt_cost.add_serial_cost (internal_cost);
7176 : 4855 : if (alt_cost.is_better_than (combined_cost, m_optimize_size))
7177 : : {
7178 : 1539 : combined_cost = alt_cost;
7179 : 1539 : layout_costs.in_cost = in_cost;
7180 : 1539 : layout_costs.internal_cost = internal_cost;
7181 : : }
7182 : : }
7183 : : }
7184 : :
7185 : : /* Record the layout with the lowest cost. Prefer layout 0 in
7186 : : the event of a tie between it and another layout. */
7187 : 191038 : if (!min_layout_cost.is_possible ()
7188 : 70368 : || combined_cost.is_better_than (min_layout_cost,
7189 : 70368 : m_optimize_size))
7190 : : {
7191 : 139198 : min_layout_i = layout_i;
7192 : 139198 : min_layout_cost = combined_cost;
7193 : : }
7194 : : }
7195 : :
7196 : : /* This loop's handling of earlier partitions should ensure that
7197 : : choosing the original layout for the current partition is no
7198 : : less valid than it was in the original graph, even with the
7199 : : provisional layout choices for those earlier partitions. */
7200 : 120670 : gcc_assert (min_layout_cost.is_possible ());
7201 : 120670 : partition.layout = min_layout_i;
7202 : : }
7203 : 9615 : }
7204 : :
7205 : : /* Make a backward pass through the partitions, accumulating output costs.
7206 : : Make a final choice of layout for each partition. */
7207 : :
7208 : : void
7209 : 9615 : vect_optimize_slp_pass::backward_pass ()
7210 : : {
7211 : 139900 : for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
7212 : : {
7213 : 120670 : auto &partition = m_partitions[partition_i];
7214 : :
7215 : 120670 : unsigned int min_layout_i = 0;
7216 : 120670 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
7217 : 367936 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7218 : : {
7219 : 247266 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7220 : 247266 : if (!layout_costs.is_possible ())
7221 : 56228 : continue;
7222 : :
7223 : : /* Accumulate the costs from successor partitions. */
7224 : 191038 : bool is_possible = true;
7225 : 393992 : for (unsigned int order_i = partition.node_begin;
7226 : 393992 : order_i < partition.node_end; ++order_i)
7227 : : {
7228 : 202954 : unsigned int node_i = m_partitioned_nodes[order_i];
7229 : 202954 : auto &vertex = m_vertices[node_i];
7230 : 552281 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
7231 : : {
7232 : 349327 : auto &other_vertex = m_vertices[other_node_i];
7233 : 349327 : auto &other_partition = m_partitions[other_vertex.partition];
7234 : 349327 : if (other_vertex.partition > vertex.partition)
7235 : : {
7236 : : /* Accumulate the incoming costs from later
7237 : : partitions, plus the cost of any layout changes
7238 : : on UD itself. */
7239 : 170810 : auto cost = backward_cost (ud, other_node_i, layout_i);
7240 : 170810 : if (!cost.is_possible ())
7241 : 0 : is_possible = false;
7242 : : else
7243 : 170810 : layout_costs.out_cost.add_parallel_cost (cost);
7244 : : }
7245 : : else
7246 : : /* Make sure that earlier partitions can (if necessary
7247 : : or beneficial) keep the layout that they chose in
7248 : : the forward pass. This ensures that there is at
7249 : : least one valid choice of layout. */
7250 : 178517 : is_possible &= edge_layout_cost (ud, other_node_i,
7251 : 178517 : other_partition.layout,
7252 : 178517 : layout_i).is_possible ();
7253 : 552281 : };
7254 : 202954 : for_each_partition_edge (node_i, add_cost);
7255 : : }
7256 : 191038 : if (!is_possible)
7257 : : {
7258 : 0 : layout_costs.mark_impossible ();
7259 : 0 : continue;
7260 : : }
7261 : :
7262 : : /* Locally combine the costs from the forward and backward passes.
7263 : : (This combined cost is not passed on, since that would lead
7264 : : to double counting.) */
7265 : 191038 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7266 : 191038 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7267 : 191038 : combined_cost.add_serial_cost (layout_costs.out_cost);
7268 : :
7269 : : /* Record the layout with the lowest cost. Prefer layout 0 in
7270 : : the event of a tie between it and another layout. */
7271 : 191038 : if (!min_layout_cost.is_possible ()
7272 : 70368 : || combined_cost.is_better_than (min_layout_cost,
7273 : 70368 : m_optimize_size))
7274 : : {
7275 : 133759 : min_layout_i = layout_i;
7276 : 133759 : min_layout_cost = combined_cost;
7277 : : }
7278 : : }
7279 : :
7280 : 120670 : gcc_assert (min_layout_cost.is_possible ());
7281 : 120670 : partition.layout = min_layout_i;
7282 : : }
7283 : 9615 : }
7284 : :
7285 : : /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
7286 : : NODE already has the layout that was selected for its partition. */
7287 : :
7288 : : slp_tree
7289 : 151790 : vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
7290 : : unsigned int to_layout_i)
7291 : : {
7292 : 151790 : unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
7293 : 151790 : slp_tree result = m_node_layouts[result_i];
7294 : 151790 : if (result)
7295 : : return result;
7296 : :
7297 : 151220 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
7298 : 151220 : || (SLP_TREE_DEF_TYPE (node) == vect_external_def
7299 : : /* We can't permute vector defs in place. */
7300 : 20204 : && SLP_TREE_VEC_DEFS (node).is_empty ()))
7301 : : {
7302 : : /* If the vector is uniform or unchanged, there's nothing to do. */
7303 : 38090 : if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
7304 : : result = node;
7305 : : else
7306 : : {
7307 : 1637 : auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
7308 : 1637 : result = vect_create_new_slp_node (scalar_ops);
7309 : 1637 : vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
7310 : : }
7311 : : }
7312 : : else
7313 : : {
7314 : 113130 : unsigned int partition_i = m_vertices[node->vertex].partition;
7315 : 113130 : unsigned int from_layout_i = m_partitions[partition_i].layout;
7316 : 113130 : if (from_layout_i == to_layout_i)
7317 : 112414 : return node;
7318 : :
7319 : : /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
7320 : : permutation instead of a serial one. Leave the new permutation
7321 : : in TMP_PERM on success. */
7322 : 716 : auto_lane_permutation_t tmp_perm;
7323 : 716 : unsigned int num_inputs = 1;
7324 : 716 : if (SLP_TREE_PERMUTE_P (node))
7325 : : {
7326 : 6 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
7327 : 6 : if (from_layout_i != 0)
7328 : 6 : vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
7329 : 6 : if (to_layout_i != 0)
7330 : 4 : vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
7331 : 6 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
7332 : : tmp_perm,
7333 : 6 : SLP_TREE_CHILDREN (node),
7334 : : false) >= 0)
7335 : 6 : num_inputs = SLP_TREE_CHILDREN (node).length ();
7336 : : else
7337 : 0 : tmp_perm.truncate (0);
7338 : : }
7339 : :
7340 : 716 : if (dump_enabled_p ())
7341 : : {
7342 : 72 : if (tmp_perm.length () > 0)
7343 : 6 : dump_printf_loc (MSG_NOTE, vect_location,
7344 : : "duplicating permutation node %p with"
7345 : : " layout %d\n",
7346 : : (void *) node, to_layout_i);
7347 : : else
7348 : 66 : dump_printf_loc (MSG_NOTE, vect_location,
7349 : : "inserting permutation node in place of %p\n",
7350 : : (void *) node);
7351 : : }
7352 : :
7353 : 716 : unsigned int num_lanes = SLP_TREE_LANES (node);
7354 : 716 : result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
7355 : 716 : if (SLP_TREE_SCALAR_STMTS (node).length ())
7356 : : {
7357 : 715 : auto &stmts = SLP_TREE_SCALAR_STMTS (result);
7358 : 715 : stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
7359 : 715 : if (from_layout_i != 0)
7360 : 387 : vect_slp_permute (m_perms[from_layout_i], stmts, false);
7361 : 715 : if (to_layout_i != 0)
7362 : 332 : vect_slp_permute (m_perms[to_layout_i], stmts, true);
7363 : : }
7364 : 716 : SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
7365 : 716 : SLP_TREE_LANES (result) = num_lanes;
7366 : 716 : SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
7367 : 716 : result->vertex = -1;
7368 : :
7369 : 716 : auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
7370 : 716 : if (tmp_perm.length ())
7371 : : {
7372 : 6 : lane_perm.safe_splice (tmp_perm);
7373 : 6 : SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
7374 : : }
7375 : : else
7376 : : {
7377 : 710 : lane_perm.create (num_lanes);
7378 : 2188 : for (unsigned j = 0; j < num_lanes; ++j)
7379 : 1478 : lane_perm.quick_push ({ 0, j });
7380 : 710 : if (from_layout_i != 0)
7381 : 381 : vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
7382 : 710 : if (to_layout_i != 0)
7383 : 329 : vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
7384 : 710 : SLP_TREE_CHILDREN (result).safe_push (node);
7385 : : }
7386 : 2868 : for (slp_tree child : SLP_TREE_CHILDREN (result))
7387 : 720 : child->refcnt++;
7388 : 716 : }
7389 : 38806 : m_node_layouts[result_i] = result;
7390 : 38806 : return result;
7391 : : }
7392 : :
7393 : : /* Apply the chosen vector layouts to the SLP graph. */
7394 : :
7395 : : void
7396 : 9615 : vect_optimize_slp_pass::materialize ()
7397 : : {
7398 : : /* We no longer need the costs, so avoid having two O(N * P) arrays
7399 : : live at the same time. */
7400 : 9615 : m_partition_layout_costs.release ();
7401 : 28845 : m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
7402 : :
7403 : 19230 : auto_sbitmap fully_folded (m_vertices.length ());
7404 : 9615 : bitmap_clear (fully_folded);
7405 : 158035 : for (unsigned int node_i : m_partitioned_nodes)
7406 : : {
7407 : 129190 : auto &vertex = m_vertices[node_i];
7408 : 129190 : slp_tree node = vertex.node;
7409 : 129190 : int layout_i = m_partitions[vertex.partition].layout;
7410 : 129190 : gcc_assert (layout_i >= 0);
7411 : :
7412 : : /* Rearrange the scalar statements to match the chosen layout. */
7413 : 129190 : if (layout_i > 0)
7414 : 13304 : vect_slp_permute (m_perms[layout_i],
7415 : 13304 : SLP_TREE_SCALAR_STMTS (node), true);
7416 : :
7417 : : /* Update load and lane permutations. */
7418 : 129190 : if (SLP_TREE_PERMUTE_P (node))
7419 : : {
7420 : : /* First try to absorb the input vector layouts. If that fails,
7421 : : force the inputs to have layout LAYOUT_I too. We checked that
7422 : : that was possible before deciding to use nonzero output layouts.
7423 : : (Note that at this stage we don't really have any guarantee that
7424 : : the target supports the original VEC_PERM_EXPR.) */
7425 : 3064 : auto &perm = SLP_TREE_LANE_PERMUTATION (node);
7426 : 3064 : auto_lane_permutation_t tmp_perm;
7427 : 3064 : tmp_perm.safe_splice (perm);
7428 : 3064 : change_vec_perm_layout (node, tmp_perm, -1, layout_i);
7429 : 3064 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
7430 : : tmp_perm,
7431 : 3064 : SLP_TREE_CHILDREN (node),
7432 : : false) >= 0)
7433 : : {
7434 : 2715 : if (dump_enabled_p ()
7435 : 3525 : && !std::equal (tmp_perm.begin (), tmp_perm.end (),
7436 : : perm.begin ()))
7437 : 62 : dump_printf_loc (MSG_NOTE, vect_location,
7438 : : "absorbing input layouts into %p\n",
7439 : : (void *) node);
7440 : 16404 : std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
7441 : 2715 : bitmap_set_bit (fully_folded, node_i);
7442 : : }
7443 : : else
7444 : : {
7445 : : /* Not MSG_MISSED because it would make no sense to users. */
7446 : 349 : if (dump_enabled_p ())
7447 : 46 : dump_printf_loc (MSG_NOTE, vect_location,
7448 : : "failed to absorb input layouts into %p\n",
7449 : : (void *) node);
7450 : 349 : change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
7451 : : }
7452 : 3064 : }
7453 : : else
7454 : : {
7455 : 126126 : gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
7456 : 126126 : auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
7457 : 126126 : if (layout_i > 0)
7458 : : /* ??? When we handle non-bijective permutes the idea
7459 : : is that we can force the load-permutation to be
7460 : : { min, min + 1, min + 2, ... max }. But then the
7461 : : scalar defs might no longer match the lane content
7462 : : which means wrong-code with live lane vectorization.
7463 : : So we possibly have to have NULL entries for those. */
7464 : 13206 : vect_slp_permute (m_perms[layout_i], load_perm, true);
7465 : : }
7466 : : }
7467 : :
7468 : : /* Do this before any nodes disappear, since it involves a walk
7469 : : over the leaves. */
7470 : 9615 : remove_redundant_permutations ();
7471 : :
7472 : : /* Replace each child with a correctly laid-out version. */
7473 : 158035 : for (unsigned int node_i : m_partitioned_nodes)
7474 : : {
7475 : : /* Skip nodes that have already been handled above. */
7476 : 129190 : if (bitmap_bit_p (fully_folded, node_i))
7477 : 2715 : continue;
7478 : :
7479 : 126475 : auto &vertex = m_vertices[node_i];
7480 : 126475 : int in_layout_i = m_partitions[vertex.partition].layout;
7481 : 126475 : gcc_assert (in_layout_i >= 0);
7482 : :
7483 : : unsigned j;
7484 : : slp_tree child;
7485 : 376318 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
7486 : : {
7487 : 156869 : if (!child)
7488 : 5079 : continue;
7489 : :
7490 : 151790 : slp_tree new_child = get_result_with_layout (child, in_layout_i);
7491 : 151790 : if (new_child != child)
7492 : : {
7493 : 2667 : vect_free_slp_tree (child);
7494 : 2667 : SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
7495 : 2667 : new_child->refcnt += 1;
7496 : : }
7497 : : }
7498 : : }
7499 : 9615 : }
7500 : :
7501 : : /* Elide load permutations that are not necessary. Such permutations might
7502 : : be pre-existing, rather than created by the layout optimizations. */
7503 : :
7504 : : void
7505 : 626722 : vect_optimize_slp_pass::remove_redundant_permutations ()
7506 : : {
7507 : 4578540 : for (unsigned int node_i : m_leafs)
7508 : : {
7509 : 2698374 : slp_tree node = m_vertices[node_i].node;
7510 : 2698374 : if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
7511 : 2210861 : continue;
7512 : :
7513 : : /* In basic block vectorization we allow any subchain of an interleaving
7514 : : chain.
7515 : : FORNOW: not in loop SLP because of realignment complications. */
7516 : 487513 : if (is_a <bb_vec_info> (m_vinfo))
7517 : : {
7518 : 152773 : bool subchain_p = true;
7519 : : stmt_vec_info next_load_info = NULL;
7520 : : stmt_vec_info load_info;
7521 : : unsigned j;
7522 : 152773 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
7523 : : {
7524 : 123968 : if (j != 0
7525 : 123968 : && (next_load_info != load_info
7526 : 57306 : || ! load_info
7527 : 57306 : || DR_GROUP_GAP (load_info) != 1))
7528 : : {
7529 : : subchain_p = false;
7530 : : break;
7531 : : }
7532 : 101956 : next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
7533 : : }
7534 : 50817 : if (subchain_p)
7535 : : {
7536 : 28805 : SLP_TREE_LOAD_PERMUTATION (node).release ();
7537 : 28805 : continue;
7538 : : }
7539 : : }
7540 : : else
7541 : : {
7542 : 436696 : loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
7543 : 436696 : stmt_vec_info load_info;
7544 : 436696 : bool this_load_permuted = false;
7545 : 436696 : unsigned j;
7546 : 1298988 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
7547 : 440113 : if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
7548 : : {
7549 : : this_load_permuted = true;
7550 : : break;
7551 : : }
7552 : : /* When this isn't a grouped access we know it's single element
7553 : : and contiguous. */
7554 : 436696 : if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
7555 : : {
7556 : 347523 : if (!this_load_permuted
7557 : 347523 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
7558 : 347073 : || SLP_TREE_LANES (node) == 1))
7559 : 347073 : SLP_TREE_LOAD_PERMUTATION (node).release ();
7560 : 347523 : continue;
7561 : : }
7562 : 89173 : stmt_vec_info first_stmt_info
7563 : 89173 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
7564 : 89516 : if (!this_load_permuted
7565 : : /* The load requires permutation when unrolling exposes
7566 : : a gap either because the group is larger than the SLP
7567 : : group-size or because there is a gap between the groups. */
7568 : 89173 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
7569 : 74860 : || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
7570 : 97 : && DR_GROUP_GAP (first_stmt_info) == 0)))
7571 : : {
7572 : 343 : SLP_TREE_LOAD_PERMUTATION (node).release ();
7573 : 343 : continue;
7574 : : }
7575 : : }
7576 : : }
7577 : 626722 : }
7578 : :
7579 : : /* Print the partition graph and layout information to the dump file. */
7580 : :
7581 : : void
7582 : 617 : vect_optimize_slp_pass::dump ()
7583 : : {
7584 : 617 : dump_printf_loc (MSG_NOTE, vect_location,
7585 : : "SLP optimize permutations:\n");
7586 : 1247 : for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
7587 : : {
7588 : 630 : dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
7589 : 630 : const char *sep = "";
7590 : 5351 : for (unsigned int idx : m_perms[layout_i])
7591 : : {
7592 : 3461 : dump_printf (MSG_NOTE, "%s%d", sep, idx);
7593 : 3461 : sep = ", ";
7594 : : }
7595 : 630 : dump_printf (MSG_NOTE, " }\n");
7596 : : }
7597 : 617 : dump_printf_loc (MSG_NOTE, vect_location,
7598 : : "SLP optimize partitions:\n");
7599 : 4981 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
7600 : : ++partition_i)
7601 : : {
7602 : 4364 : auto &partition = m_partitions[partition_i];
7603 : 4364 : dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
7604 : 4364 : dump_printf_loc (MSG_NOTE, vect_location,
7605 : : " partition %d (layout %d):\n",
7606 : : partition_i, partition.layout);
7607 : 4364 : dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
7608 : 8906 : for (unsigned int order_i = partition.node_begin;
7609 : 8906 : order_i < partition.node_end; ++order_i)
7610 : : {
7611 : 4542 : auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
7612 : 9084 : dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
7613 : 4542 : (void *) vertex.node);
7614 : 4542 : dump_printf_loc (MSG_NOTE, vect_location,
7615 : : " weight: %f\n",
7616 : : vertex.weight.to_double ());
7617 : 4542 : if (vertex.out_degree)
7618 : 3540 : dump_printf_loc (MSG_NOTE, vect_location,
7619 : : " out weight: %f (degree %d)\n",
7620 : : vertex.out_weight.to_double (),
7621 : : vertex.out_degree);
7622 : 4542 : if (SLP_TREE_PERMUTE_P (vertex.node))
7623 : 451 : dump_printf_loc (MSG_NOTE, vect_location,
7624 : : " op: VEC_PERM_EXPR\n");
7625 : 4091 : else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
7626 : 4073 : dump_printf_loc (MSG_NOTE, vect_location,
7627 : : " op template: %G", rep->stmt);
7628 : : }
7629 : 4364 : dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
7630 : 8906 : for (unsigned int order_i = partition.node_begin;
7631 : 8906 : order_i < partition.node_end; ++order_i)
7632 : : {
7633 : 4542 : unsigned int node_i = m_partitioned_nodes[order_i];
7634 : 4542 : auto &vertex = m_vertices[node_i];
7635 : 13720 : auto print_edge = [&](graph_edge *, unsigned int other_node_i)
7636 : : {
7637 : 9178 : auto &other_vertex = m_vertices[other_node_i];
7638 : 9178 : if (other_vertex.partition < vertex.partition)
7639 : 4589 : dump_printf_loc (MSG_NOTE, vect_location,
7640 : : " - %p [%d] --> %p\n",
7641 : 4589 : (void *) other_vertex.node,
7642 : : other_vertex.partition,
7643 : 4589 : (void *) vertex.node);
7644 : : else
7645 : 4589 : dump_printf_loc (MSG_NOTE, vect_location,
7646 : : " - %p --> [%d] %p\n",
7647 : 4589 : (void *) vertex.node,
7648 : : other_vertex.partition,
7649 : 4589 : (void *) other_vertex.node);
7650 : 13720 : };
7651 : 4542 : for_each_partition_edge (node_i, print_edge);
7652 : : }
7653 : :
7654 : 13291 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7655 : : {
7656 : 8927 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7657 : 8927 : if (layout_costs.is_possible ())
7658 : : {
7659 : 7249 : dump_printf_loc (MSG_NOTE, vect_location,
7660 : : " layout %d:%s\n", layout_i,
7661 : 7249 : partition.layout == int (layout_i)
7662 : : ? " (*)" : "");
7663 : 7249 : slpg_layout_cost combined_cost = layout_costs.in_cost;
7664 : 7249 : combined_cost.add_serial_cost (layout_costs.internal_cost);
7665 : 7249 : combined_cost.add_serial_cost (layout_costs.out_cost);
7666 : : #define TEMPLATE "{depth: %f, total: %f}"
7667 : 7249 : dump_printf_loc (MSG_NOTE, vect_location,
7668 : : " " TEMPLATE "\n",
7669 : : layout_costs.in_cost.depth.to_double (),
7670 : : layout_costs.in_cost.total.to_double ());
7671 : 7249 : dump_printf_loc (MSG_NOTE, vect_location,
7672 : : " + " TEMPLATE "\n",
7673 : : layout_costs.internal_cost.depth.to_double (),
7674 : : layout_costs.internal_cost.total.to_double ());
7675 : 7249 : dump_printf_loc (MSG_NOTE, vect_location,
7676 : : " + " TEMPLATE "\n",
7677 : : layout_costs.out_cost.depth.to_double (),
7678 : : layout_costs.out_cost.total.to_double ());
7679 : 7249 : dump_printf_loc (MSG_NOTE, vect_location,
7680 : : " = " TEMPLATE "\n",
7681 : : combined_cost.depth.to_double (),
7682 : : combined_cost.total.to_double ());
7683 : : #undef TEMPLATE
7684 : : }
7685 : : else
7686 : 1678 : dump_printf_loc (MSG_NOTE, vect_location,
7687 : : " layout %d: rejected\n", layout_i);
7688 : : }
7689 : : }
7690 : 617 : }
7691 : :
7692 : : /* Masked load lanes discovery. */
7693 : :
7694 : : void
7695 : 626722 : vect_optimize_slp_pass::decide_masked_load_lanes ()
7696 : : {
7697 : 7149646 : for (auto v : m_vertices)
7698 : : {
7699 : 5269480 : slp_tree node = v.node;
7700 : 5269480 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
7701 : 3608830 : || SLP_TREE_PERMUTE_P (node))
7702 : 1831454 : continue;
7703 : 3438026 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7704 : 1495715 : if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
7705 : : /* The mask has to be uniform. */
7706 : 950008 : || STMT_VINFO_SLP_VECT_ONLY (stmt_info)
7707 : 949918 : || ! is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
7708 : 3438081 : || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
7709 : : IFN_MASK_LOAD))
7710 : 3438023 : continue;
7711 : 3 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
7712 : 6 : if (STMT_VINFO_STRIDED_P (stmt_info)
7713 : 3 : || compare_step_with_zero (m_vinfo, stmt_info) <= 0
7714 : 3 : || vect_load_lanes_supported (SLP_TREE_VECTYPE (node),
7715 : 0 : DR_GROUP_SIZE (stmt_info),
7716 : : true) == IFN_LAST)
7717 : 3 : continue;
7718 : :
7719 : : /* Uniform masks need to be suitably represented. */
7720 : 0 : slp_tree mask = SLP_TREE_CHILDREN (node)[0];
7721 : 0 : if (!SLP_TREE_PERMUTE_P (mask)
7722 : 0 : || SLP_TREE_CHILDREN (mask).length () != 1)
7723 : 0 : continue;
7724 : 0 : bool match = true;
7725 : 0 : for (auto perm : SLP_TREE_LANE_PERMUTATION (mask))
7726 : 0 : if (perm.first != 0 || perm.second != 0)
7727 : : {
7728 : : match = false;
7729 : : break;
7730 : : }
7731 : 0 : if (!match)
7732 : 0 : continue;
7733 : :
7734 : : /* Now see if the consumer side matches. */
7735 : 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
7736 : 0 : pred; pred = pred->pred_next)
7737 : : {
7738 : 0 : slp_tree pred_node = m_vertices[pred->src].node;
7739 : : /* All consumers should be a permute with a single outgoing lane. */
7740 : 0 : if (!SLP_TREE_PERMUTE_P (pred_node)
7741 : 0 : || SLP_TREE_LANES (pred_node) != 1)
7742 : : {
7743 : : match = false;
7744 : : break;
7745 : : }
7746 : 0 : gcc_assert (SLP_TREE_CHILDREN (pred_node).length () == 1);
7747 : : }
7748 : 0 : if (!match)
7749 : 0 : continue;
7750 : : /* Now we can mark the nodes as to use load lanes. */
7751 : 0 : node->ldst_lanes = true;
7752 : 0 : for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
7753 : 0 : pred; pred = pred->pred_next)
7754 : 0 : m_vertices[pred->src].node->ldst_lanes = true;
7755 : : /* The catch is we have to massage the mask. We have arranged
7756 : : analyzed uniform masks to be represented by a splat VEC_PERM
7757 : : which we can now simply elide as we cannot easily re-do SLP
7758 : : discovery here. */
7759 : 0 : slp_tree new_mask = SLP_TREE_CHILDREN (mask)[0];
7760 : 0 : SLP_TREE_REF_COUNT (new_mask)++;
7761 : 0 : SLP_TREE_CHILDREN (node)[0] = new_mask;
7762 : 0 : vect_free_slp_tree (mask);
7763 : : }
7764 : 626722 : }
7765 : :
7766 : : /* Main entry point for the SLP graph optimization pass. */
7767 : :
7768 : : void
7769 : 626722 : vect_optimize_slp_pass::run ()
7770 : : {
7771 : 626722 : build_graph ();
7772 : 626722 : create_partitions ();
7773 : 626722 : start_choosing_layouts ();
7774 : 626722 : if (m_perms.length () > 1)
7775 : : {
7776 : 9615 : forward_pass ();
7777 : 9615 : backward_pass ();
7778 : 9615 : if (dump_enabled_p ())
7779 : 617 : dump ();
7780 : 9615 : materialize ();
7781 : 38880 : while (!m_perms.is_empty ())
7782 : 19650 : m_perms.pop ().release ();
7783 : : }
7784 : : else
7785 : 617107 : remove_redundant_permutations ();
7786 : 626722 : free_graph (m_slpg);
7787 : 626722 : build_graph ();
7788 : 626722 : decide_masked_load_lanes ();
7789 : 626722 : free_graph (m_slpg);
7790 : 626722 : }
7791 : :
7792 : : /* Apply CSE to NODE and its children using BST_MAP. */
7793 : :
7794 : : static void
7795 : 5659586 : vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
7796 : : {
7797 : 5659586 : bool put_p = false;
7798 : 5659586 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
7799 : : /* Besides some VEC_PERM_EXPR, two-operator nodes also
7800 : : lack scalar stmts and thus CSE doesn't work via bst_map. Ideally
7801 : : we'd have sth that works for all internal and external nodes. */
7802 : 5659586 : && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
7803 : : {
7804 : 3979767 : slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
7805 : 3979767 : if (leader)
7806 : : {
7807 : : /* We've visited this node already. */
7808 : 392020 : if (!*leader || *leader == node)
7809 : : return;
7810 : :
7811 : 2548 : if (dump_enabled_p ())
7812 : 812 : dump_printf_loc (MSG_NOTE, vect_location,
7813 : : "re-using SLP tree %p for %p\n",
7814 : : (void *)*leader, (void *)node);
7815 : 2548 : vect_free_slp_tree (node);
7816 : 2548 : (*leader)->refcnt += 1;
7817 : 2548 : node = *leader;
7818 : 2548 : return;
7819 : : }
7820 : :
7821 : : /* Avoid creating a cycle by populating the map only after recursion. */
7822 : 3587747 : bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
7823 : 3587747 : node->refcnt += 1;
7824 : 3587747 : put_p = true;
7825 : : /* And recurse. */
7826 : : }
7827 : :
7828 : 16026444 : for (slp_tree &child : SLP_TREE_CHILDREN (node))
7829 : 4778750 : if (child)
7830 : 3993809 : vect_cse_slp_nodes (bst_map, child);
7831 : :
7832 : : /* Now record the node for CSE in other siblings. */
7833 : 5267566 : if (put_p)
7834 : 3587747 : *bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
7835 : : }
7836 : :
7837 : : /* Optimize the SLP graph of VINFO. */
7838 : :
7839 : : void
7840 : 978913 : vect_optimize_slp (vec_info *vinfo)
7841 : : {
7842 : 978913 : if (vinfo->slp_instances.is_empty ())
7843 : : return;
7844 : 626722 : vect_optimize_slp_pass (vinfo).run ();
7845 : :
7846 : : /* Apply CSE again to nodes after permute optimization. */
7847 : 626722 : scalar_stmts_to_slp_tree_map_t *bst_map
7848 : 626722 : = new scalar_stmts_to_slp_tree_map_t ();
7849 : :
7850 : 3545943 : for (auto inst : vinfo->slp_instances)
7851 : 1665777 : vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
7852 : :
7853 : 626722 : release_scalar_stmts_to_slp_tree_map (bst_map);
7854 : : }
7855 : :
7856 : : /* Gather loads reachable from the individual SLP graph entries. */
7857 : :
7858 : : void
7859 : 978913 : vect_gather_slp_loads (vec_info *vinfo)
7860 : : {
7861 : 978913 : unsigned i;
7862 : 978913 : slp_instance instance;
7863 : 2644690 : FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
7864 : : {
7865 : 1665777 : hash_set<slp_tree> visited;
7866 : 1665777 : vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
7867 : : SLP_INSTANCE_TREE (instance), visited);
7868 : 1665777 : }
7869 : 978913 : }
7870 : :
7871 : : /* For NODE update VF based on the number of lanes and the vector types
7872 : : used. */
7873 : :
7874 : : static void
7875 : 4810563 : vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
7876 : : hash_set<slp_tree> &visited)
7877 : : {
7878 : 4810563 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7879 : 2065482 : return;
7880 : 3104179 : if (visited.add (node))
7881 : : return;
7882 : :
7883 : 11036672 : for (slp_tree child : SLP_TREE_CHILDREN (node))
7884 : 3860283 : vect_update_slp_vf_for_node (child, vf, visited);
7885 : :
7886 : : /* We do not visit SLP nodes for constants or externals - those neither
7887 : : have a vector type set yet (vectorizable_* does this) nor do they
7888 : : have max_nunits set. Instead we rely on internal nodes max_nunit
7889 : : to cover constant/external operands.
7890 : : Note that when we stop using fixed size vectors externs and constants
7891 : : shouldn't influence the (minimum) vectorization factor, instead
7892 : : vectorizable_* should honor the vectorization factor when trying to
7893 : : assign vector types to constants and externals and cause iteration
7894 : : to a higher vectorization factor when required. */
7895 : 2745081 : poly_uint64 node_vf
7896 : 2745081 : = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
7897 : 2745081 : vf = force_common_multiple (vf, node_vf);
7898 : :
7899 : : /* For permute nodes that are fed from externs or constants we have to
7900 : : consider their number of lanes as well. Likewise for store-lanes. */
7901 : 2745081 : if (SLP_TREE_PERMUTE_P (node) || node->ldst_lanes)
7902 : 803794 : for (slp_tree child : SLP_TREE_CHILDREN (node))
7903 : 210613 : if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7904 : : {
7905 : 2787 : poly_uint64 child_vf
7906 : 2787 : = calculate_unrolling_factor (node->max_nunits,
7907 : : SLP_TREE_LANES (child));
7908 : 2787 : vf = force_common_multiple (vf, child_vf);
7909 : : }
7910 : : }
7911 : :
7912 : : /* For each possible SLP instance decide whether to SLP it and calculate overall
7913 : : unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
7914 : : least one instance. */
7915 : :
7916 : : bool
7917 : 380554 : vect_make_slp_decision (loop_vec_info loop_vinfo)
7918 : : {
7919 : 380554 : unsigned int i;
7920 : 380554 : poly_uint64 unrolling_factor = 1;
7921 : 380554 : const vec<slp_instance> &slp_instances
7922 : : = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7923 : 380554 : slp_instance instance;
7924 : 380554 : int decided_to_slp = 0;
7925 : :
7926 : 380554 : DUMP_VECT_SCOPE ("vect_make_slp_decision");
7927 : :
7928 : 380554 : hash_set<slp_tree> visited;
7929 : 1330834 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
7930 : : {
7931 : 950280 : slp_tree root = SLP_INSTANCE_TREE (instance);
7932 : :
7933 : : /* All unroll factors have the form:
7934 : :
7935 : : GET_MODE_SIZE (vinfo->vector_mode) * X
7936 : :
7937 : : for some rational X, so they must have a common multiple. */
7938 : 950280 : vect_update_slp_vf_for_node (root, unrolling_factor, visited);
7939 : :
7940 : : /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
7941 : : call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
7942 : : loop-based vectorization. Such stmts will be marked as HYBRID. */
7943 : 950280 : vect_mark_slp_stmts (loop_vinfo, root);
7944 : :
7945 : : /* If all instances ended up with vector(1) T roots make sure to
7946 : : not vectorize. RVV for example relies on loop vectorization
7947 : : when some instances are essentially kept scalar. See PR121048. */
7948 : 950280 : if (SLP_TREE_VECTYPE (root)
7949 : 950280 : && known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U))
7950 : 746349 : decided_to_slp++;
7951 : : }
7952 : :
7953 : 380554 : LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
7954 : :
7955 : 380554 : if (decided_to_slp && dump_enabled_p ())
7956 : : {
7957 : 17496 : dump_printf_loc (MSG_NOTE, vect_location,
7958 : : "Decided to SLP %d instances. Unrolling factor ",
7959 : : decided_to_slp);
7960 : 17496 : dump_dec (MSG_NOTE, unrolling_factor);
7961 : 17496 : dump_printf (MSG_NOTE, "\n");
7962 : : }
7963 : :
7964 : 380554 : return (decided_to_slp > 0);
7965 : 380554 : }
7966 : :
7967 : : /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
7968 : :
7969 : 2429453 : _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
7970 : : : vec_info (vec_info::bb, shared),
7971 : 2429453 : roots (vNULL)
7972 : : {
7973 : : /* The region we are operating on. bbs[0] is the entry, excluding
7974 : : its PHI nodes. In the future we might want to track an explicit
7975 : : entry edge to cover bbs[0] PHI nodes and have a region entry
7976 : : insert location. */
7977 : 2429453 : bbs = _bbs.address ();
7978 : 2429453 : nbbs = _bbs.length ();
7979 : :
7980 : 18965834 : for (unsigned i = 0; i < nbbs; ++i)
7981 : : {
7982 : 16536381 : if (i != 0)
7983 : 21934663 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
7984 : 7827735 : gsi_next (&si))
7985 : : {
7986 : 7827735 : gphi *phi = si.phi ();
7987 : 7827735 : gimple_set_uid (phi, 0);
7988 : 7827735 : add_stmt (phi);
7989 : : }
7990 : 33072762 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
7991 : 136010380 : !gsi_end_p (gsi); gsi_next (&gsi))
7992 : : {
7993 : 119473999 : gimple *stmt = gsi_stmt (gsi);
7994 : 119473999 : gimple_set_uid (stmt, 0);
7995 : 119473999 : if (is_gimple_debug (stmt))
7996 : 73134802 : continue;
7997 : 46339197 : add_stmt (stmt);
7998 : : }
7999 : : }
8000 : 2429453 : }
8001 : :
8002 : :
8003 : : /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
8004 : : stmts in the basic block. */
8005 : :
8006 : 2429453 : _bb_vec_info::~_bb_vec_info ()
8007 : : {
8008 : : /* Reset region marker. */
8009 : 18965834 : for (unsigned i = 0; i < nbbs; ++i)
8010 : : {
8011 : 16536381 : if (i != 0)
8012 : 21951575 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
8013 : 7844647 : gsi_next (&si))
8014 : : {
8015 : 7844647 : gphi *phi = si.phi ();
8016 : 7844647 : gimple_set_uid (phi, -1);
8017 : : }
8018 : 33072762 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
8019 : 135938224 : !gsi_end_p (gsi); gsi_next (&gsi))
8020 : : {
8021 : 119401843 : gimple *stmt = gsi_stmt (gsi);
8022 : 119401843 : gimple_set_uid (stmt, -1);
8023 : : }
8024 : : }
8025 : :
8026 : 3639301 : for (unsigned i = 0; i < roots.length (); ++i)
8027 : : {
8028 : 1209848 : roots[i].stmts.release ();
8029 : 1209848 : roots[i].roots.release ();
8030 : 1209848 : roots[i].remain.release ();
8031 : : }
8032 : 2429453 : roots.release ();
8033 : 2429453 : }
8034 : :
8035 : : /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
8036 : : given then that child nodes have already been processed, and that
8037 : : their def types currently match their SLP node's def type. */
8038 : :
8039 : : static bool
8040 : 2385057 : vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
8041 : : slp_instance node_instance,
8042 : : stmt_vector_for_cost *cost_vec)
8043 : : {
8044 : : /* Handle purely internal nodes. */
8045 : 2385057 : if (SLP_TREE_PERMUTE_P (node))
8046 : : {
8047 : 98967 : if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
8048 : : return false;
8049 : :
8050 : : stmt_vec_info slp_stmt_info;
8051 : : unsigned int i;
8052 : 255565 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
8053 : : {
8054 : 157829 : if (slp_stmt_info
8055 : 153408 : && STMT_VINFO_LIVE_P (slp_stmt_info)
8056 : 157847 : && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
8057 : : node_instance, i,
8058 : : false, cost_vec))
8059 : : return false;
8060 : : }
8061 : 97736 : SLP_TREE_TYPE (node) = permute_info_type;
8062 : 97736 : return true;
8063 : : }
8064 : :
8065 : 2286090 : return vect_analyze_stmt (vinfo, node, node_instance, cost_vec);
8066 : : }
8067 : :
8068 : : static int
8069 : 1855448 : sort_ints (const void *a_, const void *b_)
8070 : : {
8071 : 1855448 : int a = *(const int *)a_;
8072 : 1855448 : int b = *(const int *)b_;
8073 : 1855448 : return a - b;
8074 : : }
8075 : :
8076 : : /* Verify if we can externalize a set of internal defs. */
8077 : :
8078 : : static bool
8079 : 391658 : vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
8080 : : {
8081 : : /* Constant generation uses get_later_stmt which can only handle
8082 : : defs from the same BB or a set of defs that can be ordered
8083 : : with a dominance query. */
8084 : 391658 : basic_block bb = NULL;
8085 : 391658 : bool all_same = true;
8086 : 391658 : auto_vec<int> bbs;
8087 : 783316 : bbs.reserve_exact (stmts.length ());
8088 : 2115978 : for (stmt_vec_info stmt : stmts)
8089 : : {
8090 : 941004 : if (!stmt)
8091 : : return false;
8092 : 941004 : else if (!bb)
8093 : 391658 : bb = gimple_bb (stmt->stmt);
8094 : 549346 : else if (gimple_bb (stmt->stmt) != bb)
8095 : 173037 : all_same = false;
8096 : 941004 : bbs.quick_push (gimple_bb (stmt->stmt)->index);
8097 : : }
8098 : 391658 : if (all_same)
8099 : : return true;
8100 : :
8101 : : /* Produce a vector of unique BB indexes for the defs. */
8102 : 129561 : bbs.qsort (sort_ints);
8103 : : unsigned i, j;
8104 : 316052 : for (i = 1, j = 1; i < bbs.length (); ++i)
8105 : 186491 : if (bbs[i] != bbs[j-1])
8106 : 138263 : bbs[j++] = bbs[i];
8107 : 129561 : gcc_assert (j >= 2);
8108 : 129561 : bbs.truncate (j);
8109 : :
8110 : 259122 : if (bbs.length () == 2)
8111 : 126128 : return (dominated_by_p (CDI_DOMINATORS,
8112 : 126128 : BASIC_BLOCK_FOR_FN (cfun, bbs[0]),
8113 : 126128 : BASIC_BLOCK_FOR_FN (cfun, bbs[1]))
8114 : 245006 : || dominated_by_p (CDI_DOMINATORS,
8115 : 118878 : BASIC_BLOCK_FOR_FN (cfun, bbs[1]),
8116 : 118878 : BASIC_BLOCK_FOR_FN (cfun, bbs[0])));
8117 : :
8118 : : /* ??? For more than two BBs we can sort the vector and verify the
8119 : : result is a total order. But we can't use vec::qsort with a
8120 : : compare function using a dominance query since there's no way to
8121 : : signal failure and any fallback for an unordered pair would
8122 : : fail qsort_chk later.
8123 : : For now simply hope that ordering after BB index provides the
8124 : : best candidate total order. If required we can implement our
8125 : : own mergesort or export an entry without checking. */
8126 : 407168 : for (unsigned i = 1; i < bbs.length (); ++i)
8127 : 12113 : if (!dominated_by_p (CDI_DOMINATORS,
8128 : 12113 : BASIC_BLOCK_FOR_FN (cfun, bbs[i]),
8129 : 12113 : BASIC_BLOCK_FOR_FN (cfun, bbs[i-1])))
8130 : : return false;
8131 : :
8132 : : return true;
8133 : 391658 : }
8134 : :
8135 : : /* Try to build NODE from scalars, returning true on success.
8136 : : NODE_INSTANCE is the SLP instance that contains NODE. */
8137 : :
8138 : : static bool
8139 : 572841 : vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
8140 : : slp_instance node_instance)
8141 : : {
8142 : 572841 : stmt_vec_info stmt_info;
8143 : 572841 : unsigned int i;
8144 : :
8145 : 572841 : if (!is_a <bb_vec_info> (vinfo)
8146 : 75047 : || node == SLP_INSTANCE_TREE (node_instance)
8147 : 22840 : || !SLP_TREE_SCALAR_STMTS (node).exists ()
8148 : 22796 : || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
8149 : : /* Force the mask use to be built from scalars instead. */
8150 : 20549 : || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
8151 : 593172 : || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
8152 : 552510 : return false;
8153 : :
8154 : 20331 : if (dump_enabled_p ())
8155 : 73 : dump_printf_loc (MSG_NOTE, vect_location,
8156 : : "Building vector operands of %p from scalars instead\n",
8157 : : (void *) node);
8158 : :
8159 : : /* Don't remove and free the child nodes here, since they could be
8160 : : referenced by other structures. The analysis and scheduling phases
8161 : : (need to) ignore child nodes of anything that isn't vect_internal_def. */
8162 : 20331 : unsigned int group_size = SLP_TREE_LANES (node);
8163 : 20331 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
8164 : : /* Invariants get their vector type from the uses. */
8165 : 20331 : SLP_TREE_VECTYPE (node) = NULL_TREE;
8166 : 20331 : SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
8167 : 20331 : SLP_TREE_LOAD_PERMUTATION (node).release ();
8168 : 71411 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8169 : : {
8170 : 51080 : tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
8171 : 51080 : SLP_TREE_SCALAR_OPS (node)[i] = lhs;
8172 : : }
8173 : : return true;
8174 : : }
8175 : :
8176 : : /* Return true if all elements of the slice are the same. */
8177 : : bool
8178 : 460983 : vect_scalar_ops_slice::all_same_p () const
8179 : : {
8180 : 509837 : for (unsigned int i = 1; i < length; ++i)
8181 : 432917 : if (!operand_equal_p (op (0), op (i)))
8182 : : return false;
8183 : : return true;
8184 : : }
8185 : :
8186 : : hashval_t
8187 : 419274 : vect_scalar_ops_slice_hash::hash (const value_type &s)
8188 : : {
8189 : 419274 : hashval_t hash = 0;
8190 : 1576012 : for (unsigned i = 0; i < s.length; ++i)
8191 : 1156738 : hash = iterative_hash_expr (s.op (i), hash);
8192 : 419274 : return hash;
8193 : : }
8194 : :
8195 : : bool
8196 : 227453 : vect_scalar_ops_slice_hash::equal (const value_type &s1,
8197 : : const compare_type &s2)
8198 : : {
8199 : 227453 : if (s1.length != s2.length)
8200 : : return false;
8201 : 395614 : for (unsigned i = 0; i < s1.length; ++i)
8202 : 338988 : if (!operand_equal_p (s1.op (i), s2.op (i)))
8203 : : return false;
8204 : : return true;
8205 : : }
8206 : :
8207 : : /* Compute the prologue cost for invariant or constant operands represented
8208 : : by NODE. */
8209 : :
8210 : : static void
8211 : 1013428 : vect_prologue_cost_for_slp (vec_info *vinfo, slp_tree node,
8212 : : stmt_vector_for_cost *cost_vec)
8213 : : {
8214 : : /* There's a special case of an existing vector, that costs nothing. */
8215 : 1013428 : if (SLP_TREE_SCALAR_OPS (node).length () == 0
8216 : 1013428 : && !SLP_TREE_VEC_DEFS (node).is_empty ())
8217 : 1448 : return;
8218 : : /* Without looking at the actual initializer a vector of
8219 : : constants can be implemented as load from the constant pool.
8220 : : When all elements are the same we can use a splat. */
8221 : 1011980 : tree vectype = SLP_TREE_VECTYPE (node);
8222 : 1011980 : unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
8223 : 1011980 : unsigned HOST_WIDE_INT const_nunits;
8224 : 1011980 : unsigned nelt_limit;
8225 : 1011980 : unsigned nvectors = vect_get_num_copies (vinfo, node);
8226 : 1011980 : auto ops = &SLP_TREE_SCALAR_OPS (node);
8227 : 1011980 : auto_vec<unsigned int> starts (nvectors);
8228 : 1011980 : if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
8229 : 1011980 : && ! multiple_p (const_nunits, group_size))
8230 : : {
8231 : 64490 : nelt_limit = const_nunits;
8232 : 64490 : hash_set<vect_scalar_ops_slice_hash> vector_ops;
8233 : 274040 : for (unsigned int i = 0; i < nvectors; ++i)
8234 : 209550 : if (!vector_ops.add ({ ops, i * nelt_limit, nelt_limit }))
8235 : 152924 : starts.quick_push (i * nelt_limit);
8236 : 64490 : }
8237 : : else
8238 : : {
8239 : : /* If either the vector has variable length or the vectors
8240 : : are composed of repeated whole groups we only need to
8241 : : cost construction once. All vectors will be the same. */
8242 : 947490 : nelt_limit = group_size;
8243 : 947490 : starts.quick_push (0);
8244 : : }
8245 : : /* ??? We're just tracking whether vectors in a single node are the same.
8246 : : Ideally we'd do something more global. */
8247 : 1011980 : bool passed = false;
8248 : 4136354 : for (unsigned int start : starts)
8249 : : {
8250 : 1100414 : vect_cost_for_stmt kind;
8251 : 1100414 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
8252 : : kind = vector_load;
8253 : 460983 : else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
8254 : : kind = scalar_to_vec;
8255 : : else
8256 : 384063 : kind = vec_construct;
8257 : : /* The target cost hook has no idea which part of the SLP node
8258 : : we are costing so avoid passing it down more than once. Pass
8259 : : it to the first vec_construct or scalar_to_vec part since for those
8260 : : the x86 backend tries to account for GPR to XMM register moves. */
8261 : 1100414 : record_stmt_cost (cost_vec, 1, kind, nullptr,
8262 : 1100414 : (kind != vector_load && !passed) ? node : nullptr,
8263 : : vectype, 0, vect_prologue);
8264 : 1100414 : if (kind != vector_load)
8265 : 460983 : passed = true;
8266 : : }
8267 : 1011980 : }
8268 : :
8269 : : /* Analyze statements contained in SLP tree NODE after recursively analyzing
8270 : : the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
8271 : :
8272 : : Return true if the operations are supported. */
8273 : :
8274 : : static bool
8275 : 4471662 : vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
8276 : : slp_instance node_instance,
8277 : : hash_set<slp_tree> &visited_set,
8278 : : vec<slp_tree> &visited_vec,
8279 : : stmt_vector_for_cost *cost_vec)
8280 : : {
8281 : 4471662 : int i, j;
8282 : 4471662 : slp_tree child;
8283 : :
8284 : : /* Assume we can code-generate all invariants. */
8285 : 4471662 : if (!node
8286 : 4174321 : || SLP_TREE_DEF_TYPE (node) == vect_constant_def
8287 : 3480943 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
8288 : : return true;
8289 : :
8290 : 2965585 : if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
8291 : : {
8292 : 6 : if (dump_enabled_p ())
8293 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
8294 : : "Failed cyclic SLP reference in %p\n", (void *) node);
8295 : 6 : return false;
8296 : : }
8297 : 2965579 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
8298 : :
8299 : : /* If we already analyzed the exact same set of scalar stmts we're done.
8300 : : We share the generated vector stmts for those. */
8301 : 2965579 : if (visited_set.add (node))
8302 : : return true;
8303 : 2721035 : visited_vec.safe_push (node);
8304 : :
8305 : 2721035 : bool res = true;
8306 : 2721035 : unsigned visited_rec_start = visited_vec.length ();
8307 : 2721035 : unsigned cost_vec_rec_start = cost_vec->length ();
8308 : 2721035 : bool seen_non_constant_child = false;
8309 : 5646092 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8310 : : {
8311 : 3260888 : res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
8312 : : visited_set, visited_vec,
8313 : : cost_vec);
8314 : 3260888 : if (!res)
8315 : : break;
8316 : 2925057 : if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
8317 : 2925057 : seen_non_constant_child = true;
8318 : : }
8319 : : /* We're having difficulties scheduling nodes with just constant
8320 : : operands and no scalar stmts since we then cannot compute a stmt
8321 : : insertion place. */
8322 : 2721035 : if (res
8323 : 2721035 : && !seen_non_constant_child
8324 : 2721035 : && SLP_TREE_SCALAR_STMTS (node).is_empty ())
8325 : : {
8326 : 147 : if (dump_enabled_p ())
8327 : 6 : dump_printf_loc (MSG_NOTE, vect_location,
8328 : : "Cannot vectorize all-constant op node %p\n",
8329 : : (void *) node);
8330 : : res = false;
8331 : : }
8332 : :
8333 : 2720888 : if (res)
8334 : 2385057 : res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
8335 : : cost_vec);
8336 : : /* If analysis failed we have to pop all recursive visited nodes
8337 : : plus ourselves. */
8338 : 2721035 : if (!res)
8339 : : {
8340 : 2804036 : while (visited_vec.length () >= visited_rec_start)
8341 : 829177 : visited_set.remove (visited_vec.pop ());
8342 : 572841 : cost_vec->truncate (cost_vec_rec_start);
8343 : : }
8344 : :
8345 : : /* When the node can be vectorized cost invariant nodes it references.
8346 : : This is not done in DFS order to allow the refering node
8347 : : vectorizable_* calls to nail down the invariant nodes vector type
8348 : : and possibly unshare it if it needs a different vector type than
8349 : : other referrers. */
8350 : 2721035 : if (res)
8351 : 4763011 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
8352 : 2614817 : if (child
8353 : 2373834 : && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
8354 : 2373834 : || SLP_TREE_DEF_TYPE (child) == vect_external_def)
8355 : : /* Perform usual caching, note code-generation still
8356 : : code-gens these nodes multiple times but we expect
8357 : : to CSE them later. */
8358 : 3689559 : && !visited_set.add (child))
8359 : : {
8360 : 1050289 : visited_vec.safe_push (child);
8361 : : /* ??? After auditing more code paths make a "default"
8362 : : and push the vector type from NODE to all children
8363 : : if it is not already set. */
8364 : : /* Compute the number of vectors to be generated. */
8365 : 1050289 : tree vector_type = SLP_TREE_VECTYPE (child);
8366 : 1050289 : if (!vector_type)
8367 : : {
8368 : : /* Masked loads can have an undefined (default SSA definition)
8369 : : else operand. We do not need to cost it. */
8370 : 36861 : vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
8371 : 37824 : if (SLP_TREE_TYPE (node) == load_vec_info_type
8372 : 37824 : && ((ops.length ()
8373 : 963 : && TREE_CODE (ops[0]) == SSA_NAME
8374 : 0 : && SSA_NAME_IS_DEFAULT_DEF (ops[0])
8375 : 0 : && VAR_P (SSA_NAME_VAR (ops[0])))
8376 : 963 : || SLP_TREE_DEF_TYPE (child) == vect_constant_def))
8377 : 963 : continue;
8378 : :
8379 : : /* For shifts with a scalar argument we don't need
8380 : : to cost or code-generate anything.
8381 : : ??? Represent this more explicitely. */
8382 : 35898 : gcc_assert (SLP_TREE_TYPE (node) == shift_vec_info_type
8383 : : && j == 1);
8384 : 35898 : continue;
8385 : 35898 : }
8386 : :
8387 : : /* And cost them. */
8388 : 1013428 : vect_prologue_cost_for_slp (vinfo, child, cost_vec);
8389 : : }
8390 : :
8391 : : /* If this node or any of its children can't be vectorized, try pruning
8392 : : the tree here rather than felling the whole thing. */
8393 : 572841 : if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
8394 : : {
8395 : : /* We'll need to revisit this for invariant costing and number
8396 : : of vectorized stmt setting. */
8397 : : res = true;
8398 : : }
8399 : :
8400 : : return res;
8401 : : }
8402 : :
8403 : : /* Given a definition DEF, analyze if it will have any live scalar use after
8404 : : performing SLP vectorization whose information is represented by BB_VINFO,
8405 : : and record result into hash map SCALAR_USE_MAP as cache for later fast
8406 : : check. If recursion DEPTH exceeds a limit, stop analysis and make a
8407 : : conservative assumption. Return 0 if no scalar use, 1 if there is, -1
8408 : : means recursion is limited. */
8409 : :
8410 : : static int
8411 : 560819 : vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
8412 : : hash_map<tree, int> &scalar_use_map,
8413 : : int depth = 0)
8414 : : {
8415 : 560819 : const int depth_limit = 2;
8416 : 560819 : imm_use_iterator use_iter;
8417 : 560819 : gimple *use_stmt;
8418 : :
8419 : 560819 : if (int *res = scalar_use_map.get (def))
8420 : 22521 : return *res;
8421 : :
8422 : 538298 : int scalar_use = 1;
8423 : :
8424 : 1228830 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
8425 : : {
8426 : 801980 : if (is_gimple_debug (use_stmt))
8427 : 173218 : continue;
8428 : :
8429 : 628762 : stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
8430 : :
8431 : 628762 : if (!use_stmt_info)
8432 : : break;
8433 : :
8434 : 630861 : if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
8435 : 515392 : continue;
8436 : :
8437 : : /* Do not step forward when encounter PHI statement, since it may
8438 : : involve cyclic reference and cause infinite recursive invocation. */
8439 : 107260 : if (gimple_code (use_stmt) == GIMPLE_PHI)
8440 : : break;
8441 : :
8442 : : /* When pattern recognition is involved, a statement whose definition is
8443 : : consumed in some pattern, may not be included in the final replacement
8444 : : pattern statements, so would be skipped when building SLP graph.
8445 : :
8446 : : * Original
8447 : : char a_c = *(char *) a;
8448 : : char b_c = *(char *) b;
8449 : : unsigned short a_s = (unsigned short) a_c;
8450 : : int a_i = (int) a_s;
8451 : : int b_i = (int) b_c;
8452 : : int r_i = a_i - b_i;
8453 : :
8454 : : * After pattern replacement
8455 : : a_s = (unsigned short) a_c;
8456 : : a_i = (int) a_s;
8457 : :
8458 : : patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
8459 : : patt_b_i = (int) patt_b_s; // b_i = (int) b_c
8460 : :
8461 : : patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
8462 : : patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
8463 : :
8464 : : The definitions of a_i(original statement) and b_i(pattern statement)
8465 : : are related to, but actually not part of widen_minus pattern.
8466 : : Vectorizing the pattern does not cause these definition statements to
8467 : : be marked as PURE_SLP. For this case, we need to recursively check
8468 : : whether their uses are all absorbed into vectorized code. But there
8469 : : is an exception that some use may participate in an vectorized
8470 : : operation via an external SLP node containing that use as an element.
8471 : : The parameter "scalar_use_map" tags such kind of SSA as having scalar
8472 : : use in advance. */
8473 : 88025 : tree lhs = gimple_get_lhs (use_stmt);
8474 : :
8475 : 88025 : if (!lhs || TREE_CODE (lhs) != SSA_NAME)
8476 : : break;
8477 : :
8478 : 55682 : if (depth_limit && depth >= depth_limit)
8479 : 10626 : return -1;
8480 : :
8481 : 45056 : if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
8482 : : depth + 1)))
8483 : : break;
8484 : 538298 : }
8485 : :
8486 : 527672 : if (end_imm_use_stmt_p (&use_iter))
8487 : 426850 : scalar_use = 0;
8488 : :
8489 : : /* If recursion is limited, do not cache result for non-root defs. */
8490 : 527672 : if (!depth || scalar_use >= 0)
8491 : : {
8492 : 517046 : bool added = scalar_use_map.put (def, scalar_use);
8493 : 517046 : gcc_assert (!added);
8494 : : }
8495 : :
8496 : 527672 : return scalar_use;
8497 : : }
8498 : :
8499 : : /* Mark lanes of NODE that are live outside of the basic-block vectorized
8500 : : region and that can be vectorized using vectorizable_live_operation
8501 : : with STMT_VINFO_LIVE_P. Not handled live operations will cause the
8502 : : scalar code computing it to be retained. */
8503 : :
8504 : : static void
8505 : 911442 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
8506 : : slp_instance instance,
8507 : : stmt_vector_for_cost *cost_vec,
8508 : : hash_map<tree, int> &scalar_use_map,
8509 : : hash_set<stmt_vec_info> &svisited,
8510 : : hash_set<slp_tree> &visited)
8511 : : {
8512 : 911442 : if (visited.add (node))
8513 : 34335 : return;
8514 : :
8515 : 877107 : unsigned i;
8516 : 877107 : stmt_vec_info stmt_info;
8517 : 877107 : stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
8518 : 3178223 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8519 : : {
8520 : 2301116 : if (!stmt_info || svisited.contains (stmt_info))
8521 : 30047 : continue;
8522 : 2279580 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
8523 : 2279580 : if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
8524 : 11363 : && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
8525 : : /* Only the pattern root stmt computes the original scalar value. */
8526 : 8511 : continue;
8527 : 2271069 : bool mark_visited = true;
8528 : 2271069 : gimple *orig_stmt = orig_stmt_info->stmt;
8529 : 2271069 : ssa_op_iter op_iter;
8530 : 2271069 : def_operand_p def_p;
8531 : 5057901 : FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
8532 : : {
8533 : 515763 : if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
8534 : : scalar_use_map))
8535 : : {
8536 : 90714 : STMT_VINFO_LIVE_P (stmt_info) = true;
8537 : 90714 : if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
8538 : : instance, i, false, cost_vec))
8539 : : /* ??? So we know we can vectorize the live stmt from one SLP
8540 : : node. If we cannot do so from all or none consistently
8541 : : we'd have to record which SLP node (and lane) we want to
8542 : : use for the live operation. So make sure we can
8543 : : code-generate from all nodes. */
8544 : : mark_visited = false;
8545 : : else
8546 : 0 : STMT_VINFO_LIVE_P (stmt_info) = false;
8547 : : }
8548 : :
8549 : : /* We have to verify whether we can insert the lane extract
8550 : : before all uses. The following is a conservative approximation.
8551 : : We cannot put this into vectorizable_live_operation because
8552 : : iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
8553 : : doesn't work.
8554 : : Note that while the fact that we emit code for loads at the
8555 : : first load should make this a non-problem leafs we construct
8556 : : from scalars are vectorized after the last scalar def.
8557 : : ??? If we'd actually compute the insert location during
8558 : : analysis we could use sth less conservative than the last
8559 : : scalar stmt in the node for the dominance check. */
8560 : : /* ??? What remains is "live" uses in vector CTORs in the same
8561 : : SLP graph which is where those uses can end up code-generated
8562 : : right after their definition instead of close to their original
8563 : : use. But that would restrict us to code-generate lane-extracts
8564 : : from the latest stmt in a node. So we compensate for this
8565 : : during code-generation, simply not replacing uses for those
8566 : : hopefully rare cases. */
8567 : 515763 : imm_use_iterator use_iter;
8568 : 515763 : gimple *use_stmt;
8569 : 515763 : stmt_vec_info use_stmt_info;
8570 : :
8571 : 515763 : if (STMT_VINFO_LIVE_P (stmt_info))
8572 : 528720 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
8573 : 438006 : if (!is_gimple_debug (use_stmt)
8574 : 326670 : && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
8575 : 316517 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
8576 : 621322 : && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
8577 : : {
8578 : 16100 : if (dump_enabled_p ())
8579 : 282 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8580 : : "Cannot determine insertion place for "
8581 : : "lane extract\n");
8582 : 16100 : STMT_VINFO_LIVE_P (stmt_info) = false;
8583 : 16100 : mark_visited = true;
8584 : 90714 : }
8585 : : }
8586 : 2271069 : if (mark_visited)
8587 : 2193455 : svisited.add (stmt_info);
8588 : : }
8589 : :
8590 : : slp_tree child;
8591 : 2525675 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8592 : 877273 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8593 : 228121 : vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
8594 : : scalar_use_map, svisited, visited);
8595 : : }
8596 : :
8597 : : /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
8598 : : are live outside of the basic-block vectorized region and that can be
8599 : : vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
8600 : :
8601 : : static void
8602 : 282650 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
8603 : : {
8604 : 282650 : if (bb_vinfo->slp_instances.is_empty ())
8605 : 31891 : return;
8606 : :
8607 : 250759 : hash_set<stmt_vec_info> svisited;
8608 : 250759 : hash_set<slp_tree> visited;
8609 : 250759 : hash_map<tree, int> scalar_use_map;
8610 : 250759 : auto_vec<slp_tree> worklist;
8611 : :
8612 : 1435598 : for (slp_instance instance : bb_vinfo->slp_instances)
8613 : : {
8614 : 683321 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
8615 : 56687 : for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
8616 : 16353 : if (TREE_CODE (op) == SSA_NAME)
8617 : 13815 : scalar_use_map.put (op, 1);
8618 : 683321 : if (!visited.add (SLP_INSTANCE_TREE (instance)))
8619 : 681425 : worklist.safe_push (SLP_INSTANCE_TREE (instance));
8620 : : }
8621 : :
8622 : 1525072 : do
8623 : : {
8624 : 1525072 : slp_tree node = worklist.pop ();
8625 : :
8626 : 1525072 : if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
8627 : : {
8628 : 1594653 : for (tree op : SLP_TREE_SCALAR_OPS (node))
8629 : 706274 : if (TREE_CODE (op) == SSA_NAME)
8630 : 473865 : scalar_use_map.put (op, 1);
8631 : : }
8632 : : else
8633 : : {
8634 : 3648318 : for (slp_tree child : SLP_TREE_CHILDREN (node))
8635 : 877249 : if (child && !visited.add (child))
8636 : 843647 : worklist.safe_push (child);
8637 : : }
8638 : : }
8639 : 3050144 : while (!worklist.is_empty ());
8640 : :
8641 : 250759 : visited.empty ();
8642 : :
8643 : 1435598 : for (slp_instance instance : bb_vinfo->slp_instances)
8644 : : {
8645 : 683321 : vect_location = instance->location ();
8646 : 683321 : vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
8647 : : instance, &instance->cost_vec,
8648 : : scalar_use_map, svisited, visited);
8649 : : }
8650 : 250759 : }
8651 : :
8652 : : /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
8653 : :
8654 : : static bool
8655 : 74858 : vectorizable_bb_reduc_epilogue (slp_instance instance,
8656 : : stmt_vector_for_cost *cost_vec)
8657 : : {
8658 : 74858 : gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
8659 : 74858 : enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
8660 : 74858 : if (reduc_code == MINUS_EXPR)
8661 : 0 : reduc_code = PLUS_EXPR;
8662 : 74858 : internal_fn reduc_fn;
8663 : 74858 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
8664 : 74858 : if (!vectype
8665 : 74846 : || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
8666 : 74846 : || reduc_fn == IFN_LAST
8667 : 74846 : || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
8668 : 110707 : || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
8669 : 35849 : TREE_TYPE (vectype)))
8670 : : {
8671 : 50990 : if (dump_enabled_p ())
8672 : 275 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8673 : : "not vectorized: basic block reduction epilogue "
8674 : : "operation unsupported.\n");
8675 : 50990 : return false;
8676 : : }
8677 : :
8678 : : /* There's no way to cost a horizontal vector reduction via REDUC_FN so
8679 : : cost log2 vector operations plus shuffles and one extraction. */
8680 : 23868 : unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
8681 : 23868 : record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
8682 : : vectype, 0, vect_body);
8683 : 23868 : record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
8684 : : vectype, 0, vect_body);
8685 : 23868 : record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
8686 : : vectype, 0, vect_body);
8687 : :
8688 : : /* Since we replace all stmts of a possibly longer scalar reduction
8689 : : chain account for the extra scalar stmts for that. */
8690 : 23868 : if (!instance->remain_defs.is_empty ())
8691 : 19158 : record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
8692 : 9579 : instance->root_stmts[0], 0, vect_body);
8693 : : return true;
8694 : : }
8695 : :
8696 : : /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
8697 : : and recurse to children. */
8698 : :
8699 : : static void
8700 : 186641 : vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
8701 : : hash_set<slp_tree> &visited)
8702 : : {
8703 : 186641 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
8704 : 186641 : || visited.add (node))
8705 : 81776 : return;
8706 : :
8707 : : stmt_vec_info stmt;
8708 : : unsigned i;
8709 : 347952 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
8710 : 243087 : if (stmt)
8711 : 246685 : roots.remove (vect_orig_stmt (stmt));
8712 : :
8713 : : slp_tree child;
8714 : 238383 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8715 : 133518 : if (child)
8716 : 126794 : vect_slp_prune_covered_roots (child, roots, visited);
8717 : : }
8718 : :
8719 : : /* Analyze statements in SLP instances of VINFO. Return true if the
8720 : : operations are supported. */
8721 : :
8722 : : bool
8723 : 608169 : vect_slp_analyze_operations (vec_info *vinfo)
8724 : : {
8725 : 608169 : slp_instance instance;
8726 : 608169 : int i;
8727 : :
8728 : 608169 : DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
8729 : :
8730 : 608169 : hash_set<slp_tree> visited;
8731 : 1600172 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
8732 : : {
8733 : 1210774 : auto_vec<slp_tree> visited_vec;
8734 : 1210774 : stmt_vector_for_cost cost_vec;
8735 : 1210774 : cost_vec.create (2);
8736 : 1210774 : if (is_a <bb_vec_info> (vinfo))
8737 : 788117 : vect_location = instance->location ();
8738 : 1210774 : if (!vect_slp_analyze_node_operations (vinfo,
8739 : : SLP_INSTANCE_TREE (instance),
8740 : : instance, visited, visited_vec,
8741 : : &cost_vec)
8742 : : /* CTOR instances require vectorized defs for the SLP tree root. */
8743 : 994089 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
8744 : 4514 : && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
8745 : : != vect_internal_def
8746 : : /* Make sure we vectorized with the expected type. */
8747 : 4510 : || !useless_type_conversion_p
8748 : 4510 : (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
8749 : : (instance->root_stmts[0]->stmt))),
8750 : 4510 : TREE_TYPE (SLP_TREE_VECTYPE
8751 : : (SLP_INSTANCE_TREE (instance))))))
8752 : : /* Check we can vectorize the reduction. */
8753 : 994082 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
8754 : 74858 : && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
8755 : : /* Check we can vectorize the gcond. */
8756 : 2153866 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
8757 : 56476 : && !vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
8758 : 56476 : SLP_INSTANCE_ROOT_STMTS (instance)[0],
8759 : : NULL,
8760 : : SLP_INSTANCE_TREE (instance),
8761 : : &cost_vec)))
8762 : : {
8763 : 321975 : cost_vec.release ();
8764 : 321975 : slp_tree node = SLP_INSTANCE_TREE (instance);
8765 : 321975 : stmt_vec_info stmt_info;
8766 : 321975 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8767 : 244705 : stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
8768 : : else
8769 : 77270 : stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8770 : 321975 : if (is_a <loop_vec_info> (vinfo))
8771 : : {
8772 : 218771 : if (dump_enabled_p ())
8773 : 6178 : dump_printf_loc (MSG_NOTE, vect_location,
8774 : : "unsupported SLP instance starting from: %G",
8775 : : stmt_info->stmt);
8776 : 218771 : return false;
8777 : : }
8778 : 103204 : if (dump_enabled_p ())
8779 : 328 : dump_printf_loc (MSG_NOTE, vect_location,
8780 : : "removing SLP instance operations starting from: %G",
8781 : : stmt_info->stmt);
8782 : 103204 : vect_free_slp_instance (instance);
8783 : 103204 : vinfo->slp_instances.ordered_remove (i);
8784 : 1545470 : while (!visited_vec.is_empty ())
8785 : 347106 : visited.remove (visited_vec.pop ());
8786 : : }
8787 : : else
8788 : : {
8789 : 888799 : i++;
8790 : 888799 : if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
8791 : : {
8792 : 203886 : add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
8793 : 203886 : cost_vec.release ();
8794 : : }
8795 : : else
8796 : : /* For BB vectorization remember the SLP graph entry
8797 : : cost for later. */
8798 : 684913 : instance->cost_vec = cost_vec;
8799 : : }
8800 : 1210774 : }
8801 : :
8802 : : /* Now look for SLP instances with a root that are covered by other
8803 : : instances and remove them. */
8804 : 389398 : hash_set<stmt_vec_info> roots;
8805 : 1614482 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
8806 : 866038 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8807 : 30352 : roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
8808 : 389398 : if (!roots.is_empty ())
8809 : : {
8810 : 12011 : visited.empty ();
8811 : 71858 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
8812 : 59847 : vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
8813 : : visited);
8814 : 71858 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
8815 : 59847 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
8816 : 30352 : && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
8817 : : {
8818 : 1592 : stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
8819 : 1592 : if (dump_enabled_p ())
8820 : 20 : dump_printf_loc (MSG_NOTE, vect_location,
8821 : : "removing SLP instance operations starting "
8822 : : "from: %G", root->stmt);
8823 : 1592 : vect_free_slp_instance (instance);
8824 : 1592 : vinfo->slp_instances.ordered_remove (i);
8825 : : }
8826 : : else
8827 : 58255 : ++i;
8828 : : }
8829 : :
8830 : : /* Compute vectorizable live stmts. */
8831 : 389398 : if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
8832 : 282650 : vect_bb_slp_mark_live_stmts (bb_vinfo);
8833 : :
8834 : 778796 : return !vinfo->slp_instances.is_empty ();
8835 : 997567 : }
8836 : :
8837 : : /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
8838 : : closing the eventual chain. */
8839 : :
8840 : : static slp_instance
8841 : 742199 : get_ultimate_leader (slp_instance instance,
8842 : : hash_map<slp_instance, slp_instance> &instance_leader)
8843 : : {
8844 : 742199 : auto_vec<slp_instance *, 8> chain;
8845 : 742199 : slp_instance *tem;
8846 : 812226 : while (*(tem = instance_leader.get (instance)) != instance)
8847 : : {
8848 : 70027 : chain.safe_push (tem);
8849 : 70027 : instance = *tem;
8850 : : }
8851 : 812226 : while (!chain.is_empty ())
8852 : 70027 : *chain.pop () = instance;
8853 : 742199 : return instance;
8854 : 742199 : }
8855 : :
8856 : : namespace {
8857 : : /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
8858 : : KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
8859 : : for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
8860 : :
8861 : : INSTANCE_LEADER is as for get_ultimate_leader. */
8862 : :
8863 : : template<typename T>
8864 : : bool
8865 : 3296606 : vect_map_to_instance (slp_instance instance, T key,
8866 : : hash_map<T, slp_instance> &key_to_instance,
8867 : : hash_map<slp_instance, slp_instance> &instance_leader)
8868 : : {
8869 : : bool existed_p;
8870 : 3296606 : slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
8871 : 3296606 : if (!existed_p)
8872 : : ;
8873 : 149037 : else if (key_instance != instance)
8874 : : {
8875 : : /* If we're running into a previously marked key make us the
8876 : : leader of the current ultimate leader. This keeps the
8877 : : leader chain acyclic and works even when the current instance
8878 : : connects two previously independent graph parts. */
8879 : 58878 : slp_instance key_leader
8880 : 58878 : = get_ultimate_leader (key_instance, instance_leader);
8881 : 58878 : if (key_leader != instance)
8882 : 17712 : instance_leader.put (key_leader, instance);
8883 : : }
8884 : 3296606 : key_instance = instance;
8885 : 3296606 : return existed_p;
8886 : : }
8887 : : }
8888 : :
8889 : : /* Worker of vect_bb_partition_graph, recurse on NODE. */
8890 : :
8891 : : static void
8892 : 911442 : vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
8893 : : slp_instance instance, slp_tree node,
8894 : : hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
8895 : : hash_map<slp_tree, slp_instance> &node_to_instance,
8896 : : hash_map<slp_instance, slp_instance> &instance_leader)
8897 : : {
8898 : 911442 : stmt_vec_info stmt_info;
8899 : 911442 : unsigned i;
8900 : :
8901 : 3296606 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8902 : 2385164 : if (stmt_info)
8903 : 2385164 : vect_map_to_instance (instance, stmt_info, stmt_to_instance,
8904 : : instance_leader);
8905 : :
8906 : 911442 : if (vect_map_to_instance (instance, node, node_to_instance,
8907 : : instance_leader))
8908 : 911442 : return;
8909 : :
8910 : : slp_tree child;
8911 : 1754380 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8912 : 877273 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8913 : 228121 : vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
8914 : : node_to_instance, instance_leader);
8915 : : }
8916 : :
8917 : : /* Partition the SLP graph into pieces that can be costed independently. */
8918 : :
8919 : : static void
8920 : 250759 : vect_bb_partition_graph (bb_vec_info bb_vinfo)
8921 : : {
8922 : 250759 : DUMP_VECT_SCOPE ("vect_bb_partition_graph");
8923 : :
8924 : : /* First walk the SLP graph assigning each involved scalar stmt a
8925 : : corresponding SLP graph entry and upon visiting a previously
8926 : : marked stmt, make the stmts leader the current SLP graph entry. */
8927 : 250759 : hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
8928 : 250759 : hash_map<slp_tree, slp_instance> node_to_instance;
8929 : 250759 : hash_map<slp_instance, slp_instance> instance_leader;
8930 : 250759 : slp_instance instance;
8931 : 934080 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8932 : : {
8933 : 683321 : instance_leader.put (instance, instance);
8934 : 683321 : vect_bb_partition_graph_r (bb_vinfo,
8935 : : instance, SLP_INSTANCE_TREE (instance),
8936 : : stmt_to_instance, node_to_instance,
8937 : : instance_leader);
8938 : : }
8939 : :
8940 : : /* Then collect entries to each independent subgraph. */
8941 : 1184839 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8942 : : {
8943 : 683321 : slp_instance leader = get_ultimate_leader (instance, instance_leader);
8944 : 683321 : leader->subgraph_entries.safe_push (instance);
8945 : 683321 : if (dump_enabled_p ()
8946 : 683321 : && leader != instance)
8947 : 67 : dump_printf_loc (MSG_NOTE, vect_location,
8948 : : "instance %p is leader of %p\n",
8949 : : (void *) leader, (void *) instance);
8950 : : }
8951 : 250759 : }
8952 : :
8953 : : /* Compute the set of scalar stmts participating in internal and external
8954 : : nodes. */
8955 : :
8956 : : static void
8957 : 1547718 : vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
8958 : : hash_set<slp_tree> &visited,
8959 : : hash_set<stmt_vec_info> &vstmts,
8960 : : hash_set<stmt_vec_info> &estmts)
8961 : : {
8962 : 1547718 : int i;
8963 : 1547718 : stmt_vec_info stmt_info;
8964 : 1547718 : slp_tree child;
8965 : :
8966 : 1547718 : if (visited.add (node))
8967 : 34257 : return;
8968 : :
8969 : 1513461 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
8970 : : {
8971 : 3120762 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8972 : 2252190 : if (stmt_info)
8973 : 2252190 : vstmts.add (stmt_info);
8974 : :
8975 : 3145719 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8976 : 867703 : if (child)
8977 : 867703 : vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
8978 : : vstmts, estmts);
8979 : : }
8980 : : else
8981 : 3626843 : for (tree def : SLP_TREE_SCALAR_OPS (node))
8982 : : {
8983 : 1693152 : stmt_vec_info def_stmt = vinfo->lookup_def (def);
8984 : 1693152 : if (def_stmt)
8985 : 332625 : estmts.add (def_stmt);
8986 : : }
8987 : : }
8988 : :
8989 : :
8990 : : /* Compute the scalar cost of the SLP node NODE and its children
8991 : : and return it. Do not account defs that are marked in LIFE and
8992 : : update LIFE according to uses of NODE. */
8993 : :
8994 : : static void
8995 : 902114 : vect_bb_slp_scalar_cost (vec_info *vinfo,
8996 : : slp_tree node, vec<bool, va_heap> *life,
8997 : : stmt_vector_for_cost *cost_vec,
8998 : : hash_set<stmt_vec_info> &vectorized_scalar_stmts,
8999 : : hash_set<stmt_vec_info> &scalar_stmts_in_externs,
9000 : : hash_set<slp_tree> &visited)
9001 : : {
9002 : 902114 : unsigned i;
9003 : 902114 : stmt_vec_info stmt_info;
9004 : 902114 : slp_tree child;
9005 : :
9006 : 902114 : if (visited.add (node))
9007 : 33525 : return;
9008 : :
9009 : 3120813 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9010 : : {
9011 : 2252224 : ssa_op_iter op_iter;
9012 : 2252224 : def_operand_p def_p;
9013 : :
9014 : 2283072 : if (!stmt_info
9015 : 2252224 : || (*life)[i]
9016 : : /* Defs also used in external nodes are not in the
9017 : : vectorized_scalar_stmts set as they need to be preserved.
9018 : : Honor that. */
9019 : 4477028 : || scalar_stmts_in_externs.contains (stmt_info))
9020 : 104591 : continue;
9021 : :
9022 : 2221376 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
9023 : 2221376 : gimple *orig_stmt = orig_stmt_info->stmt;
9024 : :
9025 : : /* If there is a non-vectorized use of the defs then the scalar
9026 : : stmt is kept live in which case we do not account it or any
9027 : : required defs in the SLP children in the scalar cost. This
9028 : : way we make the vectorization more costly when compared to
9029 : : the scalar cost. */
9030 : 2221376 : if (!STMT_VINFO_LIVE_P (stmt_info))
9031 : : {
9032 : 2153715 : auto_vec<gimple *, 8> worklist;
9033 : 2153715 : hash_set<gimple *> *worklist_visited = NULL;
9034 : 2153715 : worklist.quick_push (orig_stmt);
9035 : 2158565 : do
9036 : : {
9037 : 2158565 : gimple *work_stmt = worklist.pop ();
9038 : 4723402 : FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
9039 : : {
9040 : 424871 : imm_use_iterator use_iter;
9041 : 424871 : gimple *use_stmt;
9042 : 1048631 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
9043 : : DEF_FROM_PTR (def_p))
9044 : 642359 : if (!is_gimple_debug (use_stmt))
9045 : : {
9046 : 494967 : stmt_vec_info use_stmt_info
9047 : 494967 : = vinfo->lookup_stmt (use_stmt);
9048 : 494967 : if (!use_stmt_info
9049 : 494967 : || !vectorized_scalar_stmts.contains (use_stmt_info))
9050 : : {
9051 : 23530 : if (use_stmt_info
9052 : 21154 : && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
9053 : : {
9054 : : /* For stmts participating in patterns we have
9055 : : to check its uses recursively. */
9056 : 4931 : if (!worklist_visited)
9057 : 3818 : worklist_visited = new hash_set<gimple *> ();
9058 : 4931 : if (!worklist_visited->add (use_stmt))
9059 : 4931 : worklist.safe_push (use_stmt);
9060 : 4931 : continue;
9061 : : }
9062 : 18599 : (*life)[i] = true;
9063 : 18599 : goto next_lane;
9064 : : }
9065 : 424871 : }
9066 : : }
9067 : : }
9068 : 4279932 : while (!worklist.is_empty ());
9069 : 2135116 : next_lane:
9070 : 2153715 : if (worklist_visited)
9071 : 3818 : delete worklist_visited;
9072 : 2153715 : if ((*life)[i])
9073 : 18599 : continue;
9074 : 2153715 : }
9075 : :
9076 : : /* Count scalar stmts only once. */
9077 : 2202777 : if (gimple_visited_p (orig_stmt))
9078 : 24755 : continue;
9079 : 2178022 : gimple_set_visited (orig_stmt, true);
9080 : :
9081 : 2178022 : vect_cost_for_stmt kind;
9082 : 2178022 : if (STMT_VINFO_DATA_REF (orig_stmt_info))
9083 : : {
9084 : 1974750 : data_reference_p dr = STMT_VINFO_DATA_REF (orig_stmt_info);
9085 : 1974750 : tree base = get_base_address (DR_REF (dr));
9086 : : /* When the scalar access is to a non-global not address-taken
9087 : : decl that is not BLKmode assume we can access it with a single
9088 : : non-load/store instruction. */
9089 : 1974750 : if (DECL_P (base)
9090 : 1534197 : && !is_global_var (base)
9091 : 1459315 : && !TREE_ADDRESSABLE (base)
9092 : 2529065 : && DECL_MODE (base) != BLKmode)
9093 : : kind = scalar_stmt;
9094 : 1829386 : else if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
9095 : : kind = scalar_load;
9096 : : else
9097 : 1604154 : kind = scalar_store;
9098 : : }
9099 : 203272 : else if (vect_nop_conversion_p (orig_stmt_info))
9100 : 20059 : continue;
9101 : : /* For single-argument PHIs assume coalescing which means zero cost
9102 : : for the scalar and the vector PHIs. This avoids artificially
9103 : : favoring the vector path (but may pessimize it in some cases). */
9104 : 183213 : else if (is_a <gphi *> (orig_stmt_info->stmt)
9105 : 183213 : && gimple_phi_num_args
9106 : 89771 : (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
9107 : 10330 : continue;
9108 : : else
9109 : : kind = scalar_stmt;
9110 : 2147633 : record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
9111 : : SLP_TREE_VECTYPE (node), 0, vect_body);
9112 : : }
9113 : :
9114 : 1737178 : auto_vec<bool, 20> subtree_life;
9115 : 2500880 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9116 : : {
9117 : 867727 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9118 : : {
9119 : : /* Do not directly pass LIFE to the recursive call, copy it to
9120 : : confine changes in the callee to the current child/subtree. */
9121 : 222099 : if (SLP_TREE_PERMUTE_P (node))
9122 : : {
9123 : 3607 : subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
9124 : 12639 : for (unsigned j = 0;
9125 : 12639 : j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
9126 : : {
9127 : 9032 : auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
9128 : 9032 : if (perm.first == i)
9129 : 4848 : subtree_life[perm.second] = (*life)[j];
9130 : : }
9131 : : }
9132 : : else
9133 : : {
9134 : 218492 : gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
9135 : 218492 : subtree_life.safe_splice (*life);
9136 : : }
9137 : 222099 : vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
9138 : : vectorized_scalar_stmts,
9139 : : scalar_stmts_in_externs, visited);
9140 : 222099 : subtree_life.truncate (0);
9141 : : }
9142 : : }
9143 : : }
9144 : :
9145 : : /* Comparator for the loop-index sorted cost vectors. */
9146 : :
9147 : : static int
9148 : 17631770 : li_cost_vec_cmp (const void *a_, const void *b_)
9149 : : {
9150 : 17631770 : auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
9151 : 17631770 : auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
9152 : 17631770 : if (a->first < b->first)
9153 : : return -1;
9154 : 16915896 : else if (a->first == b->first)
9155 : 16283305 : return 0;
9156 : : return 1;
9157 : : }
9158 : :
9159 : : /* Check if vectorization of the basic block is profitable for the
9160 : : subgraph denoted by SLP_INSTANCES. */
9161 : :
9162 : : static bool
9163 : 662431 : vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
9164 : : vec<slp_instance> slp_instances,
9165 : : loop_p orig_loop)
9166 : : {
9167 : 662431 : slp_instance instance;
9168 : 662431 : int i;
9169 : 662431 : unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
9170 : 662431 : unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
9171 : :
9172 : 662431 : if (dump_enabled_p ())
9173 : : {
9174 : 100 : dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
9175 : 100 : hash_set<slp_tree> visited;
9176 : 403 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9177 : 103 : vect_print_slp_graph (MSG_NOTE, vect_location,
9178 : : SLP_INSTANCE_TREE (instance), visited);
9179 : 100 : }
9180 : :
9181 : : /* Compute the set of scalar stmts we know will go away 'locally' when
9182 : : vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
9183 : : not accurate for nodes promoted extern late or for scalar stmts that
9184 : : are used both in extern defs and in vectorized defs. */
9185 : 662431 : hash_set<stmt_vec_info> vectorized_scalar_stmts;
9186 : 662431 : hash_set<stmt_vec_info> scalar_stmts_in_externs;
9187 : 662431 : hash_set<slp_tree> visited;
9188 : 1342446 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9189 : : {
9190 : 680015 : vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
9191 : : SLP_INSTANCE_TREE (instance),
9192 : : visited,
9193 : : vectorized_scalar_stmts,
9194 : : scalar_stmts_in_externs);
9195 : 782384 : for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
9196 : 49287 : vectorized_scalar_stmts.add (rstmt);
9197 : : }
9198 : : /* Scalar stmts used as defs in external nodes need to be preseved, so
9199 : : remove them from vectorized_scalar_stmts. */
9200 : 960162 : for (stmt_vec_info stmt : scalar_stmts_in_externs)
9201 : 297731 : vectorized_scalar_stmts.remove (stmt);
9202 : :
9203 : : /* Calculate scalar cost and sum the cost for the vector stmts
9204 : : previously collected. */
9205 : 662431 : stmt_vector_for_cost scalar_costs = vNULL;
9206 : 662431 : stmt_vector_for_cost vector_costs = vNULL;
9207 : 662431 : visited.empty ();
9208 : 1342446 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9209 : : {
9210 : 680015 : auto_vec<bool, 20> life;
9211 : 680015 : life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
9212 : : true);
9213 : 680015 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9214 : 53082 : record_stmt_cost (&scalar_costs,
9215 : 26541 : SLP_INSTANCE_ROOT_STMTS (instance).length (),
9216 : : scalar_stmt,
9217 : 26541 : SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
9218 : 680015 : vect_bb_slp_scalar_cost (bb_vinfo,
9219 : : SLP_INSTANCE_TREE (instance),
9220 : : &life, &scalar_costs, vectorized_scalar_stmts,
9221 : : scalar_stmts_in_externs, visited);
9222 : 680015 : vector_costs.safe_splice (instance->cost_vec);
9223 : 680015 : instance->cost_vec.release ();
9224 : 680015 : }
9225 : :
9226 : 662431 : if (dump_enabled_p ())
9227 : 100 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
9228 : :
9229 : : /* When costing non-loop vectorization we need to consider each covered
9230 : : loop independently and make sure vectorization is profitable. For
9231 : : now we assume a loop may be not entered or executed an arbitrary
9232 : : number of iterations (??? static information can provide more
9233 : : precise info here) which means we can simply cost each containing
9234 : : loops stmts separately. */
9235 : :
9236 : : /* First produce cost vectors sorted by loop index. */
9237 : 662431 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
9238 : 662431 : li_scalar_costs (scalar_costs.length ());
9239 : 662431 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
9240 : 662431 : li_vector_costs (vector_costs.length ());
9241 : 662431 : stmt_info_for_cost *cost;
9242 : 2836605 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9243 : : {
9244 : 2174174 : unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
9245 : 2174174 : li_scalar_costs.quick_push (std::make_pair (l, cost));
9246 : : }
9247 : : /* Use a random used loop as fallback in case the first vector_costs
9248 : : entry does not have a stmt_info associated with it. */
9249 : 662431 : unsigned l = li_scalar_costs[0].first;
9250 : 2430937 : FOR_EACH_VEC_ELT (vector_costs, i, cost)
9251 : : {
9252 : : /* We inherit from the previous COST, invariants, externals and
9253 : : extracts immediately follow the cost for the related stmt. */
9254 : 1768506 : if (cost->stmt_info)
9255 : 1047334 : l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
9256 : 1768506 : li_vector_costs.quick_push (std::make_pair (l, cost));
9257 : : }
9258 : 662431 : li_scalar_costs.qsort (li_cost_vec_cmp);
9259 : 662431 : li_vector_costs.qsort (li_cost_vec_cmp);
9260 : :
9261 : : /* Now cost the portions individually. */
9262 : : unsigned vi = 0;
9263 : : unsigned si = 0;
9264 : 1149729 : bool profitable = true;
9265 : 1149729 : while (si < li_scalar_costs.length ()
9266 : 1816518 : && vi < li_vector_costs.length ())
9267 : : {
9268 : 666789 : unsigned sl = li_scalar_costs[si].first;
9269 : 666789 : unsigned vl = li_vector_costs[vi].first;
9270 : 666789 : if (sl != vl)
9271 : : {
9272 : 1150 : if (dump_enabled_p ())
9273 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
9274 : : "Scalar %d and vector %d loop part do not "
9275 : : "match up, skipping scalar part\n", sl, vl);
9276 : : /* Skip the scalar part, assuming zero cost on the vector side. */
9277 : 2406 : do
9278 : : {
9279 : 2406 : si++;
9280 : : }
9281 : 2406 : while (si < li_scalar_costs.length ()
9282 : 4390 : && li_scalar_costs[si].first == sl);
9283 : 1150 : continue;
9284 : : }
9285 : :
9286 : 665639 : class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
9287 : 2155520 : do
9288 : : {
9289 : 2155520 : add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
9290 : 2155520 : si++;
9291 : : }
9292 : 2155520 : while (si < li_scalar_costs.length ()
9293 : 4318016 : && li_scalar_costs[si].first == sl);
9294 : 665639 : scalar_target_cost_data->finish_cost (nullptr);
9295 : 665639 : scalar_cost = scalar_target_cost_data->body_cost ();
9296 : :
9297 : : /* Complete the target-specific vector cost calculation. */
9298 : 665639 : class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
9299 : 1736725 : do
9300 : : {
9301 : 1736725 : add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
9302 : 1736725 : vi++;
9303 : : }
9304 : 1736725 : while (vi < li_vector_costs.length ()
9305 : 3481366 : && li_vector_costs[vi].first == vl);
9306 : 665639 : vect_target_cost_data->finish_cost (scalar_target_cost_data);
9307 : 665639 : vec_prologue_cost = vect_target_cost_data->prologue_cost ();
9308 : 665639 : vec_inside_cost = vect_target_cost_data->body_cost ();
9309 : 665639 : vec_epilogue_cost = vect_target_cost_data->epilogue_cost ();
9310 : 665639 : delete scalar_target_cost_data;
9311 : 665639 : delete vect_target_cost_data;
9312 : :
9313 : 665639 : vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
9314 : :
9315 : 665639 : if (dump_enabled_p ())
9316 : : {
9317 : 100 : dump_printf_loc (MSG_NOTE, vect_location,
9318 : : "Cost model analysis for part in loop %d:\n", sl);
9319 : 100 : dump_printf (MSG_NOTE, " Vector cost: %d\n",
9320 : : vec_inside_cost + vec_outside_cost);
9321 : 100 : dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
9322 : : }
9323 : :
9324 : : /* Vectorization is profitable if its cost is more than the cost of scalar
9325 : : version. Note that we err on the vector side for equal cost because
9326 : : the cost estimate is otherwise quite pessimistic (constant uses are
9327 : : free on the scalar side but cost a load on the vector side for
9328 : : example). */
9329 : 665639 : if (vec_outside_cost + vec_inside_cost > scalar_cost)
9330 : : {
9331 : : profitable = false;
9332 : : break;
9333 : : }
9334 : : }
9335 : 1145358 : if (profitable && vi < li_vector_costs.length ())
9336 : : {
9337 : 1002 : if (dump_enabled_p ())
9338 : 12 : dump_printf_loc (MSG_NOTE, vect_location,
9339 : : "Excess vector cost for part in loop %d:\n",
9340 : 6 : li_vector_costs[vi].first);
9341 : : profitable = false;
9342 : : }
9343 : :
9344 : : /* Unset visited flag. This is delayed when the subgraph is profitable
9345 : : and we process the loop for remaining unvectorized if-converted code. */
9346 : 662431 : if (!orig_loop || !profitable)
9347 : 2835297 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9348 : 2172962 : gimple_set_visited (cost->stmt_info->stmt, false);
9349 : :
9350 : 662431 : scalar_costs.release ();
9351 : 662431 : vector_costs.release ();
9352 : :
9353 : 662431 : return profitable;
9354 : 662431 : }
9355 : :
9356 : : /* qsort comparator for lane defs. */
9357 : :
9358 : : static int
9359 : 40 : vld_cmp (const void *a_, const void *b_)
9360 : : {
9361 : 40 : auto *a = (const std::pair<unsigned, tree> *)a_;
9362 : 40 : auto *b = (const std::pair<unsigned, tree> *)b_;
9363 : 40 : return a->first - b->first;
9364 : : }
9365 : :
9366 : : /* Return true if USE_STMT is a vector lane insert into VEC and set
9367 : : *THIS_LANE to the lane number that is set. */
9368 : :
9369 : : static bool
9370 : 240 : vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
9371 : : {
9372 : 240 : gassign *use_ass = dyn_cast <gassign *> (use_stmt);
9373 : 91 : if (!use_ass
9374 : 91 : || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
9375 : 22 : || (vec
9376 : 22 : ? gimple_assign_rhs1 (use_ass) != vec
9377 : 24 : : ((vec = gimple_assign_rhs1 (use_ass)), false))
9378 : 46 : || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
9379 : 46 : TREE_TYPE (gimple_assign_rhs2 (use_ass)))
9380 : 46 : || !constant_multiple_p
9381 : 46 : (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
9382 : 92 : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
9383 : : this_lane))
9384 : 194 : return false;
9385 : : return true;
9386 : : }
9387 : :
9388 : : /* Find any vectorizable constructors and add them to the grouped_store
9389 : : array. */
9390 : :
9391 : : static void
9392 : 2429453 : vect_slp_check_for_roots (bb_vec_info bb_vinfo)
9393 : : {
9394 : 18965834 : for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
9395 : 33072762 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
9396 : 136010380 : !gsi_end_p (gsi); gsi_next (&gsi))
9397 : : {
9398 : 119473999 : gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
9399 : : /* This can be used to start SLP discovery for early breaks for BB early breaks
9400 : : when we get that far. */
9401 : 119473999 : if (!assign)
9402 : 177714614 : continue;
9403 : :
9404 : 31706569 : tree rhs = gimple_assign_rhs1 (assign);
9405 : 31706569 : enum tree_code code = gimple_assign_rhs_code (assign);
9406 : 31706569 : use_operand_p use_p;
9407 : 31706569 : gimple *use_stmt;
9408 : 31706569 : if (code == CONSTRUCTOR)
9409 : : {
9410 : 1918223 : if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9411 : 58564 : || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
9412 : 86608 : CONSTRUCTOR_NELTS (rhs))
9413 : 40202 : || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
9414 : 1958424 : || uniform_vector_p (rhs))
9415 : 1906626 : continue;
9416 : :
9417 : : unsigned j;
9418 : : tree val;
9419 : 57523 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9420 : 45926 : if (TREE_CODE (val) != SSA_NAME
9421 : 45926 : || !bb_vinfo->lookup_def (val))
9422 : : break;
9423 : 27874 : if (j != CONSTRUCTOR_NELTS (rhs))
9424 : 2340 : continue;
9425 : :
9426 : 11597 : vec<stmt_vec_info> roots = vNULL;
9427 : 11597 : roots.safe_push (bb_vinfo->lookup_stmt (assign));
9428 : 11597 : vec<stmt_vec_info> stmts;
9429 : 11597 : stmts.create (CONSTRUCTOR_NELTS (rhs));
9430 : 65368 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9431 : 42174 : stmts.quick_push
9432 : 42174 : (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
9433 : 11597 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9434 : 11597 : stmts, roots));
9435 : : }
9436 : 29788346 : else if (code == BIT_INSERT_EXPR
9437 : 872 : && VECTOR_TYPE_P (TREE_TYPE (rhs))
9438 : 559 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
9439 : 559 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
9440 : 556 : && integer_zerop (gimple_assign_rhs3 (assign))
9441 : 312 : && useless_type_conversion_p
9442 : 312 : (TREE_TYPE (TREE_TYPE (rhs)),
9443 : 312 : TREE_TYPE (gimple_assign_rhs2 (assign)))
9444 : 29788910 : && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
9445 : : {
9446 : : /* We start to match on insert to lane zero but since the
9447 : : inserts need not be ordered we'd have to search both
9448 : : the def and the use chains. */
9449 : 211 : tree vectype = TREE_TYPE (rhs);
9450 : 211 : unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
9451 : 211 : auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
9452 : 211 : auto_sbitmap lanes (nlanes);
9453 : 211 : bitmap_clear (lanes);
9454 : 211 : bitmap_set_bit (lanes, 0);
9455 : 211 : tree def = gimple_assign_lhs (assign);
9456 : 211 : lane_defs.quick_push
9457 : 211 : (std::make_pair (0, gimple_assign_rhs2 (assign)));
9458 : 211 : unsigned lanes_found = 1;
9459 : : /* Start with the use chains, the last stmt will be the root. */
9460 : 211 : stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
9461 : 211 : vec<stmt_vec_info> roots = vNULL;
9462 : 211 : roots.safe_push (last);
9463 : 213 : do
9464 : : {
9465 : 213 : use_operand_p use_p;
9466 : 213 : gimple *use_stmt;
9467 : 213 : if (!single_imm_use (def, &use_p, &use_stmt))
9468 : : break;
9469 : 207 : unsigned this_lane;
9470 : 207 : if (!bb_vinfo->lookup_stmt (use_stmt)
9471 : 207 : || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
9472 : 229 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
9473 : : break;
9474 : 22 : if (bitmap_bit_p (lanes, this_lane))
9475 : : break;
9476 : 2 : lanes_found++;
9477 : 2 : bitmap_set_bit (lanes, this_lane);
9478 : 2 : gassign *use_ass = as_a <gassign *> (use_stmt);
9479 : 2 : lane_defs.quick_push (std::make_pair
9480 : 2 : (this_lane, gimple_assign_rhs2 (use_ass)));
9481 : 2 : last = bb_vinfo->lookup_stmt (use_ass);
9482 : 2 : roots.safe_push (last);
9483 : 2 : def = gimple_assign_lhs (use_ass);
9484 : : }
9485 : 2 : while (lanes_found < nlanes);
9486 : 211 : if (roots.length () > 1)
9487 : 2 : std::swap(roots[0], roots[roots.length () - 1]);
9488 : 211 : if (lanes_found < nlanes)
9489 : : {
9490 : : /* Now search the def chain. */
9491 : 211 : def = gimple_assign_rhs1 (assign);
9492 : 213 : do
9493 : : {
9494 : 213 : if (TREE_CODE (def) != SSA_NAME
9495 : 213 : || !has_single_use (def))
9496 : : break;
9497 : 56 : gimple *def_stmt = SSA_NAME_DEF_STMT (def);
9498 : 56 : unsigned this_lane;
9499 : 56 : if (!bb_vinfo->lookup_stmt (def_stmt)
9500 : 33 : || !vect_slp_is_lane_insert (def_stmt,
9501 : : NULL_TREE, &this_lane)
9502 : 80 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
9503 : : break;
9504 : 24 : if (bitmap_bit_p (lanes, this_lane))
9505 : : break;
9506 : 4 : lanes_found++;
9507 : 4 : bitmap_set_bit (lanes, this_lane);
9508 : 8 : lane_defs.quick_push (std::make_pair
9509 : 4 : (this_lane,
9510 : 4 : gimple_assign_rhs2 (def_stmt)));
9511 : 4 : roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
9512 : 4 : def = gimple_assign_rhs1 (def_stmt);
9513 : : }
9514 : 4 : while (lanes_found < nlanes);
9515 : : }
9516 : 211 : if (lanes_found == nlanes)
9517 : : {
9518 : : /* Sort lane_defs after the lane index and register the root. */
9519 : 2 : lane_defs.qsort (vld_cmp);
9520 : 2 : vec<stmt_vec_info> stmts;
9521 : 2 : stmts.create (nlanes);
9522 : 10 : for (unsigned i = 0; i < nlanes; ++i)
9523 : 8 : stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
9524 : 2 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9525 : 2 : stmts, roots));
9526 : : }
9527 : : else
9528 : 209 : roots.release ();
9529 : 211 : }
9530 : 29788135 : else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9531 : 28867153 : && (associative_tree_code (code) || code == MINUS_EXPR)
9532 : : /* ??? This pessimizes a two-element reduction. PR54400.
9533 : : ??? In-order reduction could be handled if we only
9534 : : traverse one operand chain in vect_slp_linearize_chain. */
9535 : 33688289 : && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
9536 : : /* Ops with constants at the tail can be stripped here. */
9537 : 5777829 : && TREE_CODE (rhs) == SSA_NAME
9538 : 5713948 : && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
9539 : : /* Should be the chain end. */
9540 : 32066336 : && (!single_imm_use (gimple_assign_lhs (assign),
9541 : : &use_p, &use_stmt)
9542 : 1758480 : || !is_gimple_assign (use_stmt)
9543 : 1168513 : || (gimple_assign_rhs_code (use_stmt) != code
9544 : 857830 : && ((code != PLUS_EXPR && code != MINUS_EXPR)
9545 : 478834 : || (gimple_assign_rhs_code (use_stmt)
9546 : 478834 : != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
9547 : : {
9548 : : /* We start the match at the end of a possible association
9549 : : chain. */
9550 : 1877675 : auto_vec<chain_op_t> chain;
9551 : 1877675 : auto_vec<std::pair<tree_code, gimple *> > worklist;
9552 : 1877675 : auto_vec<gimple *> chain_stmts;
9553 : 1877675 : gimple *code_stmt = NULL, *alt_code_stmt = NULL;
9554 : 1877675 : if (code == MINUS_EXPR)
9555 : 315115 : code = PLUS_EXPR;
9556 : 1877675 : internal_fn reduc_fn;
9557 : 2150803 : if (!reduction_fn_for_scalar_code (code, &reduc_fn)
9558 : 1877675 : || reduc_fn == IFN_LAST)
9559 : 273128 : continue;
9560 : 1604547 : vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
9561 : : /* ??? */
9562 : : code_stmt, alt_code_stmt, &chain_stmts);
9563 : 3209094 : if (chain.length () > 1)
9564 : : {
9565 : : /* Sort the chain according to def_type and operation. */
9566 : 1604547 : chain.sort (dt_sort_cmp, bb_vinfo);
9567 : : /* ??? Now we'd want to strip externals and constants
9568 : : but record those to be handled in the epilogue. */
9569 : : /* ??? For now do not allow mixing ops or externs/constants. */
9570 : 1604547 : bool invalid = false;
9571 : 1604547 : unsigned remain_cnt = 0;
9572 : 1604547 : unsigned last_idx = 0;
9573 : 4834552 : for (unsigned i = 0; i < chain.length (); ++i)
9574 : : {
9575 : 3566910 : if (chain[i].code != code)
9576 : : {
9577 : : invalid = true;
9578 : : break;
9579 : : }
9580 : 3230005 : if (chain[i].dt != vect_internal_def
9581 : : /* Avoid stmts where the def is not the LHS, like
9582 : : ASMs. */
9583 : 6227250 : || (gimple_get_lhs (bb_vinfo->lookup_def
9584 : 2997245 : (chain[i].op)->stmt)
9585 : 2997245 : != chain[i].op))
9586 : 235696 : remain_cnt++;
9587 : : else
9588 : : last_idx = i;
9589 : : }
9590 : : /* Make sure to have an even number of lanes as we later do
9591 : : all-or-nothing discovery, not trying to split further. */
9592 : 1604547 : if ((chain.length () - remain_cnt) & 1)
9593 : 188762 : remain_cnt++;
9594 : 1604547 : if (!invalid && chain.length () - remain_cnt > 1)
9595 : : {
9596 : 1198249 : vec<stmt_vec_info> stmts;
9597 : 1198249 : vec<tree> remain = vNULL;
9598 : 1198249 : stmts.create (chain.length ());
9599 : 1198249 : if (remain_cnt > 0)
9600 : 110701 : remain.create (remain_cnt);
9601 : 3850259 : for (unsigned i = 0; i < chain.length (); ++i)
9602 : : {
9603 : 2652010 : stmt_vec_info stmt_info;
9604 : 2652010 : if (chain[i].dt == vect_internal_def
9605 : 2614856 : && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
9606 : 2614856 : gimple_get_lhs (stmt_info->stmt) == chain[i].op)
9607 : 5266782 : && (i != last_idx
9608 : 1198249 : || (stmts.length () & 1)))
9609 : 2529892 : stmts.quick_push (stmt_info);
9610 : : else
9611 : 122118 : remain.quick_push (chain[i].op);
9612 : : }
9613 : 1198249 : vec<stmt_vec_info> roots;
9614 : 1198249 : roots.create (chain_stmts.length ());
9615 : 2652010 : for (unsigned i = 0; i < chain_stmts.length (); ++i)
9616 : 1453761 : roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
9617 : 1198249 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
9618 : 1198249 : stmts, roots, remain));
9619 : : }
9620 : : }
9621 : 1877675 : }
9622 : : }
9623 : 2429453 : }
9624 : :
9625 : : /* Walk the grouped store chains and replace entries with their
9626 : : pattern variant if any. */
9627 : :
9628 : : static void
9629 : 636820 : vect_fixup_store_groups_with_patterns (vec_info *vinfo)
9630 : : {
9631 : 636820 : stmt_vec_info first_element;
9632 : 636820 : unsigned i;
9633 : :
9634 : 1527781 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
9635 : : {
9636 : : /* We also have CTORs in this array. */
9637 : 890961 : if (!STMT_VINFO_GROUPED_ACCESS (first_element))
9638 : 0 : continue;
9639 : 890961 : if (STMT_VINFO_IN_PATTERN_P (first_element))
9640 : : {
9641 : 244 : stmt_vec_info orig = first_element;
9642 : 244 : first_element = STMT_VINFO_RELATED_STMT (first_element);
9643 : 244 : DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
9644 : 244 : DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
9645 : 244 : DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
9646 : 244 : DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
9647 : 244 : vinfo->grouped_stores[i] = first_element;
9648 : : }
9649 : 890961 : stmt_vec_info prev = first_element;
9650 : 2510098 : while (DR_GROUP_NEXT_ELEMENT (prev))
9651 : : {
9652 : 1619137 : stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
9653 : 1619137 : if (STMT_VINFO_IN_PATTERN_P (elt))
9654 : : {
9655 : 866 : stmt_vec_info orig = elt;
9656 : 866 : elt = STMT_VINFO_RELATED_STMT (elt);
9657 : 866 : DR_GROUP_NEXT_ELEMENT (prev) = elt;
9658 : 866 : DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
9659 : 866 : DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
9660 : : }
9661 : 1619137 : DR_GROUP_FIRST_ELEMENT (elt) = first_element;
9662 : 1619137 : prev = elt;
9663 : : }
9664 : : }
9665 : 636820 : }
9666 : :
9667 : : /* Check if the region described by BB_VINFO can be vectorized, returning
9668 : : true if so. When returning false, set FATAL to true if the same failure
9669 : : would prevent vectorization at other vector sizes, false if it is still
9670 : : worth trying other sizes. N_STMTS is the number of statements in the
9671 : : region. */
9672 : :
9673 : : static bool
9674 : 2429453 : vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
9675 : : vec<int> *dataref_groups)
9676 : : {
9677 : 2429453 : DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
9678 : :
9679 : 2429453 : slp_instance instance;
9680 : 2429453 : int i;
9681 : :
9682 : : /* The first group of checks is independent of the vector size. */
9683 : 2429453 : fatal = true;
9684 : :
9685 : : /* Analyze the data references. */
9686 : :
9687 : 2429453 : if (!vect_analyze_data_refs (bb_vinfo, NULL))
9688 : : {
9689 : 0 : if (dump_enabled_p ())
9690 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9691 : : "not vectorized: unhandled data-ref in basic "
9692 : : "block.\n");
9693 : 0 : return false;
9694 : : }
9695 : :
9696 : 2429453 : if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
9697 : : {
9698 : 0 : if (dump_enabled_p ())
9699 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9700 : : "not vectorized: unhandled data access in "
9701 : : "basic block.\n");
9702 : 0 : return false;
9703 : : }
9704 : :
9705 : 2429453 : vect_slp_check_for_roots (bb_vinfo);
9706 : :
9707 : : /* If there are no grouped stores and no constructors in the region
9708 : : there is no need to continue with pattern recog as vect_analyze_slp
9709 : : will fail anyway. */
9710 : 2429453 : if (bb_vinfo->grouped_stores.is_empty ()
9711 : 2067227 : && bb_vinfo->roots.is_empty ())
9712 : : {
9713 : 1792633 : if (dump_enabled_p ())
9714 : 1033 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9715 : : "not vectorized: no grouped stores in "
9716 : : "basic block.\n");
9717 : 1792633 : return false;
9718 : : }
9719 : :
9720 : : /* While the rest of the analysis below depends on it in some way. */
9721 : 636820 : fatal = false;
9722 : :
9723 : 636820 : vect_pattern_recog (bb_vinfo);
9724 : :
9725 : : /* Update store groups from pattern processing. */
9726 : 636820 : vect_fixup_store_groups_with_patterns (bb_vinfo);
9727 : :
9728 : : /* Check the SLP opportunities in the basic block, analyze and build SLP
9729 : : trees. */
9730 : 636820 : if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
9731 : : {
9732 : 0 : if (dump_enabled_p ())
9733 : : {
9734 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9735 : : "Failed to SLP the basic block.\n");
9736 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9737 : : "not vectorized: failed to find SLP opportunities "
9738 : : "in basic block.\n");
9739 : : }
9740 : 0 : return false;
9741 : : }
9742 : :
9743 : : /* Optimize permutations. */
9744 : 636820 : vect_optimize_slp (bb_vinfo);
9745 : :
9746 : : /* Gather the loads reachable from the SLP graph entries. */
9747 : 636820 : vect_gather_slp_loads (bb_vinfo);
9748 : :
9749 : 636820 : vect_record_base_alignments (bb_vinfo);
9750 : :
9751 : : /* Analyze and verify the alignment of data references and the
9752 : : dependence in the SLP instances. */
9753 : 1432973 : for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
9754 : : {
9755 : 796153 : vect_location = instance->location ();
9756 : 796153 : if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
9757 : 796153 : || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
9758 : : {
9759 : 8036 : slp_tree node = SLP_INSTANCE_TREE (instance);
9760 : 8036 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
9761 : 8036 : if (dump_enabled_p ())
9762 : 4 : dump_printf_loc (MSG_NOTE, vect_location,
9763 : : "removing SLP instance operations starting from: %G",
9764 : : stmt_info->stmt);
9765 : 8036 : vect_free_slp_instance (instance);
9766 : 8036 : BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
9767 : 8036 : continue;
9768 : 8036 : }
9769 : :
9770 : : /* Mark all the statements that we want to vectorize as pure SLP and
9771 : : relevant. */
9772 : 788117 : vect_mark_slp_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance));
9773 : 788117 : vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
9774 : 788117 : unsigned j;
9775 : 788117 : stmt_vec_info root;
9776 : : /* Likewise consider instance root stmts as vectorized. */
9777 : 1741424 : FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
9778 : 165190 : STMT_SLP_TYPE (root) = pure_slp;
9779 : :
9780 : 788117 : i++;
9781 : : }
9782 : 2461344 : if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
9783 : : return false;
9784 : :
9785 : 282650 : if (!vect_slp_analyze_operations (bb_vinfo))
9786 : : {
9787 : 31891 : if (dump_enabled_p ())
9788 : 84 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9789 : : "not vectorized: bad operation in basic block.\n");
9790 : 31891 : return false;
9791 : : }
9792 : :
9793 : 250759 : vect_bb_partition_graph (bb_vinfo);
9794 : :
9795 : 250759 : return true;
9796 : : }
9797 : :
9798 : : /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
9799 : : basic blocks in BBS, returning true on success.
9800 : : The region has N_STMTS statements and has the datarefs given by DATAREFS. */
9801 : :
9802 : : static bool
9803 : 2099119 : vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
9804 : : vec<int> *dataref_groups, unsigned int n_stmts,
9805 : : loop_p orig_loop)
9806 : : {
9807 : 2099119 : bb_vec_info bb_vinfo;
9808 : 2099119 : auto_vector_modes vector_modes;
9809 : :
9810 : : /* Autodetect first vector size we try. */
9811 : 2099119 : machine_mode next_vector_mode = VOIDmode;
9812 : 2099119 : targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
9813 : 2099119 : unsigned int mode_i = 0;
9814 : :
9815 : 2099119 : vec_info_shared shared;
9816 : :
9817 : 2099119 : machine_mode autodetected_vector_mode = VOIDmode;
9818 : 2759787 : while (1)
9819 : : {
9820 : 2429453 : bool vectorized = false;
9821 : 2429453 : bool fatal = false;
9822 : 2429453 : bb_vinfo = new _bb_vec_info (bbs, &shared);
9823 : :
9824 : 2429453 : bool first_time_p = shared.datarefs.is_empty ();
9825 : 2429453 : BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
9826 : 2429453 : if (first_time_p)
9827 : 2121690 : bb_vinfo->shared->save_datarefs ();
9828 : : else
9829 : 307763 : bb_vinfo->shared->check_datarefs ();
9830 : 2429453 : bb_vinfo->vector_mode = next_vector_mode;
9831 : :
9832 : 2429453 : if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
9833 : : {
9834 : 250759 : if (dump_enabled_p ())
9835 : : {
9836 : 1488 : dump_printf_loc (MSG_NOTE, vect_location,
9837 : : "***** Analysis succeeded with vector mode"
9838 : 744 : " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
9839 : 744 : dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
9840 : : }
9841 : :
9842 : 250759 : bb_vinfo->shared->check_datarefs ();
9843 : :
9844 : 250759 : bool force_clear = false;
9845 : 250759 : auto_vec<slp_instance> profitable_subgraphs;
9846 : 1435598 : for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
9847 : : {
9848 : 683321 : if (instance->subgraph_entries.is_empty ())
9849 : 215917 : continue;
9850 : :
9851 : 665609 : dump_user_location_t saved_vect_location = vect_location;
9852 : 665609 : vect_location = instance->location ();
9853 : 665609 : if (!unlimited_cost_model (NULL)
9854 : 1328040 : && !vect_bb_vectorization_profitable_p
9855 : 662431 : (bb_vinfo, instance->subgraph_entries, orig_loop))
9856 : : {
9857 : 180493 : if (dump_enabled_p ())
9858 : 29 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9859 : : "not vectorized: vectorization is not "
9860 : : "profitable.\n");
9861 : 180493 : vect_location = saved_vect_location;
9862 : 180493 : continue;
9863 : : }
9864 : :
9865 : 485116 : vect_location = saved_vect_location;
9866 : 485116 : if (!dbg_cnt (vect_slp))
9867 : : {
9868 : 0 : force_clear = true;
9869 : 0 : continue;
9870 : : }
9871 : :
9872 : 485116 : profitable_subgraphs.safe_push (instance);
9873 : : }
9874 : :
9875 : : /* When we're vectorizing an if-converted loop body make sure
9876 : : we vectorized all if-converted code. */
9877 : 419400 : if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
9878 : : {
9879 : 106 : gcc_assert (bb_vinfo->nbbs == 1);
9880 : 212 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
9881 : 4395 : !gsi_end_p (gsi); gsi_next (&gsi))
9882 : : {
9883 : : /* The costing above left us with DCEable vectorized scalar
9884 : : stmts having the visited flag set on profitable
9885 : : subgraphs. Do the delayed clearing of the flag here. */
9886 : 4289 : if (gimple_visited_p (gsi_stmt (gsi)))
9887 : : {
9888 : 1186 : gimple_set_visited (gsi_stmt (gsi), false);
9889 : 1186 : continue;
9890 : : }
9891 : 3103 : if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
9892 : 868 : continue;
9893 : :
9894 : 6335 : if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
9895 : 2648 : if (gimple_assign_rhs_code (ass) == COND_EXPR)
9896 : : {
9897 : 72 : if (!profitable_subgraphs.is_empty ()
9898 : 29 : && dump_enabled_p ())
9899 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
9900 : : "not profitable because of "
9901 : : "unprofitable if-converted scalar "
9902 : : "code\n");
9903 : 43 : profitable_subgraphs.truncate (0);
9904 : : }
9905 : : }
9906 : : }
9907 : :
9908 : : /* Finally schedule the profitable subgraphs. */
9909 : 1073118 : for (slp_instance instance : profitable_subgraphs)
9910 : : {
9911 : 485077 : if (!vectorized && dump_enabled_p ())
9912 : 718 : dump_printf_loc (MSG_NOTE, vect_location,
9913 : : "Basic block will be vectorized "
9914 : : "using SLP\n");
9915 : 485077 : vectorized = true;
9916 : :
9917 : : /* Dump before scheduling as store vectorization will remove
9918 : : the original stores and mess with the instance tree
9919 : : so querying its location will eventually ICE. */
9920 : 485077 : if (flag_checking)
9921 : 1950041 : for (slp_instance sub : instance->subgraph_entries)
9922 : 494810 : gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
9923 : 485077 : unsigned HOST_WIDE_INT bytes;
9924 : 485077 : if (dump_enabled_p ())
9925 : 3391 : for (slp_instance sub : instance->subgraph_entries)
9926 : : {
9927 : 898 : tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
9928 : 1796 : if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
9929 : 898 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9930 : 898 : sub->location (),
9931 : : "basic block part vectorized using %wu "
9932 : : "byte vectors\n", bytes);
9933 : : else
9934 : : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9935 : : sub->location (),
9936 : : "basic block part vectorized using "
9937 : : "variable length vectors\n");
9938 : : }
9939 : :
9940 : 485077 : dump_user_location_t saved_vect_location = vect_location;
9941 : 485077 : vect_location = instance->location ();
9942 : :
9943 : 485077 : vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
9944 : :
9945 : 485077 : vect_location = saved_vect_location;
9946 : : }
9947 : :
9948 : :
9949 : : /* Generate the invariant statements. */
9950 : 250759 : if (!gimple_seq_empty_p (bb_vinfo->inv_pattern_def_seq))
9951 : : {
9952 : 23 : if (dump_enabled_p ())
9953 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
9954 : : "------>generating invariant statements\n");
9955 : :
9956 : 23 : bb_vinfo->insert_seq_on_entry (NULL,
9957 : : bb_vinfo->inv_pattern_def_seq);
9958 : : }
9959 : 250759 : }
9960 : : else
9961 : : {
9962 : 2178694 : if (dump_enabled_p ())
9963 : 1325 : dump_printf_loc (MSG_NOTE, vect_location,
9964 : : "***** Analysis failed with vector mode %s\n",
9965 : 1325 : GET_MODE_NAME (bb_vinfo->vector_mode));
9966 : : }
9967 : :
9968 : 2429453 : if (mode_i == 0)
9969 : 2099119 : autodetected_vector_mode = bb_vinfo->vector_mode;
9970 : :
9971 : 2429453 : if (!fatal)
9972 : 3417355 : while (mode_i < vector_modes.length ()
9973 : 1837890 : && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
9974 : : {
9975 : 351082 : if (dump_enabled_p ())
9976 : 1638 : dump_printf_loc (MSG_NOTE, vect_location,
9977 : : "***** The result for vector mode %s would"
9978 : : " be the same\n",
9979 : 819 : GET_MODE_NAME (vector_modes[mode_i]));
9980 : 351082 : mode_i += 1;
9981 : : }
9982 : :
9983 : 2429453 : delete bb_vinfo;
9984 : :
9985 : 2429453 : if (mode_i < vector_modes.length ()
9986 : 2244888 : && VECTOR_MODE_P (autodetected_vector_mode)
9987 : 2104282 : && (related_vector_mode (vector_modes[mode_i],
9988 : : GET_MODE_INNER (autodetected_vector_mode))
9989 : 1052141 : == autodetected_vector_mode)
9990 : 4674341 : && (related_vector_mode (autodetected_vector_mode,
9991 : 552767 : GET_MODE_INNER (vector_modes[mode_i]))
9992 : 1105534 : == vector_modes[mode_i]))
9993 : : {
9994 : 552767 : if (dump_enabled_p ())
9995 : 209 : dump_printf_loc (MSG_NOTE, vect_location,
9996 : : "***** Skipping vector mode %s, which would"
9997 : : " repeat the analysis for %s\n",
9998 : 209 : GET_MODE_NAME (vector_modes[mode_i]),
9999 : 209 : GET_MODE_NAME (autodetected_vector_mode));
10000 : 552767 : mode_i += 1;
10001 : : }
10002 : :
10003 : 2429453 : if (vectorized
10004 : 2260841 : || mode_i == vector_modes.length ()
10005 : 2076316 : || autodetected_vector_mode == VOIDmode
10006 : : /* If vect_slp_analyze_bb_1 signaled that analysis for all
10007 : : vector sizes will fail do not bother iterating. */
10008 : 3313022 : || fatal)
10009 : 4198238 : return vectorized;
10010 : :
10011 : : /* Try the next biggest vector size. */
10012 : 330334 : next_vector_mode = vector_modes[mode_i++];
10013 : 330334 : if (dump_enabled_p ())
10014 : 219 : dump_printf_loc (MSG_NOTE, vect_location,
10015 : : "***** Re-trying analysis with vector mode %s\n",
10016 : 219 : GET_MODE_NAME (next_vector_mode));
10017 : 330334 : }
10018 : 2099119 : }
10019 : :
10020 : :
10021 : : /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
10022 : : true if anything in the basic-block was vectorized. */
10023 : :
10024 : : static bool
10025 : 2099119 : vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
10026 : : {
10027 : 2099119 : vec<data_reference_p> datarefs = vNULL;
10028 : 2099119 : auto_vec<int> dataref_groups;
10029 : 2099119 : int insns = 0;
10030 : 2099119 : int current_group = 0;
10031 : :
10032 : 13470832 : for (unsigned i = 0; i < bbs.length (); i++)
10033 : : {
10034 : 11371713 : basic_block bb = bbs[i];
10035 : 92023332 : for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
10036 : 80651619 : gsi_next (&gsi))
10037 : : {
10038 : 80651619 : gimple *stmt = gsi_stmt (gsi);
10039 : 80651619 : if (is_gimple_debug (stmt))
10040 : 50122278 : continue;
10041 : :
10042 : 30529341 : insns++;
10043 : :
10044 : 30529341 : if (gimple_location (stmt) != UNKNOWN_LOCATION)
10045 : 27510110 : vect_location = stmt;
10046 : :
10047 : 30529341 : if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
10048 : : &dataref_groups, current_group))
10049 : 5221563 : ++current_group;
10050 : : }
10051 : : /* New BBs always start a new DR group. */
10052 : 11371713 : ++current_group;
10053 : : }
10054 : :
10055 : 2099119 : return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
10056 : 2099119 : }
10057 : :
10058 : : /* Special entry for the BB vectorizer. Analyze and transform a single
10059 : : if-converted BB with ORIG_LOOPs body being the not if-converted
10060 : : representation. Returns true if anything in the basic-block was
10061 : : vectorized. */
10062 : :
10063 : : bool
10064 : 18346 : vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
10065 : : {
10066 : 18346 : auto_vec<basic_block> bbs;
10067 : 18346 : bbs.safe_push (bb);
10068 : 18346 : return vect_slp_bbs (bbs, orig_loop);
10069 : 18346 : }
10070 : :
10071 : : /* Main entry for the BB vectorizer. Analyze and transform BB, returns
10072 : : true if anything in the basic-block was vectorized. */
10073 : :
10074 : : bool
10075 : 912251 : vect_slp_function (function *fun)
10076 : : {
10077 : 912251 : bool r = false;
10078 : 912251 : int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
10079 : 912251 : auto_bitmap exit_bbs;
10080 : 912251 : bitmap_set_bit (exit_bbs, EXIT_BLOCK);
10081 : 912251 : edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
10082 : 912251 : unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
10083 : 912251 : true, rpo, NULL);
10084 : :
10085 : : /* For the moment split the function into pieces to avoid making
10086 : : the iteration on the vector mode moot. Split at points we know
10087 : : to not handle well which is CFG merges (SLP discovery doesn't
10088 : : handle non-loop-header PHIs) and loop exits. Since pattern
10089 : : recog requires reverse iteration to visit uses before defs
10090 : : simply chop RPO into pieces. */
10091 : 912251 : auto_vec<basic_block> bbs;
10092 : 12292709 : for (unsigned i = 0; i < n; i++)
10093 : : {
10094 : 11380458 : basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
10095 : 11380458 : bool split = false;
10096 : :
10097 : : /* Split when a BB is not dominated by the first block. */
10098 : 21487786 : if (!bbs.is_empty ()
10099 : 10107328 : && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
10100 : : {
10101 : 823256 : if (dump_enabled_p ())
10102 : 162 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10103 : : "splitting region at dominance boundary bb%d\n",
10104 : : bb->index);
10105 : : split = true;
10106 : : }
10107 : : /* Split when the loop determined by the first block
10108 : : is exited. This is because we eventually insert
10109 : : invariants at region begin. */
10110 : 19841274 : else if (!bbs.is_empty ()
10111 : 9284072 : && bbs[0]->loop_father != bb->loop_father
10112 : 2332801 : && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
10113 : : {
10114 : 5182 : if (dump_enabled_p ())
10115 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10116 : : "splitting region at loop %d exit at bb%d\n",
10117 : 3 : bbs[0]->loop_father->num, bb->index);
10118 : : split = true;
10119 : : }
10120 : 10552020 : else if (!bbs.is_empty ()
10121 : 9278890 : && bb->loop_father->header == bb
10122 : 481523 : && bb->loop_father->dont_vectorize)
10123 : : {
10124 : 6296 : if (dump_enabled_p ())
10125 : 72 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10126 : : "splitting region at dont-vectorize loop %d "
10127 : : "entry at bb%d\n",
10128 : : bb->loop_father->num, bb->index);
10129 : : split = true;
10130 : : }
10131 : :
10132 : 12215192 : if (split && !bbs.is_empty ())
10133 : : {
10134 : 834734 : r |= vect_slp_bbs (bbs, NULL);
10135 : 834734 : bbs.truncate (0);
10136 : : }
10137 : :
10138 : 11380458 : if (bbs.is_empty ())
10139 : : {
10140 : : /* We need to be able to insert at the head of the region which
10141 : : we cannot for region starting with a returns-twice call. */
10142 : 2107864 : if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
10143 : 411322 : if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
10144 : : {
10145 : 294 : if (dump_enabled_p ())
10146 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10147 : : "skipping bb%d as start of region as it "
10148 : : "starts with returns-twice call\n",
10149 : : bb->index);
10150 : 27091 : continue;
10151 : : }
10152 : : /* If the loop this BB belongs to is marked as not to be vectorized
10153 : : honor that also for BB vectorization. */
10154 : 2107570 : if (bb->loop_father->dont_vectorize)
10155 : 26797 : continue;
10156 : : }
10157 : :
10158 : 11353367 : bbs.safe_push (bb);
10159 : :
10160 : : /* When we have a stmt ending this block and defining a
10161 : : value we have to insert on edges when inserting after it for
10162 : : a vector containing its definition. Avoid this for now. */
10163 : 22706734 : if (gimple *last = *gsi_last_bb (bb))
10164 : 8987439 : if (gimple_get_lhs (last)
10165 : 8987439 : && is_ctrl_altering_stmt (last))
10166 : : {
10167 : 333795 : if (dump_enabled_p ())
10168 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10169 : : "splitting region at control altering "
10170 : : "definition %G", last);
10171 : 333795 : r |= vect_slp_bbs (bbs, NULL);
10172 : 333795 : bbs.truncate (0);
10173 : : }
10174 : : }
10175 : :
10176 : 912251 : if (!bbs.is_empty ())
10177 : 912244 : r |= vect_slp_bbs (bbs, NULL);
10178 : :
10179 : 912251 : free (rpo);
10180 : :
10181 : 912251 : return r;
10182 : 912251 : }
10183 : :
10184 : : /* Build a variable-length vector in which the elements in ELTS are repeated
10185 : : to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
10186 : : RESULTS and add any new instructions to SEQ.
10187 : :
10188 : : The approach we use is:
10189 : :
10190 : : (1) Find a vector mode VM with integer elements of mode IM.
10191 : :
10192 : : (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
10193 : : ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
10194 : : from small vectors to IM.
10195 : :
10196 : : (3) Duplicate each ELTS'[I] into a vector of mode VM.
10197 : :
10198 : : (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
10199 : : correct byte contents.
10200 : :
10201 : : (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
10202 : :
10203 : : We try to find the largest IM for which this sequence works, in order
10204 : : to cut down on the number of interleaves. */
10205 : :
10206 : : void
10207 : 0 : duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
10208 : : const vec<tree> &elts, unsigned int nresults,
10209 : : vec<tree> &results)
10210 : : {
10211 : 0 : unsigned int nelts = elts.length ();
10212 : 0 : tree element_type = TREE_TYPE (vector_type);
10213 : :
10214 : : /* (1) Find a vector mode VM with integer elements of mode IM. */
10215 : 0 : unsigned int nvectors = 1;
10216 : 0 : tree new_vector_type;
10217 : 0 : tree permutes[2];
10218 : 0 : if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
10219 : : &nvectors, &new_vector_type,
10220 : : permutes))
10221 : 0 : gcc_unreachable ();
10222 : :
10223 : : /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
10224 : 0 : unsigned int partial_nelts = nelts / nvectors;
10225 : 0 : tree partial_vector_type = build_vector_type (element_type, partial_nelts);
10226 : :
10227 : 0 : tree_vector_builder partial_elts;
10228 : 0 : auto_vec<tree, 32> pieces (nvectors * 2);
10229 : 0 : pieces.quick_grow_cleared (nvectors * 2);
10230 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
10231 : : {
10232 : : /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
10233 : : ELTS' has mode IM. */
10234 : 0 : partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
10235 : 0 : for (unsigned int j = 0; j < partial_nelts; ++j)
10236 : 0 : partial_elts.quick_push (elts[i * partial_nelts + j]);
10237 : 0 : tree t = gimple_build_vector (seq, &partial_elts);
10238 : 0 : t = gimple_build (seq, VIEW_CONVERT_EXPR,
10239 : 0 : TREE_TYPE (new_vector_type), t);
10240 : :
10241 : : /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
10242 : 0 : pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
10243 : : }
10244 : :
10245 : : /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
10246 : : correct byte contents.
10247 : :
10248 : : Conceptually, we need to repeat the following operation log2(nvectors)
10249 : : times, where hi_start = nvectors / 2:
10250 : :
10251 : : out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
10252 : : out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
10253 : :
10254 : : However, if each input repeats every N elements and the VF is
10255 : : a multiple of N * 2, the HI result is the same as the LO result.
10256 : : This will be true for the first N1 iterations of the outer loop,
10257 : : followed by N2 iterations for which both the LO and HI results
10258 : : are needed. I.e.:
10259 : :
10260 : : N1 + N2 = log2(nvectors)
10261 : :
10262 : : Each "N1 iteration" doubles the number of redundant vectors and the
10263 : : effect of the process as a whole is to have a sequence of nvectors/2**N1
10264 : : vectors that repeats 2**N1 times. Rather than generate these redundant
10265 : : vectors, we halve the number of vectors for each N1 iteration. */
10266 : : unsigned int in_start = 0;
10267 : : unsigned int out_start = nvectors;
10268 : : unsigned int new_nvectors = nvectors;
10269 : 0 : for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
10270 : : {
10271 : 0 : unsigned int hi_start = new_nvectors / 2;
10272 : 0 : unsigned int out_i = 0;
10273 : 0 : for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
10274 : : {
10275 : 0 : if ((in_i & 1) != 0
10276 : 0 : && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
10277 : : 2 * in_repeat))
10278 : 0 : continue;
10279 : :
10280 : 0 : tree output = make_ssa_name (new_vector_type);
10281 : 0 : tree input1 = pieces[in_start + (in_i / 2)];
10282 : 0 : tree input2 = pieces[in_start + (in_i / 2) + hi_start];
10283 : 0 : gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
10284 : : input1, input2,
10285 : : permutes[in_i & 1]);
10286 : 0 : gimple_seq_add_stmt (seq, stmt);
10287 : 0 : pieces[out_start + out_i] = output;
10288 : 0 : out_i += 1;
10289 : : }
10290 : 0 : std::swap (in_start, out_start);
10291 : 0 : new_nvectors = out_i;
10292 : : }
10293 : :
10294 : : /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
10295 : 0 : results.reserve (nresults);
10296 : 0 : for (unsigned int i = 0; i < nresults; ++i)
10297 : 0 : if (i < new_nvectors)
10298 : 0 : results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
10299 : 0 : pieces[in_start + i]));
10300 : : else
10301 : 0 : results.quick_push (results[i - new_nvectors]);
10302 : 0 : }
10303 : :
10304 : :
10305 : : /* For constant and loop invariant defs in OP_NODE this function creates
10306 : : vector defs that will be used in the vectorized stmts and stores them
10307 : : to SLP_TREE_VEC_DEFS of OP_NODE. */
10308 : :
10309 : : static void
10310 : 487937 : vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
10311 : : {
10312 : 487937 : unsigned HOST_WIDE_INT nunits;
10313 : 487937 : tree vec_cst;
10314 : 487937 : unsigned j, number_of_places_left_in_vector;
10315 : 487937 : tree vector_type;
10316 : 487937 : tree vop;
10317 : 487937 : int group_size = op_node->ops.length ();
10318 : 487937 : unsigned int vec_num, i;
10319 : 487937 : unsigned number_of_copies = 1;
10320 : 487937 : bool constant_p;
10321 : 487937 : gimple_seq ctor_seq = NULL;
10322 : 487937 : auto_vec<tree, 16> permute_results;
10323 : :
10324 : : /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
10325 : 487937 : vector_type = SLP_TREE_VECTYPE (op_node);
10326 : :
10327 : 487937 : unsigned int number_of_vectors = vect_get_num_copies (vinfo, op_node);
10328 : 487937 : SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
10329 : 487937 : auto_vec<tree> voprnds (number_of_vectors);
10330 : :
10331 : : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
10332 : : created vectors. It is greater than 1 if unrolling is performed.
10333 : :
10334 : : For example, we have two scalar operands, s1 and s2 (e.g., group of
10335 : : strided accesses of size two), while NUNITS is four (i.e., four scalars
10336 : : of this type can be packed in a vector). The output vector will contain
10337 : : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
10338 : : will be 2).
10339 : :
10340 : : If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
10341 : : containing the operands.
10342 : :
10343 : : For example, NUNITS is four as before, and the group size is 8
10344 : : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
10345 : : {s5, s6, s7, s8}. */
10346 : :
10347 : : /* When using duplicate_and_interleave, we just need one element for
10348 : : each scalar statement. */
10349 : 487937 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
10350 : : nunits = group_size;
10351 : :
10352 : 487937 : number_of_copies = nunits * number_of_vectors / group_size;
10353 : :
10354 : 487937 : number_of_places_left_in_vector = nunits;
10355 : 487937 : constant_p = true;
10356 : 487937 : tree uniform_elt = NULL_TREE;
10357 : 487937 : tree_vector_builder elts (vector_type, nunits, 1);
10358 : 487937 : elts.quick_grow (nunits);
10359 : 487937 : stmt_vec_info insert_after = NULL;
10360 : 1364535 : for (j = 0; j < number_of_copies; j++)
10361 : : {
10362 : 876598 : tree op;
10363 : 3451768 : for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
10364 : : {
10365 : : /* Create 'vect_ = {op0,op1,...,opn}'. */
10366 : 1698572 : tree orig_op = op;
10367 : 1698572 : if (number_of_places_left_in_vector == nunits)
10368 : : uniform_elt = op;
10369 : 1086047 : else if (uniform_elt && operand_equal_p (uniform_elt, op))
10370 : 658382 : op = elts[number_of_places_left_in_vector];
10371 : : else
10372 : : uniform_elt = NULL_TREE;
10373 : 1698572 : number_of_places_left_in_vector--;
10374 : 1698572 : if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
10375 : : {
10376 : 285345 : if (CONSTANT_CLASS_P (op))
10377 : : {
10378 : 106312 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10379 : : {
10380 : : /* Can't use VIEW_CONVERT_EXPR for booleans because
10381 : : of possibly different sizes of scalar value and
10382 : : vector element. */
10383 : 64 : if (integer_zerop (op))
10384 : 64 : op = build_int_cst (TREE_TYPE (vector_type), 0);
10385 : 0 : else if (integer_onep (op))
10386 : 0 : op = build_all_ones_cst (TREE_TYPE (vector_type));
10387 : : else
10388 : 0 : gcc_unreachable ();
10389 : : }
10390 : : else
10391 : 106248 : op = fold_unary (VIEW_CONVERT_EXPR,
10392 : : TREE_TYPE (vector_type), op);
10393 : 106312 : gcc_assert (op && CONSTANT_CLASS_P (op));
10394 : : }
10395 : : else
10396 : : {
10397 : 179033 : tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
10398 : 179033 : gimple *init_stmt;
10399 : 179033 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10400 : : {
10401 : 397 : tree true_val
10402 : 397 : = build_all_ones_cst (TREE_TYPE (vector_type));
10403 : 397 : tree false_val
10404 : 397 : = build_zero_cst (TREE_TYPE (vector_type));
10405 : 397 : gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
10406 : 397 : init_stmt = gimple_build_assign (new_temp, COND_EXPR,
10407 : : op, true_val,
10408 : : false_val);
10409 : : }
10410 : : else
10411 : : {
10412 : 178636 : op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
10413 : : op);
10414 : 178636 : init_stmt
10415 : 178636 : = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
10416 : : op);
10417 : : }
10418 : 179033 : gimple_seq_add_stmt (&ctor_seq, init_stmt);
10419 : 179033 : op = new_temp;
10420 : : }
10421 : : }
10422 : 1698572 : elts[number_of_places_left_in_vector] = op;
10423 : 1698572 : if (!CONSTANT_CLASS_P (op))
10424 : 316200 : constant_p = false;
10425 : : /* For BB vectorization we have to compute an insert location
10426 : : when a def is inside the analyzed region since we cannot
10427 : : simply insert at the BB start in this case. */
10428 : 1698572 : stmt_vec_info opdef;
10429 : 1698572 : if (TREE_CODE (orig_op) == SSA_NAME
10430 : 178674 : && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
10431 : 165941 : && is_a <bb_vec_info> (vinfo)
10432 : 1812780 : && (opdef = vinfo->lookup_def (orig_op)))
10433 : : {
10434 : 91682 : if (!insert_after)
10435 : : insert_after = opdef;
10436 : : else
10437 : 50654 : insert_after = get_later_stmt (insert_after, opdef);
10438 : : }
10439 : :
10440 : 1698572 : if (number_of_places_left_in_vector == 0)
10441 : : {
10442 : 612525 : auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
10443 : 612525 : if (uniform_elt)
10444 : 626828 : vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
10445 : 313414 : elts[0]);
10446 : 598222 : else if (constant_p
10447 : 598222 : ? multiple_p (type_nunits, nunits)
10448 : 115173 : : known_eq (type_nunits, nunits))
10449 : 299111 : vec_cst = gimple_build_vector (&ctor_seq, &elts);
10450 : : else
10451 : : {
10452 : 0 : if (permute_results.is_empty ())
10453 : 0 : duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
10454 : : elts, number_of_vectors,
10455 : : permute_results);
10456 : 0 : vec_cst = permute_results[number_of_vectors - j - 1];
10457 : : }
10458 : 612525 : if (!gimple_seq_empty_p (ctor_seq))
10459 : : {
10460 : 140376 : if (insert_after)
10461 : : {
10462 : 41028 : gimple_stmt_iterator gsi;
10463 : 41028 : if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
10464 : : {
10465 : 853 : gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
10466 : 853 : gsi_insert_seq_before (&gsi, ctor_seq,
10467 : : GSI_CONTINUE_LINKING);
10468 : : }
10469 : 40175 : else if (!stmt_ends_bb_p (insert_after->stmt))
10470 : : {
10471 : 40175 : gsi = gsi_for_stmt (insert_after->stmt);
10472 : 40175 : gsi_insert_seq_after (&gsi, ctor_seq,
10473 : : GSI_CONTINUE_LINKING);
10474 : : }
10475 : : else
10476 : : {
10477 : : /* When we want to insert after a def where the
10478 : : defining stmt throws then insert on the fallthru
10479 : : edge. */
10480 : 0 : edge e = find_fallthru_edge
10481 : 0 : (gimple_bb (insert_after->stmt)->succs);
10482 : 0 : basic_block new_bb
10483 : 0 : = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
10484 : 0 : gcc_assert (!new_bb);
10485 : : }
10486 : : }
10487 : : else
10488 : 99348 : vinfo->insert_seq_on_entry (NULL, ctor_seq);
10489 : 140376 : ctor_seq = NULL;
10490 : : }
10491 : 612525 : voprnds.quick_push (vec_cst);
10492 : 612525 : insert_after = NULL;
10493 : 612525 : number_of_places_left_in_vector = nunits;
10494 : 612525 : constant_p = true;
10495 : 612525 : elts.new_vector (vector_type, nunits, 1);
10496 : 612525 : elts.quick_grow (nunits);
10497 : : }
10498 : : }
10499 : : }
10500 : :
10501 : : /* Since the vectors are created in the reverse order, we should invert
10502 : : them. */
10503 : 487937 : vec_num = voprnds.length ();
10504 : 1100462 : for (j = vec_num; j != 0; j--)
10505 : : {
10506 : 612525 : vop = voprnds[j - 1];
10507 : 612525 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
10508 : : }
10509 : :
10510 : : /* In case that VF is greater than the unrolling factor needed for the SLP
10511 : : group of stmts, NUMBER_OF_VECTORS to be created is greater than
10512 : : NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
10513 : : to replicate the vectors. */
10514 : 487937 : while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
10515 : 487937 : for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
10516 : : i++)
10517 : 0 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
10518 : 487937 : }
10519 : :
10520 : : /* Get the scalar definition of the Nth lane from SLP_NODE or NULL_TREE
10521 : : if there is no definition for it in the scalar IL or it is not known. */
10522 : :
10523 : : tree
10524 : 946 : vect_get_slp_scalar_def (slp_tree slp_node, unsigned n)
10525 : : {
10526 : 946 : if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
10527 : : {
10528 : 946 : if (!SLP_TREE_SCALAR_STMTS (slp_node).exists ())
10529 : : return NULL_TREE;
10530 : 946 : stmt_vec_info def = SLP_TREE_SCALAR_STMTS (slp_node)[n];
10531 : 946 : if (!def)
10532 : : return NULL_TREE;
10533 : 946 : return gimple_get_lhs (STMT_VINFO_STMT (def));
10534 : : }
10535 : : else
10536 : 0 : return SLP_TREE_SCALAR_OPS (slp_node)[n];
10537 : : }
10538 : :
10539 : : /* Get the Ith vectorized definition from SLP_NODE. */
10540 : :
10541 : : tree
10542 : 143181 : vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
10543 : : {
10544 : 143181 : return SLP_TREE_VEC_DEFS (slp_node)[i];
10545 : : }
10546 : :
10547 : : /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
10548 : :
10549 : : void
10550 : 896639 : vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
10551 : : {
10552 : 1793278 : vec_defs->create (SLP_TREE_VEC_DEFS (slp_node).length ());
10553 : 896639 : vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
10554 : 896639 : }
10555 : :
10556 : : /* Get N vectorized definitions for SLP_NODE. */
10557 : :
10558 : : void
10559 : 2929 : vect_get_slp_defs (vec_info *,
10560 : : slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
10561 : : {
10562 : 2929 : if (n == -1U)
10563 : 2929 : n = SLP_TREE_CHILDREN (slp_node).length ();
10564 : :
10565 : 10194 : for (unsigned i = 0; i < n; ++i)
10566 : : {
10567 : 7265 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
10568 : 7265 : vec<tree> vec_defs = vNULL;
10569 : 7265 : vect_get_slp_defs (child, &vec_defs);
10570 : 7265 : vec_oprnds->quick_push (vec_defs);
10571 : : }
10572 : 2929 : }
10573 : :
10574 : : /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
10575 : : - PERM gives the permutation that the caller wants to use for NODE,
10576 : : which might be different from SLP_LOAD_PERMUTATION.
10577 : : - DUMP_P controls whether the function dumps information. */
10578 : :
10579 : : static bool
10580 : 124623 : vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
10581 : : load_permutation_t &perm,
10582 : : const vec<tree> &dr_chain,
10583 : : gimple_stmt_iterator *gsi, poly_uint64 vf,
10584 : : bool analyze_only, bool dump_p,
10585 : : unsigned *n_perms, unsigned int *n_loads,
10586 : : bool dce_chain)
10587 : : {
10588 : 124623 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
10589 : 124623 : int vec_index = 0;
10590 : 124623 : tree vectype = SLP_TREE_VECTYPE (node);
10591 : 124623 : unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
10592 : 124623 : unsigned int mask_element;
10593 : 124623 : unsigned dr_group_size;
10594 : 124623 : machine_mode mode;
10595 : :
10596 : 124623 : if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
10597 : : dr_group_size = 1;
10598 : : else
10599 : : {
10600 : 123650 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
10601 : 123650 : dr_group_size = DR_GROUP_SIZE (stmt_info);
10602 : : }
10603 : :
10604 : 124623 : mode = TYPE_MODE (vectype);
10605 : 124623 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10606 : 124623 : unsigned int nstmts = vect_get_num_copies (vinfo, node);
10607 : :
10608 : : /* Initialize the vect stmts of NODE to properly insert the generated
10609 : : stmts later. */
10610 : 124623 : if (! analyze_only)
10611 : 57587 : for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
10612 : 22663 : SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
10613 : :
10614 : : /* Generate permutation masks for every NODE. Number of masks for each NODE
10615 : : is equal to GROUP_SIZE.
10616 : : E.g., we have a group of three nodes with three loads from the same
10617 : : location in each node, and the vector size is 4. I.e., we have a
10618 : : a0b0c0a1b1c1... sequence and we need to create the following vectors:
10619 : : for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
10620 : : for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
10621 : : ...
10622 : :
10623 : : The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
10624 : : The last mask is illegal since we assume two operands for permute
10625 : : operation, and the mask element values can't be outside that range.
10626 : : Hence, the last mask must be converted into {2,5,5,5}.
10627 : : For the first two permutations we need the first and the second input
10628 : : vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
10629 : : we need the second and the third vectors: {b1,c1,a2,b2} and
10630 : : {c2,a3,b3,c3}. */
10631 : :
10632 : 124623 : int vect_stmts_counter = 0;
10633 : 124623 : unsigned int index = 0;
10634 : 124623 : int first_vec_index = -1;
10635 : 124623 : int second_vec_index = -1;
10636 : 124623 : bool noop_p = true;
10637 : 124623 : *n_perms = 0;
10638 : :
10639 : 124623 : vec_perm_builder mask;
10640 : 124623 : unsigned int nelts_to_build;
10641 : 124623 : unsigned int nvectors_per_build;
10642 : 124623 : unsigned int in_nlanes;
10643 : 124623 : bool repeating_p = (group_size == dr_group_size
10644 : 166056 : && multiple_p (nunits, group_size));
10645 : 124623 : if (repeating_p)
10646 : : {
10647 : : /* A single vector contains a whole number of copies of the node, so:
10648 : : (a) all permutes can use the same mask; and
10649 : : (b) the permutes only need a single vector input. */
10650 : 34631 : mask.new_vector (nunits, group_size, 3);
10651 : 34631 : nelts_to_build = mask.encoded_nelts ();
10652 : : /* It's possible to obtain zero nstmts during analyze_only, so make
10653 : : it at least one to ensure the later computation for n_perms
10654 : : proceed. */
10655 : 34631 : nvectors_per_build = nstmts > 0 ? nstmts : 1;
10656 : 34631 : in_nlanes = dr_group_size * 3;
10657 : : }
10658 : : else
10659 : : {
10660 : : /* We need to construct a separate mask for each vector statement. */
10661 : 89992 : unsigned HOST_WIDE_INT const_nunits, const_vf;
10662 : 89992 : if (!nunits.is_constant (&const_nunits)
10663 : 89992 : || !vf.is_constant (&const_vf))
10664 : : return false;
10665 : 89992 : mask.new_vector (const_nunits, const_nunits, 1);
10666 : 89992 : nelts_to_build = const_vf * group_size;
10667 : 89992 : nvectors_per_build = 1;
10668 : 89992 : in_nlanes = const_vf * dr_group_size;
10669 : : }
10670 : 124623 : auto_sbitmap used_in_lanes (in_nlanes);
10671 : 124623 : bitmap_clear (used_in_lanes);
10672 : 124623 : auto_bitmap used_defs;
10673 : :
10674 : 124623 : unsigned int count = mask.encoded_nelts ();
10675 : 124623 : mask.quick_grow (count);
10676 : 124623 : vec_perm_indices indices;
10677 : :
10678 : 679400 : for (unsigned int j = 0; j < nelts_to_build; j++)
10679 : : {
10680 : 565162 : unsigned int iter_num = j / group_size;
10681 : 565162 : unsigned int stmt_num = j % group_size;
10682 : 565162 : unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
10683 : 565162 : bitmap_set_bit (used_in_lanes, i);
10684 : 565162 : if (repeating_p)
10685 : : {
10686 : : first_vec_index = 0;
10687 : : mask_element = i;
10688 : : }
10689 : : else
10690 : : {
10691 : : /* Enforced before the loop when !repeating_p. */
10692 : 342028 : unsigned int const_nunits = nunits.to_constant ();
10693 : 342028 : vec_index = i / const_nunits;
10694 : 342028 : mask_element = i % const_nunits;
10695 : 342028 : if (vec_index == first_vec_index
10696 : 342028 : || first_vec_index == -1)
10697 : : {
10698 : : first_vec_index = vec_index;
10699 : : }
10700 : 131180 : else if (vec_index == second_vec_index
10701 : 131180 : || second_vec_index == -1)
10702 : : {
10703 : 124850 : second_vec_index = vec_index;
10704 : 124850 : mask_element += const_nunits;
10705 : : }
10706 : : else
10707 : : {
10708 : 6330 : if (dump_p)
10709 : 223 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10710 : : "permutation requires at "
10711 : : "least three vectors %G",
10712 : : stmt_info->stmt);
10713 : 6330 : gcc_assert (analyze_only);
10714 : : return false;
10715 : : }
10716 : :
10717 : 335698 : gcc_assert (mask_element < 2 * const_nunits);
10718 : : }
10719 : :
10720 : 558832 : if (mask_element != index)
10721 : 361695 : noop_p = false;
10722 : 558832 : mask[index++] = mask_element;
10723 : :
10724 : 558832 : if (index == count)
10725 : : {
10726 : 148730 : if (!noop_p)
10727 : : {
10728 : 197059 : indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
10729 : 119577 : if (!can_vec_perm_const_p (mode, mode, indices))
10730 : : {
10731 : 4055 : if (dump_p)
10732 : : {
10733 : 75 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10734 : : "unsupported vect permute { ");
10735 : 625 : for (i = 0; i < count; ++i)
10736 : : {
10737 : 550 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
10738 : 550 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
10739 : : }
10740 : 75 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
10741 : : }
10742 : 4055 : gcc_assert (analyze_only);
10743 : : return false;
10744 : : }
10745 : :
10746 : 115522 : tree mask_vec = NULL_TREE;
10747 : 115522 : if (!analyze_only)
10748 : 19716 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
10749 : :
10750 : 115522 : if (second_vec_index == -1)
10751 : 39699 : second_vec_index = first_vec_index;
10752 : :
10753 : 236734 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
10754 : : {
10755 : 121212 : ++*n_perms;
10756 : 121212 : if (analyze_only)
10757 : 99724 : continue;
10758 : : /* Generate the permute statement if necessary. */
10759 : 21488 : tree first_vec = dr_chain[first_vec_index + ri];
10760 : 21488 : tree second_vec = dr_chain[second_vec_index + ri];
10761 : 21488 : gassign *stmt = as_a<gassign *> (stmt_info->stmt);
10762 : 21488 : tree perm_dest
10763 : 21488 : = vect_create_destination_var (gimple_assign_lhs (stmt),
10764 : : vectype);
10765 : 21488 : perm_dest = make_ssa_name (perm_dest);
10766 : 21488 : gimple *perm_stmt
10767 : 21488 : = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
10768 : : second_vec, mask_vec);
10769 : 21488 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
10770 : : gsi);
10771 : 21488 : if (dce_chain)
10772 : : {
10773 : 20549 : bitmap_set_bit (used_defs, first_vec_index + ri);
10774 : 20549 : bitmap_set_bit (used_defs, second_vec_index + ri);
10775 : : }
10776 : :
10777 : : /* Store the vector statement in NODE. */
10778 : 21488 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
10779 : : }
10780 : : }
10781 : 29153 : else if (!analyze_only)
10782 : : {
10783 : 2350 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
10784 : : {
10785 : 1175 : tree first_vec = dr_chain[first_vec_index + ri];
10786 : : /* If mask was NULL_TREE generate the requested
10787 : : identity transform. */
10788 : 1175 : if (dce_chain)
10789 : 1174 : bitmap_set_bit (used_defs, first_vec_index + ri);
10790 : :
10791 : : /* Store the vector statement in NODE. */
10792 : 1175 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
10793 : : }
10794 : : }
10795 : :
10796 : : index = 0;
10797 : : first_vec_index = -1;
10798 : : second_vec_index = -1;
10799 : : noop_p = true;
10800 : : }
10801 : : }
10802 : :
10803 : 114238 : if (n_loads)
10804 : : {
10805 : 0 : if (repeating_p)
10806 : 0 : *n_loads = nstmts;
10807 : : else
10808 : : {
10809 : : /* Enforced above when !repeating_p. */
10810 : 0 : unsigned int const_nunits = nunits.to_constant ();
10811 : 0 : *n_loads = 0;
10812 : 0 : bool load_seen = false;
10813 : 0 : for (unsigned i = 0; i < in_nlanes; ++i)
10814 : : {
10815 : 0 : if (i % const_nunits == 0)
10816 : : {
10817 : 0 : if (load_seen)
10818 : 0 : *n_loads += 1;
10819 : : load_seen = false;
10820 : : }
10821 : 0 : if (bitmap_bit_p (used_in_lanes, i))
10822 : 0 : load_seen = true;
10823 : : }
10824 : 0 : if (load_seen)
10825 : 0 : *n_loads += 1;
10826 : : }
10827 : : }
10828 : :
10829 : 114238 : if (dce_chain)
10830 : 175898 : for (unsigned i = 0; i < dr_chain.length (); ++i)
10831 : 34485 : if (!bitmap_bit_p (used_defs, i))
10832 : : {
10833 : 2764 : tree def = dr_chain[i];
10834 : 2855 : do
10835 : : {
10836 : 2855 : gimple *stmt = SSA_NAME_DEF_STMT (def);
10837 : 2855 : if (is_gimple_assign (stmt)
10838 : 2855 : && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
10839 : 2855 : || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
10840 : 529 : def = single_ssa_tree_operand (stmt, SSA_OP_USE);
10841 : : else
10842 : : def = NULL;
10843 : 2855 : gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
10844 : 2855 : gsi_remove (&rgsi, true);
10845 : 2855 : release_defs (stmt);
10846 : : }
10847 : 2855 : while (def);
10848 : : }
10849 : :
10850 : : return true;
10851 : 124623 : }
10852 : :
10853 : : /* Generate vector permute statements from a list of loads in DR_CHAIN.
10854 : : If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
10855 : : permute statements for the SLP node NODE. Store the number of vector
10856 : : permute instructions in *N_PERMS and the number of vector load
10857 : : instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
10858 : : that were not needed. */
10859 : :
10860 : : bool
10861 : 77977 : vect_transform_slp_perm_load (vec_info *vinfo,
10862 : : slp_tree node, const vec<tree> &dr_chain,
10863 : : gimple_stmt_iterator *gsi, poly_uint64 vf,
10864 : : bool analyze_only, unsigned *n_perms,
10865 : : unsigned int *n_loads, bool dce_chain)
10866 : : {
10867 : 77977 : return vect_transform_slp_perm_load_1 (vinfo, node,
10868 : 77977 : SLP_TREE_LOAD_PERMUTATION (node),
10869 : : dr_chain, gsi, vf, analyze_only,
10870 : : dump_enabled_p (), n_perms, n_loads,
10871 : 77977 : dce_chain);
10872 : : }
10873 : :
10874 : : /* Produce the next vector result for SLP permutation NODE by adding a vector
10875 : : statement at GSI. If MASK_VEC is nonnull, add:
10876 : :
10877 : : <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
10878 : :
10879 : : otherwise add:
10880 : :
10881 : : <new SSA name> = FIRST_DEF. */
10882 : :
10883 : : static void
10884 : 30384 : vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
10885 : : slp_tree node, tree first_def, tree second_def,
10886 : : tree mask_vec, poly_uint64 identity_offset)
10887 : : {
10888 : 30384 : tree vectype = SLP_TREE_VECTYPE (node);
10889 : :
10890 : : /* ??? We SLP match existing vector element extracts but
10891 : : allow punning which we need to re-instantiate at uses
10892 : : but have no good way of explicitly representing. */
10893 : 30384 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
10894 : 30384 : && !types_compatible_p (TREE_TYPE (first_def), vectype))
10895 : : {
10896 : 13 : gassign *conv_stmt
10897 : 13 : = gimple_build_assign (make_ssa_name (vectype),
10898 : : build1 (VIEW_CONVERT_EXPR, vectype, first_def));
10899 : 13 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
10900 : 13 : first_def = gimple_assign_lhs (conv_stmt);
10901 : : }
10902 : 30384 : gassign *perm_stmt;
10903 : 30384 : tree perm_dest = make_ssa_name (vectype);
10904 : 30384 : if (mask_vec)
10905 : : {
10906 : 27380 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
10907 : 27380 : TYPE_SIZE (vectype))
10908 : 27380 : && !types_compatible_p (TREE_TYPE (second_def), vectype))
10909 : : {
10910 : 8 : gassign *conv_stmt
10911 : 8 : = gimple_build_assign (make_ssa_name (vectype),
10912 : : build1 (VIEW_CONVERT_EXPR,
10913 : : vectype, second_def));
10914 : 8 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
10915 : 8 : second_def = gimple_assign_lhs (conv_stmt);
10916 : : }
10917 : 27380 : perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
10918 : : first_def, second_def,
10919 : : mask_vec);
10920 : : }
10921 : 3004 : else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
10922 : : {
10923 : : /* For identity permutes we still need to handle the case
10924 : : of offsetted extracts or concats. */
10925 : 205 : unsigned HOST_WIDE_INT c;
10926 : 205 : auto first_def_nunits
10927 : 205 : = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
10928 : 205 : if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
10929 : : {
10930 : 201 : unsigned HOST_WIDE_INT elsz
10931 : 201 : = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
10932 : 402 : tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
10933 : 201 : TYPE_SIZE (vectype),
10934 : 201 : bitsize_int (identity_offset * elsz));
10935 : 201 : perm_stmt = gimple_build_assign (perm_dest, lowpart);
10936 : : }
10937 : 4 : else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
10938 : 4 : first_def_nunits, &c) && c == 2)
10939 : : {
10940 : 4 : tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
10941 : : NULL_TREE, second_def);
10942 : 4 : perm_stmt = gimple_build_assign (perm_dest, ctor);
10943 : : }
10944 : : else
10945 : 0 : gcc_unreachable ();
10946 : : }
10947 : : else
10948 : : {
10949 : : /* We need a copy here in case the def was external. */
10950 : 2799 : perm_stmt = gimple_build_assign (perm_dest, first_def);
10951 : : }
10952 : 30384 : vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
10953 : : /* Store the vector statement in NODE. */
10954 : 30384 : node->push_vec_def (perm_stmt);
10955 : 30384 : }
10956 : :
10957 : : /* Subroutine of vectorizable_slp_permutation. Check whether the target
10958 : : can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
10959 : : If GSI is nonnull, emit the permutation there.
10960 : :
10961 : : When GSI is null, the only purpose of NODE is to give properties
10962 : : of the result, such as the vector type and number of SLP lanes.
10963 : : The node does not need to be a VEC_PERM_EXPR.
10964 : :
10965 : : If the target supports the operation, return the number of individual
10966 : : VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
10967 : : dump file if DUMP_P is true. */
10968 : :
10969 : : static int
10970 : 425351 : vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
10971 : : slp_tree node, lane_permutation_t &perm,
10972 : : vec<slp_tree> &children, bool dump_p)
10973 : : {
10974 : 425351 : tree vectype = SLP_TREE_VECTYPE (node);
10975 : :
10976 : : /* ??? We currently only support all same vector input types
10977 : : while the SLP IL should really do a concat + select and thus accept
10978 : : arbitrary mismatches. */
10979 : 425351 : slp_tree child;
10980 : 425351 : unsigned i;
10981 : 425351 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10982 : 425351 : bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
10983 : : /* True if we're permuting a single input of 2N vectors down
10984 : : to N vectors. This case doesn't generalize beyond 2 since
10985 : : VEC_PERM_EXPR only takes 2 inputs. */
10986 : 425351 : bool pack_p = false;
10987 : : /* If we're permuting inputs of N vectors each into X*N outputs,
10988 : : this is the value of X, otherwise it is 1. */
10989 : 425351 : unsigned int unpack_factor = 1;
10990 : 425351 : tree op_vectype = NULL_TREE;
10991 : 426552 : FOR_EACH_VEC_ELT (children, i, child)
10992 : 426473 : if (SLP_TREE_VECTYPE (child))
10993 : : {
10994 : : op_vectype = SLP_TREE_VECTYPE (child);
10995 : : break;
10996 : : }
10997 : 425351 : if (!op_vectype)
10998 : 79 : op_vectype = vectype;
10999 : 905565 : FOR_EACH_VEC_ELT (children, i, child)
11000 : : {
11001 : 480214 : if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
11002 : 9565 : && !vect_maybe_update_slp_op_vectype (child, op_vectype))
11003 : 480214 : || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
11004 : 960428 : || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
11005 : : {
11006 : 0 : if (dump_p)
11007 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11008 : : "Unsupported vector types in lane permutation\n");
11009 : 0 : return -1;
11010 : : }
11011 : 480214 : auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
11012 : 480214 : unsigned int this_unpack_factor;
11013 : : /* Detect permutations of external, pre-existing vectors. The external
11014 : : node's SLP_TREE_LANES stores the total number of units in the vector,
11015 : : or zero if the vector has variable length.
11016 : :
11017 : : We are expected to keep the original VEC_PERM_EXPR for such cases.
11018 : : There is no repetition to model. */
11019 : 480214 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def
11020 : 480214 : && SLP_TREE_SCALAR_OPS (child).is_empty ())
11021 : : repeating_p = false;
11022 : : /* Check whether the input has twice as many lanes per vector. */
11023 : 472786 : else if (children.length () == 1
11024 : 472786 : && known_eq (SLP_TREE_LANES (child) * nunits,
11025 : : SLP_TREE_LANES (node) * op_nunits * 2))
11026 : : pack_p = true;
11027 : : /* Check whether the output has N times as many lanes per vector. */
11028 : 480214 : else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
11029 : 430867 : SLP_TREE_LANES (child) * nunits,
11030 : : &this_unpack_factor)
11031 : 396179 : && (i == 0 || unpack_factor == this_unpack_factor))
11032 : : unpack_factor = this_unpack_factor;
11033 : : else
11034 : : repeating_p = false;
11035 : : }
11036 : :
11037 : 850702 : gcc_assert (perm.length () == SLP_TREE_LANES (node));
11038 : :
11039 : : /* Load-lanes permute. This permute only acts as a forwarder to
11040 : : select the correct vector def of the load-lanes load which
11041 : : has the permuted vectors in its vector defs like
11042 : : { v0, w0, r0, v1, w1, r1 ... } for a ld3. All costs are
11043 : : accounted for in the costing for the actual load so we
11044 : : return zero here. */
11045 : 425351 : if (node->ldst_lanes)
11046 : : {
11047 : 0 : gcc_assert (children.length () == 1);
11048 : 0 : if (!gsi)
11049 : : /* This is a trivial op always supported. */
11050 : : return 0;
11051 : 0 : slp_tree child = children[0];
11052 : 0 : unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
11053 : 0 : / SLP_TREE_LANES (node));
11054 : 0 : unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
11055 : 0 : unsigned nvectors = vect_get_num_copies (vinfo, node);
11056 : 0 : for (unsigned i = 0; i < nvectors; ++i)
11057 : : {
11058 : 0 : tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx];
11059 : 0 : node->push_vec_def (def);
11060 : : }
11061 : : return 0;
11062 : : }
11063 : :
11064 : : /* Set REPEATING_P to true if the permutations are cylical wrt UNPACK_FACTOR
11065 : : and if we can generate the vectors in a vector-length agnostic way.
11066 : : This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
11067 : : compile time.
11068 : :
11069 : : The significance of UNPACK_STEP is that, when PACK_P is false,
11070 : : output vector I operates on a window of UNPACK_STEP elements from each
11071 : : input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR). For example,
11072 : : when UNPACK_FACTOR is 2, the first output vector operates on lanes
11073 : : [0, NUNITS / 2 - 1] of each input vector and the second output vector
11074 : : operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
11075 : :
11076 : : When REPEATING_P is true, NOUTPUTS holds the total number of outputs
11077 : : that we actually need to generate. */
11078 : 425351 : uint64_t noutputs = 0;
11079 : 425351 : poly_uint64 unpack_step = 0;
11080 : 425351 : loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
11081 : 127803 : if (!linfo
11082 : 463555 : || !multiple_p (nunits, unpack_factor, &unpack_step)
11083 : 126914 : || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
11084 : 126914 : * SLP_TREE_LANES (node), nunits, &noutputs))
11085 : : repeating_p = false;
11086 : :
11087 : : /* We can handle the conditions described for REPEATING_P above for
11088 : : both variable- and constant-length vectors. The fallback requires
11089 : : us to generate every element of every permute vector explicitly,
11090 : : which is only possible for constant-length permute vectors.
11091 : :
11092 : : Set:
11093 : :
11094 : : - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
11095 : : mask vectors that we want to build.
11096 : :
11097 : : - NCOPIES to the number of copies of PERM that we need in order
11098 : : to build the necessary permute mask vectors. */
11099 : 126914 : uint64_t npatterns;
11100 : 126914 : unsigned nelts_per_pattern;
11101 : 126914 : uint64_t ncopies;
11102 : 126914 : if (repeating_p)
11103 : : {
11104 : : /* We need permute mask vectors that have the form:
11105 : :
11106 : : { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
11107 : :
11108 : : In other words, the original n-element permute in PERM is
11109 : : "unrolled" to fill a full vector. The stepped vector encoding
11110 : : that we use for permutes requires 3n elements. */
11111 : 88710 : npatterns = SLP_TREE_LANES (node);
11112 : 88710 : nelts_per_pattern = ncopies = 3;
11113 : : }
11114 : : else
11115 : : {
11116 : : /* Calculate every element of every permute mask vector explicitly,
11117 : : instead of relying on the pattern described above. */
11118 : 336641 : if (!nunits.is_constant (&npatterns)
11119 : 336641 : || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
11120 : : {
11121 : : if (dump_p)
11122 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11123 : : "unsupported permutation %p on variable-length"
11124 : : " vectors\n", (void *) node);
11125 : : return -1;
11126 : : }
11127 : 336641 : nelts_per_pattern = ncopies = 1;
11128 : 336641 : if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
11129 : : {
11130 : : if (dump_p)
11131 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11132 : : "unsupported permutation %p for variable VF\n",
11133 : : (void *) node);
11134 : : return -1;
11135 : : }
11136 : : pack_p = false;
11137 : : unpack_factor = 1;
11138 : : }
11139 : 425351 : unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
11140 : 425351 : gcc_assert (repeating_p || multiple_p (olanes, nunits));
11141 : :
11142 : : /* Compute the { { SLP operand, vector index}, lane } permutation sequence
11143 : : from the { SLP operand, scalar lane } permutation as recorded in the
11144 : : SLP node as intermediate step. This part should already work
11145 : : with SLP children with arbitrary number of lanes. */
11146 : 425351 : auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
11147 : 425351 : auto_vec<poly_uint64> active_lane;
11148 : 425351 : vperm.create (olanes);
11149 : 425351 : active_lane.safe_grow_cleared (children.length (), true);
11150 : 857597 : for (unsigned int ui = 0; ui < unpack_factor; ++ui)
11151 : : {
11152 : 1852452 : for (unsigned j = 0; j < children.length (); ++j)
11153 : 493980 : active_lane[j] = ui * unpack_step;
11154 : 1162351 : for (unsigned i = 0; i < ncopies; ++i)
11155 : : {
11156 : 4556132 : for (unsigned pi = 0; pi < perm.length (); ++pi)
11157 : : {
11158 : 1547961 : std::pair<unsigned, unsigned> p = perm[pi];
11159 : 1547961 : tree vtype = SLP_TREE_VECTYPE (children[p.first]);
11160 : 1547961 : if (repeating_p)
11161 : 501747 : vperm.quick_push ({{p.first, 0},
11162 : 501747 : p.second + active_lane[p.first]});
11163 : : else
11164 : : {
11165 : : /* We checked above that the vectors are constant-length. */
11166 : 1046214 : unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
11167 : 1046214 : .to_constant ();
11168 : 1046214 : unsigned lane = active_lane[p.first].to_constant ();
11169 : 1046214 : unsigned vi = (lane + p.second) / vnunits;
11170 : 1046214 : unsigned vl = (lane + p.second) % vnunits;
11171 : 1046214 : vperm.quick_push ({{p.first, vi}, vl});
11172 : : }
11173 : : }
11174 : : /* Advance to the next group. */
11175 : 1564813 : for (unsigned j = 0; j < children.length (); ++j)
11176 : 834708 : active_lane[j] += SLP_TREE_LANES (children[j]);
11177 : : }
11178 : : }
11179 : :
11180 : 425351 : if (dump_p)
11181 : : {
11182 : 8174 : dump_printf_loc (MSG_NOTE, vect_location,
11183 : : "vectorizing permutation %p", (void *)node);
11184 : 29573 : for (unsigned i = 0; i < perm.length (); ++i)
11185 : 21399 : dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
11186 : 8174 : if (repeating_p)
11187 : 6932 : dump_printf (MSG_NOTE, " (repeat %d)", SLP_TREE_LANES (node));
11188 : 8174 : dump_printf (MSG_NOTE, "\n");
11189 : 8174 : dump_printf_loc (MSG_NOTE, vect_location, "as");
11190 : 82683 : for (unsigned i = 0; i < vperm.length (); ++i)
11191 : : {
11192 : 74509 : if (i != 0
11193 : 74509 : && (repeating_p
11194 : 51427 : ? multiple_p (i, npatterns)
11195 : 55128 : : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
11196 : 22417 : dump_printf (MSG_NOTE, ",");
11197 : 74509 : dump_printf (MSG_NOTE, " vops%u[%u][",
11198 : 74509 : vperm[i].first.first, vperm[i].first.second);
11199 : 74509 : dump_dec (MSG_NOTE, vperm[i].second);
11200 : 74509 : dump_printf (MSG_NOTE, "]");
11201 : : }
11202 : 8174 : dump_printf (MSG_NOTE, "\n");
11203 : : }
11204 : :
11205 : : /* We can only handle two-vector permutes, everything else should
11206 : : be lowered on the SLP level. The following is closely inspired
11207 : : by vect_transform_slp_perm_load and is supposed to eventually
11208 : : replace it.
11209 : : ??? As intermediate step do code-gen in the SLP tree representation
11210 : : somehow? */
11211 : 425351 : std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
11212 : 425351 : std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
11213 : 425351 : unsigned int index = 0;
11214 : 425351 : poly_uint64 mask_element;
11215 : 425351 : vec_perm_builder mask;
11216 : 425351 : mask.new_vector (nunits, npatterns, nelts_per_pattern);
11217 : 425351 : unsigned int count = mask.encoded_nelts ();
11218 : 425351 : mask.quick_grow (count);
11219 : 425351 : vec_perm_indices indices;
11220 : 425351 : unsigned nperms = 0;
11221 : : /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
11222 : : vectors to check during analysis, but we need to generate NOUTPUTS
11223 : : vectors during transformation. */
11224 : 425351 : unsigned total_nelts = olanes;
11225 : 425351 : unsigned process_nelts = olanes;
11226 : 425351 : if (repeating_p)
11227 : : {
11228 : 88710 : total_nelts = (total_nelts / unpack_factor) * noutputs;
11229 : 88710 : if (gsi)
11230 : 9532 : process_nelts = total_nelts;
11231 : : }
11232 : 425351 : unsigned last_ei = (total_nelts - 1) % process_nelts;
11233 : 1981435 : for (unsigned i = 0; i < process_nelts; ++i)
11234 : : {
11235 : : /* VI is the input vector index when generating code for REPEATING_P. */
11236 : 1563869 : unsigned vi = i / olanes * (pack_p ? 2 : 1);
11237 : 1563869 : unsigned ei = i % olanes;
11238 : 1563869 : mask_element = vperm[ei].second;
11239 : 1563869 : if (pack_p)
11240 : : {
11241 : : /* In this case, we have N outputs and the single child provides 2N
11242 : : inputs. Output X permutes inputs 2X and 2X+1.
11243 : :
11244 : : The mask indices are taken directly from the SLP permutation node.
11245 : : Index X selects from the first vector if (X / NUNITS) % 2 == 0;
11246 : : X selects from the second vector otherwise. These conditions
11247 : : are only known at compile time for constant-length vectors. */
11248 : : first_vec = std::make_pair (0, 0);
11249 : : second_vec = std::make_pair (0, 1);
11250 : : }
11251 : 1405031 : else if (first_vec.first == -1U
11252 : 1405031 : || first_vec == vperm[ei].first)
11253 : 1227704 : first_vec = vperm[ei].first;
11254 : 177327 : else if (second_vec.first == -1U
11255 : 177327 : || second_vec == vperm[ei].first)
11256 : : {
11257 : 176939 : second_vec = vperm[ei].first;
11258 : 176939 : mask_element += nunits;
11259 : : }
11260 : : else
11261 : : {
11262 : 388 : if (dump_p)
11263 : 19 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11264 : : "permutation requires at "
11265 : : "least three vectors\n");
11266 : 388 : gcc_assert (!gsi);
11267 : : return -1;
11268 : : }
11269 : :
11270 : 1563481 : mask[index++] = mask_element;
11271 : :
11272 : 1563481 : if (index == count)
11273 : : {
11274 : 702911 : indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
11275 : : TYPE_VECTOR_SUBPARTS (op_vectype));
11276 : 565241 : bool identity_p = (indices.series_p (0, 1, mask[0], 1)
11277 : 879478 : && constant_multiple_p (mask[0], nunits));
11278 : 565241 : machine_mode vmode = TYPE_MODE (vectype);
11279 : 565241 : machine_mode op_vmode = TYPE_MODE (op_vectype);
11280 : 565241 : unsigned HOST_WIDE_INT c;
11281 : 565241 : if ((!identity_p
11282 : 524970 : && !can_vec_perm_const_p (vmode, op_vmode, indices))
11283 : 565241 : || (identity_p
11284 : 40271 : && !known_le (nunits,
11285 : : TYPE_VECTOR_SUBPARTS (op_vectype))
11286 : 7405 : && (!constant_multiple_p (nunits,
11287 : 8 : TYPE_VECTOR_SUBPARTS (op_vectype),
11288 : 8 : &c) || c != 2)))
11289 : : {
11290 : 7397 : if (dump_p)
11291 : : {
11292 : 152 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
11293 : : vect_location,
11294 : : "unsupported vect permute { ");
11295 : 1586 : for (i = 0; i < count; ++i)
11296 : : {
11297 : 1434 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
11298 : 1434 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
11299 : : }
11300 : 152 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
11301 : : }
11302 : 7397 : gcc_assert (!gsi);
11303 : 7785 : return -1;
11304 : : }
11305 : :
11306 : 557844 : if (!identity_p)
11307 : 517573 : nperms += CEIL (total_nelts, process_nelts) - (ei > last_ei);
11308 : 557844 : if (gsi)
11309 : : {
11310 : 30384 : if (second_vec.first == -1U)
11311 : 6800 : second_vec = first_vec;
11312 : :
11313 : 30384 : slp_tree
11314 : 30384 : first_node = children[first_vec.first],
11315 : 30384 : second_node = children[second_vec.first];
11316 : :
11317 : 30384 : tree mask_vec = NULL_TREE;
11318 : 30384 : if (!identity_p)
11319 : 27380 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
11320 : :
11321 : 30384 : tree first_def
11322 : 30384 : = vect_get_slp_vect_def (first_node, first_vec.second + vi);
11323 : 30384 : tree second_def
11324 : 30384 : = vect_get_slp_vect_def (second_node, second_vec.second + vi);
11325 : 30384 : vect_add_slp_permutation (vinfo, gsi, node, first_def,
11326 : 30384 : second_def, mask_vec, mask[0]);
11327 : : }
11328 : :
11329 : : index = 0;
11330 : : first_vec = std::make_pair (-1U, -1U);
11331 : : second_vec = std::make_pair (-1U, -1U);
11332 : : }
11333 : : }
11334 : :
11335 : 417566 : return nperms;
11336 : 425351 : }
11337 : :
11338 : : /* Vectorize the SLP permutations in NODE as specified
11339 : : in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
11340 : : child number and lane number.
11341 : : Interleaving of two two-lane two-child SLP subtrees (not supported):
11342 : : [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
11343 : : A blend of two four-lane two-child SLP subtrees:
11344 : : [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
11345 : : Highpart of a four-lane one-child SLP subtree (not supported):
11346 : : [ { 0, 2 }, { 0, 3 } ]
11347 : : Where currently only a subset is supported by code generating below. */
11348 : :
11349 : : bool
11350 : 114926 : vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11351 : : slp_tree node, stmt_vector_for_cost *cost_vec)
11352 : : {
11353 : 114926 : tree vectype = SLP_TREE_VECTYPE (node);
11354 : 114926 : lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
11355 : 114926 : int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
11356 : 114926 : SLP_TREE_CHILDREN (node),
11357 : : dump_enabled_p ());
11358 : 114926 : if (nperms < 0)
11359 : : return false;
11360 : :
11361 : 113695 : if (!gsi && nperms != 0)
11362 : 92759 : record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
11363 : :
11364 : : return true;
11365 : : }
11366 : :
11367 : : /* Vectorize SLP NODE. */
11368 : :
11369 : : static void
11370 : 1446484 : vect_schedule_slp_node (vec_info *vinfo,
11371 : : slp_tree node, slp_instance instance)
11372 : : {
11373 : 1446484 : gimple_stmt_iterator si;
11374 : 1446484 : int i;
11375 : 1446484 : slp_tree child;
11376 : :
11377 : : /* Vectorize externals and constants. */
11378 : 1446484 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
11379 : 1446484 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
11380 : : {
11381 : : /* ??? vectorizable_shift can end up using a scalar operand which is
11382 : : currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
11383 : : node in this case. */
11384 : 494646 : if (!SLP_TREE_VECTYPE (node))
11385 : 494646 : return;
11386 : :
11387 : : /* There are two reasons vector defs might already exist. The first
11388 : : is that we are vectorizing an existing vector def. The second is
11389 : : when performing BB vectorization shared constant/external nodes
11390 : : are not split apart during partitioning so during the code-gen
11391 : : DFS walk we can end up visiting them twice. */
11392 : 488627 : if (! SLP_TREE_VEC_DEFS (node).exists ())
11393 : 487937 : vect_create_constant_vectors (vinfo, node);
11394 : 488627 : return;
11395 : : }
11396 : :
11397 : 951838 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
11398 : :
11399 : 951838 : gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
11400 : 951838 : if (SLP_TREE_VECTYPE (node))
11401 : 951832 : SLP_TREE_VEC_DEFS (node).create (vect_get_num_copies (vinfo, node));
11402 : :
11403 : 951838 : if (!SLP_TREE_PERMUTE_P (node) && STMT_VINFO_DATA_REF (stmt_info))
11404 : : {
11405 : : /* Vectorized loads go before the first scalar load to make it
11406 : : ready early, vectorized stores go before the last scalar
11407 : : stmt which is where all uses are ready. */
11408 : 709760 : stmt_vec_info last_stmt_info = NULL;
11409 : 709760 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
11410 : 164099 : last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
11411 : : else /* DR_IS_WRITE */
11412 : 545661 : last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
11413 : 709760 : si = gsi_for_stmt (last_stmt_info->stmt);
11414 : 709760 : }
11415 : 242078 : else if (!SLP_TREE_PERMUTE_P (node)
11416 : 226119 : && (SLP_TREE_TYPE (node) == cycle_phi_info_type
11417 : : || SLP_TREE_TYPE (node) == induc_vec_info_type
11418 : : || SLP_TREE_TYPE (node) == phi_info_type))
11419 : : {
11420 : : /* For PHI node vectorization we do not use the insertion iterator. */
11421 : 53970 : si = gsi_none ();
11422 : : }
11423 : : else
11424 : : {
11425 : : /* Emit other stmts after the children vectorized defs which is
11426 : : earliest possible. */
11427 : : gimple *last_stmt = NULL;
11428 : : bool seen_vector_def = false;
11429 : 522222 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11430 : 334114 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
11431 : : {
11432 : : /* For fold-left reductions we are retaining the scalar
11433 : : reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
11434 : : set so the representation isn't perfect. Resort to the
11435 : : last scalar def here. */
11436 : 268039 : if (SLP_TREE_VEC_DEFS (child).is_empty ())
11437 : : {
11438 : 847 : gcc_assert (SLP_TREE_TYPE (child) == cycle_phi_info_type);
11439 : 847 : gphi *phi = as_a <gphi *>
11440 : 847 : (vect_find_last_scalar_stmt_in_slp (child)->stmt);
11441 : 847 : if (!last_stmt)
11442 : : last_stmt = phi;
11443 : 620 : else if (vect_stmt_dominates_stmt_p (last_stmt, phi))
11444 : : last_stmt = phi;
11445 : 609 : else if (vect_stmt_dominates_stmt_p (phi, last_stmt))
11446 : : ;
11447 : : else
11448 : 0 : gcc_unreachable ();
11449 : : }
11450 : : /* We are emitting all vectorized stmts in the same place and
11451 : : the last one is the last.
11452 : : ??? Unless we have a load permutation applied and that
11453 : : figures to re-use an earlier generated load. */
11454 : : unsigned j;
11455 : : tree vdef;
11456 : 632143 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11457 : : {
11458 : 364104 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11459 : 364104 : if (!last_stmt)
11460 : : last_stmt = vstmt;
11461 : 186307 : else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11462 : : last_stmt = vstmt;
11463 : 41027 : else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
11464 : : ;
11465 : : else
11466 : 0 : gcc_unreachable ();
11467 : : }
11468 : : }
11469 : 66075 : else if (!SLP_TREE_VECTYPE (child))
11470 : : {
11471 : : /* For externals we use unvectorized at all scalar defs. */
11472 : : unsigned j;
11473 : : tree def;
11474 : 12633 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
11475 : 7198 : if (TREE_CODE (def) == SSA_NAME
11476 : 7198 : && !SSA_NAME_IS_DEFAULT_DEF (def))
11477 : : {
11478 : 169 : gimple *stmt = SSA_NAME_DEF_STMT (def);
11479 : 169 : if (gimple_uid (stmt) == -1u)
11480 : : /* If the stmt is not inside the region do not
11481 : : use it as possible insertion point. */
11482 : : ;
11483 : 161 : else if (!last_stmt)
11484 : : last_stmt = stmt;
11485 : 155 : else if (vect_stmt_dominates_stmt_p (last_stmt, stmt))
11486 : : last_stmt = stmt;
11487 : 155 : else if (vect_stmt_dominates_stmt_p (stmt, last_stmt))
11488 : : ;
11489 : : else
11490 : 0 : gcc_unreachable ();
11491 : : }
11492 : : }
11493 : : else
11494 : : {
11495 : : /* For externals we have to look at all defs since their
11496 : : insertion place is decided per vector. But beware
11497 : : of pre-existing vectors where we need to make sure
11498 : : we do not insert before the region boundary. */
11499 : 60640 : if (SLP_TREE_SCALAR_OPS (child).is_empty ()
11500 : 553 : && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
11501 : : seen_vector_def = true;
11502 : : else
11503 : : {
11504 : : unsigned j;
11505 : : tree vdef;
11506 : 475048 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11507 : 80410 : if (TREE_CODE (vdef) == SSA_NAME
11508 : 80410 : && !SSA_NAME_IS_DEFAULT_DEF (vdef))
11509 : : {
11510 : 17576 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11511 : 17576 : if (!last_stmt)
11512 : : last_stmt = vstmt;
11513 : 9219 : else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11514 : : last_stmt = vstmt;
11515 : 7001 : else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
11516 : : ;
11517 : : else
11518 : 0 : gcc_unreachable ();
11519 : : }
11520 : : }
11521 : : }
11522 : : /* This can happen when all children are pre-existing vectors or
11523 : : constants. */
11524 : 188108 : if (!last_stmt)
11525 : 1721 : last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
11526 : 1721 : if (!last_stmt)
11527 : : {
11528 : 0 : gcc_assert (seen_vector_def);
11529 : 0 : si = gsi_after_labels (vinfo->bbs[0]);
11530 : : }
11531 : 188108 : else if (is_ctrl_altering_stmt (last_stmt))
11532 : : {
11533 : : /* We split regions to vectorize at control altering stmts
11534 : : with a definition so this must be an external which
11535 : : we can insert at the start of the region. */
11536 : 0 : si = gsi_after_labels (vinfo->bbs[0]);
11537 : : }
11538 : 188108 : else if (is_a <bb_vec_info> (vinfo)
11539 : 17678 : && !SLP_TREE_PERMUTE_P (node)
11540 : 16312 : && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
11541 : 189311 : && gimple_could_trap_p (stmt_info->stmt))
11542 : : {
11543 : : /* We've constrained possibly trapping operations to all come
11544 : : from the same basic-block, if vectorized defs would allow earlier
11545 : : scheduling still force vectorized stmts to the original block.
11546 : : This is only necessary for BB vectorization since for loop vect
11547 : : all operations are in a single BB and scalar stmt based
11548 : : placement doesn't play well with epilogue vectorization. */
11549 : 51 : gcc_assert (dominated_by_p (CDI_DOMINATORS,
11550 : : gimple_bb (stmt_info->stmt),
11551 : : gimple_bb (last_stmt)));
11552 : 51 : si = gsi_after_labels (gimple_bb (stmt_info->stmt));
11553 : : }
11554 : 188057 : else if (is_a <gphi *> (last_stmt))
11555 : 14386 : si = gsi_after_labels (gimple_bb (last_stmt));
11556 : : else
11557 : : {
11558 : 173671 : si = gsi_for_stmt (last_stmt);
11559 : 173671 : gsi_next (&si);
11560 : :
11561 : : /* Avoid scheduling internal defs outside of the loop when
11562 : : we might have only implicitly tracked loop mask/len defs. */
11563 : 173671 : if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
11564 : 61 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
11565 : 156236 : || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
11566 : : {
11567 : 61 : gimple_stmt_iterator si2
11568 : 61 : = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
11569 : 61 : if ((gsi_end_p (si2)
11570 : 0 : && (LOOP_VINFO_LOOP (loop_vinfo)->header
11571 : 0 : != gimple_bb (last_stmt))
11572 : 0 : && dominated_by_p (CDI_DOMINATORS,
11573 : : LOOP_VINFO_LOOP (loop_vinfo)->header,
11574 : 0 : gimple_bb (last_stmt)))
11575 : 61 : || (!gsi_end_p (si2)
11576 : 61 : && last_stmt != *si2
11577 : 60 : && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
11578 : 3 : si = si2;
11579 : : }
11580 : : }
11581 : : }
11582 : :
11583 : 951838 : if (dump_enabled_p ())
11584 : : {
11585 : 67601 : if (stmt_info)
11586 : 67550 : dump_printf_loc (MSG_NOTE, vect_location,
11587 : : "------>vectorizing SLP node starting from: %G",
11588 : : stmt_info->stmt);
11589 : : else
11590 : : {
11591 : 51 : dump_printf_loc (MSG_NOTE, vect_location,
11592 : : "------>vectorizing SLP node:\n");
11593 : 51 : vect_print_slp_tree (MSG_NOTE, vect_location, node);
11594 : : }
11595 : : }
11596 : 951838 : vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
11597 : : }
11598 : :
11599 : : /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
11600 : : For loop vectorization this is done in vectorizable_call, but for SLP
11601 : : it needs to be deferred until end of vect_schedule_slp, because multiple
11602 : : SLP instances may refer to the same scalar stmt. */
11603 : :
11604 : : static void
11605 : 559363 : vect_remove_slp_scalar_calls (vec_info *vinfo,
11606 : : slp_tree node, hash_set<slp_tree> &visited)
11607 : : {
11608 : 559363 : gimple *new_stmt;
11609 : 559363 : gimple_stmt_iterator gsi;
11610 : 559363 : int i;
11611 : 559363 : slp_tree child;
11612 : 559363 : tree lhs;
11613 : 559363 : stmt_vec_info stmt_info;
11614 : :
11615 : 559363 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
11616 : 178277 : return;
11617 : :
11618 : 422494 : if (visited.add (node))
11619 : : return;
11620 : :
11621 : 852678 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11622 : 471592 : vect_remove_slp_scalar_calls (vinfo, child, visited);
11623 : :
11624 : 1214282 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
11625 : : {
11626 : 456185 : if (!stmt_info)
11627 : 3699 : continue;
11628 : 452486 : stmt_info = vect_orig_stmt (stmt_info);
11629 : 452486 : gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
11630 : 4753 : if (!stmt || gimple_bb (stmt) == NULL)
11631 : 447745 : continue;
11632 : 4741 : lhs = gimple_call_lhs (stmt);
11633 : 4741 : if (lhs)
11634 : 4194 : new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
11635 : : else
11636 : : {
11637 : 547 : new_stmt = gimple_build_nop ();
11638 : 547 : unlink_stmt_vdef (stmt_info->stmt);
11639 : : }
11640 : 4741 : gsi = gsi_for_stmt (stmt);
11641 : 4741 : vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
11642 : 4741 : if (lhs)
11643 : 4194 : SSA_NAME_DEF_STMT (lhs) = new_stmt;
11644 : : }
11645 : : }
11646 : :
11647 : : static void
11648 : 87771 : vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
11649 : : {
11650 : 87771 : hash_set<slp_tree> visited;
11651 : 87771 : vect_remove_slp_scalar_calls (vinfo, node, visited);
11652 : 87771 : }
11653 : :
11654 : : /* Vectorize the instance root. */
11655 : :
11656 : : void
11657 : 10004 : vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance instance)
11658 : : {
11659 : 10004 : gassign *rstmt = NULL;
11660 : :
11661 : 10004 : if (instance->kind == slp_inst_kind_ctor)
11662 : : {
11663 : 4119 : if (SLP_TREE_VEC_DEFS (node).length () == 1)
11664 : : {
11665 : 4115 : tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
11666 : 4115 : tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
11667 : 4115 : if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
11668 : 4115 : TREE_TYPE (vect_lhs)))
11669 : 0 : vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
11670 : : vect_lhs);
11671 : 4115 : rstmt = gimple_build_assign (root_lhs, vect_lhs);
11672 : : }
11673 : : else
11674 : : {
11675 : 4 : gcc_assert (SLP_TREE_VEC_DEFS (node).length () > 1);
11676 : 4 : tree child_def;
11677 : 4 : int j;
11678 : 4 : vec<constructor_elt, va_gc> *v;
11679 : 4 : vec_alloc (v, SLP_TREE_VEC_DEFS (node).length ());
11680 : :
11681 : : /* A CTOR can handle V16HI composition from VNx8HI so we
11682 : : do not need to convert vector elements if the types
11683 : : do not match. */
11684 : 12 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
11685 : 8 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
11686 : 4 : tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
11687 : 4 : tree rtype
11688 : 4 : = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
11689 : 4 : tree r_constructor = build_constructor (rtype, v);
11690 : 4 : rstmt = gimple_build_assign (lhs, r_constructor);
11691 : : }
11692 : : }
11693 : 5885 : else if (instance->kind == slp_inst_kind_bb_reduc)
11694 : : {
11695 : : /* Largely inspired by reduction chain epilogue handling in
11696 : : vect_create_epilog_for_reduction. */
11697 : 4265 : vec<tree> vec_defs = vNULL;
11698 : 4265 : vect_get_slp_defs (node, &vec_defs);
11699 : 4265 : enum tree_code reduc_code
11700 : 4265 : = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
11701 : : /* ??? We actually have to reflect signs somewhere. */
11702 : 4265 : if (reduc_code == MINUS_EXPR)
11703 : 0 : reduc_code = PLUS_EXPR;
11704 : 4265 : gimple_seq epilogue = NULL;
11705 : : /* We may end up with more than one vector result, reduce them
11706 : : to one vector. */
11707 : 4265 : tree vec_def = vec_defs[0];
11708 : 4265 : tree vectype = TREE_TYPE (vec_def);
11709 : 4265 : tree compute_vectype = vectype;
11710 : 4265 : bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
11711 : 4061 : && TYPE_OVERFLOW_UNDEFINED (vectype)
11712 : 7188 : && operation_can_overflow (reduc_code));
11713 : 2795 : if (pun_for_overflow_p)
11714 : : {
11715 : 2795 : compute_vectype = unsigned_type_for (vectype);
11716 : 2795 : vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
11717 : : compute_vectype, vec_def);
11718 : : }
11719 : 6634 : for (unsigned i = 1; i < vec_defs.length (); ++i)
11720 : : {
11721 : 2369 : tree def = vec_defs[i];
11722 : 2369 : if (pun_for_overflow_p)
11723 : 2273 : def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
11724 : : compute_vectype, def);
11725 : 2369 : vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
11726 : : vec_def, def);
11727 : : }
11728 : 4265 : vec_defs.release ();
11729 : : /* ??? Support other schemes than direct internal fn. */
11730 : 4265 : internal_fn reduc_fn;
11731 : 4265 : if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
11732 : 4265 : || reduc_fn == IFN_LAST)
11733 : 0 : gcc_unreachable ();
11734 : 4265 : tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
11735 : 4265 : TREE_TYPE (compute_vectype), vec_def);
11736 : 4265 : if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
11737 : : {
11738 : 2803 : tree rem_def = NULL_TREE;
11739 : 12383 : for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
11740 : : {
11741 : 9580 : def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
11742 : 9580 : if (!rem_def)
11743 : : rem_def = def;
11744 : : else
11745 : 6777 : rem_def = gimple_build (&epilogue, reduc_code,
11746 : 6777 : TREE_TYPE (scalar_def),
11747 : : rem_def, def);
11748 : : }
11749 : 2803 : scalar_def = gimple_build (&epilogue, reduc_code,
11750 : 2803 : TREE_TYPE (scalar_def),
11751 : : scalar_def, rem_def);
11752 : : }
11753 : 4265 : scalar_def = gimple_convert (&epilogue,
11754 : 4265 : TREE_TYPE (vectype), scalar_def);
11755 : 4265 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
11756 : 4265 : gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
11757 : 4265 : gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
11758 : 4265 : update_stmt (gsi_stmt (rgsi));
11759 : 4265 : return;
11760 : : }
11761 : 1620 : else if (instance->kind == slp_inst_kind_gcond)
11762 : : {
11763 : : /* Only support a single root for now as we can't codegen CFG yet and so we
11764 : : can't support lane > 1 at this time. */
11765 : 1620 : gcc_assert (instance->root_stmts.length () == 1);
11766 : 1620 : auto root_stmt_info = instance->root_stmts[0];
11767 : 1620 : auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
11768 : 1620 : gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
11769 : 1620 : gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
11770 : 1620 : bool res = vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
11771 : : root_stmt_info, &rgsi, node, NULL);
11772 : 1620 : gcc_assert (res);
11773 : 1620 : return;
11774 : : }
11775 : : else
11776 : 0 : gcc_unreachable ();
11777 : :
11778 : 4119 : gcc_assert (rstmt);
11779 : :
11780 : 4119 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
11781 : 4119 : gsi_replace (&rgsi, rstmt, true);
11782 : : }
11783 : :
11784 : : struct slp_scc_info
11785 : : {
11786 : : bool on_stack;
11787 : : int dfs;
11788 : : int lowlink;
11789 : : };
11790 : :
11791 : : /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
11792 : :
11793 : : static void
11794 : 1446484 : vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
11795 : : hash_map<slp_tree, slp_scc_info> &scc_info,
11796 : : int &maxdfs, vec<slp_tree> &stack)
11797 : : {
11798 : 1446484 : bool existed_p;
11799 : 1446484 : slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
11800 : 1446484 : gcc_assert (!existed_p);
11801 : 1446484 : info->dfs = maxdfs;
11802 : 1446484 : info->lowlink = maxdfs;
11803 : 1446484 : maxdfs++;
11804 : :
11805 : : /* Leaf. */
11806 : 1446484 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
11807 : : {
11808 : 494646 : info->on_stack = false;
11809 : 494646 : vect_schedule_slp_node (vinfo, node, instance);
11810 : 1017474 : return;
11811 : : }
11812 : :
11813 : 951838 : info->on_stack = true;
11814 : 951838 : stack.safe_push (node);
11815 : :
11816 : 951838 : unsigned i;
11817 : 951838 : slp_tree child;
11818 : : /* DFS recurse. */
11819 : 1948174 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11820 : : {
11821 : 996336 : if (!child)
11822 : 54025 : continue;
11823 : 942311 : slp_scc_info *child_info = scc_info.get (child);
11824 : 942311 : if (!child_info)
11825 : : {
11826 : 864082 : vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
11827 : : /* Recursion might have re-allocated the node. */
11828 : 864082 : info = scc_info.get (node);
11829 : 864082 : child_info = scc_info.get (child);
11830 : 864082 : info->lowlink = MIN (info->lowlink, child_info->lowlink);
11831 : : }
11832 : 78229 : else if (child_info->on_stack)
11833 : 24081 : info->lowlink = MIN (info->lowlink, child_info->dfs);
11834 : : }
11835 : 951838 : if (info->lowlink != info->dfs)
11836 : : return;
11837 : :
11838 : 923656 : auto_vec<slp_tree, 4> phis_to_fixup;
11839 : :
11840 : : /* Singleton. */
11841 : 923656 : if (stack.last () == node)
11842 : : {
11843 : 900907 : stack.pop ();
11844 : 900907 : info->on_stack = false;
11845 : 900907 : vect_schedule_slp_node (vinfo, node, instance);
11846 : 900907 : if (!SLP_TREE_PERMUTE_P (node)
11847 : 900907 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
11848 : 31257 : phis_to_fixup.quick_push (node);
11849 : : }
11850 : : else
11851 : : {
11852 : : /* SCC. */
11853 : 22749 : int last_idx = stack.length () - 1;
11854 : 50931 : while (stack[last_idx] != node)
11855 : 28182 : last_idx--;
11856 : : /* We can break the cycle at PHIs who have at least one child
11857 : : code generated. Then we could re-start the DFS walk until
11858 : : all nodes in the SCC are covered (we might have new entries
11859 : : for only back-reachable nodes). But it's simpler to just
11860 : : iterate and schedule those that are ready. */
11861 : 22749 : unsigned todo = stack.length () - last_idx;
11862 : 23069 : do
11863 : : {
11864 : 98787 : for (int idx = stack.length () - 1; idx >= last_idx; --idx)
11865 : : {
11866 : 52649 : slp_tree entry = stack[idx];
11867 : 52649 : if (!entry)
11868 : 934 : continue;
11869 : 51715 : bool phi = (!SLP_TREE_PERMUTE_P (entry)
11870 : 51715 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
11871 : 51715 : bool ready = !phi;
11872 : 129396 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
11873 : 101616 : if (!child)
11874 : : {
11875 : 21961 : gcc_assert (phi);
11876 : : ready = true;
11877 : : break;
11878 : : }
11879 : 79655 : else if (scc_info.get (child)->on_stack)
11880 : : {
11881 : 22983 : if (!phi)
11882 : : {
11883 : : ready = false;
11884 : : break;
11885 : : }
11886 : : }
11887 : : else
11888 : : {
11889 : 56672 : if (phi)
11890 : : {
11891 : : ready = true;
11892 : : break;
11893 : : }
11894 : : }
11895 : 29754 : if (ready)
11896 : : {
11897 : 50931 : vect_schedule_slp_node (vinfo, entry, instance);
11898 : 50931 : scc_info.get (entry)->on_stack = false;
11899 : 50931 : stack[idx] = NULL;
11900 : 50931 : todo--;
11901 : 50931 : if (phi)
11902 : 23183 : phis_to_fixup.safe_push (entry);
11903 : : }
11904 : : }
11905 : : }
11906 : 23069 : while (todo != 0);
11907 : :
11908 : : /* Pop the SCC. */
11909 : 22749 : stack.truncate (last_idx);
11910 : : }
11911 : :
11912 : : /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
11913 : : slp_tree phi_node;
11914 : 1901752 : FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
11915 : : {
11916 : 54440 : gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
11917 : 54440 : edge_iterator ei;
11918 : 54440 : edge e;
11919 : 168601 : FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
11920 : : {
11921 : 114161 : unsigned dest_idx = e->dest_idx;
11922 : 114161 : child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
11923 : 114161 : if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
11924 : 64862 : continue;
11925 : 49299 : unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
11926 : : /* Simply fill all args. */
11927 : 49299 : if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
11928 : : != vect_first_order_recurrence)
11929 : 106625 : for (unsigned i = 0; i < n; ++i)
11930 : : {
11931 : 57368 : tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
11932 : 57368 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11933 : 57368 : add_phi_arg (phi, vect_get_slp_vect_def (child, i),
11934 : : e, gimple_phi_arg_location (phi, dest_idx));
11935 : : }
11936 : : else
11937 : : {
11938 : : /* Unless it is a first order recurrence which needs
11939 : : args filled in for both the PHI node and the permutes. */
11940 : 42 : gimple *perm
11941 : 42 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
11942 : 42 : gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
11943 : 42 : add_phi_arg (as_a <gphi *> (rphi),
11944 : : vect_get_slp_vect_def (child, n - 1),
11945 : : e, gimple_phi_arg_location (phi, dest_idx));
11946 : 121 : for (unsigned i = 0; i < n; ++i)
11947 : : {
11948 : 79 : gimple *perm
11949 : 79 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
11950 : 79 : if (i > 0)
11951 : 37 : gimple_assign_set_rhs1 (perm,
11952 : : vect_get_slp_vect_def (child, i - 1));
11953 : 79 : gimple_assign_set_rhs2 (perm,
11954 : : vect_get_slp_vect_def (child, i));
11955 : 79 : update_stmt (perm);
11956 : : }
11957 : : }
11958 : : }
11959 : : }
11960 : 923656 : }
11961 : :
11962 : : /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
11963 : :
11964 : : void
11965 : 542755 : vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
11966 : : {
11967 : 542755 : slp_instance instance;
11968 : 542755 : unsigned int i;
11969 : :
11970 : 542755 : hash_map<slp_tree, slp_scc_info> scc_info;
11971 : 542755 : int maxdfs = 0;
11972 : 1125336 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
11973 : : {
11974 : 582581 : slp_tree node = SLP_INSTANCE_TREE (instance);
11975 : 582581 : if (dump_enabled_p ())
11976 : : {
11977 : 15627 : dump_printf_loc (MSG_NOTE, vect_location,
11978 : : "Vectorizing SLP tree:\n");
11979 : : /* ??? Dump all? */
11980 : 15627 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
11981 : 405 : dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
11982 : 405 : SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
11983 : 15627 : vect_print_slp_graph (MSG_NOTE, vect_location,
11984 : : SLP_INSTANCE_TREE (instance));
11985 : : }
11986 : : /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
11987 : : have a PHI be the node breaking the cycle. */
11988 : 582581 : auto_vec<slp_tree> stack;
11989 : 582581 : if (!scc_info.get (node))
11990 : 582402 : vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
11991 : :
11992 : 582581 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
11993 : 10004 : vectorize_slp_instance_root_stmt (vinfo, node, instance);
11994 : :
11995 : 582581 : if (dump_enabled_p ())
11996 : 15627 : dump_printf_loc (MSG_NOTE, vect_location,
11997 : : "vectorizing stmts using SLP.\n");
11998 : 582581 : }
11999 : :
12000 : 1668091 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
12001 : : {
12002 : 582581 : slp_tree root = SLP_INSTANCE_TREE (instance);
12003 : 582581 : stmt_vec_info store_info;
12004 : 582581 : unsigned int j;
12005 : :
12006 : : /* Remove scalar call stmts. Do not do this for basic-block
12007 : : vectorization as not all uses may be vectorized.
12008 : : ??? Why should this be necessary? DCE should be able to
12009 : : remove the stmts itself.
12010 : : ??? For BB vectorization we can as well remove scalar
12011 : : stmts starting from the SLP tree root if they have no
12012 : : uses. */
12013 : 582581 : if (is_a <loop_vec_info> (vinfo))
12014 : 87771 : vect_remove_slp_scalar_calls (vinfo, root);
12015 : :
12016 : : /* Remove vectorized stores original scalar stmts. */
12017 : 2606890 : for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
12018 : : {
12019 : 1478648 : if (!STMT_VINFO_DATA_REF (store_info)
12020 : 1449537 : || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
12021 : : break;
12022 : :
12023 : 1441728 : store_info = vect_orig_stmt (store_info);
12024 : : /* Free the attached stmt_vec_info and remove the stmt. */
12025 : 1441728 : vinfo->remove_stmt (store_info);
12026 : :
12027 : : /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
12028 : : to not crash in vect_free_slp_tree later. */
12029 : 1441728 : if (SLP_TREE_REPRESENTATIVE (root) == store_info)
12030 : 545455 : SLP_TREE_REPRESENTATIVE (root) = NULL;
12031 : : }
12032 : : }
12033 : 542755 : }
|