Branch data Line data Source code
1 : : /* SLP - Basic Block Vectorization
2 : : Copyright (C) 2007-2024 Free Software Foundation, Inc.
3 : : Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 : : and Ira Rosen <irar@il.ibm.com>
5 : :
6 : : This file is part of GCC.
7 : :
8 : : GCC is free software; you can redistribute it and/or modify it under
9 : : the terms of the GNU General Public License as published by the Free
10 : : Software Foundation; either version 3, or (at your option) any later
11 : : version.
12 : :
13 : : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 : : WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 : : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 : : for more details.
17 : :
18 : : You should have received a copy of the GNU General Public License
19 : : along with GCC; see the file COPYING3. If not see
20 : : <http://www.gnu.org/licenses/>. */
21 : :
22 : : #include "config.h"
23 : : #define INCLUDE_ALGORITHM
24 : : #include "system.h"
25 : : #include "coretypes.h"
26 : : #include "backend.h"
27 : : #include "target.h"
28 : : #include "rtl.h"
29 : : #include "tree.h"
30 : : #include "gimple.h"
31 : : #include "tree-pass.h"
32 : : #include "ssa.h"
33 : : #include "optabs-tree.h"
34 : : #include "insn-config.h"
35 : : #include "recog.h" /* FIXME: for insn_data */
36 : : #include "fold-const.h"
37 : : #include "stor-layout.h"
38 : : #include "gimple-iterator.h"
39 : : #include "cfgloop.h"
40 : : #include "tree-vectorizer.h"
41 : : #include "langhooks.h"
42 : : #include "gimple-walk.h"
43 : : #include "dbgcnt.h"
44 : : #include "tree-vector-builder.h"
45 : : #include "vec-perm-indices.h"
46 : : #include "gimple-fold.h"
47 : : #include "internal-fn.h"
48 : : #include "dump-context.h"
49 : : #include "cfganal.h"
50 : : #include "tree-eh.h"
51 : : #include "tree-cfg.h"
52 : : #include "alloc-pool.h"
53 : : #include "sreal.h"
54 : : #include "predict.h"
55 : :
56 : : static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 : : load_permutation_t &,
58 : : const vec<tree> &,
59 : : gimple_stmt_iterator *,
60 : : poly_uint64, bool, bool,
61 : : unsigned *,
62 : : unsigned * = nullptr,
63 : : bool = false);
64 : : static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 : : slp_tree, lane_permutation_t &,
66 : : vec<slp_tree> &, bool);
67 : : static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 : : slp_tree, stmt_vector_for_cost *);
69 : : static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
70 : :
71 : : static object_allocator<_slp_tree> *slp_tree_pool;
72 : : static slp_tree slp_first_node;
73 : :
74 : : void
75 : 1054561 : vect_slp_init (void)
76 : : {
77 : 1054561 : slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
78 : 1054561 : }
79 : :
80 : : void
81 : 1054561 : vect_slp_fini (void)
82 : : {
83 : 1109827 : while (slp_first_node)
84 : 55266 : delete slp_first_node;
85 : 2109122 : delete slp_tree_pool;
86 : 1054561 : slp_tree_pool = NULL;
87 : 1054561 : }
88 : :
89 : : void *
90 : 3938015 : _slp_tree::operator new (size_t n)
91 : : {
92 : 3938015 : gcc_assert (n == sizeof (_slp_tree));
93 : 3938015 : return slp_tree_pool->allocate_raw ();
94 : : }
95 : :
96 : : void
97 : 3938015 : _slp_tree::operator delete (void *node, size_t n)
98 : : {
99 : 3938015 : gcc_assert (n == sizeof (_slp_tree));
100 : 3938015 : slp_tree_pool->remove_raw (node);
101 : 3938015 : }
102 : :
103 : :
104 : : /* Initialize a SLP node. */
105 : :
106 : 3938015 : _slp_tree::_slp_tree ()
107 : : {
108 : 3938015 : this->prev_node = NULL;
109 : 3938015 : if (slp_first_node)
110 : 3355805 : slp_first_node->prev_node = this;
111 : 3938015 : this->next_node = slp_first_node;
112 : 3938015 : slp_first_node = this;
113 : 3938015 : SLP_TREE_SCALAR_STMTS (this) = vNULL;
114 : 3938015 : SLP_TREE_SCALAR_OPS (this) = vNULL;
115 : 3938015 : SLP_TREE_VEC_DEFS (this) = vNULL;
116 : 3938015 : SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
117 : 3938015 : SLP_TREE_CHILDREN (this) = vNULL;
118 : 3938015 : SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
119 : 3938015 : SLP_TREE_LANE_PERMUTATION (this) = vNULL;
120 : 3938015 : SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
121 : 3938015 : SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
122 : 3938015 : SLP_TREE_CODE (this) = ERROR_MARK;
123 : 3938015 : SLP_TREE_VECTYPE (this) = NULL_TREE;
124 : 3938015 : SLP_TREE_REPRESENTATIVE (this) = NULL;
125 : 3938015 : SLP_TREE_REF_COUNT (this) = 1;
126 : 3938015 : this->failed = NULL;
127 : 3938015 : this->max_nunits = 1;
128 : 3938015 : this->lanes = 0;
129 : 3938015 : }
130 : :
131 : : /* Tear down a SLP node. */
132 : :
133 : 3938015 : _slp_tree::~_slp_tree ()
134 : : {
135 : 3938015 : if (this->prev_node)
136 : 2612563 : this->prev_node->next_node = this->next_node;
137 : : else
138 : 1325452 : slp_first_node = this->next_node;
139 : 3938015 : if (this->next_node)
140 : 2753271 : this->next_node->prev_node = this->prev_node;
141 : 3938015 : SLP_TREE_CHILDREN (this).release ();
142 : 3938015 : SLP_TREE_SCALAR_STMTS (this).release ();
143 : 3938015 : SLP_TREE_SCALAR_OPS (this).release ();
144 : 3938015 : SLP_TREE_VEC_DEFS (this).release ();
145 : 3938015 : SLP_TREE_LOAD_PERMUTATION (this).release ();
146 : 3938015 : SLP_TREE_LANE_PERMUTATION (this).release ();
147 : 3938015 : SLP_TREE_SIMD_CLONE_INFO (this).release ();
148 : 3938015 : if (this->failed)
149 : 1670395 : free (failed);
150 : 3938015 : }
151 : :
152 : : /* Push the single SSA definition in DEF to the vector of vector defs. */
153 : :
154 : : void
155 : 173187 : _slp_tree::push_vec_def (gimple *def)
156 : : {
157 : 173187 : if (gphi *phi = dyn_cast <gphi *> (def))
158 : 23353 : vec_defs.quick_push (gimple_phi_result (phi));
159 : : else
160 : : {
161 : 149834 : def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
162 : 149834 : vec_defs.quick_push (get_def_from_ptr (defop));
163 : : }
164 : 173187 : }
165 : :
166 : : /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
167 : :
168 : : void
169 : 5093445 : vect_free_slp_tree (slp_tree node)
170 : : {
171 : 5093445 : int i;
172 : 5093445 : slp_tree child;
173 : :
174 : 5093445 : if (--SLP_TREE_REF_COUNT (node) != 0)
175 : 5093445 : return;
176 : :
177 : 5104250 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
178 : 1221501 : if (child)
179 : 1220019 : vect_free_slp_tree (child);
180 : :
181 : : /* If the node defines any SLP only patterns then those patterns are no
182 : : longer valid and should be removed. */
183 : 3882749 : stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
184 : 3882749 : if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
185 : : {
186 : 703 : stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
187 : 703 : STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
188 : 703 : STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
189 : : }
190 : :
191 : 3882749 : delete node;
192 : : }
193 : :
194 : : /* Return a location suitable for dumpings related to the SLP instance. */
195 : :
196 : : dump_user_location_t
197 : 3287147 : _slp_instance::location () const
198 : : {
199 : 3287147 : if (!root_stmts.is_empty ())
200 : 315606 : return root_stmts[0]->stmt;
201 : : else
202 : 2971541 : return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
203 : : }
204 : :
205 : :
206 : : /* Free the memory allocated for the SLP instance. */
207 : :
208 : : void
209 : 770884 : vect_free_slp_instance (slp_instance instance)
210 : : {
211 : 770884 : vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
212 : 770884 : SLP_INSTANCE_LOADS (instance).release ();
213 : 770884 : SLP_INSTANCE_ROOT_STMTS (instance).release ();
214 : 770884 : SLP_INSTANCE_REMAIN_DEFS (instance).release ();
215 : 770884 : instance->subgraph_entries.release ();
216 : 770884 : instance->cost_vec.release ();
217 : 770884 : free (instance);
218 : 770884 : }
219 : :
220 : :
221 : : /* Create an SLP node for SCALAR_STMTS. */
222 : :
223 : : slp_tree
224 : 635 : vect_create_new_slp_node (unsigned nops, tree_code code)
225 : : {
226 : 635 : slp_tree node = new _slp_tree;
227 : 635 : SLP_TREE_SCALAR_STMTS (node) = vNULL;
228 : 635 : SLP_TREE_CHILDREN (node).create (nops);
229 : 635 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
230 : 635 : SLP_TREE_CODE (node) = code;
231 : 635 : return node;
232 : : }
233 : : /* Create an SLP node for SCALAR_STMTS. */
234 : :
235 : : static slp_tree
236 : 1144188 : vect_create_new_slp_node (slp_tree node,
237 : : vec<stmt_vec_info> scalar_stmts, unsigned nops)
238 : : {
239 : 1144188 : SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
240 : 1144188 : SLP_TREE_CHILDREN (node).create (nops);
241 : 1144188 : SLP_TREE_DEF_TYPE (node) = vect_internal_def;
242 : 1144188 : SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
243 : 1144188 : SLP_TREE_LANES (node) = scalar_stmts.length ();
244 : 1144188 : return node;
245 : : }
246 : :
247 : : /* Create an SLP node for SCALAR_STMTS. */
248 : :
249 : : static slp_tree
250 : 42 : vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
251 : : {
252 : 42 : return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
253 : : }
254 : :
255 : : /* Create an SLP node for OPS. */
256 : :
257 : : static slp_tree
258 : 1112881 : vect_create_new_slp_node (slp_tree node, vec<tree> ops)
259 : : {
260 : 1112881 : SLP_TREE_SCALAR_OPS (node) = ops;
261 : 1112881 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
262 : 0 : SLP_TREE_LANES (node) = ops.length ();
263 : 1112881 : return node;
264 : : }
265 : :
266 : : /* Create an SLP node for OPS. */
267 : :
268 : : static slp_tree
269 : 1112881 : vect_create_new_slp_node (vec<tree> ops)
270 : : {
271 : 1112881 : return vect_create_new_slp_node (new _slp_tree, ops);
272 : : }
273 : :
274 : :
275 : : /* This structure is used in creation of an SLP tree. Each instance
276 : : corresponds to the same operand in a group of scalar stmts in an SLP
277 : : node. */
278 : : typedef struct _slp_oprnd_info
279 : : {
280 : : /* Def-stmts for the operands. */
281 : : vec<stmt_vec_info> def_stmts;
282 : : /* Operands. */
283 : : vec<tree> ops;
284 : : /* Information about the first statement, its vector def-type, type, the
285 : : operand itself in case it's constant, and an indication if it's a pattern
286 : : stmt and gather/scatter info. */
287 : : tree first_op_type;
288 : : enum vect_def_type first_dt;
289 : : bool any_pattern;
290 : : bool first_gs_p;
291 : : gather_scatter_info first_gs_info;
292 : : } *slp_oprnd_info;
293 : :
294 : :
295 : : /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
296 : : operand. */
297 : : static vec<slp_oprnd_info>
298 : 1257259 : vect_create_oprnd_info (int nops, int group_size)
299 : : {
300 : 1257259 : int i;
301 : 1257259 : slp_oprnd_info oprnd_info;
302 : 1257259 : vec<slp_oprnd_info> oprnds_info;
303 : :
304 : 1257259 : oprnds_info.create (nops);
305 : 4282180 : for (i = 0; i < nops; i++)
306 : : {
307 : 1767662 : oprnd_info = XNEW (struct _slp_oprnd_info);
308 : 1767662 : oprnd_info->def_stmts.create (group_size);
309 : 1767662 : oprnd_info->ops.create (group_size);
310 : 1767662 : oprnd_info->first_dt = vect_uninitialized_def;
311 : 1767662 : oprnd_info->first_op_type = NULL_TREE;
312 : 1767662 : oprnd_info->any_pattern = false;
313 : 1767662 : oprnd_info->first_gs_p = false;
314 : 1767662 : oprnds_info.quick_push (oprnd_info);
315 : : }
316 : :
317 : 1257259 : return oprnds_info;
318 : : }
319 : :
320 : :
321 : : /* Free operands info. */
322 : :
323 : : static void
324 : 1257259 : vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
325 : : {
326 : 1257259 : int i;
327 : 1257259 : slp_oprnd_info oprnd_info;
328 : :
329 : 3024921 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
330 : : {
331 : 1767662 : oprnd_info->def_stmts.release ();
332 : 1767662 : oprnd_info->ops.release ();
333 : 1767662 : XDELETE (oprnd_info);
334 : : }
335 : :
336 : 1257259 : oprnds_info.release ();
337 : 1257259 : }
338 : :
339 : : /* Return the execution frequency of NODE (so that a higher value indicates
340 : : a "more important" node when optimizing for speed). */
341 : :
342 : : static sreal
343 : 1132183 : vect_slp_node_weight (slp_tree node)
344 : : {
345 : 1132183 : stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
346 : 1132183 : basic_block bb = gimple_bb (stmt_info->stmt);
347 : 1132183 : return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
348 : : }
349 : :
350 : : /* Return true if STMTS contains a pattern statement. */
351 : :
352 : : static bool
353 : 26028 : vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
354 : : {
355 : 26028 : stmt_vec_info stmt_info;
356 : 26028 : unsigned int i;
357 : 83888 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
358 : 60101 : if (is_pattern_stmt_p (stmt_info))
359 : : return true;
360 : : return false;
361 : : }
362 : :
363 : : /* Return true when all lanes in the external or constant NODE have
364 : : the same value. */
365 : :
366 : : static bool
367 : 585388 : vect_slp_tree_uniform_p (slp_tree node)
368 : : {
369 : 585388 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
370 : : || SLP_TREE_DEF_TYPE (node) == vect_external_def);
371 : :
372 : : /* Pre-exsting vectors. */
373 : 1024855 : if (SLP_TREE_SCALAR_OPS (node).is_empty ())
374 : : return false;
375 : :
376 : : unsigned i;
377 : : tree op, first = NULL_TREE;
378 : 1343227 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
379 : 1197306 : if (!first)
380 : : first = op;
381 : 611918 : else if (!operand_equal_p (first, op, 0))
382 : : return false;
383 : :
384 : : return true;
385 : : }
386 : :
387 : : /* Find the place of the data-ref in STMT_INFO in the interleaving chain
388 : : that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
389 : : of the chain. */
390 : :
391 : : int
392 : 426114 : vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
393 : : stmt_vec_info first_stmt_info)
394 : : {
395 : 426114 : stmt_vec_info next_stmt_info = first_stmt_info;
396 : 426114 : int result = 0;
397 : :
398 : 426114 : if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
399 : : return -1;
400 : :
401 : 1194171 : do
402 : : {
403 : 1194171 : if (next_stmt_info == stmt_info)
404 : 426114 : return result;
405 : 768057 : next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
406 : 768057 : if (next_stmt_info)
407 : 768057 : result += DR_GROUP_GAP (next_stmt_info);
408 : : }
409 : 768057 : while (next_stmt_info);
410 : :
411 : : return -1;
412 : : }
413 : :
414 : : /* Check whether it is possible to load COUNT elements of type ELT_TYPE
415 : : using the method implemented by duplicate_and_interleave. Return true
416 : : if so, returning the number of intermediate vectors in *NVECTORS_OUT
417 : : (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
418 : : (if nonnull). */
419 : :
420 : : bool
421 : 0 : can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
422 : : tree elt_type, unsigned int *nvectors_out,
423 : : tree *vector_type_out,
424 : : tree *permutes)
425 : : {
426 : 0 : tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
427 : 0 : if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
428 : 0 : return false;
429 : :
430 : 0 : machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
431 : 0 : poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
432 : 0 : unsigned int nvectors = 1;
433 : 0 : for (;;)
434 : : {
435 : 0 : scalar_int_mode int_mode;
436 : 0 : poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
437 : 0 : if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
438 : : {
439 : : /* Get the natural vector type for this SLP group size. */
440 : 0 : tree int_type = build_nonstandard_integer_type
441 : 0 : (GET_MODE_BITSIZE (int_mode), 1);
442 : 0 : tree vector_type
443 : 0 : = get_vectype_for_scalar_type (vinfo, int_type, count);
444 : 0 : poly_int64 half_nelts;
445 : 0 : if (vector_type
446 : 0 : && VECTOR_MODE_P (TYPE_MODE (vector_type))
447 : 0 : && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
448 : : GET_MODE_SIZE (base_vector_mode))
449 : 0 : && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
450 : : 2, &half_nelts))
451 : : {
452 : : /* Try fusing consecutive sequences of COUNT / NVECTORS elements
453 : : together into elements of type INT_TYPE and using the result
454 : : to build NVECTORS vectors. */
455 : 0 : poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
456 : 0 : vec_perm_builder sel1 (nelts, 2, 3);
457 : 0 : vec_perm_builder sel2 (nelts, 2, 3);
458 : :
459 : 0 : for (unsigned int i = 0; i < 3; ++i)
460 : : {
461 : 0 : sel1.quick_push (i);
462 : 0 : sel1.quick_push (i + nelts);
463 : 0 : sel2.quick_push (half_nelts + i);
464 : 0 : sel2.quick_push (half_nelts + i + nelts);
465 : : }
466 : 0 : vec_perm_indices indices1 (sel1, 2, nelts);
467 : 0 : vec_perm_indices indices2 (sel2, 2, nelts);
468 : 0 : machine_mode vmode = TYPE_MODE (vector_type);
469 : 0 : if (can_vec_perm_const_p (vmode, vmode, indices1)
470 : 0 : && can_vec_perm_const_p (vmode, vmode, indices2))
471 : : {
472 : 0 : if (nvectors_out)
473 : 0 : *nvectors_out = nvectors;
474 : 0 : if (vector_type_out)
475 : 0 : *vector_type_out = vector_type;
476 : 0 : if (permutes)
477 : : {
478 : 0 : permutes[0] = vect_gen_perm_mask_checked (vector_type,
479 : : indices1);
480 : 0 : permutes[1] = vect_gen_perm_mask_checked (vector_type,
481 : : indices2);
482 : : }
483 : 0 : return true;
484 : : }
485 : 0 : }
486 : : }
487 : 0 : if (!multiple_p (elt_bytes, 2, &elt_bytes))
488 : : return false;
489 : 0 : nvectors *= 2;
490 : 0 : }
491 : : }
492 : :
493 : : /* Return true if DTA and DTB match. */
494 : :
495 : : static bool
496 : 16345810 : vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
497 : : {
498 : 16345810 : return (dta == dtb
499 : 146751 : || ((dta == vect_external_def || dta == vect_constant_def)
500 : 98894 : && (dtb == vect_external_def || dtb == vect_constant_def)));
501 : : }
502 : :
503 : : static const int cond_expr_maps[3][5] = {
504 : : { 4, -1, -2, 1, 2 },
505 : : { 4, -2, -1, 1, 2 },
506 : : { 4, -1, -2, 2, 1 }
507 : : };
508 : : static const int arg0_map[] = { 1, 0 };
509 : : static const int arg1_map[] = { 1, 1 };
510 : : static const int arg2_map[] = { 1, 2 };
511 : : static const int arg1_arg4_map[] = { 2, 1, 4 };
512 : : static const int arg3_arg2_map[] = { 2, 3, 2 };
513 : : static const int op1_op0_map[] = { 2, 1, 0 };
514 : : static const int off_map[] = { 1, -3 };
515 : : static const int off_op0_map[] = { 2, -3, 0 };
516 : : static const int off_arg2_map[] = { 2, -3, 2 };
517 : : static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
518 : : static const int mask_call_maps[6][7] = {
519 : : { 1, 1, },
520 : : { 2, 1, 2, },
521 : : { 3, 1, 2, 3, },
522 : : { 4, 1, 2, 3, 4, },
523 : : { 5, 1, 2, 3, 4, 5, },
524 : : { 6, 1, 2, 3, 4, 5, 6 },
525 : : };
526 : :
527 : : /* For most SLP statements, there is a one-to-one mapping between
528 : : gimple arguments and child nodes. If that is not true for STMT,
529 : : return an array that contains:
530 : :
531 : : - the number of child nodes, followed by
532 : : - for each child node, the index of the argument associated with that node.
533 : : The special index -1 is the first operand of an embedded comparison and
534 : : the special index -2 is the second operand of an embedded comparison.
535 : : The special indes -3 is the offset of a gather as analyzed by
536 : : vect_check_gather_scatter.
537 : :
538 : : SWAP is as for vect_get_and_check_slp_defs. */
539 : :
540 : : static const int *
541 : 14056903 : vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
542 : : unsigned char swap = 0)
543 : : {
544 : 14056903 : if (auto assign = dyn_cast<const gassign *> (stmt))
545 : : {
546 : 13414260 : if (gimple_assign_rhs_code (assign) == COND_EXPR
547 : 13414260 : && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
548 : 1521 : return cond_expr_maps[swap];
549 : 13412739 : if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
550 : 13412739 : && swap)
551 : : return op1_op0_map;
552 : 13206896 : if (gather_scatter_p)
553 : 372 : return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
554 : 372 : ? off_op0_map : off_map);
555 : : }
556 : 13849167 : gcc_assert (!swap);
557 : 13849167 : if (auto call = dyn_cast<const gcall *> (stmt))
558 : : {
559 : 87463 : if (gimple_call_internal_p (call))
560 : 30485 : switch (gimple_call_internal_fn (call))
561 : : {
562 : 60 : case IFN_MASK_LOAD:
563 : 120 : return gather_scatter_p ? off_arg2_map : arg2_map;
564 : :
565 : 0 : case IFN_GATHER_LOAD:
566 : 0 : return arg1_map;
567 : :
568 : 0 : case IFN_MASK_GATHER_LOAD:
569 : 0 : case IFN_MASK_LEN_GATHER_LOAD:
570 : 0 : return arg1_arg4_map;
571 : :
572 : 87 : case IFN_MASK_STORE:
573 : 174 : return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
574 : :
575 : 51 : case IFN_MASK_CALL:
576 : 51 : {
577 : 51 : unsigned nargs = gimple_call_num_args (call);
578 : 51 : if (nargs >= 2 && nargs <= 7)
579 : 51 : return mask_call_maps[nargs-2];
580 : : else
581 : : return nullptr;
582 : : }
583 : :
584 : 24 : case IFN_CLZ:
585 : 24 : case IFN_CTZ:
586 : 24 : return arg0_map;
587 : :
588 : : default:
589 : : break;
590 : : }
591 : : }
592 : : return nullptr;
593 : : }
594 : :
595 : : /* Return the SLP node child index for operand OP of STMT. */
596 : :
597 : : int
598 : 1137431 : vect_slp_child_index_for_operand (const gimple *stmt, int op,
599 : : bool gather_scatter_p)
600 : : {
601 : 1137431 : const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
602 : 1137431 : if (!opmap)
603 : : return op;
604 : 117 : for (int i = 1; i < 1 + opmap[0]; ++i)
605 : 117 : if (opmap[i] == op)
606 : 72 : return i - 1;
607 : 0 : gcc_unreachable ();
608 : : }
609 : :
610 : : /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
611 : : they are of a valid type and that they match the defs of the first stmt of
612 : : the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
613 : : by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
614 : : indicates swap is required for cond_expr stmts. Specifically, SWAP
615 : : is 1 if STMT is cond and operands of comparison need to be swapped;
616 : : SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
617 : :
618 : : If there was a fatal error return -1; if the error could be corrected by
619 : : swapping operands of father node of this one, return 1; if everything is
620 : : ok return 0. */
621 : : static int
622 : 10111686 : vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
623 : : bool *skip_args,
624 : : vec<stmt_vec_info> stmts, unsigned stmt_num,
625 : : vec<slp_oprnd_info> *oprnds_info)
626 : : {
627 : 10111686 : stmt_vec_info stmt_info = stmts[stmt_num];
628 : 10111686 : tree oprnd;
629 : 10111686 : unsigned int i, number_of_oprnds;
630 : 10111686 : enum vect_def_type dt = vect_uninitialized_def;
631 : 10111686 : slp_oprnd_info oprnd_info;
632 : 10111686 : gather_scatter_info gs_info;
633 : 10111686 : unsigned int gs_op = -1u;
634 : 10111686 : unsigned int commutative_op = -1U;
635 : 10111686 : bool first = stmt_num == 0;
636 : :
637 : 10111686 : if (!is_a<gcall *> (stmt_info->stmt)
638 : : && !is_a<gassign *> (stmt_info->stmt)
639 : : && !is_a<gphi *> (stmt_info->stmt))
640 : : return -1;
641 : :
642 : 10111686 : number_of_oprnds = gimple_num_args (stmt_info->stmt);
643 : 10111686 : const int *map
644 : 20223372 : = vect_get_operand_map (stmt_info->stmt,
645 : 10111686 : STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
646 : 10111686 : if (map)
647 : 207025 : number_of_oprnds = *map++;
648 : 10111686 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
649 : : {
650 : 23443 : if (gimple_call_internal_p (stmt))
651 : : {
652 : 8870 : internal_fn ifn = gimple_call_internal_fn (stmt);
653 : 8870 : commutative_op = first_commutative_argument (ifn);
654 : : }
655 : : }
656 : 10088243 : else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
657 : : {
658 : 11627168 : if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
659 : 10111686 : commutative_op = 0;
660 : : }
661 : :
662 : 10111686 : bool swapped = (swap != 0);
663 : 10111686 : bool backedge = false;
664 : 10111686 : enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
665 : 28304662 : for (i = 0; i < number_of_oprnds; i++)
666 : : {
667 : 18195383 : oprnd_info = (*oprnds_info)[i];
668 : 18195383 : int opno = map ? map[i] : int (i);
669 : 18195383 : if (opno == -3)
670 : : {
671 : 261 : gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
672 : 261 : if (!is_a <loop_vec_info> (vinfo)
673 : 261 : || !vect_check_gather_scatter (stmt_info,
674 : : as_a <loop_vec_info> (vinfo),
675 : : first ? &oprnd_info->first_gs_info
676 : : : &gs_info))
677 : 2407 : return -1;
678 : :
679 : 261 : if (first)
680 : : {
681 : 111 : oprnd_info->first_gs_p = true;
682 : 111 : oprnd = oprnd_info->first_gs_info.offset;
683 : : }
684 : : else
685 : : {
686 : 150 : gs_op = i;
687 : 150 : oprnd = gs_info.offset;
688 : : }
689 : : }
690 : 18195122 : else if (opno < 0)
691 : 1710 : oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
692 : : else
693 : : {
694 : 18193412 : oprnd = gimple_arg (stmt_info->stmt, opno);
695 : 18193412 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
696 : : {
697 : 605924 : edge e = gimple_phi_arg_edge (stmt, opno);
698 : 1211848 : backedge = (is_a <bb_vec_info> (vinfo)
699 : 633366 : ? e->flags & EDGE_DFS_BACK
700 : 27442 : : dominated_by_p (CDI_DOMINATORS, e->src,
701 : 27442 : gimple_bb (stmt_info->stmt)));
702 : : }
703 : : }
704 : 18195383 : if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
705 : 1995 : oprnd = TREE_OPERAND (oprnd, 0);
706 : :
707 : 18195383 : stmt_vec_info def_stmt_info;
708 : 18195383 : if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
709 : : {
710 : 969 : if (dump_enabled_p ())
711 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
712 : : "Build SLP failed: can't analyze def for %T\n",
713 : : oprnd);
714 : :
715 : 969 : return -1;
716 : : }
717 : :
718 : 18194414 : if (skip_args[i])
719 : : {
720 : 18359 : oprnd_info->def_stmts.quick_push (NULL);
721 : 18359 : oprnd_info->ops.quick_push (NULL_TREE);
722 : 18359 : oprnd_info->first_dt = vect_uninitialized_def;
723 : 18359 : continue;
724 : : }
725 : :
726 : 18176055 : oprnd_info->def_stmts.quick_push (def_stmt_info);
727 : 18176055 : oprnd_info->ops.quick_push (oprnd);
728 : :
729 : 18176055 : if (def_stmt_info
730 : 18176055 : && is_pattern_stmt_p (def_stmt_info))
731 : : {
732 : 184896 : if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
733 : : != def_stmt_info)
734 : 147577 : oprnd_info->any_pattern = true;
735 : : else
736 : : /* If we promote this to external use the original stmt def. */
737 : 37319 : oprnd_info->ops.last ()
738 : 74638 : = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
739 : : }
740 : :
741 : : /* If there's a extern def on a backedge make sure we can
742 : : code-generate at the region start.
743 : : ??? This is another case that could be fixed by adjusting
744 : : how we split the function but at the moment we'd have conflicting
745 : : goals there. */
746 : 18176055 : if (backedge
747 : 58754 : && dts[i] == vect_external_def
748 : 1459 : && is_a <bb_vec_info> (vinfo)
749 : 1459 : && TREE_CODE (oprnd) == SSA_NAME
750 : 1438 : && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
751 : 18177493 : && !dominated_by_p (CDI_DOMINATORS,
752 : 1438 : as_a <bb_vec_info> (vinfo)->bbs[0],
753 : 1438 : gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
754 : : {
755 : 1438 : if (dump_enabled_p ())
756 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
757 : : "Build SLP failed: extern def %T only defined "
758 : : "on backedge\n", oprnd);
759 : 1438 : return -1;
760 : : }
761 : :
762 : 18174617 : if (first)
763 : : {
764 : 1761589 : tree type = TREE_TYPE (oprnd);
765 : 1761589 : dt = dts[i];
766 : :
767 : : /* For the swapping logic below force vect_reduction_def
768 : : for the reduction op in a SLP reduction group. */
769 : 1761589 : if (!STMT_VINFO_DATA_REF (stmt_info)
770 : 1096658 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
771 : 1176 : && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
772 : 1762171 : && def_stmt_info)
773 : 582 : dts[i] = dt = vect_reduction_def;
774 : :
775 : : /* Check the types of the definition. */
776 : 1761589 : switch (dt)
777 : : {
778 : 1761589 : case vect_external_def:
779 : 1761589 : case vect_constant_def:
780 : 1761589 : case vect_internal_def:
781 : 1761589 : case vect_reduction_def:
782 : 1761589 : case vect_induction_def:
783 : 1761589 : case vect_nested_cycle:
784 : 1761589 : case vect_first_order_recurrence:
785 : 1761589 : break;
786 : :
787 : 0 : default:
788 : : /* FORNOW: Not supported. */
789 : 0 : if (dump_enabled_p ())
790 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
791 : : "Build SLP failed: illegal type of def %T\n",
792 : : oprnd);
793 : 0 : return -1;
794 : : }
795 : :
796 : 1761589 : oprnd_info->first_dt = dt;
797 : 1761589 : oprnd_info->first_op_type = type;
798 : : }
799 : : }
800 : 10109279 : if (first)
801 : : return 0;
802 : :
803 : : /* Now match the operand definition types to that of the first stmt. */
804 : 25211484 : for (i = 0; i < number_of_oprnds;)
805 : : {
806 : 16421452 : if (skip_args[i])
807 : : {
808 : 15141 : ++i;
809 : 15141 : continue;
810 : : }
811 : :
812 : 16406311 : oprnd_info = (*oprnds_info)[i];
813 : 16406311 : dt = dts[i];
814 : 16406311 : stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
815 : 16406311 : oprnd = oprnd_info->ops[stmt_num];
816 : 16406311 : tree type = TREE_TYPE (oprnd);
817 : :
818 : 16406311 : if (!types_compatible_p (oprnd_info->first_op_type, type))
819 : : {
820 : 63333 : if (dump_enabled_p ())
821 : 115 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 : : "Build SLP failed: different operand types\n");
823 : 63333 : return 1;
824 : : }
825 : :
826 : 16342978 : if ((gs_op == i) != oprnd_info->first_gs_p)
827 : : {
828 : 0 : if (dump_enabled_p ())
829 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
830 : : "Build SLP failed: mixed gather and non-gather\n");
831 : 0 : return 1;
832 : : }
833 : 16342978 : else if (gs_op == i)
834 : : {
835 : 142 : if (!operand_equal_p (oprnd_info->first_gs_info.base,
836 : 142 : gs_info.base))
837 : : {
838 : 20 : if (dump_enabled_p ())
839 : 8 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
840 : : "Build SLP failed: different gather base\n");
841 : 20 : return 1;
842 : : }
843 : 122 : if (oprnd_info->first_gs_info.scale != gs_info.scale)
844 : : {
845 : 2 : if (dump_enabled_p ())
846 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
847 : : "Build SLP failed: different gather scale\n");
848 : 2 : return 1;
849 : : }
850 : : }
851 : :
852 : : /* Not first stmt of the group, check that the def-stmt/s match
853 : : the def-stmt/s of the first stmt. Allow different definition
854 : : types for reduction chains: the first stmt must be a
855 : : vect_reduction_def (a phi node), and the rest
856 : : end in the reduction chain. */
857 : 16342956 : if ((!vect_def_types_match (oprnd_info->first_dt, dt)
858 : 129004 : && !(oprnd_info->first_dt == vect_reduction_def
859 : 2013 : && !STMT_VINFO_DATA_REF (stmt_info)
860 : 2013 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
861 : 1675 : && def_stmt_info
862 : 1669 : && !STMT_VINFO_DATA_REF (def_stmt_info)
863 : 1663 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
864 : : == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
865 : 16215331 : || (!STMT_VINFO_DATA_REF (stmt_info)
866 : 15097845 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
867 : 3850 : && ((!def_stmt_info
868 : 3744 : || STMT_VINFO_DATA_REF (def_stmt_info)
869 : 6793 : || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
870 : : != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
871 : 3850 : != (oprnd_info->first_dt != vect_reduction_def))))
872 : : {
873 : : /* Try swapping operands if we got a mismatch. For BB
874 : : vectorization only in case it will clearly improve things. */
875 : 130035 : if (i == commutative_op && !swapped
876 : 127858 : && (!is_a <bb_vec_info> (vinfo)
877 : 1969 : || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
878 : 1969 : dts[i+1])
879 : 620 : && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
880 : : || vect_def_types_match
881 : 265 : ((*oprnds_info)[i+1]->first_dt, dts[i])))))
882 : : {
883 : 2177 : if (dump_enabled_p ())
884 : 351 : dump_printf_loc (MSG_NOTE, vect_location,
885 : : "trying swapped operands\n");
886 : 2177 : std::swap (dts[i], dts[i+1]);
887 : 2177 : std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
888 : 2177 : (*oprnds_info)[i+1]->def_stmts[stmt_num]);
889 : 2177 : std::swap ((*oprnds_info)[i]->ops[stmt_num],
890 : 2177 : (*oprnds_info)[i+1]->ops[stmt_num]);
891 : 2177 : swapped = true;
892 : 2177 : continue;
893 : : }
894 : :
895 : 125681 : if (is_a <bb_vec_info> (vinfo)
896 : 125681 : && !oprnd_info->any_pattern)
897 : : {
898 : : /* Now for commutative ops we should see whether we can
899 : : make the other operand matching. */
900 : 124645 : if (dump_enabled_p ())
901 : 328 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
902 : : "treating operand as external\n");
903 : 124645 : oprnd_info->first_dt = dt = vect_external_def;
904 : : }
905 : : else
906 : : {
907 : 1036 : if (dump_enabled_p ())
908 : 147 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
909 : : "Build SLP failed: different types\n");
910 : 1036 : return 1;
911 : : }
912 : : }
913 : :
914 : : /* Make sure to demote the overall operand to external. */
915 : 16339743 : if (dt == vect_external_def)
916 : 322221 : oprnd_info->first_dt = vect_external_def;
917 : : /* For a SLP reduction chain we want to duplicate the reduction to
918 : : each of the chain members. That gets us a sane SLP graph (still
919 : : the stmts are not 100% correct wrt the initial values). */
920 : 16017522 : else if ((dt == vect_internal_def
921 : 16017522 : || dt == vect_reduction_def)
922 : 15232145 : && oprnd_info->first_dt == vect_reduction_def
923 : 21778 : && !STMT_VINFO_DATA_REF (stmt_info)
924 : 21778 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
925 : 1800 : && !STMT_VINFO_DATA_REF (def_stmt_info)
926 : 16019322 : && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
927 : : == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
928 : : {
929 : 1800 : oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
930 : 1800 : oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
931 : : }
932 : :
933 : 16339743 : ++i;
934 : : }
935 : :
936 : : /* Swap operands. */
937 : 8790032 : if (swapped)
938 : : {
939 : 117751 : if (dump_enabled_p ())
940 : 1046 : dump_printf_loc (MSG_NOTE, vect_location,
941 : : "swapped operands to match def types in %G",
942 : : stmt_info->stmt);
943 : : }
944 : :
945 : : return 0;
946 : : }
947 : :
948 : : /* Return true if call statements CALL1 and CALL2 are similar enough
949 : : to be combined into the same SLP group. */
950 : :
951 : : bool
952 : 25219 : compatible_calls_p (gcall *call1, gcall *call2)
953 : : {
954 : 25219 : unsigned int nargs = gimple_call_num_args (call1);
955 : 25219 : if (nargs != gimple_call_num_args (call2))
956 : : return false;
957 : :
958 : 22188 : if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
959 : : return false;
960 : :
961 : 22188 : if (gimple_call_internal_p (call1))
962 : : {
963 : 6722 : if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
964 : 6722 : TREE_TYPE (gimple_call_lhs (call2))))
965 : : return false;
966 : 13541 : for (unsigned int i = 0; i < nargs; ++i)
967 : 6819 : if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
968 : 6819 : TREE_TYPE (gimple_call_arg (call2, i))))
969 : : return false;
970 : : }
971 : : else
972 : : {
973 : 15466 : if (!operand_equal_p (gimple_call_fn (call1),
974 : 15466 : gimple_call_fn (call2), 0))
975 : : return false;
976 : :
977 : 31056 : if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
978 : : return false;
979 : : }
980 : :
981 : : /* Check that any unvectorized arguments are equal. */
982 : 17074 : if (const int *map = vect_get_operand_map (call1))
983 : : {
984 : 15 : unsigned int nkept = *map++;
985 : 15 : unsigned int mapi = 0;
986 : 57 : for (unsigned int i = 0; i < nargs; ++i)
987 : 42 : if (mapi < nkept && map[mapi] == int (i))
988 : 27 : mapi += 1;
989 : 15 : else if (!operand_equal_p (gimple_call_arg (call1, i),
990 : 15 : gimple_call_arg (call2, i)))
991 : : return false;
992 : : }
993 : :
994 : : return true;
995 : : }
996 : :
997 : : /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
998 : : caller's attempt to find the vector type in STMT_INFO with the narrowest
999 : : element type. Return true if VECTYPE is nonnull and if it is valid
1000 : : for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1001 : : number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1002 : : vect_build_slp_tree. */
1003 : :
1004 : : static bool
1005 : 13904045 : vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1006 : : unsigned int group_size,
1007 : : tree vectype, poly_uint64 *max_nunits)
1008 : : {
1009 : 13904045 : if (!vectype)
1010 : : {
1011 : 0 : if (dump_enabled_p ())
1012 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1013 : : "Build SLP failed: unsupported data-type in %G\n",
1014 : : stmt_info->stmt);
1015 : : /* Fatal mismatch. */
1016 : 0 : return false;
1017 : : }
1018 : :
1019 : : /* If populating the vector type requires unrolling then fail
1020 : : before adjusting *max_nunits for basic-block vectorization. */
1021 : 13904045 : if (is_a <bb_vec_info> (vinfo)
1022 : 13904045 : && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1023 : : {
1024 : 616527 : if (dump_enabled_p ())
1025 : 233 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026 : : "Build SLP failed: unrolling required "
1027 : : "in basic block SLP\n");
1028 : : /* Fatal mismatch. */
1029 : 616527 : return false;
1030 : : }
1031 : :
1032 : : /* In case of multiple types we need to detect the smallest type. */
1033 : 13287518 : vect_update_max_nunits (max_nunits, vectype);
1034 : 13287518 : return true;
1035 : : }
1036 : :
1037 : : /* Verify if the scalar stmts STMTS are isomorphic, require data
1038 : : permutation or are of unsupported types of operation. Return
1039 : : true if they are, otherwise return false and indicate in *MATCHES
1040 : : which stmts are not isomorphic to the first one. If MATCHES[0]
1041 : : is false then this indicates the comparison could not be
1042 : : carried out or the stmts will never be vectorized by SLP.
1043 : :
1044 : : Note COND_EXPR is possibly isomorphic to another one after swapping its
1045 : : operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1046 : : the first stmt by swapping the two operands of comparison; set SWAP[i]
1047 : : to 2 if stmt I is isormorphic to the first stmt by inverting the code
1048 : : of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1049 : : to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1050 : :
1051 : : static bool
1052 : 2790712 : vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1053 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1054 : : poly_uint64 *max_nunits, bool *matches,
1055 : : bool *two_operators, tree *node_vectype)
1056 : : {
1057 : 2790712 : unsigned int i;
1058 : 2790712 : stmt_vec_info first_stmt_info = stmts[0];
1059 : 2790712 : code_helper first_stmt_code = ERROR_MARK;
1060 : 2790712 : code_helper alt_stmt_code = ERROR_MARK;
1061 : 2790712 : code_helper rhs_code = ERROR_MARK;
1062 : 2790712 : code_helper first_cond_code = ERROR_MARK;
1063 : 2790712 : tree lhs;
1064 : 2790712 : bool need_same_oprnds = false;
1065 : 2790712 : tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1066 : 2790712 : stmt_vec_info first_load = NULL, prev_first_load = NULL;
1067 : 2790712 : bool first_stmt_ldst_p = false, ldst_p = false;
1068 : 2790712 : bool first_stmt_phi_p = false, phi_p = false;
1069 : 2790712 : bool maybe_soft_fail = false;
1070 : 2790712 : tree soft_fail_nunits_vectype = NULL_TREE;
1071 : :
1072 : : /* For every stmt in NODE find its def stmt/s. */
1073 : 2790712 : stmt_vec_info stmt_info;
1074 : 16704642 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1075 : : {
1076 : 14272529 : gimple *stmt = stmt_info->stmt;
1077 : 14272529 : swap[i] = 0;
1078 : 14272529 : matches[i] = false;
1079 : :
1080 : 14272529 : if (dump_enabled_p ())
1081 : 78651 : dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1082 : :
1083 : : /* Fail to vectorize statements marked as unvectorizable, throw
1084 : : or are volatile. */
1085 : 14272529 : if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1086 : 14053607 : || stmt_can_throw_internal (cfun, stmt)
1087 : 27765193 : || gimple_has_volatile_ops (stmt))
1088 : : {
1089 : 227819 : if (dump_enabled_p ())
1090 : 229 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1091 : : "Build SLP failed: unvectorizable statement %G",
1092 : : stmt);
1093 : : /* ??? For BB vectorization we want to commutate operands in a way
1094 : : to shuffle all unvectorizable defs into one operand and have
1095 : : the other still vectorized. The following doesn't reliably
1096 : : work for this though but it's the easiest we can do here. */
1097 : 227819 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1098 : 1936030 : continue;
1099 : : /* Fatal mismatch. */
1100 : 170367 : matches[0] = false;
1101 : 358599 : return false;
1102 : : }
1103 : :
1104 : 14044710 : gcall *call_stmt = dyn_cast <gcall *> (stmt);
1105 : 14044710 : lhs = gimple_get_lhs (stmt);
1106 : 14044710 : if (lhs == NULL_TREE
1107 : 14044710 : && (!call_stmt
1108 : 30 : || !gimple_call_internal_p (stmt)
1109 : 30 : || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1110 : : {
1111 : 60 : if (dump_enabled_p ())
1112 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1113 : : "Build SLP failed: not GIMPLE_ASSIGN nor "
1114 : : "GIMPLE_CALL %G", stmt);
1115 : 60 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1116 : 60 : continue;
1117 : : /* Fatal mismatch. */
1118 : 0 : matches[0] = false;
1119 : 0 : return false;
1120 : : }
1121 : :
1122 : 14044650 : tree nunits_vectype;
1123 : 14044650 : if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1124 : 14044650 : &nunits_vectype, group_size))
1125 : : {
1126 : 142996 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1127 : 0 : continue;
1128 : : /* Fatal mismatch. */
1129 : 142996 : matches[0] = false;
1130 : 142996 : return false;
1131 : : }
1132 : : /* Record nunits required but continue analysis, producing matches[]
1133 : : as if nunits was not an issue. This allows splitting of groups
1134 : : to happen. */
1135 : 13901654 : if (nunits_vectype
1136 : 13901654 : && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1137 : : nunits_vectype, max_nunits))
1138 : : {
1139 : 616527 : gcc_assert (is_a <bb_vec_info> (vinfo));
1140 : 616527 : maybe_soft_fail = true;
1141 : 616527 : soft_fail_nunits_vectype = nunits_vectype;
1142 : : }
1143 : :
1144 : 13901654 : gcc_assert (vectype);
1145 : :
1146 : 13901654 : if (call_stmt)
1147 : : {
1148 : 77148 : combined_fn cfn = gimple_call_combined_fn (call_stmt);
1149 : 77148 : if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1150 : 31125 : rhs_code = cfn;
1151 : : else
1152 : : rhs_code = CALL_EXPR;
1153 : :
1154 : 77148 : if (cfn == CFN_MASK_LOAD
1155 : 77148 : || cfn == CFN_GATHER_LOAD
1156 : : || cfn == CFN_MASK_GATHER_LOAD
1157 : : || cfn == CFN_MASK_LEN_GATHER_LOAD)
1158 : : ldst_p = true;
1159 : : else if (cfn == CFN_MASK_STORE)
1160 : : {
1161 : : ldst_p = true;
1162 : : rhs_code = CFN_MASK_STORE;
1163 : : }
1164 : 77042 : else if ((cfn != CFN_LAST
1165 : : && cfn != CFN_MASK_CALL
1166 : 31019 : && internal_fn_p (cfn)
1167 : 21047 : && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1168 : 76983 : || gimple_call_tail_p (call_stmt)
1169 : 76983 : || gimple_call_noreturn_p (call_stmt)
1170 : 154025 : || gimple_call_chain (call_stmt))
1171 : : {
1172 : 612 : if (dump_enabled_p ())
1173 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1174 : : "Build SLP failed: unsupported call type %G",
1175 : : (gimple *) call_stmt);
1176 : 612 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1177 : 105 : continue;
1178 : : /* Fatal mismatch. */
1179 : 507 : matches[0] = false;
1180 : 507 : return false;
1181 : : }
1182 : : }
1183 : 13824506 : else if (gimple_code (stmt) == GIMPLE_PHI)
1184 : : {
1185 : : rhs_code = ERROR_MARK;
1186 : : phi_p = true;
1187 : : }
1188 : : else
1189 : : {
1190 : 13282930 : rhs_code = gimple_assign_rhs_code (stmt);
1191 : 13282930 : ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1192 : : }
1193 : :
1194 : : /* Check the operation. */
1195 : 13901042 : if (i == 0)
1196 : : {
1197 : 2476842 : *node_vectype = vectype;
1198 : 2476842 : first_stmt_code = rhs_code;
1199 : 2476842 : first_stmt_ldst_p = ldst_p;
1200 : 2476842 : first_stmt_phi_p = phi_p;
1201 : :
1202 : : /* Shift arguments should be equal in all the packed stmts for a
1203 : : vector shift with scalar shift operand. */
1204 : 2476842 : if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1205 : 2389087 : || rhs_code == LROTATE_EXPR
1206 : 4865929 : || rhs_code == RROTATE_EXPR)
1207 : : {
1208 : : /* First see if we have a vector/vector shift. */
1209 : 87848 : if (!directly_supported_p (rhs_code, vectype, optab_vector))
1210 : : {
1211 : : /* No vector/vector shift, try for a vector/scalar shift. */
1212 : 87025 : if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1213 : : {
1214 : 23459 : if (dump_enabled_p ())
1215 : 68 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1216 : : "Build SLP failed: "
1217 : : "op not supported by target.\n");
1218 : 23459 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1219 : : continue;
1220 : : /* Fatal mismatch. */
1221 : 23459 : matches[0] = false;
1222 : 23459 : return false;
1223 : : }
1224 : 63566 : need_same_oprnds = true;
1225 : 63566 : first_op1 = gimple_assign_rhs2 (stmt);
1226 : : }
1227 : : }
1228 : 2388994 : else if (rhs_code == WIDEN_LSHIFT_EXPR)
1229 : : {
1230 : 0 : need_same_oprnds = true;
1231 : 0 : first_op1 = gimple_assign_rhs2 (stmt);
1232 : : }
1233 : 2388994 : else if (!ldst_p
1234 : 2388994 : && rhs_code == BIT_FIELD_REF)
1235 : : {
1236 : 9302 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1237 : 9302 : if (!is_a <bb_vec_info> (vinfo)
1238 : 9272 : || TREE_CODE (vec) != SSA_NAME
1239 : : /* When the element types are not compatible we pun the
1240 : : source to the target vectype which requires equal size. */
1241 : 18562 : || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1242 : 8594 : || !types_compatible_p (TREE_TYPE (vectype),
1243 : 8594 : TREE_TYPE (TREE_TYPE (vec))))
1244 : 978 : && !operand_equal_p (TYPE_SIZE (vectype),
1245 : 978 : TYPE_SIZE (TREE_TYPE (vec)))))
1246 : : {
1247 : 567 : if (dump_enabled_p ())
1248 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249 : : "Build SLP failed: "
1250 : : "BIT_FIELD_REF not supported\n");
1251 : : /* Fatal mismatch. */
1252 : 567 : matches[0] = false;
1253 : 567 : return false;
1254 : : }
1255 : : }
1256 : 2379692 : else if (rhs_code == CFN_DIV_POW2)
1257 : : {
1258 : 0 : need_same_oprnds = true;
1259 : 0 : first_op1 = gimple_call_arg (call_stmt, 1);
1260 : : }
1261 : : }
1262 : : else
1263 : : {
1264 : 11424200 : if (first_stmt_code != rhs_code
1265 : 11424200 : && alt_stmt_code == ERROR_MARK)
1266 : : alt_stmt_code = rhs_code;
1267 : 13133611 : if ((first_stmt_code != rhs_code
1268 : 1791780 : && (first_stmt_code != IMAGPART_EXPR
1269 : 4730 : || rhs_code != REALPART_EXPR)
1270 : 1787206 : && (first_stmt_code != REALPART_EXPR
1271 : 12769 : || rhs_code != IMAGPART_EXPR)
1272 : : /* Handle mismatches in plus/minus by computing both
1273 : : and merging the results. */
1274 : 1775010 : && !((first_stmt_code == PLUS_EXPR
1275 : 1666565 : || first_stmt_code == MINUS_EXPR)
1276 : 132726 : && (alt_stmt_code == PLUS_EXPR
1277 : 122994 : || alt_stmt_code == MINUS_EXPR)
1278 : 19480 : && rhs_code == alt_stmt_code)
1279 : 1755818 : && !(first_stmt_code.is_tree_code ()
1280 : 1634127 : && rhs_code.is_tree_code ()
1281 : 1540545 : && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1282 : : == tcc_comparison)
1283 : 115710 : && (swap_tree_comparison (tree_code (first_stmt_code))
1284 : 115710 : == tree_code (rhs_code)))
1285 : 1717574 : && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1286 : 341020 : && (first_stmt_code == ARRAY_REF
1287 : 340897 : || first_stmt_code == BIT_FIELD_REF
1288 : 339923 : || first_stmt_code == INDIRECT_REF
1289 : 339923 : || first_stmt_code == COMPONENT_REF
1290 : 335722 : || first_stmt_code == MEM_REF)
1291 : 10643 : && (rhs_code == ARRAY_REF
1292 : : || rhs_code == BIT_FIELD_REF
1293 : : || rhs_code == INDIRECT_REF
1294 : : || rhs_code == COMPONENT_REF
1295 : : || rhs_code == MEM_REF)))
1296 : 9716747 : || (ldst_p
1297 : 3792636 : && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1298 : 1896318 : != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1299 : : || (ldst_p
1300 : 1895512 : && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1301 : 1895512 : != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1302 : 9715939 : || first_stmt_ldst_p != ldst_p
1303 : 21140139 : || first_stmt_phi_p != phi_p)
1304 : : {
1305 : 1709411 : if (dump_enabled_p ())
1306 : : {
1307 : 2725 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1308 : : "Build SLP failed: different operation "
1309 : : "in stmt %G", stmt);
1310 : 2725 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311 : : "original stmt %G", first_stmt_info->stmt);
1312 : : }
1313 : : /* Mismatch. */
1314 : 1709411 : continue;
1315 : : }
1316 : :
1317 : 9720944 : if (!ldst_p
1318 : 7819661 : && first_stmt_code == BIT_FIELD_REF
1319 : 9724680 : && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1320 : 9891 : != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1321 : : {
1322 : 6155 : if (dump_enabled_p ())
1323 : 40 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1324 : : "Build SLP failed: different BIT_FIELD_REF "
1325 : : "arguments in %G", stmt);
1326 : : /* Mismatch. */
1327 : 6155 : continue;
1328 : : }
1329 : :
1330 : 9708634 : if (call_stmt
1331 : 25266 : && first_stmt_code != CFN_MASK_LOAD
1332 : 9733862 : && first_stmt_code != CFN_MASK_STORE)
1333 : : {
1334 : 25219 : if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1335 : : call_stmt))
1336 : : {
1337 : 8145 : if (dump_enabled_p ())
1338 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1339 : : "Build SLP failed: different calls in %G",
1340 : : stmt);
1341 : : /* Mismatch. */
1342 : 8145 : continue;
1343 : : }
1344 : : }
1345 : :
1346 : 9513555 : if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1347 : 10243647 : && (gimple_bb (first_stmt_info->stmt)
1348 : 730092 : != gimple_bb (stmt_info->stmt)))
1349 : : {
1350 : 27974 : if (dump_enabled_p ())
1351 : 4 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1352 : : "Build SLP failed: different BB for PHI "
1353 : : "or possibly trapping operation in %G", stmt);
1354 : : /* Mismatch. */
1355 : 27974 : continue;
1356 : : }
1357 : :
1358 : 9672515 : if (need_same_oprnds)
1359 : : {
1360 : 50491 : tree other_op1 = gimple_arg (stmt, 1);
1361 : 50491 : if (!operand_equal_p (first_op1, other_op1, 0))
1362 : : {
1363 : 6353 : if (dump_enabled_p ())
1364 : 95 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1365 : : "Build SLP failed: different shift "
1366 : : "arguments in %G", stmt);
1367 : : /* Mismatch. */
1368 : 6353 : continue;
1369 : : }
1370 : : }
1371 : :
1372 : 9666162 : if (!types_compatible_p (vectype, *node_vectype))
1373 : : {
1374 : 13178 : if (dump_enabled_p ())
1375 : 32 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1376 : : "Build SLP failed: different vector type "
1377 : : "in %G", stmt);
1378 : : /* Mismatch. */
1379 : 13178 : continue;
1380 : : }
1381 : : }
1382 : :
1383 : : /* Grouped store or load. */
1384 : 12105800 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1385 : : {
1386 : 3057899 : gcc_assert (ldst_p);
1387 : 3057899 : if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1388 : : {
1389 : : /* Store. */
1390 : 2526663 : gcc_assert (rhs_code == CFN_MASK_STORE
1391 : : || REFERENCE_CLASS_P (lhs)
1392 : : || DECL_P (lhs));
1393 : : }
1394 : : else
1395 : : {
1396 : : /* Load. */
1397 : 531236 : first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1398 : 531236 : if (prev_first_load)
1399 : : {
1400 : : /* Check that there are no loads from different interleaving
1401 : : chains in the same node. */
1402 : 298161 : if (prev_first_load != first_load)
1403 : : {
1404 : 35489 : if (dump_enabled_p ())
1405 : 1328 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1406 : : vect_location,
1407 : : "Build SLP failed: different "
1408 : : "interleaving chains in one node %G",
1409 : : stmt);
1410 : : /* Mismatch. */
1411 : 35489 : continue;
1412 : : }
1413 : : }
1414 : : else
1415 : : prev_first_load = first_load;
1416 : : }
1417 : : }
1418 : : /* Non-grouped store or load. */
1419 : 9047901 : else if (ldst_p)
1420 : : {
1421 : 24550 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1422 : 24550 : && rhs_code != CFN_GATHER_LOAD
1423 : : && rhs_code != CFN_MASK_GATHER_LOAD
1424 : : && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1425 : 24550 : && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1426 : : /* Not grouped loads are handled as externals for BB
1427 : : vectorization. For loop vectorization we can handle
1428 : : splats the same we handle single element interleaving. */
1429 : 48839 : && (is_a <bb_vec_info> (vinfo)
1430 : 24289 : || stmt_info != first_stmt_info))
1431 : : {
1432 : : /* Not grouped load. */
1433 : 14214 : if (dump_enabled_p ())
1434 : 83 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1435 : : "Build SLP failed: not grouped load %G", stmt);
1436 : :
1437 : 14214 : if (i != 0)
1438 : 14214 : continue;
1439 : : /* Fatal mismatch. */
1440 : 0 : matches[0] = false;
1441 : 0 : return false;
1442 : : }
1443 : : }
1444 : : /* Not memory operation. */
1445 : : else
1446 : : {
1447 : 9023351 : if (!phi_p
1448 : 8601750 : && rhs_code.is_tree_code ()
1449 : 8573180 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1450 : 866645 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1451 : 494709 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1452 : 484995 : && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1453 : 67615 : && rhs_code != VIEW_CONVERT_EXPR
1454 : : && rhs_code != CALL_EXPR
1455 : 9023351 : && rhs_code != BIT_FIELD_REF)
1456 : : {
1457 : 20703 : if (dump_enabled_p ())
1458 : 19 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459 : : "Build SLP failed: operation unsupported %G",
1460 : : stmt);
1461 : 20703 : if (is_a <bb_vec_info> (vinfo) && i != 0)
1462 : 0 : continue;
1463 : : /* Fatal mismatch. */
1464 : 20703 : matches[0] = false;
1465 : 20703 : return false;
1466 : : }
1467 : :
1468 : 9002648 : if (rhs_code == COND_EXPR)
1469 : : {
1470 : 8165 : tree cond_expr = gimple_assign_rhs1 (stmt);
1471 : 8165 : enum tree_code cond_code = TREE_CODE (cond_expr);
1472 : 8165 : enum tree_code swap_code = ERROR_MARK;
1473 : 8165 : enum tree_code invert_code = ERROR_MARK;
1474 : :
1475 : 8165 : if (i == 0)
1476 : 3519 : first_cond_code = TREE_CODE (cond_expr);
1477 : 4646 : else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1478 : : {
1479 : 531 : bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1480 : 531 : swap_code = swap_tree_comparison (cond_code);
1481 : 531 : invert_code = invert_tree_comparison (cond_code, honor_nans);
1482 : : }
1483 : :
1484 : 8165 : if (first_cond_code == cond_code)
1485 : : ;
1486 : : /* Isomorphic can be achieved by swapping. */
1487 : 42 : else if (first_cond_code == swap_code)
1488 : 0 : swap[i] = 1;
1489 : : /* Isomorphic can be achieved by inverting. */
1490 : 42 : else if (first_cond_code == invert_code)
1491 : 0 : swap[i] = 2;
1492 : : else
1493 : : {
1494 : 42 : if (dump_enabled_p ())
1495 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1496 : : "Build SLP failed: different"
1497 : : " operation %G", stmt);
1498 : : /* Mismatch. */
1499 : 42 : continue;
1500 : : }
1501 : : }
1502 : :
1503 : 9002606 : if (rhs_code.is_tree_code ()
1504 : 8552435 : && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1505 : 9419986 : && (swap_tree_comparison ((tree_code)first_stmt_code)
1506 : 417380 : == (tree_code)rhs_code))
1507 : 261399 : swap[i] = 1;
1508 : : }
1509 : :
1510 : 12035352 : matches[i] = true;
1511 : : }
1512 : :
1513 : 14412482 : for (i = 0; i < group_size; ++i)
1514 : 12888006 : if (!matches[i])
1515 : : return false;
1516 : :
1517 : : /* If we allowed a two-operation SLP node verify the target can cope
1518 : : with the permute we are going to use. */
1519 : 1524476 : if (alt_stmt_code != ERROR_MARK
1520 : 1524476 : && (!alt_stmt_code.is_tree_code ()
1521 : 64679 : || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1522 : 50317 : && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1523 : : {
1524 : 12274 : *two_operators = true;
1525 : : }
1526 : :
1527 : 1524476 : if (maybe_soft_fail)
1528 : : {
1529 : 93412 : unsigned HOST_WIDE_INT const_nunits;
1530 : 93412 : if (!TYPE_VECTOR_SUBPARTS
1531 : 93412 : (soft_fail_nunits_vectype).is_constant (&const_nunits)
1532 : 93412 : || const_nunits > group_size)
1533 : 0 : matches[0] = false;
1534 : : else
1535 : : {
1536 : : /* With constant vector elements simulate a mismatch at the
1537 : : point we need to split. */
1538 : 93412 : unsigned tail = group_size & (const_nunits - 1);
1539 : 93412 : memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1540 : : }
1541 : 93412 : return false;
1542 : : }
1543 : :
1544 : : return true;
1545 : : }
1546 : :
1547 : : /* Traits for the hash_set to record failed SLP builds for a stmt set.
1548 : : Note we never remove apart from at destruction time so we do not
1549 : : need a special value for deleted that differs from empty. */
1550 : : struct bst_traits
1551 : : {
1552 : : typedef vec <stmt_vec_info> value_type;
1553 : : typedef vec <stmt_vec_info> compare_type;
1554 : : static inline hashval_t hash (value_type);
1555 : : static inline bool equal (value_type existing, value_type candidate);
1556 : 136157436 : static inline bool is_empty (value_type x) { return !x.exists (); }
1557 : 29725246 : static inline bool is_deleted (value_type x) { return !x.exists (); }
1558 : : static const bool empty_zero_p = true;
1559 : 0 : static inline void mark_empty (value_type &x) { x.release (); }
1560 : : static inline void mark_deleted (value_type &x) { x.release (); }
1561 : 2814503 : static inline void remove (value_type &x) { x.release (); }
1562 : : };
1563 : : inline hashval_t
1564 : 25624966 : bst_traits::hash (value_type x)
1565 : : {
1566 : 25624966 : inchash::hash h;
1567 : 548547134 : for (unsigned i = 0; i < x.length (); ++i)
1568 : 248648601 : h.add_int (gimple_uid (x[i]->stmt));
1569 : 25624966 : return h.end ();
1570 : : }
1571 : : inline bool
1572 : 22093586 : bst_traits::equal (value_type existing, value_type candidate)
1573 : : {
1574 : 66280758 : if (existing.length () != candidate.length ())
1575 : : return false;
1576 : 24172757 : for (unsigned i = 0; i < existing.length (); ++i)
1577 : 24065717 : if (existing[i] != candidate[i])
1578 : : return false;
1579 : : return true;
1580 : : }
1581 : :
1582 : : /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1583 : : but then vec::insert does memmove and that's not compatible with
1584 : : std::pair. */
1585 : : struct chain_op_t
1586 : : {
1587 : 3070873 : chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1588 : 3070873 : : code (code_), dt (dt_), op (op_) {}
1589 : : tree_code code;
1590 : : vect_def_type dt;
1591 : : tree op;
1592 : : };
1593 : :
1594 : : /* Comparator for sorting associatable chains. */
1595 : :
1596 : : static int
1597 : 7399599 : dt_sort_cmp (const void *op1_, const void *op2_, void *)
1598 : : {
1599 : 7399599 : auto *op1 = (const chain_op_t *) op1_;
1600 : 7399599 : auto *op2 = (const chain_op_t *) op2_;
1601 : 7399599 : if (op1->dt != op2->dt)
1602 : 857165 : return (int)op1->dt - (int)op2->dt;
1603 : 6542434 : return (int)op1->code - (int)op2->code;
1604 : : }
1605 : :
1606 : : /* Linearize the associatable expression chain at START with the
1607 : : associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1608 : : filling CHAIN with the result and using WORKLIST as intermediate storage.
1609 : : CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1610 : : or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1611 : : stmts, starting with START. */
1612 : :
1613 : : static void
1614 : 1363663 : vect_slp_linearize_chain (vec_info *vinfo,
1615 : : vec<std::pair<tree_code, gimple *> > &worklist,
1616 : : vec<chain_op_t> &chain,
1617 : : enum tree_code code, gimple *start,
1618 : : gimple *&code_stmt, gimple *&alt_code_stmt,
1619 : : vec<gimple *> *chain_stmts)
1620 : : {
1621 : : /* For each lane linearize the addition/subtraction (or other
1622 : : uniform associatable operation) expression tree. */
1623 : 1363663 : worklist.safe_push (std::make_pair (code, start));
1624 : 3070873 : while (!worklist.is_empty ())
1625 : : {
1626 : 1707210 : auto entry = worklist.pop ();
1627 : 1707210 : gassign *stmt = as_a <gassign *> (entry.second);
1628 : 1707210 : enum tree_code in_code = entry.first;
1629 : 3414420 : enum tree_code this_code = gimple_assign_rhs_code (stmt);
1630 : : /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1631 : 1707210 : if (!code_stmt
1632 : 1707210 : && gimple_assign_rhs_code (stmt) == code)
1633 : 1151853 : code_stmt = stmt;
1634 : 555357 : else if (!alt_code_stmt
1635 : 555357 : && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1636 : 275122 : alt_code_stmt = stmt;
1637 : 1707210 : if (chain_stmts)
1638 : 1698758 : chain_stmts->safe_push (stmt);
1639 : 5121630 : for (unsigned opnum = 1; opnum <= 2; ++opnum)
1640 : : {
1641 : 3414420 : tree op = gimple_op (stmt, opnum);
1642 : 3414420 : vect_def_type dt;
1643 : 3414420 : stmt_vec_info def_stmt_info;
1644 : 3414420 : bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1645 : 3414420 : gcc_assert (res);
1646 : 3414420 : if (dt == vect_internal_def
1647 : 3414420 : && is_pattern_stmt_p (def_stmt_info))
1648 : 642 : op = gimple_get_lhs (def_stmt_info->stmt);
1649 : 3414420 : gimple *use_stmt;
1650 : 3414420 : use_operand_p use_p;
1651 : 3414420 : if (dt == vect_internal_def
1652 : 3193698 : && single_imm_use (op, &use_p, &use_stmt)
1653 : 2010848 : && is_gimple_assign (def_stmt_info->stmt)
1654 : 5274528 : && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1655 : 1545804 : || (code == PLUS_EXPR
1656 : 739501 : && (gimple_assign_rhs_code (def_stmt_info->stmt)
1657 : : == MINUS_EXPR))))
1658 : : {
1659 : 343547 : tree_code op_def_code = this_code;
1660 : 343547 : if (op_def_code == MINUS_EXPR && opnum == 1)
1661 : 39418 : op_def_code = PLUS_EXPR;
1662 : 343547 : if (in_code == MINUS_EXPR)
1663 : 193 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1664 : 343547 : worklist.safe_push (std::make_pair (op_def_code,
1665 : 343547 : def_stmt_info->stmt));
1666 : : }
1667 : : else
1668 : : {
1669 : 3070873 : tree_code op_def_code = this_code;
1670 : 3070873 : if (op_def_code == MINUS_EXPR && opnum == 1)
1671 : 239349 : op_def_code = PLUS_EXPR;
1672 : 3070873 : if (in_code == MINUS_EXPR)
1673 : 4123 : op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1674 : 3070873 : chain.safe_push (chain_op_t (op_def_code, dt, op));
1675 : : }
1676 : : }
1677 : : }
1678 : 1363663 : }
1679 : :
1680 : : typedef hash_map <vec <stmt_vec_info>, slp_tree,
1681 : : simple_hashmap_traits <bst_traits, slp_tree> >
1682 : : scalar_stmts_to_slp_tree_map_t;
1683 : :
1684 : : static slp_tree
1685 : : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1686 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1687 : : poly_uint64 *max_nunits,
1688 : : bool *matches, unsigned *limit, unsigned *tree_size,
1689 : : scalar_stmts_to_slp_tree_map_t *bst_map);
1690 : :
1691 : : static slp_tree
1692 : 2921197 : vect_build_slp_tree (vec_info *vinfo,
1693 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1694 : : poly_uint64 *max_nunits,
1695 : : bool *matches, unsigned *limit, unsigned *tree_size,
1696 : : scalar_stmts_to_slp_tree_map_t *bst_map)
1697 : : {
1698 : 2921197 : if (slp_tree *leader = bst_map->get (stmts))
1699 : : {
1700 : 106694 : if (dump_enabled_p ())
1701 : 787 : dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1702 : 787 : !(*leader)->failed ? "" : "failed ",
1703 : : (void *) *leader);
1704 : 106694 : if (!(*leader)->failed)
1705 : : {
1706 : 60583 : SLP_TREE_REF_COUNT (*leader)++;
1707 : 60583 : vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1708 : 60583 : stmts.release ();
1709 : 60583 : return *leader;
1710 : : }
1711 : 46111 : memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1712 : 46111 : return NULL;
1713 : : }
1714 : :
1715 : : /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1716 : : so we can pick up backedge destinations during discovery. */
1717 : 2814503 : slp_tree res = new _slp_tree;
1718 : 2814503 : SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1719 : 2814503 : SLP_TREE_SCALAR_STMTS (res) = stmts;
1720 : 2814503 : bst_map->put (stmts.copy (), res);
1721 : :
1722 : 2814503 : if (*limit == 0)
1723 : : {
1724 : 23690 : if (dump_enabled_p ())
1725 : 20 : dump_printf_loc (MSG_NOTE, vect_location,
1726 : : "SLP discovery limit exceeded\n");
1727 : : /* Mark the node invalid so we can detect those when still in use
1728 : : as backedge destinations. */
1729 : 23690 : SLP_TREE_SCALAR_STMTS (res) = vNULL;
1730 : 23690 : SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1731 : 23690 : res->failed = XNEWVEC (bool, group_size);
1732 : 23690 : memset (res->failed, 0, sizeof (bool) * group_size);
1733 : 23690 : memset (matches, 0, sizeof (bool) * group_size);
1734 : 23690 : return NULL;
1735 : : }
1736 : 2790813 : --*limit;
1737 : :
1738 : 2790813 : if (dump_enabled_p ())
1739 : 19254 : dump_printf_loc (MSG_NOTE, vect_location,
1740 : : "starting SLP discovery for node %p\n", (void *) res);
1741 : :
1742 : 2790813 : poly_uint64 this_max_nunits = 1;
1743 : 2790813 : slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1744 : : &this_max_nunits,
1745 : : matches, limit, tree_size, bst_map);
1746 : 2790813 : if (!res_)
1747 : : {
1748 : 1646705 : if (dump_enabled_p ())
1749 : 4885 : dump_printf_loc (MSG_NOTE, vect_location,
1750 : : "SLP discovery for node %p failed\n", (void *) res);
1751 : : /* Mark the node invalid so we can detect those when still in use
1752 : : as backedge destinations. */
1753 : 1646705 : SLP_TREE_SCALAR_STMTS (res) = vNULL;
1754 : 1646705 : SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1755 : 1646705 : res->failed = XNEWVEC (bool, group_size);
1756 : 1646705 : if (flag_checking)
1757 : : {
1758 : : unsigned i;
1759 : 3133543 : for (i = 0; i < group_size; ++i)
1760 : 3133543 : if (!matches[i])
1761 : : break;
1762 : 1646705 : gcc_assert (i < group_size);
1763 : : }
1764 : 1646705 : memcpy (res->failed, matches, sizeof (bool) * group_size);
1765 : : }
1766 : : else
1767 : : {
1768 : 1144108 : if (dump_enabled_p ())
1769 : 14369 : dump_printf_loc (MSG_NOTE, vect_location,
1770 : : "SLP discovery for node %p succeeded\n",
1771 : : (void *) res);
1772 : 1144108 : gcc_assert (res_ == res);
1773 : 1144108 : res->max_nunits = this_max_nunits;
1774 : 1144108 : vect_update_max_nunits (max_nunits, this_max_nunits);
1775 : : /* Keep a reference for the bst_map use. */
1776 : 1144108 : SLP_TREE_REF_COUNT (res)++;
1777 : : }
1778 : : return res_;
1779 : : }
1780 : :
1781 : : /* Helper for building an associated SLP node chain. */
1782 : :
1783 : : static void
1784 : 162 : vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1785 : : slp_tree op0, slp_tree op1,
1786 : : stmt_vec_info oper1, stmt_vec_info oper2,
1787 : : vec<std::pair<unsigned, unsigned> > lperm)
1788 : : {
1789 : 162 : unsigned group_size = SLP_TREE_LANES (op1);
1790 : :
1791 : 162 : slp_tree child1 = new _slp_tree;
1792 : 162 : SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1793 : 162 : SLP_TREE_VECTYPE (child1) = vectype;
1794 : 162 : SLP_TREE_LANES (child1) = group_size;
1795 : 162 : SLP_TREE_CHILDREN (child1).create (2);
1796 : 162 : SLP_TREE_CHILDREN (child1).quick_push (op0);
1797 : 162 : SLP_TREE_CHILDREN (child1).quick_push (op1);
1798 : 162 : SLP_TREE_REPRESENTATIVE (child1) = oper1;
1799 : :
1800 : 162 : slp_tree child2 = new _slp_tree;
1801 : 162 : SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1802 : 162 : SLP_TREE_VECTYPE (child2) = vectype;
1803 : 162 : SLP_TREE_LANES (child2) = group_size;
1804 : 162 : SLP_TREE_CHILDREN (child2).create (2);
1805 : 162 : SLP_TREE_CHILDREN (child2).quick_push (op0);
1806 : 162 : SLP_TREE_REF_COUNT (op0)++;
1807 : 162 : SLP_TREE_CHILDREN (child2).quick_push (op1);
1808 : 162 : SLP_TREE_REF_COUNT (op1)++;
1809 : 162 : SLP_TREE_REPRESENTATIVE (child2) = oper2;
1810 : :
1811 : 162 : SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1812 : 162 : SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1813 : 162 : SLP_TREE_VECTYPE (perm) = vectype;
1814 : 162 : SLP_TREE_LANES (perm) = group_size;
1815 : : /* ??? We should set this NULL but that's not expected. */
1816 : 162 : SLP_TREE_REPRESENTATIVE (perm) = oper1;
1817 : 162 : SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1818 : 162 : SLP_TREE_CHILDREN (perm).quick_push (child1);
1819 : 162 : SLP_TREE_CHILDREN (perm).quick_push (child2);
1820 : 162 : }
1821 : :
1822 : : /* Recursively build an SLP tree starting from NODE.
1823 : : Fail (and return a value not equal to zero) if def-stmts are not
1824 : : isomorphic, require data permutation or are of unsupported types of
1825 : : operation. Otherwise, return 0.
1826 : : The value returned is the depth in the SLP tree where a mismatch
1827 : : was found. */
1828 : :
1829 : : static slp_tree
1830 : 2790813 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1831 : : vec<stmt_vec_info> stmts, unsigned int group_size,
1832 : : poly_uint64 *max_nunits,
1833 : : bool *matches, unsigned *limit, unsigned *tree_size,
1834 : : scalar_stmts_to_slp_tree_map_t *bst_map)
1835 : : {
1836 : 2790813 : unsigned nops, i, this_tree_size = 0;
1837 : 2790813 : poly_uint64 this_max_nunits = *max_nunits;
1838 : :
1839 : 2790813 : matches[0] = false;
1840 : :
1841 : 2790813 : stmt_vec_info stmt_info = stmts[0];
1842 : 2790813 : if (!is_a<gcall *> (stmt_info->stmt)
1843 : : && !is_a<gassign *> (stmt_info->stmt)
1844 : : && !is_a<gphi *> (stmt_info->stmt))
1845 : : return NULL;
1846 : :
1847 : 2790712 : nops = gimple_num_args (stmt_info->stmt);
1848 : 2790712 : if (const int *map = vect_get_operand_map (stmt_info->stmt,
1849 : 2790712 : STMT_VINFO_GATHER_SCATTER_P
1850 : : (stmt_info)))
1851 : 846 : nops = map[0];
1852 : :
1853 : : /* If the SLP node is a PHI (induction or reduction), terminate
1854 : : the recursion. */
1855 : 2790712 : bool *skip_args = XALLOCAVEC (bool, nops);
1856 : 2790712 : memset (skip_args, 0, sizeof (bool) * nops);
1857 : 2790712 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1858 : 60872 : if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1859 : : {
1860 : 2391 : tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1861 : 2391 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1862 : : group_size);
1863 : 2391 : if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1864 : : max_nunits))
1865 : : return NULL;
1866 : :
1867 : 2391 : vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1868 : 2391 : if (def_type == vect_induction_def)
1869 : : {
1870 : : /* Induction PHIs are not cycles but walk the initial
1871 : : value. Only for inner loops through, for outer loops
1872 : : we need to pick up the value from the actual PHIs
1873 : : to more easily support peeling and epilogue vectorization. */
1874 : 560 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1875 : 560 : if (!nested_in_vect_loop_p (loop, stmt_info))
1876 : 548 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1877 : : else
1878 : : loop = loop->inner;
1879 : 560 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
1880 : : }
1881 : 1831 : else if (def_type == vect_reduction_def
1882 : : || def_type == vect_double_reduction_def
1883 : : || def_type == vect_nested_cycle
1884 : 1831 : || def_type == vect_first_order_recurrence)
1885 : : {
1886 : : /* Else def types have to match. */
1887 : : stmt_vec_info other_info;
1888 : : bool all_same = true;
1889 : 12134 : FOR_EACH_VEC_ELT (stmts, i, other_info)
1890 : : {
1891 : 10350 : if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1892 : 1417862 : return NULL;
1893 : 10350 : if (other_info != stmt_info)
1894 : 7082 : all_same = false;
1895 : : }
1896 : 1784 : class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1897 : : /* Reduction initial values are not explicitely represented. */
1898 : 1784 : if (def_type != vect_first_order_recurrence
1899 : 1784 : && !nested_in_vect_loop_p (loop, stmt_info))
1900 : 1738 : skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1901 : : /* Reduction chain backedge defs are filled manually.
1902 : : ??? Need a better way to identify a SLP reduction chain PHI.
1903 : : Or a better overall way to SLP match those. */
1904 : 1784 : if (all_same && def_type == vect_reduction_def)
1905 : 372 : skip_args[loop_latch_edge (loop)->dest_idx] = true;
1906 : : }
1907 : 47 : else if (def_type != vect_internal_def)
1908 : : return NULL;
1909 : : }
1910 : :
1911 : :
1912 : 2790712 : bool two_operators = false;
1913 : 2790712 : unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1914 : 2790712 : tree vectype = NULL_TREE;
1915 : 2790712 : if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1916 : : &this_max_nunits, matches, &two_operators,
1917 : : &vectype))
1918 : : return NULL;
1919 : :
1920 : : /* If the SLP node is a load, terminate the recursion unless masked. */
1921 : 1431064 : if (STMT_VINFO_DATA_REF (stmt_info)
1922 : 835690 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1923 : : {
1924 : 170897 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1925 : : gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1926 : : else
1927 : : {
1928 : 170786 : *max_nunits = this_max_nunits;
1929 : 170786 : (*tree_size)++;
1930 : 170786 : node = vect_create_new_slp_node (node, stmts, 0);
1931 : 170786 : SLP_TREE_VECTYPE (node) = vectype;
1932 : : /* And compute the load permutation. Whether it is actually
1933 : : a permutation depends on the unrolling factor which is
1934 : : decided later. */
1935 : 170786 : vec<unsigned> load_permutation;
1936 : 170786 : int j;
1937 : 170786 : stmt_vec_info load_info;
1938 : 170786 : load_permutation.create (group_size);
1939 : 170786 : stmt_vec_info first_stmt_info
1940 : 170786 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1941 : 170786 : bool any_permute = false;
1942 : 597330 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1943 : : {
1944 : 426544 : int load_place;
1945 : 426544 : if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1946 : 421655 : load_place = vect_get_place_in_interleaving_chain
1947 : 421655 : (load_info, first_stmt_info);
1948 : : else
1949 : : load_place = 0;
1950 : 421655 : gcc_assert (load_place != -1);
1951 : 426544 : any_permute |= load_place != j;
1952 : 426544 : load_permutation.quick_push (load_place);
1953 : : }
1954 : :
1955 : 170786 : if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
1956 : : {
1957 : 38 : gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
1958 : : || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
1959 : : || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
1960 : : || gimple_call_internal_p (stmt,
1961 : : IFN_MASK_LEN_GATHER_LOAD));
1962 : 38 : load_permutation.release ();
1963 : : /* We cannot handle permuted masked loads, see PR114375. */
1964 : 38 : if (any_permute
1965 : 6 : || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1966 : 6 : && DR_GROUP_SIZE (first_stmt_info) != group_size)
1967 : 44 : || STMT_VINFO_STRIDED_P (stmt_info))
1968 : : {
1969 : 32 : matches[0] = false;
1970 : 170780 : return NULL;
1971 : : }
1972 : : }
1973 : : else
1974 : : {
1975 : 170748 : SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
1976 : 170748 : return node;
1977 : : }
1978 : : }
1979 : : }
1980 : 1260167 : else if (gimple_assign_single_p (stmt_info->stmt)
1981 : 1337858 : && !gimple_vuse (stmt_info->stmt)
1982 : 1264908 : && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
1983 : : {
1984 : : /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
1985 : : the same SSA name vector of a compatible type to vectype. */
1986 : 2726 : vec<std::pair<unsigned, unsigned> > lperm = vNULL;
1987 : 2726 : tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
1988 : 2726 : stmt_vec_info estmt_info;
1989 : 8772 : FOR_EACH_VEC_ELT (stmts, i, estmt_info)
1990 : : {
1991 : 6050 : gassign *estmt = as_a <gassign *> (estmt_info->stmt);
1992 : 6050 : tree bfref = gimple_assign_rhs1 (estmt);
1993 : 6050 : HOST_WIDE_INT lane;
1994 : 6050 : if (!known_eq (bit_field_size (bfref),
1995 : : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
1996 : 18142 : || !constant_multiple_p (bit_field_offset (bfref),
1997 : 6046 : bit_field_size (bfref), &lane))
1998 : : {
1999 : 4 : lperm.release ();
2000 : 4 : matches[0] = false;
2001 : 4 : return NULL;
2002 : : }
2003 : 6046 : lperm.safe_push (std::make_pair (0, (unsigned)lane));
2004 : : }
2005 : 2722 : slp_tree vnode = vect_create_new_slp_node (vNULL);
2006 : 2722 : if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2007 : : /* ??? We record vectype here but we hide eventually necessary
2008 : : punning and instead rely on code generation to materialize
2009 : : VIEW_CONVERT_EXPRs as necessary. We instead should make
2010 : : this explicit somehow. */
2011 : 654 : SLP_TREE_VECTYPE (vnode) = vectype;
2012 : : else
2013 : : {
2014 : : /* For different size but compatible elements we can still
2015 : : use VEC_PERM_EXPR without punning. */
2016 : 2068 : gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2017 : : && types_compatible_p (TREE_TYPE (vectype),
2018 : : TREE_TYPE (TREE_TYPE (vec))));
2019 : 2068 : SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2020 : : }
2021 : 2722 : auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2022 : 2722 : unsigned HOST_WIDE_INT const_nunits;
2023 : 2722 : if (nunits.is_constant (&const_nunits))
2024 : 2722 : SLP_TREE_LANES (vnode) = const_nunits;
2025 : 2722 : SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2026 : : /* We are always building a permutation node even if it is an identity
2027 : : permute to shield the rest of the vectorizer from the odd node
2028 : : representing an actual vector without any scalar ops.
2029 : : ??? We could hide it completely with making the permute node
2030 : : external? */
2031 : 2722 : node = vect_create_new_slp_node (node, stmts, 1);
2032 : 2722 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2033 : 2722 : SLP_TREE_LANE_PERMUTATION (node) = lperm;
2034 : 2722 : SLP_TREE_VECTYPE (node) = vectype;
2035 : 2722 : SLP_TREE_CHILDREN (node).quick_push (vnode);
2036 : 2722 : return node;
2037 : : }
2038 : : /* When discovery reaches an associatable operation see whether we can
2039 : : improve that to match up lanes in a way superior to the operand
2040 : : swapping code which at most looks at two defs.
2041 : : ??? For BB vectorization we cannot do the brute-force search
2042 : : for matching as we can succeed by means of builds from scalars
2043 : : and have no good way to "cost" one build against another. */
2044 : 1257441 : else if (is_a <loop_vec_info> (vinfo)
2045 : : /* ??? We don't handle !vect_internal_def defs below. */
2046 : 38380 : && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2047 : 32743 : && is_gimple_assign (stmt_info->stmt)
2048 : 32608 : && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2049 : 22846 : || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2050 : 1268441 : && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2051 : 7158 : || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2052 : 5482 : && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2053 : : {
2054 : : /* See if we have a chain of (mixed) adds or subtracts or other
2055 : : associatable ops. */
2056 : 6878 : enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2057 : 6878 : if (code == MINUS_EXPR)
2058 : 520 : code = PLUS_EXPR;
2059 : 6878 : stmt_vec_info other_op_stmt_info = NULL;
2060 : 6878 : stmt_vec_info op_stmt_info = NULL;
2061 : 6878 : unsigned chain_len = 0;
2062 : 6878 : auto_vec<chain_op_t> chain;
2063 : 6878 : auto_vec<std::pair<tree_code, gimple *> > worklist;
2064 : 6878 : auto_vec<vec<chain_op_t> > chains (group_size);
2065 : 6878 : auto_vec<slp_tree, 4> children;
2066 : 6878 : bool hard_fail = true;
2067 : 7665 : for (unsigned lane = 0; lane < group_size; ++lane)
2068 : : {
2069 : : /* For each lane linearize the addition/subtraction (or other
2070 : : uniform associatable operation) expression tree. */
2071 : 7382 : gimple *op_stmt = NULL, *other_op_stmt = NULL;
2072 : 7382 : vect_slp_linearize_chain (vinfo, worklist, chain, code,
2073 : 7382 : stmts[lane]->stmt, op_stmt, other_op_stmt,
2074 : : NULL);
2075 : 7382 : if (!op_stmt_info && op_stmt)
2076 : 6393 : op_stmt_info = vinfo->lookup_stmt (op_stmt);
2077 : 7382 : if (!other_op_stmt_info && other_op_stmt)
2078 : 670 : other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2079 : 7382 : if (chain.length () == 2)
2080 : : {
2081 : : /* In a chain of just two elements resort to the regular
2082 : : operand swapping scheme. If we run into a length
2083 : : mismatch still hard-FAIL. */
2084 : 6595 : if (chain_len == 0)
2085 : : hard_fail = false;
2086 : : else
2087 : : {
2088 : 25 : matches[lane] = false;
2089 : : /* ??? We might want to process the other lanes, but
2090 : : make sure to not give false matching hints to the
2091 : : caller for lanes we did not process. */
2092 : 25 : if (lane != group_size - 1)
2093 : 15 : matches[0] = false;
2094 : : }
2095 : 6595 : break;
2096 : : }
2097 : 787 : else if (chain_len == 0)
2098 : 308 : chain_len = chain.length ();
2099 : 958 : else if (chain.length () != chain_len)
2100 : : {
2101 : : /* ??? Here we could slip in magic to compensate with
2102 : : neutral operands. */
2103 : 0 : matches[lane] = false;
2104 : 0 : if (lane != group_size - 1)
2105 : 0 : matches[0] = false;
2106 : : break;
2107 : : }
2108 : 787 : chains.quick_push (chain.copy ());
2109 : 787 : chain.truncate (0);
2110 : : }
2111 : 13756 : if (chains.length () == group_size)
2112 : : {
2113 : : /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2114 : 283 : if (!op_stmt_info)
2115 : : {
2116 : 3 : hard_fail = false;
2117 : 3 : goto out;
2118 : : }
2119 : : /* Now we have a set of chains with the same length. */
2120 : : /* 1. pre-sort according to def_type and operation. */
2121 : 1036 : for (unsigned lane = 0; lane < group_size; ++lane)
2122 : 1512 : chains[lane].stablesort (dt_sort_cmp, vinfo);
2123 : 280 : if (dump_enabled_p ())
2124 : : {
2125 : 106 : dump_printf_loc (MSG_NOTE, vect_location,
2126 : : "pre-sorted chains of %s\n",
2127 : : get_tree_code_name (code));
2128 : 471 : for (unsigned lane = 0; lane < group_size; ++lane)
2129 : : {
2130 : 1686 : for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2131 : 2642 : dump_printf (MSG_NOTE, "%s %T ",
2132 : 1321 : get_tree_code_name (chains[lane][opnum].code),
2133 : 1321 : chains[lane][opnum].op);
2134 : 365 : dump_printf (MSG_NOTE, "\n");
2135 : : }
2136 : : }
2137 : : /* 2. try to build children nodes, associating as necessary. */
2138 : 1131 : for (unsigned n = 0; n < chain_len; ++n)
2139 : : {
2140 : 873 : vect_def_type dt = chains[0][n].dt;
2141 : 873 : unsigned lane;
2142 : 3329 : for (lane = 0; lane < group_size; ++lane)
2143 : 2456 : if (chains[lane][n].dt != dt)
2144 : : {
2145 : 0 : if (dt == vect_constant_def
2146 : 0 : && chains[lane][n].dt == vect_external_def)
2147 : : dt = vect_external_def;
2148 : 0 : else if (dt == vect_external_def
2149 : 0 : && chains[lane][n].dt == vect_constant_def)
2150 : : ;
2151 : : else
2152 : : break;
2153 : : }
2154 : 873 : if (lane != group_size)
2155 : : {
2156 : 0 : if (dump_enabled_p ())
2157 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2158 : : "giving up on chain due to mismatched "
2159 : : "def types\n");
2160 : 0 : matches[lane] = false;
2161 : 0 : if (lane != group_size - 1)
2162 : 0 : matches[0] = false;
2163 : 0 : goto out;
2164 : : }
2165 : 873 : if (dt == vect_constant_def
2166 : 873 : || dt == vect_external_def)
2167 : : {
2168 : : /* Check whether we can build the invariant. If we can't
2169 : : we never will be able to. */
2170 : 90 : tree type = TREE_TYPE (chains[0][n].op);
2171 : 90 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2172 : : && (TREE_CODE (type) == BOOLEAN_TYPE
2173 : : || !can_duplicate_and_interleave_p (vinfo, group_size,
2174 : : type)))
2175 : : {
2176 : : matches[0] = false;
2177 : : goto out;
2178 : : }
2179 : 90 : vec<tree> ops;
2180 : 90 : ops.create (group_size);
2181 : 465 : for (lane = 0; lane < group_size; ++lane)
2182 : 285 : ops.quick_push (chains[lane][n].op);
2183 : 90 : slp_tree child = vect_create_new_slp_node (ops);
2184 : 90 : SLP_TREE_DEF_TYPE (child) = dt;
2185 : 90 : children.safe_push (child);
2186 : : }
2187 : 783 : else if (dt != vect_internal_def)
2188 : : {
2189 : : /* Not sure, we might need sth special.
2190 : : gcc.dg/vect/pr96854.c,
2191 : : gfortran.dg/vect/fast-math-pr37021.f90
2192 : : and gfortran.dg/vect/pr61171.f trigger. */
2193 : : /* Soft-fail for now. */
2194 : 6 : hard_fail = false;
2195 : 6 : goto out;
2196 : : }
2197 : : else
2198 : : {
2199 : 777 : vec<stmt_vec_info> op_stmts;
2200 : 777 : op_stmts.create (group_size);
2201 : 777 : slp_tree child = NULL;
2202 : : /* Brute-force our way. We have to consider a lane
2203 : : failing after fixing an earlier fail up in the
2204 : : SLP discovery recursion. So track the current
2205 : : permute per lane. */
2206 : 777 : unsigned *perms = XALLOCAVEC (unsigned, group_size);
2207 : 777 : memset (perms, 0, sizeof (unsigned) * group_size);
2208 : 832 : do
2209 : : {
2210 : 832 : op_stmts.truncate (0);
2211 : 4041 : for (lane = 0; lane < group_size; ++lane)
2212 : 2377 : op_stmts.quick_push
2213 : 2377 : (vinfo->lookup_def (chains[lane][n].op));
2214 : 832 : child = vect_build_slp_tree (vinfo, op_stmts,
2215 : : group_size, &this_max_nunits,
2216 : : matches, limit,
2217 : : &this_tree_size, bst_map);
2218 : : /* ??? We're likely getting too many fatal mismatches
2219 : : here so maybe we want to ignore them (but then we
2220 : : have no idea which lanes fatally mismatched). */
2221 : 832 : if (child || !matches[0])
2222 : : break;
2223 : : /* Swap another lane we have not yet matched up into
2224 : : lanes that did not match. If we run out of
2225 : : permute possibilities for a lane terminate the
2226 : : search. */
2227 : 246 : bool term = false;
2228 : 246 : for (lane = 1; lane < group_size; ++lane)
2229 : 191 : if (!matches[lane])
2230 : : {
2231 : 135 : if (n + perms[lane] + 1 == chain_len)
2232 : : {
2233 : : term = true;
2234 : : break;
2235 : : }
2236 : 238 : std::swap (chains[lane][n],
2237 : 119 : chains[lane][n + perms[lane] + 1]);
2238 : 119 : perms[lane]++;
2239 : : }
2240 : 71 : if (term)
2241 : : break;
2242 : : }
2243 : : while (1);
2244 : 777 : if (!child)
2245 : : {
2246 : 16 : if (dump_enabled_p ())
2247 : 11 : dump_printf_loc (MSG_NOTE, vect_location,
2248 : : "failed to match up op %d\n", n);
2249 : 16 : op_stmts.release ();
2250 : 16 : if (lane != group_size - 1)
2251 : 12 : matches[0] = false;
2252 : : else
2253 : 4 : matches[lane] = false;
2254 : 16 : goto out;
2255 : : }
2256 : 761 : if (dump_enabled_p ())
2257 : : {
2258 : 293 : dump_printf_loc (MSG_NOTE, vect_location,
2259 : : "matched up op %d to\n", n);
2260 : 293 : vect_print_slp_tree (MSG_NOTE, vect_location, child);
2261 : : }
2262 : 761 : children.safe_push (child);
2263 : : }
2264 : : }
2265 : : /* 3. build SLP nodes to combine the chain. */
2266 : 902 : for (unsigned lane = 0; lane < group_size; ++lane)
2267 : 654 : if (chains[lane][0].code != code)
2268 : : {
2269 : : /* See if there's any alternate all-PLUS entry. */
2270 : : unsigned n;
2271 : 10 : for (n = 1; n < chain_len; ++n)
2272 : : {
2273 : 54 : for (lane = 0; lane < group_size; ++lane)
2274 : 44 : if (chains[lane][n].code != code)
2275 : : break;
2276 : 10 : if (lane == group_size)
2277 : : break;
2278 : : }
2279 : 10 : if (n != chain_len)
2280 : : {
2281 : : /* Swap that in at first position. */
2282 : 10 : std::swap (children[0], children[n]);
2283 : 54 : for (lane = 0; lane < group_size; ++lane)
2284 : 44 : std::swap (chains[lane][0], chains[lane][n]);
2285 : : }
2286 : : else
2287 : : {
2288 : : /* ??? When this triggers and we end up with two
2289 : : vect_constant/external_def up-front things break (ICE)
2290 : : spectacularly finding an insertion place for the
2291 : : all-constant op. We should have a fully
2292 : : vect_internal_def operand though(?) so we can swap
2293 : : that into first place and then prepend the all-zero
2294 : : constant. */
2295 : 0 : if (dump_enabled_p ())
2296 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
2297 : : "inserting constant zero to compensate "
2298 : : "for (partially) negated first "
2299 : : "operand\n");
2300 : 0 : chain_len++;
2301 : 0 : for (lane = 0; lane < group_size; ++lane)
2302 : 0 : chains[lane].safe_insert
2303 : 0 : (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2304 : 0 : vec<tree> zero_ops;
2305 : 0 : zero_ops.create (group_size);
2306 : 0 : zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2307 : 0 : for (lane = 1; lane < group_size; ++lane)
2308 : 0 : zero_ops.quick_push (zero_ops[0]);
2309 : 0 : slp_tree zero = vect_create_new_slp_node (zero_ops);
2310 : 0 : SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2311 : 0 : children.safe_insert (0, zero);
2312 : : }
2313 : : break;
2314 : : }
2315 : 1668 : for (unsigned i = 1; i < children.length (); ++i)
2316 : : {
2317 : 576 : slp_tree op0 = children[i - 1];
2318 : 576 : slp_tree op1 = children[i];
2319 : 576 : bool this_two_op = false;
2320 : 1984 : for (unsigned lane = 0; lane < group_size; ++lane)
2321 : 1570 : if (chains[lane][i].code != chains[0][i].code)
2322 : : {
2323 : : this_two_op = true;
2324 : : break;
2325 : : }
2326 : 576 : slp_tree child;
2327 : 576 : if (i == children.length () - 1)
2328 : 258 : child = vect_create_new_slp_node (node, stmts, 2);
2329 : : else
2330 : 318 : child = vect_create_new_slp_node (2, ERROR_MARK);
2331 : 576 : if (this_two_op)
2332 : : {
2333 : 162 : vec<std::pair<unsigned, unsigned> > lperm;
2334 : 162 : lperm.create (group_size);
2335 : 594 : for (unsigned lane = 0; lane < group_size; ++lane)
2336 : 432 : lperm.quick_push (std::make_pair
2337 : 432 : (chains[lane][i].code != chains[0][i].code, lane));
2338 : 324 : vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2339 : 162 : (chains[0][i].code == code
2340 : : ? op_stmt_info
2341 : : : other_op_stmt_info),
2342 : 162 : (chains[0][i].code == code
2343 : : ? other_op_stmt_info
2344 : : : op_stmt_info),
2345 : : lperm);
2346 : : }
2347 : : else
2348 : : {
2349 : 414 : SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2350 : 414 : SLP_TREE_VECTYPE (child) = vectype;
2351 : 414 : SLP_TREE_LANES (child) = group_size;
2352 : 414 : SLP_TREE_CHILDREN (child).quick_push (op0);
2353 : 414 : SLP_TREE_CHILDREN (child).quick_push (op1);
2354 : 414 : SLP_TREE_REPRESENTATIVE (child)
2355 : 828 : = (chains[0][i].code == code
2356 : 414 : ? op_stmt_info : other_op_stmt_info);
2357 : : }
2358 : 576 : children[i] = child;
2359 : : }
2360 : 258 : *tree_size += this_tree_size + 1;
2361 : 258 : *max_nunits = this_max_nunits;
2362 : 1245 : while (!chains.is_empty ())
2363 : 688 : chains.pop ().release ();
2364 : : return node;
2365 : : }
2366 : 13215 : out:
2367 : 6637 : while (!children.is_empty ())
2368 : 17 : vect_free_slp_tree (children.pop ());
2369 : 6719 : while (!chains.is_empty ())
2370 : 99 : chains.pop ().release ();
2371 : : /* Hard-fail, otherwise we might run into quadratic processing of the
2372 : : chains starting one stmt into the chain again. */
2373 : 6620 : if (hard_fail)
2374 : : return NULL;
2375 : : /* Fall thru to normal processing. */
2376 : 6878 : }
2377 : :
2378 : : /* Get at the operands, verifying they are compatible. */
2379 : 1257259 : vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2380 : 1257259 : slp_oprnd_info oprnd_info;
2381 : 11366538 : FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2382 : : {
2383 : 10111686 : int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2384 : : stmts, i, &oprnds_info);
2385 : 10111686 : if (res != 0)
2386 : 131189 : matches[(res == -1) ? 0 : i] = false;
2387 : 10111686 : if (!matches[0])
2388 : : break;
2389 : : }
2390 : 11299704 : for (i = 0; i < group_size; ++i)
2391 : 10100558 : if (!matches[i])
2392 : : {
2393 : 58113 : vect_free_oprnd_info (oprnds_info);
2394 : 58113 : return NULL;
2395 : : }
2396 : 1199146 : swap = NULL;
2397 : :
2398 : 2398292 : auto_vec<slp_tree, 4> children;
2399 : :
2400 : 1199146 : stmt_info = stmts[0];
2401 : :
2402 : : /* Create SLP_TREE nodes for the definition node/s. */
2403 : 2744063 : FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2404 : : {
2405 : 1644361 : slp_tree child = nullptr;
2406 : 1644361 : unsigned int j;
2407 : :
2408 : : /* We're skipping certain operands from processing, for example
2409 : : outer loop reduction initial defs. */
2410 : 1644361 : if (skip_args[i])
2411 : : {
2412 : 3218 : children.safe_push (NULL);
2413 : 1544917 : continue;
2414 : : }
2415 : :
2416 : 1641143 : if (oprnd_info->first_dt == vect_uninitialized_def)
2417 : : {
2418 : : /* COND_EXPR have one too many eventually if the condition
2419 : : is a SSA name. */
2420 : 0 : gcc_assert (i == 3 && nops == 4);
2421 : 0 : continue;
2422 : : }
2423 : :
2424 : 1641143 : if (is_a <bb_vec_info> (vinfo)
2425 : 1592873 : && oprnd_info->first_dt == vect_internal_def
2426 : 2520495 : && !oprnd_info->any_pattern)
2427 : : {
2428 : : /* For BB vectorization, if all defs are the same do not
2429 : : bother to continue the build along the single-lane
2430 : : graph but use a splat of the scalar value. */
2431 : 835335 : stmt_vec_info first_def = oprnd_info->def_stmts[0];
2432 : 892530 : for (j = 1; j < group_size; ++j)
2433 : 845116 : if (oprnd_info->def_stmts[j] != first_def)
2434 : : break;
2435 : 835335 : if (j == group_size
2436 : : /* But avoid doing this for loads where we may be
2437 : : able to CSE things, unless the stmt is not
2438 : : vectorizable. */
2439 : 835335 : && (!STMT_VINFO_VECTORIZABLE (first_def)
2440 : 59242 : || !gimple_vuse (first_def->stmt)))
2441 : : {
2442 : 37643 : if (dump_enabled_p ())
2443 : 101 : dump_printf_loc (MSG_NOTE, vect_location,
2444 : : "Using a splat of the uniform operand %G",
2445 : : first_def->stmt);
2446 : 37643 : oprnd_info->first_dt = vect_external_def;
2447 : : }
2448 : : }
2449 : :
2450 : 1641143 : if (oprnd_info->first_dt == vect_external_def
2451 : 1641143 : || oprnd_info->first_dt == vect_constant_def)
2452 : : {
2453 : 759368 : if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2454 : : {
2455 : : tree op0;
2456 : : tree uniform_val = op0 = oprnd_info->ops[0];
2457 : : for (j = 1; j < oprnd_info->ops.length (); ++j)
2458 : : if (!operand_equal_p (uniform_val, oprnd_info->ops[j]))
2459 : : {
2460 : : uniform_val = NULL_TREE;
2461 : : break;
2462 : : }
2463 : : if (!uniform_val
2464 : : && !can_duplicate_and_interleave_p (vinfo,
2465 : : oprnd_info->ops.length (),
2466 : : TREE_TYPE (op0)))
2467 : : {
2468 : : matches[j] = false;
2469 : : if (dump_enabled_p ())
2470 : : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2471 : : "Build SLP failed: invalid type of def "
2472 : : "for variable-length SLP %T\n", op0);
2473 : : goto fail;
2474 : : }
2475 : : }
2476 : 759368 : slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2477 : 759368 : SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2478 : 759368 : oprnd_info->ops = vNULL;
2479 : 759368 : children.safe_push (invnode);
2480 : 759368 : continue;
2481 : 759368 : }
2482 : :
2483 : 881775 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2484 : : group_size, &this_max_nunits,
2485 : : matches, limit,
2486 : : &this_tree_size, bst_map)) != NULL)
2487 : : {
2488 : 428361 : oprnd_info->def_stmts = vNULL;
2489 : 428361 : children.safe_push (child);
2490 : 428361 : continue;
2491 : : }
2492 : :
2493 : : /* If the SLP build for operand zero failed and operand zero
2494 : : and one can be commutated try that for the scalar stmts
2495 : : that failed the match. */
2496 : 453414 : if (i == 0
2497 : : /* A first scalar stmt mismatch signals a fatal mismatch. */
2498 : 315808 : && matches[0]
2499 : : /* ??? For COND_EXPRs we can swap the comparison operands
2500 : : as well as the arms under some constraints. */
2501 : 193123 : && nops == 2
2502 : 110983 : && oprnds_info[1]->first_dt == vect_internal_def
2503 : 67230 : && is_gimple_assign (stmt_info->stmt)
2504 : : /* Swapping operands for reductions breaks assumptions later on. */
2505 : 52048 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
2506 : 505450 : && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2507 : : {
2508 : : /* See whether we can swap the matching or the non-matching
2509 : : stmt operands. */
2510 : : bool swap_not_matching = true;
2511 : 65091 : do
2512 : : {
2513 : 7052660 : for (j = 0; j < group_size; ++j)
2514 : : {
2515 : 7011941 : if (matches[j] != !swap_not_matching)
2516 : 72051 : continue;
2517 : 6939890 : stmt_vec_info stmt_info = stmts[j];
2518 : : /* Verify if we can swap operands of this stmt. */
2519 : 6939890 : gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2520 : 6939890 : if (!stmt
2521 : 6939890 : || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2522 : : {
2523 : 24372 : if (!swap_not_matching)
2524 : 11317 : goto fail;
2525 : : swap_not_matching = false;
2526 : : break;
2527 : : }
2528 : : }
2529 : : }
2530 : 53774 : while (j != group_size);
2531 : :
2532 : : /* Swap mismatched definition stmts. */
2533 : 40719 : if (dump_enabled_p ())
2534 : 295 : dump_printf_loc (MSG_NOTE, vect_location,
2535 : : "Re-trying with swapped operands of stmts ");
2536 : 7012986 : for (j = 0; j < group_size; ++j)
2537 : 6972267 : if (matches[j] == !swap_not_matching)
2538 : : {
2539 : 13829272 : std::swap (oprnds_info[0]->def_stmts[j],
2540 : 6914636 : oprnds_info[1]->def_stmts[j]);
2541 : 13829272 : std::swap (oprnds_info[0]->ops[j],
2542 : 6914636 : oprnds_info[1]->ops[j]);
2543 : 6914636 : if (dump_enabled_p ())
2544 : 952 : dump_printf (MSG_NOTE, "%d ", j);
2545 : : }
2546 : 40719 : if (dump_enabled_p ())
2547 : 295 : dump_printf (MSG_NOTE, "\n");
2548 : : /* After swapping some operands we lost track whether an
2549 : : operand has any pattern defs so be conservative here. */
2550 : 78594 : if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2551 : 2856 : oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2552 : : /* And try again with scratch 'matches' ... */
2553 : 40719 : bool *tem = XALLOCAVEC (bool, group_size);
2554 : 40719 : if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2555 : : group_size, &this_max_nunits,
2556 : : tem, limit,
2557 : : &this_tree_size, bst_map)) != NULL)
2558 : : {
2559 : 4665 : oprnd_info->def_stmts = vNULL;
2560 : 4665 : children.safe_push (child);
2561 : 4665 : continue;
2562 : : }
2563 : : }
2564 : 448749 : fail:
2565 : :
2566 : : /* If the SLP build failed and we analyze a basic-block
2567 : : simply treat nodes we fail to build as externally defined
2568 : : (and thus build vectors from the scalar defs).
2569 : : The cost model will reject outright expensive cases.
2570 : : ??? This doesn't treat cases where permutation ultimatively
2571 : : fails (or we don't try permutation below). Ideally we'd
2572 : : even compute a permutation that will end up with the maximum
2573 : : SLP tree size... */
2574 : 448749 : if (is_a <bb_vec_info> (vinfo)
2575 : : /* ??? Rejecting patterns this way doesn't work. We'd have to
2576 : : do extra work to cancel the pattern so the uses see the
2577 : : scalar version. */
2578 : 437453 : && !is_pattern_stmt_p (stmt_info)
2579 : 862255 : && !oprnd_info->any_pattern)
2580 : : {
2581 : : /* But if there's a leading vector sized set of matching stmts
2582 : : fail here so we can split the group. This matches the condition
2583 : : vect_analyze_slp_instance uses. */
2584 : : /* ??? We might want to split here and combine the results to support
2585 : : multiple vector sizes better. */
2586 : 676828 : for (j = 0; j < group_size; ++j)
2587 : 676828 : if (!matches[j])
2588 : : break;
2589 : 413080 : if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2590 : : {
2591 : 349305 : if (dump_enabled_p ())
2592 : 474 : dump_printf_loc (MSG_NOTE, vect_location,
2593 : : "Building vector operands from scalars\n");
2594 : 349305 : this_tree_size++;
2595 : 349305 : child = vect_create_new_slp_node (oprnd_info->ops);
2596 : 349305 : children.safe_push (child);
2597 : 349305 : oprnd_info->ops = vNULL;
2598 : 349305 : continue;
2599 : : }
2600 : : }
2601 : :
2602 : 99444 : gcc_assert (child == NULL);
2603 : 111653 : FOR_EACH_VEC_ELT (children, j, child)
2604 : 12209 : if (child)
2605 : 12209 : vect_free_slp_tree (child);
2606 : 99444 : vect_free_oprnd_info (oprnds_info);
2607 : 99444 : return NULL;
2608 : : }
2609 : :
2610 : 1099702 : vect_free_oprnd_info (oprnds_info);
2611 : :
2612 : : /* If we have all children of a child built up from uniform scalars
2613 : : or does more than one possibly expensive vector construction then
2614 : : just throw that away, causing it built up from scalars.
2615 : : The exception is the SLP node for the vector store. */
2616 : 1099702 : if (is_a <bb_vec_info> (vinfo)
2617 : 1073991 : && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2618 : : /* ??? Rejecting patterns this way doesn't work. We'd have to
2619 : : do extra work to cancel the pattern so the uses see the
2620 : : scalar version. */
2621 : 1546283 : && !is_pattern_stmt_p (stmt_info))
2622 : : {
2623 : : slp_tree child;
2624 : : unsigned j;
2625 : : bool all_uniform_p = true;
2626 : : unsigned n_vector_builds = 0;
2627 : 1237940 : FOR_EACH_VEC_ELT (children, j, child)
2628 : : {
2629 : 817699 : if (!child)
2630 : : ;
2631 : 817699 : else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2632 : : all_uniform_p = false;
2633 : 582894 : else if (!vect_slp_tree_uniform_p (child))
2634 : : {
2635 : 438071 : all_uniform_p = false;
2636 : 438071 : if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2637 : 406620 : n_vector_builds++;
2638 : : }
2639 : : }
2640 : 420241 : if (all_uniform_p
2641 : 420241 : || n_vector_builds > 1
2642 : 717256 : || (n_vector_builds == children.length ()
2643 : 32437 : && is_a <gphi *> (stmt_info->stmt)))
2644 : : {
2645 : : /* Roll back. */
2646 : 129322 : matches[0] = false;
2647 : 401231 : FOR_EACH_VEC_ELT (children, j, child)
2648 : 271909 : if (child)
2649 : 271909 : vect_free_slp_tree (child);
2650 : :
2651 : 129322 : if (dump_enabled_p ())
2652 : 122 : dump_printf_loc (MSG_NOTE, vect_location,
2653 : : "Building parent vector operands from "
2654 : : "scalars instead\n");
2655 : 129322 : return NULL;
2656 : : }
2657 : : }
2658 : :
2659 : 970380 : *tree_size += this_tree_size + 1;
2660 : 970380 : *max_nunits = this_max_nunits;
2661 : :
2662 : 970380 : if (two_operators)
2663 : : {
2664 : : /* ??? We'd likely want to either cache in bst_map sth like
2665 : : { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2666 : : the true { a+b, a+b, a+b, a+b } ... but there we don't have
2667 : : explicit stmts to put in so the keying on 'stmts' doesn't
2668 : : work (but we have the same issue with nodes that use 'ops'). */
2669 : 4815 : slp_tree one = new _slp_tree;
2670 : 4815 : slp_tree two = new _slp_tree;
2671 : 4815 : SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2672 : 4815 : SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2673 : 4815 : SLP_TREE_VECTYPE (one) = vectype;
2674 : 4815 : SLP_TREE_VECTYPE (two) = vectype;
2675 : 4815 : SLP_TREE_CHILDREN (one).safe_splice (children);
2676 : 4815 : SLP_TREE_CHILDREN (two).safe_splice (children);
2677 : 4815 : slp_tree child;
2678 : 19260 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2679 : 9630 : SLP_TREE_REF_COUNT (child)++;
2680 : :
2681 : : /* Here we record the original defs since this
2682 : : node represents the final lane configuration. */
2683 : 4815 : node = vect_create_new_slp_node (node, stmts, 2);
2684 : 4815 : SLP_TREE_VECTYPE (node) = vectype;
2685 : 4815 : SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2686 : 4815 : SLP_TREE_CHILDREN (node).quick_push (one);
2687 : 4815 : SLP_TREE_CHILDREN (node).quick_push (two);
2688 : 4815 : gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2689 : 4815 : enum tree_code code0 = gimple_assign_rhs_code (stmt);
2690 : 4815 : enum tree_code ocode = ERROR_MARK;
2691 : 4815 : stmt_vec_info ostmt_info;
2692 : 4815 : unsigned j = 0;
2693 : 17215 : FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2694 : : {
2695 : 12400 : gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2696 : 12400 : if (gimple_assign_rhs_code (ostmt) != code0)
2697 : : {
2698 : 6203 : SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2699 : 6203 : ocode = gimple_assign_rhs_code (ostmt);
2700 : 6203 : j = i;
2701 : : }
2702 : : else
2703 : 6197 : SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2704 : : }
2705 : 4815 : SLP_TREE_CODE (one) = code0;
2706 : 4815 : SLP_TREE_CODE (two) = ocode;
2707 : 4815 : SLP_TREE_LANES (one) = stmts.length ();
2708 : 4815 : SLP_TREE_LANES (two) = stmts.length ();
2709 : 4815 : SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2710 : 4815 : SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2711 : 4815 : return node;
2712 : : }
2713 : :
2714 : 965565 : node = vect_create_new_slp_node (node, stmts, nops);
2715 : 965565 : SLP_TREE_VECTYPE (node) = vectype;
2716 : 965565 : SLP_TREE_CHILDREN (node).splice (children);
2717 : 965565 : return node;
2718 : : }
2719 : :
2720 : : /* Dump a single SLP tree NODE. */
2721 : :
2722 : : static void
2723 : 32725 : vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2724 : : slp_tree node)
2725 : : {
2726 : 32725 : unsigned i, j;
2727 : 32725 : slp_tree child;
2728 : 32725 : stmt_vec_info stmt_info;
2729 : 32725 : tree op;
2730 : :
2731 : 32725 : dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2732 : 32725 : dump_user_location_t user_loc = loc.get_user_location ();
2733 : 32725 : dump_printf_loc (metadata, user_loc,
2734 : : "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2735 : : ", refcnt=%u)",
2736 : 32725 : SLP_TREE_DEF_TYPE (node) == vect_external_def
2737 : : ? " (external)"
2738 : : : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2739 : 30493 : ? " (constant)"
2740 : : : ""), (void *) node,
2741 : 32725 : estimated_poly_value (node->max_nunits),
2742 : : SLP_TREE_REF_COUNT (node));
2743 : 32725 : if (SLP_TREE_VECTYPE (node))
2744 : 29247 : dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2745 : 32725 : dump_printf (metadata, "\n");
2746 : 32725 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2747 : : {
2748 : 26977 : if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2749 : 999 : dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2750 : : else
2751 : 25978 : dump_printf_loc (metadata, user_loc, "op template: %G",
2752 : 25978 : SLP_TREE_REPRESENTATIVE (node)->stmt);
2753 : : }
2754 : 32725 : if (SLP_TREE_SCALAR_STMTS (node).exists ())
2755 : 142411 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2756 : 117305 : dump_printf_loc (metadata, user_loc, "\tstmt %u %G", i, stmt_info->stmt);
2757 : : else
2758 : : {
2759 : 7619 : dump_printf_loc (metadata, user_loc, "\t{ ");
2760 : 39945 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2761 : 24707 : dump_printf (metadata, "%T%s ", op,
2762 : 24707 : i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2763 : 7619 : dump_printf (metadata, "}\n");
2764 : : }
2765 : 32725 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2766 : : {
2767 : 5503 : dump_printf_loc (metadata, user_loc, "\tload permutation {");
2768 : 35892 : FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2769 : 24886 : dump_printf (dump_kind, " %u", j);
2770 : 5503 : dump_printf (dump_kind, " }\n");
2771 : : }
2772 : 32725 : if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2773 : : {
2774 : 1027 : dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2775 : 13949 : for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2776 : 5434 : dump_printf (dump_kind, " %u[%u]",
2777 : 5434 : SLP_TREE_LANE_PERMUTATION (node)[i].first,
2778 : 5434 : SLP_TREE_LANE_PERMUTATION (node)[i].second);
2779 : 1027 : dump_printf (dump_kind, " }\n");
2780 : : }
2781 : 32725 : if (SLP_TREE_CHILDREN (node).is_empty ())
2782 : 13318 : return;
2783 : 19407 : dump_printf_loc (metadata, user_loc, "\tchildren");
2784 : 69103 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2785 : 30289 : dump_printf (dump_kind, " %p", (void *)child);
2786 : 19407 : dump_printf (dump_kind, "\n");
2787 : : }
2788 : :
2789 : : DEBUG_FUNCTION void
2790 : 0 : debug (slp_tree node)
2791 : : {
2792 : 0 : debug_dump_context ctx;
2793 : 0 : vect_print_slp_tree (MSG_NOTE,
2794 : : dump_location_t::from_location_t (UNKNOWN_LOCATION),
2795 : : node);
2796 : 0 : }
2797 : :
2798 : : /* Recursive helper for the dot producer below. */
2799 : :
2800 : : static void
2801 : 0 : dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2802 : : {
2803 : 0 : if (visited.add (node))
2804 : : return;
2805 : :
2806 : 0 : fprintf (f, "\"%p\" [label=\"", (void *)node);
2807 : 0 : vect_print_slp_tree (MSG_NOTE,
2808 : : dump_location_t::from_location_t (UNKNOWN_LOCATION),
2809 : : node);
2810 : 0 : fprintf (f, "\"];\n");
2811 : :
2812 : :
2813 : 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
2814 : 0 : fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2815 : :
2816 : 0 : for (slp_tree child : SLP_TREE_CHILDREN (node))
2817 : 0 : if (child)
2818 : 0 : dot_slp_tree (f, child, visited);
2819 : : }
2820 : :
2821 : : DEBUG_FUNCTION void
2822 : 0 : dot_slp_tree (const char *fname, slp_tree node)
2823 : : {
2824 : 0 : FILE *f = fopen (fname, "w");
2825 : 0 : fprintf (f, "digraph {\n");
2826 : 0 : fflush (f);
2827 : 0 : {
2828 : 0 : debug_dump_context ctx (f);
2829 : 0 : hash_set<slp_tree> visited;
2830 : 0 : dot_slp_tree (f, node, visited);
2831 : 0 : }
2832 : 0 : fflush (f);
2833 : 0 : fprintf (f, "}\n");
2834 : 0 : fclose (f);
2835 : 0 : }
2836 : :
2837 : : /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
2838 : :
2839 : : static void
2840 : 35363 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2841 : : slp_tree node, hash_set<slp_tree> &visited)
2842 : : {
2843 : 35363 : unsigned i;
2844 : 35363 : slp_tree child;
2845 : :
2846 : 35363 : if (visited.add (node))
2847 : 35363 : return;
2848 : :
2849 : 32432 : vect_print_slp_tree (dump_kind, loc, node);
2850 : :
2851 : 94804 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2852 : 29940 : if (child)
2853 : 29150 : vect_print_slp_graph (dump_kind, loc, child, visited);
2854 : : }
2855 : :
2856 : : static void
2857 : 6091 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
2858 : : slp_tree entry)
2859 : : {
2860 : 6091 : hash_set<slp_tree> visited;
2861 : 6091 : vect_print_slp_graph (dump_kind, loc, entry, visited);
2862 : 6091 : }
2863 : :
2864 : : /* Mark the tree rooted at NODE with PURE_SLP. */
2865 : :
2866 : : static void
2867 : 2301587 : vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
2868 : : {
2869 : 2301587 : int i;
2870 : 2301587 : stmt_vec_info stmt_info;
2871 : 2301587 : slp_tree child;
2872 : :
2873 : 2301587 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2874 : : return;
2875 : :
2876 : 1384141 : if (visited.add (node))
2877 : : return;
2878 : :
2879 : 4313064 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2880 : 3026855 : STMT_SLP_TYPE (stmt_info) = pure_slp;
2881 : :
2882 : 2827648 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2883 : 1541439 : if (child)
2884 : 1539081 : vect_mark_slp_stmts (child, visited);
2885 : : }
2886 : :
2887 : : static void
2888 : 762506 : vect_mark_slp_stmts (slp_tree node)
2889 : : {
2890 : 762506 : hash_set<slp_tree> visited;
2891 : 762506 : vect_mark_slp_stmts (node, visited);
2892 : 762506 : }
2893 : :
2894 : : /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
2895 : :
2896 : : static void
2897 : 2251485 : vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
2898 : : {
2899 : 2251485 : int i;
2900 : 2251485 : stmt_vec_info stmt_info;
2901 : 2251485 : slp_tree child;
2902 : :
2903 : 2251485 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2904 : : return;
2905 : :
2906 : 1342185 : if (visited.add (node))
2907 : : return;
2908 : :
2909 : 4145980 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2910 : : {
2911 : 2898484 : gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
2912 : : || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
2913 : 2898484 : STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
2914 : : }
2915 : :
2916 : 2748077 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2917 : 1500581 : if (child)
2918 : 1500581 : vect_mark_slp_stmts_relevant (child, visited);
2919 : : }
2920 : :
2921 : : static void
2922 : 750904 : vect_mark_slp_stmts_relevant (slp_tree node)
2923 : : {
2924 : 750904 : hash_set<slp_tree> visited;
2925 : 750904 : vect_mark_slp_stmts_relevant (node, visited);
2926 : 750904 : }
2927 : :
2928 : :
2929 : : /* Gather loads in the SLP graph NODE and populate the INST loads array. */
2930 : :
2931 : : static void
2932 : 2340374 : vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
2933 : : hash_set<slp_tree> &visited)
2934 : : {
2935 : 2340374 : if (!node || visited.add (node))
2936 : 128372 : return;
2937 : :
2938 : 2212002 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
2939 : : return;
2940 : :
2941 : 1309476 : if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
2942 : : {
2943 : 1279041 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
2944 : 1279041 : if (STMT_VINFO_DATA_REF (stmt_info)
2945 : 822862 : && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2946 : 185105 : loads.safe_push (node);
2947 : : }
2948 : :
2949 : : unsigned i;
2950 : : slp_tree child;
2951 : 2878966 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2952 : 1569490 : vect_gather_slp_loads (loads, child, visited);
2953 : : }
2954 : :
2955 : :
2956 : : /* Find the last store in SLP INSTANCE. */
2957 : :
2958 : : stmt_vec_info
2959 : 2605070 : vect_find_last_scalar_stmt_in_slp (slp_tree node)
2960 : : {
2961 : 2605070 : stmt_vec_info last = NULL;
2962 : 2605070 : stmt_vec_info stmt_vinfo;
2963 : :
2964 : 9385067 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2965 : : {
2966 : 6779997 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2967 : 6779997 : last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
2968 : : }
2969 : :
2970 : 2605070 : return last;
2971 : : }
2972 : :
2973 : : /* Find the first stmt in NODE. */
2974 : :
2975 : : stmt_vec_info
2976 : 520581 : vect_find_first_scalar_stmt_in_slp (slp_tree node)
2977 : : {
2978 : 520581 : stmt_vec_info first = NULL;
2979 : 520581 : stmt_vec_info stmt_vinfo;
2980 : :
2981 : 1789707 : for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
2982 : : {
2983 : 1269126 : stmt_vinfo = vect_orig_stmt (stmt_vinfo);
2984 : 1269126 : if (!first
2985 : 1269126 : || get_later_stmt (stmt_vinfo, first) == first)
2986 : : first = stmt_vinfo;
2987 : : }
2988 : :
2989 : 520581 : return first;
2990 : : }
2991 : :
2992 : : /* Splits a group of stores, currently beginning at FIRST_VINFO, into
2993 : : two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
2994 : : (also containing the first GROUP1_SIZE stmts, since stores are
2995 : : consecutive), the second containing the remainder.
2996 : : Return the first stmt in the second group. */
2997 : :
2998 : : static stmt_vec_info
2999 : 119229 : vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3000 : : {
3001 : 119229 : gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3002 : 119229 : gcc_assert (group1_size > 0);
3003 : 119229 : int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3004 : 119229 : gcc_assert (group2_size > 0);
3005 : 119229 : DR_GROUP_SIZE (first_vinfo) = group1_size;
3006 : :
3007 : 119229 : stmt_vec_info stmt_info = first_vinfo;
3008 : 400122 : for (unsigned i = group1_size; i > 1; i--)
3009 : : {
3010 : 280893 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3011 : 280893 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3012 : : }
3013 : : /* STMT is now the last element of the first group. */
3014 : 119229 : stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3015 : 119229 : DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3016 : :
3017 : 119229 : DR_GROUP_SIZE (group2) = group2_size;
3018 : 367929 : for (stmt_info = group2; stmt_info;
3019 : 248700 : stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3020 : : {
3021 : 248700 : DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3022 : 248700 : gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3023 : : }
3024 : :
3025 : : /* For the second group, the DR_GROUP_GAP is that before the original group,
3026 : : plus skipping over the first vector. */
3027 : 119229 : DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3028 : :
3029 : : /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3030 : 119229 : DR_GROUP_GAP (first_vinfo) += group2_size;
3031 : :
3032 : 119229 : if (dump_enabled_p ())
3033 : 114 : dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3034 : : group1_size, group2_size);
3035 : :
3036 : 119229 : return group2;
3037 : : }
3038 : :
3039 : : /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3040 : : statements and a vector of NUNITS elements. */
3041 : :
3042 : : static poly_uint64
3043 : 770884 : calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3044 : : {
3045 : 770884 : return exact_div (common_multiple (nunits, group_size), group_size);
3046 : : }
3047 : :
3048 : : /* Helper that checks to see if a node is a load node. */
3049 : :
3050 : : static inline bool
3051 : 320 : vect_is_slp_load_node (slp_tree root)
3052 : : {
3053 : 320 : return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3054 : 42 : && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3055 : 360 : && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3056 : : }
3057 : :
3058 : :
3059 : : /* Helper function of optimize_load_redistribution that performs the operation
3060 : : recursively. */
3061 : :
3062 : : static slp_tree
3063 : 20676 : optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3064 : : vec_info *vinfo, unsigned int group_size,
3065 : : hash_map<slp_tree, slp_tree> *load_map,
3066 : : slp_tree root)
3067 : : {
3068 : 20676 : if (slp_tree *leader = load_map->get (root))
3069 : 3366 : return *leader;
3070 : :
3071 : 17310 : slp_tree node;
3072 : 17310 : unsigned i;
3073 : :
3074 : : /* For now, we don't know anything about externals so do not do anything. */
3075 : 17310 : if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3076 : : return NULL;
3077 : 12675 : else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3078 : : {
3079 : : /* First convert this node into a load node and add it to the leaves
3080 : : list and flatten the permute from a lane to a load one. If it's
3081 : : unneeded it will be elided later. */
3082 : 300 : vec<stmt_vec_info> stmts;
3083 : 300 : stmts.create (SLP_TREE_LANES (root));
3084 : 300 : lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3085 : 680 : for (unsigned j = 0; j < lane_perm.length (); j++)
3086 : : {
3087 : 320 : std::pair<unsigned, unsigned> perm = lane_perm[j];
3088 : 320 : node = SLP_TREE_CHILDREN (root)[perm.first];
3089 : :
3090 : 320 : if (!vect_is_slp_load_node (node)
3091 : 320 : || SLP_TREE_CHILDREN (node).exists ())
3092 : : {
3093 : 280 : stmts.release ();
3094 : 280 : goto next;
3095 : : }
3096 : :
3097 : 40 : stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3098 : : }
3099 : :
3100 : 20 : if (dump_enabled_p ())
3101 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
3102 : : "converting stmts on permute node %p\n",
3103 : : (void *) root);
3104 : :
3105 : 20 : bool *matches = XALLOCAVEC (bool, group_size);
3106 : 20 : poly_uint64 max_nunits = 1;
3107 : 20 : unsigned tree_size = 0, limit = 1;
3108 : 20 : node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3109 : : matches, &limit, &tree_size, bst_map);
3110 : 20 : if (!node)
3111 : 0 : stmts.release ();
3112 : :
3113 : 20 : load_map->put (root, node);
3114 : 20 : return node;
3115 : : }
3116 : :
3117 : 12375 : next:
3118 : 12655 : load_map->put (root, NULL);
3119 : :
3120 : 29851 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3121 : : {
3122 : 17196 : slp_tree value
3123 : 17196 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3124 : : node);
3125 : 17196 : if (value)
3126 : : {
3127 : 20 : SLP_TREE_REF_COUNT (value)++;
3128 : 20 : SLP_TREE_CHILDREN (root)[i] = value;
3129 : : /* ??? We know the original leafs of the replaced nodes will
3130 : : be referenced by bst_map, only the permutes created by
3131 : : pattern matching are not. */
3132 : 20 : if (SLP_TREE_REF_COUNT (node) == 1)
3133 : 20 : load_map->remove (node);
3134 : 20 : vect_free_slp_tree (node);
3135 : : }
3136 : : }
3137 : :
3138 : : return NULL;
3139 : : }
3140 : :
3141 : : /* Temporary workaround for loads not being CSEd during SLP build. This
3142 : : function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3143 : : VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3144 : : same DR such that the final operation is equal to a permuted load. Such
3145 : : NODES are then directly converted into LOADS themselves. The nodes are
3146 : : CSEd using BST_MAP. */
3147 : :
3148 : : static void
3149 : 2692 : optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3150 : : vec_info *vinfo, unsigned int group_size,
3151 : : hash_map<slp_tree, slp_tree> *load_map,
3152 : : slp_tree root)
3153 : : {
3154 : 2692 : slp_tree node;
3155 : 2692 : unsigned i;
3156 : :
3157 : 6172 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3158 : : {
3159 : 3480 : slp_tree value
3160 : 3480 : = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3161 : : node);
3162 : 3480 : if (value)
3163 : : {
3164 : 0 : SLP_TREE_REF_COUNT (value)++;
3165 : 0 : SLP_TREE_CHILDREN (root)[i] = value;
3166 : : /* ??? We know the original leafs of the replaced nodes will
3167 : : be referenced by bst_map, only the permutes created by
3168 : : pattern matching are not. */
3169 : 0 : if (SLP_TREE_REF_COUNT (node) == 1)
3170 : 0 : load_map->remove (node);
3171 : 0 : vect_free_slp_tree (node);
3172 : : }
3173 : : }
3174 : 2692 : }
3175 : :
3176 : : /* Helper function of vect_match_slp_patterns.
3177 : :
3178 : : Attempts to match patterns against the slp tree rooted in REF_NODE using
3179 : : VINFO. Patterns are matched in post-order traversal.
3180 : :
3181 : : If matching is successful the value in REF_NODE is updated and returned, if
3182 : : not then it is returned unchanged. */
3183 : :
3184 : : static bool
3185 : 2029275 : vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3186 : : slp_tree_to_load_perm_map_t *perm_cache,
3187 : : slp_compat_nodes_map_t *compat_cache,
3188 : : hash_set<slp_tree> *visited)
3189 : : {
3190 : 2029275 : unsigned i;
3191 : 2029275 : slp_tree node = *ref_node;
3192 : 2029275 : bool found_p = false;
3193 : 2029275 : if (!node || visited->add (node))
3194 : 65715 : return false;
3195 : :
3196 : : slp_tree child;
3197 : 3221951 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3198 : 1258391 : found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3199 : : vinfo, perm_cache, compat_cache,
3200 : : visited);
3201 : :
3202 : 5890680 : for (unsigned x = 0; x < num__slp_patterns; x++)
3203 : : {
3204 : 3927120 : vect_pattern *pattern
3205 : 3927120 : = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3206 : 3927120 : if (pattern)
3207 : : {
3208 : 1059 : pattern->build (vinfo);
3209 : 1059 : delete pattern;
3210 : 1059 : found_p = true;
3211 : : }
3212 : : }
3213 : :
3214 : : return found_p;
3215 : : }
3216 : :
3217 : : /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3218 : : vec_info VINFO.
3219 : :
3220 : : The modified tree is returned. Patterns are tried in order and multiple
3221 : : patterns may match. */
3222 : :
3223 : : static bool
3224 : 770884 : vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3225 : : hash_set<slp_tree> *visited,
3226 : : slp_tree_to_load_perm_map_t *perm_cache,
3227 : : slp_compat_nodes_map_t *compat_cache)
3228 : : {
3229 : 770884 : DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3230 : 770884 : slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3231 : :
3232 : 770884 : if (dump_enabled_p ())
3233 : 3700 : dump_printf_loc (MSG_NOTE, vect_location,
3234 : : "Analyzing SLP tree %p for patterns\n",
3235 : 3700 : (void *) SLP_INSTANCE_TREE (instance));
3236 : :
3237 : 770884 : return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3238 : 770884 : visited);
3239 : : }
3240 : :
3241 : : /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3242 : : splitting into two, with the first split group having size NEW_GROUP_SIZE.
3243 : : Return true if we could use IFN_STORE_LANES instead and if that appears
3244 : : to be the better approach. */
3245 : :
3246 : : static bool
3247 : 554 : vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3248 : : unsigned int group_size,
3249 : : unsigned int new_group_size)
3250 : : {
3251 : 554 : tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3252 : 554 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3253 : 554 : if (!vectype)
3254 : : return false;
3255 : : /* Allow the split if one of the two new groups would operate on full
3256 : : vectors *within* rather than across one scalar loop iteration.
3257 : : This is purely a heuristic, but it should work well for group
3258 : : sizes of 3 and 4, where the possible splits are:
3259 : :
3260 : : 3->2+1: OK if the vector has exactly two elements
3261 : : 4->2+2: Likewise
3262 : : 4->3+1: Less clear-cut. */
3263 : 554 : if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3264 : 466 : || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3265 : 187 : return false;
3266 : 367 : return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3267 : : }
3268 : :
3269 : : /* Analyze an SLP instance starting from a group of grouped stores. Call
3270 : : vect_build_slp_tree to build a tree of packed stmts if possible.
3271 : : Return FALSE if it's impossible to SLP any stmt in the loop. */
3272 : :
3273 : : static bool
3274 : : vect_analyze_slp_instance (vec_info *vinfo,
3275 : : scalar_stmts_to_slp_tree_map_t *bst_map,
3276 : : stmt_vec_info stmt_info, slp_instance_kind kind,
3277 : : unsigned max_tree_size, unsigned *limit);
3278 : :
3279 : : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3280 : : of KIND. Return true if successful. */
3281 : :
3282 : : static bool
3283 : 1997851 : vect_build_slp_instance (vec_info *vinfo,
3284 : : slp_instance_kind kind,
3285 : : vec<stmt_vec_info> &scalar_stmts,
3286 : : vec<stmt_vec_info> &root_stmt_infos,
3287 : : vec<tree> &remain,
3288 : : unsigned max_tree_size, unsigned *limit,
3289 : : scalar_stmts_to_slp_tree_map_t *bst_map,
3290 : : /* ??? We need stmt_info for group splitting. */
3291 : : stmt_vec_info stmt_info_)
3292 : : {
3293 : 1997851 : if (kind == slp_inst_kind_ctor)
3294 : : {
3295 : 17134 : if (dump_enabled_p ())
3296 : 54 : dump_printf_loc (MSG_NOTE, vect_location,
3297 : : "Analyzing vectorizable constructor: %G\n",
3298 : 27 : root_stmt_infos[0]->stmt);
3299 : : }
3300 : :
3301 : 1997851 : if (dump_enabled_p ())
3302 : : {
3303 : 6681 : dump_printf_loc (MSG_NOTE, vect_location,
3304 : : "Starting SLP discovery for\n");
3305 : 59892 : for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3306 : 46530 : dump_printf_loc (MSG_NOTE, vect_location,
3307 : 23265 : " %G", scalar_stmts[i]->stmt);
3308 : : }
3309 : :
3310 : : /* Build the tree for the SLP instance. */
3311 : 1997851 : unsigned int group_size = scalar_stmts.length ();
3312 : 1997851 : bool *matches = XALLOCAVEC (bool, group_size);
3313 : 1997851 : poly_uint64 max_nunits = 1;
3314 : 1997851 : unsigned tree_size = 0;
3315 : 1997851 : unsigned i;
3316 : 1997851 : slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3317 : : &max_nunits, matches, limit,
3318 : 1997851 : &tree_size, bst_map);
3319 : 1997851 : if (node != NULL)
3320 : : {
3321 : : /* Calculate the unrolling factor based on the smallest type. */
3322 : 770884 : poly_uint64 unrolling_factor
3323 : 770884 : = calculate_unrolling_factor (max_nunits, group_size);
3324 : :
3325 : 770884 : if (maybe_ne (unrolling_factor, 1U)
3326 : 770884 : && is_a <bb_vec_info> (vinfo))
3327 : : {
3328 : 0 : unsigned HOST_WIDE_INT const_max_nunits;
3329 : 0 : if (!max_nunits.is_constant (&const_max_nunits)
3330 : 0 : || const_max_nunits > group_size)
3331 : : {
3332 : 0 : if (dump_enabled_p ())
3333 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3334 : : "Build SLP failed: store group "
3335 : : "size not a multiple of the vector size "
3336 : : "in basic block SLP\n");
3337 : 0 : vect_free_slp_tree (node);
3338 : 0 : return false;
3339 : : }
3340 : : /* Fatal mismatch. */
3341 : 0 : if (dump_enabled_p ())
3342 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
3343 : : "SLP discovery succeeded but node needs "
3344 : : "splitting\n");
3345 : 0 : memset (matches, true, group_size);
3346 : 0 : matches[group_size / const_max_nunits * const_max_nunits] = false;
3347 : 0 : vect_free_slp_tree (node);
3348 : : }
3349 : : else
3350 : : {
3351 : : /* Create a new SLP instance. */
3352 : 770884 : slp_instance new_instance = XNEW (class _slp_instance);
3353 : 770884 : SLP_INSTANCE_TREE (new_instance) = node;
3354 : 770884 : SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3355 : 770884 : SLP_INSTANCE_LOADS (new_instance) = vNULL;
3356 : 770884 : SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3357 : 770884 : SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3358 : 770884 : SLP_INSTANCE_KIND (new_instance) = kind;
3359 : 770884 : new_instance->reduc_phis = NULL;
3360 : 770884 : new_instance->cost_vec = vNULL;
3361 : 770884 : new_instance->subgraph_entries = vNULL;
3362 : :
3363 : 770884 : if (dump_enabled_p ())
3364 : 3700 : dump_printf_loc (MSG_NOTE, vect_location,
3365 : : "SLP size %u vs. limit %u.\n",
3366 : : tree_size, max_tree_size);
3367 : :
3368 : : /* Fixup SLP reduction chains. */
3369 : 770884 : if (kind == slp_inst_kind_reduc_chain)
3370 : : {
3371 : : /* If this is a reduction chain with a conversion in front
3372 : : amend the SLP tree with a node for that. */
3373 : 346 : gimple *scalar_def
3374 : 346 : = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3375 : 346 : if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3376 : : {
3377 : : /* Get at the conversion stmt - we know it's the single use
3378 : : of the last stmt of the reduction chain. */
3379 : 42 : use_operand_p use_p;
3380 : 42 : bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3381 : : &use_p, &scalar_def);
3382 : 42 : gcc_assert (r);
3383 : 42 : stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3384 : 42 : next_info = vect_stmt_to_vectorize (next_info);
3385 : 42 : scalar_stmts = vNULL;
3386 : 42 : scalar_stmts.create (group_size);
3387 : 132 : for (unsigned i = 0; i < group_size; ++i)
3388 : 90 : scalar_stmts.quick_push (next_info);
3389 : 42 : slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3390 : 42 : SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3391 : 42 : SLP_TREE_CHILDREN (conv).quick_push (node);
3392 : 42 : SLP_INSTANCE_TREE (new_instance) = conv;
3393 : : /* We also have to fake this conversion stmt as SLP reduction
3394 : : group so we don't have to mess with too much code
3395 : : elsewhere. */
3396 : 42 : REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3397 : 42 : REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3398 : : }
3399 : : /* Fill the backedge child of the PHI SLP node. The
3400 : : general matching code cannot find it because the
3401 : : scalar code does not reflect how we vectorize the
3402 : : reduction. */
3403 : 346 : use_operand_p use_p;
3404 : 346 : imm_use_iterator imm_iter;
3405 : 346 : class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3406 : 1101 : FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3407 : : gimple_get_lhs (scalar_def))
3408 : : /* There are exactly two non-debug uses, the reduction
3409 : : PHI and the loop-closed PHI node. */
3410 : 755 : if (!is_gimple_debug (USE_STMT (use_p))
3411 : 755 : && gimple_bb (USE_STMT (use_p)) == loop->header)
3412 : : {
3413 : 346 : auto_vec<stmt_vec_info, 64> phis (group_size);
3414 : 346 : stmt_vec_info phi_info
3415 : 346 : = vinfo->lookup_stmt (USE_STMT (use_p));
3416 : 2072 : for (unsigned i = 0; i < group_size; ++i)
3417 : 1726 : phis.quick_push (phi_info);
3418 : 346 : slp_tree *phi_node = bst_map->get (phis);
3419 : 346 : unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3420 : 692 : SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3421 : 346 : = SLP_INSTANCE_TREE (new_instance);
3422 : 346 : SLP_INSTANCE_TREE (new_instance)->refcnt++;
3423 : 346 : }
3424 : : }
3425 : :
3426 : 770884 : vinfo->slp_instances.safe_push (new_instance);
3427 : :
3428 : : /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3429 : : the number of scalar stmts in the root in a few places.
3430 : : Verify that assumption holds. */
3431 : 1541768 : gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3432 : : .length () == group_size);
3433 : :
3434 : 770884 : if (dump_enabled_p ())
3435 : : {
3436 : 3700 : dump_printf_loc (MSG_NOTE, vect_location,
3437 : : "Final SLP tree for instance %p:\n",
3438 : : (void *) new_instance);
3439 : 3700 : vect_print_slp_graph (MSG_NOTE, vect_location,
3440 : : SLP_INSTANCE_TREE (new_instance));
3441 : : }
3442 : :
3443 : 770884 : return true;
3444 : : }
3445 : : }
3446 : : else
3447 : : {
3448 : : /* Failed to SLP. */
3449 : : /* Free the allocated memory. */
3450 : 1226967 : scalar_stmts.release ();
3451 : : }
3452 : :
3453 : 1226967 : stmt_vec_info stmt_info = stmt_info_;
3454 : : /* Try to break the group up into pieces. */
3455 : 1226967 : if (kind == slp_inst_kind_store)
3456 : : {
3457 : : /* ??? We could delay all the actual splitting of store-groups
3458 : : until after SLP discovery of the original group completed.
3459 : : Then we can recurse to vect_build_slp_instance directly. */
3460 : 912862 : for (i = 0; i < group_size; i++)
3461 : 912862 : if (!matches[i])
3462 : : break;
3463 : :
3464 : : /* For basic block SLP, try to break the group up into multiples of
3465 : : a vector size. */
3466 : 314254 : if (is_a <bb_vec_info> (vinfo)
3467 : 314254 : && (i > 1 && i < group_size))
3468 : : {
3469 : 116358 : tree scalar_type
3470 : 116358 : = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3471 : 232716 : tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3472 : 116358 : 1 << floor_log2 (i));
3473 : 116358 : unsigned HOST_WIDE_INT const_nunits;
3474 : 116358 : if (vectype
3475 : 116358 : && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3476 : : {
3477 : : /* Split into two groups at the first vector boundary. */
3478 : 116358 : gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3479 : 116358 : unsigned group1_size = i & ~(const_nunits - 1);
3480 : :
3481 : 116358 : if (dump_enabled_p ())
3482 : 68 : dump_printf_loc (MSG_NOTE, vect_location,
3483 : : "Splitting SLP group at stmt %u\n", i);
3484 : 116358 : stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3485 : : group1_size);
3486 : 116358 : bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3487 : : kind, max_tree_size,
3488 : : limit);
3489 : : /* Split the rest at the failure point and possibly
3490 : : re-analyze the remaining matching part if it has
3491 : : at least two lanes. */
3492 : 116358 : if (group1_size < i
3493 : 4459 : && (i + 1 < group_size
3494 : 2166 : || i - group1_size > 1))
3495 : : {
3496 : 2317 : stmt_vec_info rest2 = rest;
3497 : 2317 : rest = vect_split_slp_store_group (rest, i - group1_size);
3498 : 2317 : if (i - group1_size > 1)
3499 : 62 : res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3500 : : kind, max_tree_size,
3501 : : limit);
3502 : : }
3503 : : /* Re-analyze the non-matching tail if it has at least
3504 : : two lanes. */
3505 : 116358 : if (i + 1 < group_size)
3506 : 21206 : res |= vect_analyze_slp_instance (vinfo, bst_map,
3507 : : rest, kind, max_tree_size,
3508 : : limit);
3509 : 116358 : return res;
3510 : : }
3511 : : }
3512 : :
3513 : : /* For loop vectorization split into arbitrary pieces of size > 1. */
3514 : 197896 : if (is_a <loop_vec_info> (vinfo)
3515 : 5253 : && (i > 1 && i < group_size)
3516 : 198450 : && !vect_slp_prefer_store_lanes_p (vinfo, stmt_info, group_size, i))
3517 : : {
3518 : 554 : unsigned group1_size = i;
3519 : :
3520 : 554 : if (dump_enabled_p ())
3521 : 44 : dump_printf_loc (MSG_NOTE, vect_location,
3522 : : "Splitting SLP group at stmt %u\n", i);
3523 : :
3524 : 554 : stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3525 : : group1_size);
3526 : : /* Loop vectorization cannot handle gaps in stores, make sure
3527 : : the split group appears as strided. */
3528 : 554 : STMT_VINFO_STRIDED_P (rest) = 1;
3529 : 554 : DR_GROUP_GAP (rest) = 0;
3530 : 554 : STMT_VINFO_STRIDED_P (stmt_info) = 1;
3531 : 554 : DR_GROUP_GAP (stmt_info) = 0;
3532 : :
3533 : 554 : bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3534 : : kind, max_tree_size, limit);
3535 : 554 : if (i + 1 < group_size)
3536 : 330 : res |= vect_analyze_slp_instance (vinfo, bst_map,
3537 : : rest, kind, max_tree_size, limit);
3538 : :
3539 : 554 : return res;
3540 : : }
3541 : :
3542 : : /* Even though the first vector did not all match, we might be able to SLP
3543 : : (some) of the remainder. FORNOW ignore this possibility. */
3544 : : }
3545 : :
3546 : : /* Failed to SLP. */
3547 : 1110055 : if (dump_enabled_p ())
3548 : 2869 : dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
3549 : : return false;
3550 : : }
3551 : :
3552 : :
3553 : : /* Analyze an SLP instance starting from a group of grouped stores. Call
3554 : : vect_build_slp_tree to build a tree of packed stmts if possible.
3555 : : Return FALSE if it's impossible to SLP any stmt in the loop. */
3556 : :
3557 : : static bool
3558 : 955931 : vect_analyze_slp_instance (vec_info *vinfo,
3559 : : scalar_stmts_to_slp_tree_map_t *bst_map,
3560 : : stmt_vec_info stmt_info,
3561 : : slp_instance_kind kind,
3562 : : unsigned max_tree_size, unsigned *limit)
3563 : : {
3564 : 955931 : unsigned int i;
3565 : 955931 : vec<stmt_vec_info> scalar_stmts;
3566 : :
3567 : 955931 : if (is_a <bb_vec_info> (vinfo))
3568 : 936408 : vect_location = stmt_info->stmt;
3569 : :
3570 : 955931 : stmt_vec_info next_info = stmt_info;
3571 : 955931 : if (kind == slp_inst_kind_store)
3572 : : {
3573 : : /* Collect the stores and store them in scalar_stmts. */
3574 : 952011 : scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
3575 : 4722620 : while (next_info)
3576 : : {
3577 : 2818598 : scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3578 : 2818598 : next_info = DR_GROUP_NEXT_ELEMENT (next_info);
3579 : : }
3580 : : }
3581 : 3920 : else if (kind == slp_inst_kind_reduc_chain)
3582 : : {
3583 : : /* Collect the reduction stmts and store them in scalar_stmts. */
3584 : 614 : scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
3585 : 3715 : while (next_info)
3586 : : {
3587 : 2487 : scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
3588 : 2487 : next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
3589 : : }
3590 : : /* Mark the first element of the reduction chain as reduction to properly
3591 : : transform the node. In the reduction analysis phase only the last
3592 : : element of the chain is marked as reduction. */
3593 : 614 : STMT_VINFO_DEF_TYPE (stmt_info)
3594 : 614 : = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
3595 : 614 : STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
3596 : 648 : = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
3597 : : }
3598 : 3306 : else if (kind == slp_inst_kind_reduc_group)
3599 : : {
3600 : : /* Collect reduction statements. */
3601 : 3306 : const vec<stmt_vec_info> &reductions
3602 : 3306 : = as_a <loop_vec_info> (vinfo)->reductions;
3603 : 3306 : scalar_stmts.create (reductions.length ());
3604 : 26789 : for (i = 0; reductions.iterate (i, &next_info); i++)
3605 : 20177 : if ((STMT_VINFO_RELEVANT_P (next_info)
3606 : 0 : || STMT_VINFO_LIVE_P (next_info))
3607 : : /* ??? Make sure we didn't skip a conversion around a reduction
3608 : : path. In that case we'd have to reverse engineer that conversion
3609 : : stmt following the chain using reduc_idx and from the PHI
3610 : : using reduc_def. */
3611 : 20177 : && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
3612 : 20105 : scalar_stmts.quick_push (next_info);
3613 : : /* If less than two were relevant/live there's nothing to SLP. */
3614 : 3306 : if (scalar_stmts.length () < 2)
3615 : : return false;
3616 : : }
3617 : : else
3618 : 0 : gcc_unreachable ();
3619 : :
3620 : 955925 : vec<stmt_vec_info> roots = vNULL;
3621 : 955925 : vec<tree> remain = vNULL;
3622 : : /* Build the tree for the SLP instance. */
3623 : 955925 : bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
3624 : : roots, remain,
3625 : : max_tree_size, limit, bst_map,
3626 : : kind == slp_inst_kind_store
3627 : : ? stmt_info : NULL);
3628 : :
3629 : : /* ??? If this is slp_inst_kind_store and the above succeeded here's
3630 : : where we should do store group splitting. */
3631 : :
3632 : 955925 : return res;
3633 : : }
3634 : :
3635 : : /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
3636 : : trees of packed scalar stmts if SLP is possible. */
3637 : :
3638 : : opt_result
3639 : 748366 : vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
3640 : : {
3641 : 748366 : unsigned int i;
3642 : 748366 : stmt_vec_info first_element;
3643 : 748366 : slp_instance instance;
3644 : :
3645 : 748366 : DUMP_VECT_SCOPE ("vect_analyze_slp");
3646 : :
3647 : 748366 : unsigned limit = max_tree_size;
3648 : :
3649 : 748366 : scalar_stmts_to_slp_tree_map_t *bst_map
3650 : 748366 : = new scalar_stmts_to_slp_tree_map_t ();
3651 : :
3652 : : /* Find SLP sequences starting from groups of grouped stores. */
3653 : 2310233 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
3654 : 813501 : vect_analyze_slp_instance (vinfo, bst_map, first_element,
3655 : : slp_inst_kind_store, max_tree_size, &limit);
3656 : :
3657 : 748366 : if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
3658 : : {
3659 : 2971898 : for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
3660 : : {
3661 : 1041926 : vect_location = bb_vinfo->roots[i].roots[0]->stmt;
3662 : : /* Apply patterns. */
3663 : 6572224 : for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
3664 : 4488372 : bb_vinfo->roots[i].stmts[j]
3665 : 2295145 : = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
3666 : 1041926 : if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
3667 : 1041926 : bb_vinfo->roots[i].stmts,
3668 : 1041926 : bb_vinfo->roots[i].roots,
3669 : 1041926 : bb_vinfo->roots[i].remain,
3670 : : max_tree_size, &limit, bst_map, NULL))
3671 : : {
3672 : 131875 : bb_vinfo->roots[i].stmts = vNULL;
3673 : 131875 : bb_vinfo->roots[i].roots = vNULL;
3674 : 131875 : bb_vinfo->roots[i].remain = vNULL;
3675 : : }
3676 : : }
3677 : : }
3678 : :
3679 : 748366 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3680 : : {
3681 : : /* Find SLP sequences starting from reduction chains. */
3682 : 171269 : FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
3683 : 618 : if (! STMT_VINFO_RELEVANT_P (first_element)
3684 : 4 : && ! STMT_VINFO_LIVE_P (first_element))
3685 : : ;
3686 : 614 : else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
3687 : : slp_inst_kind_reduc_chain,
3688 : : max_tree_size, &limit))
3689 : : {
3690 : : /* Dissolve reduction chain group. */
3691 : 268 : stmt_vec_info vinfo = first_element;
3692 : 268 : stmt_vec_info last = NULL;
3693 : 1029 : while (vinfo)
3694 : : {
3695 : 761 : stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
3696 : 761 : REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
3697 : 761 : REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
3698 : 761 : last = vinfo;
3699 : 761 : vinfo = next;
3700 : : }
3701 : 268 : STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
3702 : : /* It can be still vectorized as part of an SLP reduction. */
3703 : 268 : loop_vinfo->reductions.safe_push (last);
3704 : : }
3705 : :
3706 : : /* Find SLP sequences starting from groups of reductions. */
3707 : 170651 : if (loop_vinfo->reductions.length () > 1)
3708 : 3306 : vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
3709 : : slp_inst_kind_reduc_group, max_tree_size,
3710 : : &limit);
3711 : : }
3712 : :
3713 : 748366 : hash_set<slp_tree> visited_patterns;
3714 : 748366 : slp_tree_to_load_perm_map_t perm_cache;
3715 : 748366 : slp_compat_nodes_map_t compat_cache;
3716 : :
3717 : : /* See if any patterns can be found in the SLP tree. */
3718 : 748366 : bool pattern_found = false;
3719 : 2267616 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3720 : 770884 : pattern_found |= vect_match_slp_patterns (instance, vinfo,
3721 : : &visited_patterns, &perm_cache,
3722 : : &compat_cache);
3723 : :
3724 : : /* If any were found optimize permutations of loads. */
3725 : 748366 : if (pattern_found)
3726 : : {
3727 : 532 : hash_map<slp_tree, slp_tree> load_map;
3728 : 3756 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3729 : : {
3730 : 2692 : slp_tree root = SLP_INSTANCE_TREE (instance);
3731 : 2692 : optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
3732 : : &load_map, root);
3733 : : }
3734 : 532 : }
3735 : :
3736 : :
3737 : :
3738 : : /* The map keeps a reference on SLP nodes built, release that. */
3739 : 3562869 : for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
3740 : 6377372 : it != bst_map->end (); ++it)
3741 : 2814503 : if ((*it).second)
3742 : 2814503 : vect_free_slp_tree ((*it).second);
3743 : 748366 : delete bst_map;
3744 : :
3745 : 748366 : if (pattern_found && dump_enabled_p ())
3746 : : {
3747 : 13 : dump_printf_loc (MSG_NOTE, vect_location,
3748 : : "Pattern matched SLP tree\n");
3749 : 13 : hash_set<slp_tree> visited;
3750 : 58 : FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
3751 : 19 : vect_print_slp_graph (MSG_NOTE, vect_location,
3752 : : SLP_INSTANCE_TREE (instance), visited);
3753 : 13 : }
3754 : :
3755 : 748366 : return opt_result::success ();
3756 : 748366 : }
3757 : :
3758 : : /* Estimates the cost of inserting layout changes into the SLP graph.
3759 : : It can also say that the insertion is impossible. */
3760 : :
3761 : : struct slpg_layout_cost
3762 : : {
3763 : 3777390 : slpg_layout_cost () = default;
3764 : : slpg_layout_cost (sreal, bool);
3765 : :
3766 : 375716 : static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
3767 : 4214833 : bool is_possible () const { return depth != sreal::max (); }
3768 : :
3769 : : bool operator== (const slpg_layout_cost &) const;
3770 : : bool operator!= (const slpg_layout_cost &) const;
3771 : :
3772 : : bool is_better_than (const slpg_layout_cost &, bool) const;
3773 : :
3774 : : void add_parallel_cost (const slpg_layout_cost &);
3775 : : void add_serial_cost (const slpg_layout_cost &);
3776 : : void split (unsigned int);
3777 : :
3778 : : /* The longest sequence of layout changes needed during any traversal
3779 : : of the partition dag, weighted by execution frequency.
3780 : :
3781 : : This is the most important metric when optimizing for speed, since
3782 : : it helps to ensure that we keep the number of operations on
3783 : : critical paths to a minimum. */
3784 : : sreal depth = 0;
3785 : :
3786 : : /* An estimate of the total number of operations needed. It is weighted by
3787 : : execution frequency when optimizing for speed but not when optimizing for
3788 : : size. In order to avoid double-counting, a node with a fanout of N will
3789 : : distribute 1/N of its total cost to each successor.
3790 : :
3791 : : This is the most important metric when optimizing for size, since
3792 : : it helps to keep the total number of operations to a minimum, */
3793 : : sreal total = 0;
3794 : : };
3795 : :
3796 : : /* Construct costs for a node with weight WEIGHT. A higher weight
3797 : : indicates more frequent execution. IS_FOR_SIZE is true if we are
3798 : : optimizing for size rather than speed. */
3799 : :
3800 : 967433 : slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
3801 : 968296 : : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
3802 : : {
3803 : 967433 : }
3804 : :
3805 : : bool
3806 : 0 : slpg_layout_cost::operator== (const slpg_layout_cost &other) const
3807 : : {
3808 : 0 : return depth == other.depth && total == other.total;
3809 : : }
3810 : :
3811 : : bool
3812 : 0 : slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
3813 : : {
3814 : 0 : return !operator== (other);
3815 : : }
3816 : :
3817 : : /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
3818 : : true if we are optimizing for size rather than speed. */
3819 : :
3820 : : bool
3821 : 264633 : slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
3822 : : bool is_for_size) const
3823 : : {
3824 : 264633 : if (is_for_size)
3825 : : {
3826 : 417 : if (total != other.total)
3827 : 186 : return total < other.total;
3828 : 231 : return depth < other.depth;
3829 : : }
3830 : : else
3831 : : {
3832 : 264216 : if (depth != other.depth)
3833 : 120324 : return depth < other.depth;
3834 : 143892 : return total < other.total;
3835 : : }
3836 : : }
3837 : :
3838 : : /* Increase the costs to account for something with cost INPUT_COST
3839 : : happening in parallel with the current costs. */
3840 : :
3841 : : void
3842 : 284901 : slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
3843 : : {
3844 : 284901 : depth = std::max (depth, input_cost.depth);
3845 : 284901 : total += input_cost.total;
3846 : 284901 : }
3847 : :
3848 : : /* Increase the costs to account for something with cost INPUT_COST
3849 : : happening in series with the current costs. */
3850 : :
3851 : : void
3852 : 1204984 : slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
3853 : : {
3854 : 1204984 : depth += other.depth;
3855 : 1204984 : total += other.total;
3856 : 1204984 : }
3857 : :
3858 : : /* Split the total cost among TIMES successors or predecessors. */
3859 : :
3860 : : void
3861 : 950335 : slpg_layout_cost::split (unsigned int times)
3862 : : {
3863 : 950335 : if (times > 1)
3864 : 351283 : total /= times;
3865 : 950335 : }
3866 : :
3867 : : /* Information about one node in the SLP graph, for use during
3868 : : vect_optimize_slp_pass. */
3869 : :
3870 : : struct slpg_vertex
3871 : : {
3872 : 1960729 : slpg_vertex (slp_tree node_) : node (node_) {}
3873 : :
3874 : : /* The node itself. */
3875 : : slp_tree node;
3876 : :
3877 : : /* Which partition the node belongs to, or -1 if none. Nodes outside of
3878 : : partitions are flexible; they can have whichever layout consumers
3879 : : want them to have. */
3880 : : int partition = -1;
3881 : :
3882 : : /* The number of nodes that directly use the result of this one
3883 : : (i.e. the number of nodes that count this one as a child). */
3884 : : unsigned int out_degree = 0;
3885 : :
3886 : : /* The execution frequency of the node. */
3887 : : sreal weight = 0;
3888 : :
3889 : : /* The total execution frequency of all nodes that directly use the
3890 : : result of this one. */
3891 : : sreal out_weight = 0;
3892 : : };
3893 : :
3894 : : /* Information about one partition of the SLP graph, for use during
3895 : : vect_optimize_slp_pass. */
3896 : :
3897 : 1125440 : struct slpg_partition_info
3898 : : {
3899 : : /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
3900 : : of m_partitioned_nodes. */
3901 : : unsigned int node_begin = 0;
3902 : : unsigned int node_end = 0;
3903 : :
3904 : : /* Which layout we've chosen to use for this partition, or -1 if
3905 : : we haven't picked one yet. */
3906 : : int layout = -1;
3907 : :
3908 : : /* The number of predecessors and successors in the partition dag.
3909 : : The predecessors always have lower partition numbers and the
3910 : : successors always have higher partition numbers.
3911 : :
3912 : : Note that the directions of these edges are not necessarily the
3913 : : same as in the data flow graph. For example, if an SCC has separate
3914 : : partitions for an inner loop and an outer loop, the inner loop's
3915 : : partition will have at least two incoming edges from the outer loop's
3916 : : partition: one for a live-in value and one for a live-out value.
3917 : : In data flow terms, one of these edges would also be from the outer loop
3918 : : to the inner loop, but the other would be in the opposite direction. */
3919 : : unsigned int in_degree = 0;
3920 : : unsigned int out_degree = 0;
3921 : : };
3922 : :
3923 : : /* Information about the costs of using a particular layout for a
3924 : : particular partition. It can also say that the combination is
3925 : : impossible. */
3926 : :
3927 : : struct slpg_partition_layout_costs
3928 : : {
3929 : 1208643 : bool is_possible () const { return internal_cost.is_possible (); }
3930 : 37905 : void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
3931 : :
3932 : : /* The costs inherited from predecessor partitions. */
3933 : : slpg_layout_cost in_cost;
3934 : :
3935 : : /* The inherent cost of the layout within the node itself. For example,
3936 : : this is nonzero for a load if choosing a particular layout would require
3937 : : the load to permute the loaded elements. It is nonzero for a
3938 : : VEC_PERM_EXPR if the permutation cannot be eliminated or converted
3939 : : to full-vector moves. */
3940 : : slpg_layout_cost internal_cost;
3941 : :
3942 : : /* The costs inherited from successor partitions. */
3943 : : slpg_layout_cost out_cost;
3944 : : };
3945 : :
3946 : : /* This class tries to optimize the layout of vectors in order to avoid
3947 : : unnecessary shuffling. At the moment, the set of possible layouts are
3948 : : restricted to bijective permutations.
3949 : :
3950 : : The goal of the pass depends on whether we're optimizing for size or
3951 : : for speed. When optimizing for size, the goal is to reduce the overall
3952 : : number of layout changes (including layout changes implied by things
3953 : : like load permutations). When optimizing for speed, the goal is to
3954 : : reduce the maximum latency attributable to layout changes on any
3955 : : non-cyclical path through the data flow graph.
3956 : :
3957 : : For example, when optimizing a loop nest for speed, we will prefer
3958 : : to make layout changes outside of a loop rather than inside of a loop,
3959 : : and will prefer to make layout changes in parallel rather than serially,
3960 : : even if that increases the overall number of layout changes.
3961 : :
3962 : : The high-level procedure is:
3963 : :
3964 : : (1) Build a graph in which edges go from uses (parents) to definitions
3965 : : (children).
3966 : :
3967 : : (2) Divide the graph into a dag of strongly-connected components (SCCs).
3968 : :
3969 : : (3) When optimizing for speed, partition the nodes in each SCC based
3970 : : on their containing cfg loop. When optimizing for size, treat
3971 : : each SCC as a single partition.
3972 : :
3973 : : This gives us a dag of partitions. The goal is now to assign a
3974 : : layout to each partition.
3975 : :
3976 : : (4) Construct a set of vector layouts that are worth considering.
3977 : : Record which nodes must keep their current layout.
3978 : :
3979 : : (5) Perform a forward walk over the partition dag (from loads to stores)
3980 : : accumulating the "forward" cost of using each layout. When visiting
3981 : : each partition, assign a tentative choice of layout to the partition
3982 : : and use that choice when calculating the cost of using a different
3983 : : layout in successor partitions.
3984 : :
3985 : : (6) Perform a backward walk over the partition dag (from stores to loads),
3986 : : accumulating the "backward" cost of using each layout. When visiting
3987 : : each partition, make a final choice of layout for that partition based
3988 : : on the accumulated forward costs (from (5)) and backward costs
3989 : : (from (6)).
3990 : :
3991 : : (7) Apply the chosen layouts to the SLP graph.
3992 : :
3993 : : For example, consider the SLP statements:
3994 : :
3995 : : S1: a_1 = load
3996 : : loop:
3997 : : S2: a_2 = PHI<a_1, a_3>
3998 : : S3: b_1 = load
3999 : : S4: a_3 = a_2 + b_1
4000 : : exit:
4001 : : S5: a_4 = PHI<a_3>
4002 : : S6: store a_4
4003 : :
4004 : : S2 and S4 form an SCC and are part of the same loop. Every other
4005 : : statement is in a singleton SCC. In this example there is a one-to-one
4006 : : mapping between SCCs and partitions and the partition dag looks like this;
4007 : :
4008 : : S1 S3
4009 : : \ /
4010 : : S2+S4
4011 : : |
4012 : : S5
4013 : : |
4014 : : S6
4015 : :
4016 : : S2, S3 and S4 will have a higher execution frequency than the other
4017 : : statements, so when optimizing for speed, the goal is to avoid any
4018 : : layout changes:
4019 : :
4020 : : - within S3
4021 : : - within S2+S4
4022 : : - on the S3->S2+S4 edge
4023 : :
4024 : : For example, if S3 was originally a reversing load, the goal of the
4025 : : pass is to make it an unreversed load and change the layout on the
4026 : : S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
4027 : : on S1->S2+S4 and S5->S6 would also be acceptable.)
4028 : :
4029 : : The difference between SCCs and partitions becomes important if we
4030 : : add an outer loop:
4031 : :
4032 : : S1: a_1 = ...
4033 : : loop1:
4034 : : S2: a_2 = PHI<a_1, a_6>
4035 : : S3: b_1 = load
4036 : : S4: a_3 = a_2 + b_1
4037 : : loop2:
4038 : : S5: a_4 = PHI<a_3, a_5>
4039 : : S6: c_1 = load
4040 : : S7: a_5 = a_4 + c_1
4041 : : exit2:
4042 : : S8: a_6 = PHI<a_5>
4043 : : S9: store a_6
4044 : : exit1:
4045 : :
4046 : : Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
4047 : : for speed, we usually do not want restrictions in the outer loop to "infect"
4048 : : the decision for the inner loop. For example, if an outer-loop node
4049 : : in the SCC contains a statement with a fixed layout, that should not
4050 : : prevent the inner loop from using a different layout. Conversely,
4051 : : the inner loop should not dictate a layout to the outer loop: if the
4052 : : outer loop does a lot of computation, then it may not be efficient to
4053 : : do all of that computation in the inner loop's preferred layout.
4054 : :
4055 : : So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
4056 : : and S5+S7 (inner). We also try to arrange partitions so that:
4057 : :
4058 : : - the partition for an outer loop comes before the partition for
4059 : : an inner loop
4060 : :
4061 : : - if a sibling loop A dominates a sibling loop B, A's partition
4062 : : comes before B's
4063 : :
4064 : : This gives the following partition dag for the example above:
4065 : :
4066 : : S1 S3
4067 : : \ /
4068 : : S2+S4+S8 S6
4069 : : | \\ /
4070 : : | S5+S7
4071 : : |
4072 : : S9
4073 : :
4074 : : There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
4075 : : one for a reversal of the edge S7->S8.
4076 : :
4077 : : The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
4078 : : for S2+S4+S8 therefore has to balance the cost of using the outer loop's
4079 : : preferred layout against the cost of changing the layout on entry to the
4080 : : inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
4081 : :
4082 : : Although this works well when optimizing for speed, it has the downside
4083 : : when optimizing for size that the choice of layout for S5+S7 is completely
4084 : : independent of S9, which lessens the chance of reducing the overall number
4085 : : of permutations. We therefore do not partition SCCs when optimizing
4086 : : for size.
4087 : :
4088 : : To give a concrete example of the difference between optimizing
4089 : : for size and speed, consider:
4090 : :
4091 : : a[0] = (b[1] << c[3]) - d[1];
4092 : : a[1] = (b[0] << c[2]) - d[0];
4093 : : a[2] = (b[3] << c[1]) - d[3];
4094 : : a[3] = (b[2] << c[0]) - d[2];
4095 : :
4096 : : There are three different layouts here: one for a, one for b and d,
4097 : : and one for c. When optimizing for speed it is better to permute each
4098 : : of b, c and d into the order required by a, since those permutations
4099 : : happen in parallel. But when optimizing for size, it is better to:
4100 : :
4101 : : - permute c into the same order as b
4102 : : - do the arithmetic
4103 : : - permute the result into the order required by a
4104 : :
4105 : : This gives 2 permutations rather than 3. */
4106 : :
4107 : : class vect_optimize_slp_pass
4108 : : {
4109 : : public:
4110 : 302376 : vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
4111 : : void run ();
4112 : :
4113 : : private:
4114 : : /* Graph building. */
4115 : : struct loop *containing_loop (slp_tree);
4116 : : bool is_cfg_latch_edge (graph_edge *);
4117 : : void build_vertices (hash_set<slp_tree> &, slp_tree);
4118 : : void build_vertices ();
4119 : : void build_graph ();
4120 : :
4121 : : /* Partitioning. */
4122 : : void create_partitions ();
4123 : : template<typename T> void for_each_partition_edge (unsigned int, T);
4124 : :
4125 : : /* Layout selection. */
4126 : : bool is_compatible_layout (slp_tree, unsigned int);
4127 : : int change_layout_cost (slp_tree, unsigned int, unsigned int);
4128 : : slpg_partition_layout_costs &partition_layout_costs (unsigned int,
4129 : : unsigned int);
4130 : : void change_vec_perm_layout (slp_tree, lane_permutation_t &,
4131 : : int, unsigned int);
4132 : : int internal_node_cost (slp_tree, int, unsigned int);
4133 : : void start_choosing_layouts ();
4134 : :
4135 : : /* Cost propagation. */
4136 : : slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
4137 : : unsigned int, unsigned int);
4138 : : slpg_layout_cost total_in_cost (unsigned int);
4139 : : slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
4140 : : slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
4141 : : void forward_pass ();
4142 : : void backward_pass ();
4143 : :
4144 : : /* Rematerialization. */
4145 : : slp_tree get_result_with_layout (slp_tree, unsigned int);
4146 : : void materialize ();
4147 : :
4148 : : /* Clean-up. */
4149 : : void remove_redundant_permutations ();
4150 : :
4151 : : void dump ();
4152 : :
4153 : : vec_info *m_vinfo;
4154 : :
4155 : : /* True if we should optimize the graph for size, false if we should
4156 : : optimize it for speed. (It wouldn't be easy to make this decision
4157 : : more locally.) */
4158 : : bool m_optimize_size;
4159 : :
4160 : : /* A graph of all SLP nodes, with edges leading from uses to definitions.
4161 : : In other words, a node's predecessors are its slp_tree parents and
4162 : : a node's successors are its slp_tree children. */
4163 : : graph *m_slpg = nullptr;
4164 : :
4165 : : /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
4166 : : auto_vec<slpg_vertex> m_vertices;
4167 : :
4168 : : /* The list of all leaves of M_SLPG. such as external definitions, constants,
4169 : : and loads. */
4170 : : auto_vec<int> m_leafs;
4171 : :
4172 : : /* This array has one entry for every vector layout that we're considering.
4173 : : Element 0 is null and indicates "no change". Other entries describe
4174 : : permutations that are inherent in the current graph and that we would
4175 : : like to reverse if possible.
4176 : :
4177 : : For example, a permutation { 1, 2, 3, 0 } means that something has
4178 : : effectively been permuted in that way, such as a load group
4179 : : { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
4180 : : We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
4181 : : in order to put things "back" in order. */
4182 : : auto_vec<vec<unsigned> > m_perms;
4183 : :
4184 : : /* A partitioning of the nodes for which a layout must be chosen.
4185 : : Each partition represents an <SCC, cfg loop> pair; that is,
4186 : : nodes in different SCCs belong to different partitions, and nodes
4187 : : within an SCC can be further partitioned according to a containing
4188 : : cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
4189 : :
4190 : : - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
4191 : : from leaves (such as loads) to roots (such as stores).
4192 : :
4193 : : - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
4194 : : auto_vec<slpg_partition_info> m_partitions;
4195 : :
4196 : : /* The list of all nodes for which a layout must be chosen. Nodes for
4197 : : partition P come before the nodes for partition P+1. Nodes within a
4198 : : partition are in reverse postorder. */
4199 : : auto_vec<unsigned int> m_partitioned_nodes;
4200 : :
4201 : : /* Index P * num-layouts + L contains the cost of using layout L
4202 : : for partition P. */
4203 : : auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
4204 : :
4205 : : /* Index N * num-layouts + L, if nonnull, is a node that provides the
4206 : : original output of node N adjusted to have layout L. */
4207 : : auto_vec<slp_tree> m_node_layouts;
4208 : : };
4209 : :
4210 : : /* Fill the vertices and leafs vector with all nodes in the SLP graph.
4211 : : Also record whether we should optimize anything for speed rather
4212 : : than size. */
4213 : :
4214 : : void
4215 : 2022023 : vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
4216 : : slp_tree node)
4217 : : {
4218 : 2022023 : unsigned i;
4219 : 2022023 : slp_tree child;
4220 : :
4221 : 2022023 : if (visited.add (node))
4222 : 2022023 : return;
4223 : :
4224 : 1960729 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4225 : : {
4226 : 1154483 : basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
4227 : 1132183 : if (optimize_bb_for_speed_p (bb))
4228 : 1088180 : m_optimize_size = false;
4229 : : }
4230 : :
4231 : 1960729 : node->vertex = m_vertices.length ();
4232 : 1960729 : m_vertices.safe_push (slpg_vertex (node));
4233 : :
4234 : 1960729 : bool leaf = true;
4235 : 1960729 : bool force_leaf = false;
4236 : 3214174 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
4237 : 1253445 : if (child)
4238 : : {
4239 : 1251139 : leaf = false;
4240 : 1251139 : build_vertices (visited, child);
4241 : : }
4242 : : else
4243 : : force_leaf = true;
4244 : : /* Since SLP discovery works along use-def edges all cycles have an
4245 : : entry - but there's the exception of cycles where we do not handle
4246 : : the entry explicitely (but with a NULL SLP node), like some reductions
4247 : : and inductions. Force those SLP PHIs to act as leafs to make them
4248 : : backwards reachable. */
4249 : 1960729 : if (leaf || force_leaf)
4250 : 995345 : m_leafs.safe_push (node->vertex);
4251 : : }
4252 : :
4253 : : /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
4254 : :
4255 : : void
4256 : 302376 : vect_optimize_slp_pass::build_vertices ()
4257 : : {
4258 : 302376 : hash_set<slp_tree> visited;
4259 : 302376 : unsigned i;
4260 : 302376 : slp_instance instance;
4261 : 1678012 : FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
4262 : 770884 : build_vertices (visited, SLP_INSTANCE_TREE (instance));
4263 : 302376 : }
4264 : :
4265 : : /* Apply (reverse) bijectite PERM to VEC. */
4266 : :
4267 : : template <class T>
4268 : : static void
4269 : 165702 : vect_slp_permute (vec<unsigned> perm,
4270 : : vec<T> &vec, bool reverse)
4271 : : {
4272 : 165702 : auto_vec<T, 64> saved;
4273 : 165702 : saved.create (vec.length ());
4274 : 1087737 : for (unsigned i = 0; i < vec.length (); ++i)
4275 : 380399 : saved.quick_push (vec[i]);
4276 : :
4277 : 165702 : if (reverse)
4278 : : {
4279 : 645150 : for (unsigned i = 0; i < vec.length (); ++i)
4280 : 379103 : vec[perm[i]] = saved[i];
4281 : 544303 : for (unsigned i = 0; i < vec.length (); ++i)
4282 : 650066 : gcc_assert (vec[perm[i]] == saved[i]);
4283 : : }
4284 : : else
4285 : : {
4286 : 1798 : for (unsigned i = 0; i < vec.length (); ++i)
4287 : 1296 : vec[i] = saved[perm[i]];
4288 : 1798 : for (unsigned i = 0; i < vec.length (); ++i)
4289 : 1944 : gcc_assert (vec[i] == saved[perm[i]]);
4290 : : }
4291 : 165702 : }
4292 : :
4293 : : /* Return the cfg loop that contains NODE. */
4294 : :
4295 : : struct loop *
4296 : 1138875 : vect_optimize_slp_pass::containing_loop (slp_tree node)
4297 : : {
4298 : 1138875 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4299 : 1138875 : if (!rep)
4300 : 2726 : return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
4301 : 1158459 : return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
4302 : : }
4303 : :
4304 : : /* Return true if UD (an edge from a use to a definition) is associated
4305 : : with a loop latch edge in the cfg. */
4306 : :
4307 : : bool
4308 : 1251139 : vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
4309 : : {
4310 : 1251139 : slp_tree use = m_vertices[ud->src].node;
4311 : 1251139 : slp_tree def = m_vertices[ud->dest].node;
4312 : 1251139 : if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
4313 : 1251139 : || SLP_TREE_CODE (use) == VEC_PERM_EXPR)
4314 : 1240621 : || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
4315 : : return false;
4316 : :
4317 : 413286 : stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
4318 : 413286 : return (is_a<gphi *> (use_rep->stmt)
4319 : 88581 : && bb_loop_header_p (gimple_bb (use_rep->stmt))
4320 : 425530 : && containing_loop (def) == containing_loop (use));
4321 : : }
4322 : :
4323 : : /* Build the graph. Mark edges that correspond to cfg loop latch edges with
4324 : : a nonnull data field. */
4325 : :
4326 : : void
4327 : 302376 : vect_optimize_slp_pass::build_graph ()
4328 : : {
4329 : 302376 : m_optimize_size = true;
4330 : 302376 : build_vertices ();
4331 : :
4332 : 604752 : m_slpg = new_graph (m_vertices.length ());
4333 : 2867857 : for (slpg_vertex &v : m_vertices)
4334 : 5148512 : for (slp_tree child : SLP_TREE_CHILDREN (v.node))
4335 : 1253445 : if (child)
4336 : : {
4337 : 1251139 : graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
4338 : 1251139 : if (is_cfg_latch_edge (ud))
4339 : 9595 : ud->data = this;
4340 : : }
4341 : 302376 : }
4342 : :
4343 : : /* Return true if E corresponds to a loop latch edge in the cfg. */
4344 : :
4345 : : static bool
4346 : 1260734 : skip_cfg_latch_edges (graph_edge *e)
4347 : : {
4348 : 1260734 : return e->data;
4349 : : }
4350 : :
4351 : : /* Create the node partitions. */
4352 : :
4353 : : void
4354 : 302376 : vect_optimize_slp_pass::create_partitions ()
4355 : : {
4356 : : /* Calculate a postorder of the graph, ignoring edges that correspond
4357 : : to natural latch edges in the cfg. Reading the vector from the end
4358 : : to the beginning gives the reverse postorder. */
4359 : 302376 : auto_vec<int> initial_rpo;
4360 : 604752 : graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
4361 : : false, NULL, skip_cfg_latch_edges);
4362 : 907128 : gcc_assert (initial_rpo.length () == m_vertices.length ());
4363 : :
4364 : : /* Calculate the strongly connected components of the graph. */
4365 : 302376 : auto_vec<int> scc_grouping;
4366 : 302376 : unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
4367 : :
4368 : : /* Create a new index order in which all nodes from the same SCC are
4369 : : consecutive. Use scc_pos to record the index of the first node in
4370 : : each SCC. */
4371 : 302376 : auto_vec<unsigned int> scc_pos (num_sccs);
4372 : 302376 : int last_component = -1;
4373 : 302376 : unsigned int node_count = 0;
4374 : 2867857 : for (unsigned int node_i : scc_grouping)
4375 : : {
4376 : 1960729 : if (last_component != m_slpg->vertices[node_i].component)
4377 : : {
4378 : 1950953 : last_component = m_slpg->vertices[node_i].component;
4379 : 3901906 : gcc_assert (last_component == int (scc_pos.length ()));
4380 : 1950953 : scc_pos.quick_push (node_count);
4381 : : }
4382 : 1960729 : node_count += 1;
4383 : : }
4384 : 604752 : gcc_assert (node_count == initial_rpo.length ()
4385 : : && last_component + 1 == int (num_sccs));
4386 : :
4387 : : /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
4388 : : inside each SCC following the RPO we calculated above. The fact that
4389 : : we ignored natural latch edges when calculating the RPO should ensure
4390 : : that, for natural loop nests:
4391 : :
4392 : : - the first node that we encounter in a cfg loop is the loop header phi
4393 : : - the loop header phis are in dominance order
4394 : :
4395 : : Arranging for this is an optimization (see below) rather than a
4396 : : correctness issue. Unnatural loops with a tangled mess of backedges
4397 : : will still work correctly, but might give poorer results.
4398 : :
4399 : : Also update scc_pos so that it gives 1 + the index of the last node
4400 : : in the SCC. */
4401 : 302376 : m_partitioned_nodes.safe_grow (node_count);
4402 : 2565481 : for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
4403 : : {
4404 : 1960729 : unsigned int node_i = initial_rpo[old_i];
4405 : 1960729 : unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
4406 : 1960729 : m_partitioned_nodes[new_i] = node_i;
4407 : : }
4408 : :
4409 : : /* When optimizing for speed, partition each SCC based on the containing
4410 : : cfg loop. The order we constructed above should ensure that, for natural
4411 : : cfg loops, we'll create sub-SCC partitions for outer loops before
4412 : : the corresponding sub-SCC partitions for inner loops. Similarly,
4413 : : when one sibling loop A dominates another sibling loop B, we should
4414 : : create a sub-SCC partition for A before a sub-SCC partition for B.
4415 : :
4416 : : As above, nothing depends for correctness on whether this achieves
4417 : : a natural nesting, but we should get better results when it does. */
4418 : 604752 : m_partitions.reserve (m_vertices.length ());
4419 : 302376 : unsigned int next_partition_i = 0;
4420 : 302376 : hash_map<struct loop *, int> loop_partitions;
4421 : 302376 : unsigned int rpo_begin = 0;
4422 : 302376 : unsigned int num_partitioned_nodes = 0;
4423 : 2858081 : for (unsigned int rpo_end : scc_pos)
4424 : : {
4425 : 1950953 : loop_partitions.empty ();
4426 : : unsigned int partition_i = next_partition_i;
4427 : 3911682 : for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
4428 : : {
4429 : : /* Handle externals and constants optimistically throughout.
4430 : : But treat existing vectors as fixed since we do not handle
4431 : : permuting them. */
4432 : 1960729 : unsigned int node_i = m_partitioned_nodes[rpo_i];
4433 : 1960729 : auto &vertex = m_vertices[node_i];
4434 : 1960729 : if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
4435 : 387269 : && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
4436 : 1963451 : || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
4437 : 825816 : vertex.partition = -1;
4438 : : else
4439 : : {
4440 : 1134913 : bool existed;
4441 : 1134913 : if (m_optimize_size)
4442 : 20526 : existed = next_partition_i > partition_i;
4443 : : else
4444 : : {
4445 : 1114387 : struct loop *loop = containing_loop (vertex.node);
4446 : 1114387 : auto &entry = loop_partitions.get_or_insert (loop, &existed);
4447 : 1114387 : if (!existed)
4448 : 1104935 : entry = next_partition_i;
4449 : 1114387 : partition_i = entry;
4450 : : }
4451 : 1134913 : if (!existed)
4452 : : {
4453 : 1125440 : m_partitions.quick_push (slpg_partition_info ());
4454 : 1125440 : next_partition_i += 1;
4455 : : }
4456 : 1134913 : vertex.partition = partition_i;
4457 : 1134913 : num_partitioned_nodes += 1;
4458 : 1134913 : m_partitions[partition_i].node_end += 1;
4459 : : }
4460 : : }
4461 : 1950953 : rpo_begin = rpo_end;
4462 : : }
4463 : :
4464 : : /* Assign ranges of consecutive node indices to each partition,
4465 : : in partition order. Start with node_end being the same as
4466 : : node_begin so that the next loop can use it as a counter. */
4467 : 302376 : unsigned int node_begin = 0;
4468 : 2032568 : for (auto &partition : m_partitions)
4469 : : {
4470 : 1125440 : partition.node_begin = node_begin;
4471 : 1125440 : node_begin += partition.node_end;
4472 : 1125440 : partition.node_end = partition.node_begin;
4473 : : }
4474 : 302376 : gcc_assert (node_begin == num_partitioned_nodes);
4475 : :
4476 : : /* Finally build the list of nodes in partition order. */
4477 : 302376 : m_partitioned_nodes.truncate (num_partitioned_nodes);
4478 : 4526210 : for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
4479 : : {
4480 : 1960729 : int partition_i = m_vertices[node_i].partition;
4481 : 1960729 : if (partition_i >= 0)
4482 : : {
4483 : 1134913 : unsigned int order_i = m_partitions[partition_i].node_end++;
4484 : 1134913 : m_partitioned_nodes[order_i] = node_i;
4485 : : }
4486 : : }
4487 : 302376 : }
4488 : :
4489 : : /* Look for edges from earlier partitions into node NODE_I and edges from
4490 : : node NODE_I into later partitions. Call:
4491 : :
4492 : : FN (ud, other_node_i)
4493 : :
4494 : : for each such use-to-def edge ud, where other_node_i is the node at the
4495 : : other end of the edge. */
4496 : :
4497 : : template<typename T>
4498 : : void
4499 : 1467002 : vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
4500 : : {
4501 : 1467002 : int partition_i = m_vertices[node_i].partition;
4502 : 1467002 : for (graph_edge *pred = m_slpg->vertices[node_i].pred;
4503 : 2191481 : pred; pred = pred->pred_next)
4504 : : {
4505 : 724479 : int src_partition_i = m_vertices[pred->src].partition;
4506 : 724479 : if (src_partition_i >= 0 && src_partition_i != partition_i)
4507 : 695649 : fn (pred, pred->src);
4508 : : }
4509 : 1467002 : for (graph_edge *succ = m_slpg->vertices[node_i].succ;
4510 : 3139073 : succ; succ = succ->succ_next)
4511 : : {
4512 : 1672071 : int dest_partition_i = m_vertices[succ->dest].partition;
4513 : 1672071 : if (dest_partition_i >= 0 && dest_partition_i != partition_i)
4514 : 698059 : fn (succ, succ->dest);
4515 : : }
4516 : 1467002 : }
4517 : :
4518 : : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
4519 : : that NODE would operate on. This test is independent of NODE's actual
4520 : : operation. */
4521 : :
4522 : : bool
4523 : 1286401 : vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
4524 : : unsigned int layout_i)
4525 : : {
4526 : 1286401 : if (layout_i == 0)
4527 : : return true;
4528 : :
4529 : 758024 : if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
4530 : 9070 : return false;
4531 : :
4532 : : return true;
4533 : : }
4534 : :
4535 : : /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
4536 : : to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
4537 : : layouts is incompatible with NODE or if the change is not possible for
4538 : : some other reason.
4539 : :
4540 : : The properties taken from NODE include the number of lanes and the
4541 : : vector type. The actual operation doesn't matter. */
4542 : :
4543 : : int
4544 : 555576 : vect_optimize_slp_pass::change_layout_cost (slp_tree node,
4545 : : unsigned int from_layout_i,
4546 : : unsigned int to_layout_i)
4547 : : {
4548 : 555576 : if (!is_compatible_layout (node, from_layout_i)
4549 : 555576 : || !is_compatible_layout (node, to_layout_i))
4550 : 973 : return -1;
4551 : :
4552 : 554603 : if (from_layout_i == to_layout_i)
4553 : : return 0;
4554 : :
4555 : 253455 : auto_vec<slp_tree, 1> children (1);
4556 : 253455 : children.quick_push (node);
4557 : 253455 : auto_lane_permutation_t perm (SLP_TREE_LANES (node));
4558 : 253455 : if (from_layout_i > 0)
4559 : 743925 : for (unsigned int i : m_perms[from_layout_i])
4560 : 329154 : perm.quick_push ({ 0, i });
4561 : : else
4562 : 370379 : for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
4563 : 255181 : perm.quick_push ({ 0, i });
4564 : 253455 : if (to_layout_i > 0)
4565 : 115650 : vect_slp_permute (m_perms[to_layout_i], perm, true);
4566 : 253455 : auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
4567 : : children, false);
4568 : 253455 : if (count >= 0)
4569 : 246984 : return MAX (count, 1);
4570 : :
4571 : : /* ??? In principle we could try changing via layout 0, giving two
4572 : : layout changes rather than 1. Doing that would require
4573 : : corresponding support in get_result_with_layout. */
4574 : : return -1;
4575 : 253455 : }
4576 : :
4577 : : /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
4578 : :
4579 : : inline slpg_partition_layout_costs &
4580 : 823010 : vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
4581 : : unsigned int layout_i)
4582 : : {
4583 : 1646020 : return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
4584 : : }
4585 : :
4586 : : /* Change PERM in one of two ways:
4587 : :
4588 : : - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
4589 : : chosen for child I of NODE.
4590 : :
4591 : : - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
4592 : :
4593 : : In both cases, arrange for the output to have layout OUT_LAYOUT_I */
4594 : :
4595 : : void
4596 : 28846 : vect_optimize_slp_pass::
4597 : : change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
4598 : : int in_layout_i, unsigned int out_layout_i)
4599 : : {
4600 : 157382 : for (auto &entry : perm)
4601 : : {
4602 : 70844 : int this_in_layout_i = in_layout_i;
4603 : 70844 : if (this_in_layout_i < 0)
4604 : : {
4605 : 52386 : slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
4606 : 52386 : unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
4607 : 52386 : this_in_layout_i = m_partitions[in_partition_i].layout;
4608 : : }
4609 : 70844 : if (this_in_layout_i > 0)
4610 : 13818 : entry.second = m_perms[this_in_layout_i][entry.second];
4611 : : }
4612 : 28846 : if (out_layout_i > 0)
4613 : 5757 : vect_slp_permute (m_perms[out_layout_i], perm, true);
4614 : 28846 : }
4615 : :
4616 : : /* Check whether the target allows NODE to be rearranged so that the node's
4617 : : output has layout OUT_LAYOUT_I. Return the cost of the change if so,
4618 : : in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
4619 : :
4620 : : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
4621 : : NODE can adapt to the layout changes that have (perhaps provisionally)
4622 : : been chosen for NODE's children, so that no extra permutations are
4623 : : needed on either the input or the output of NODE.
4624 : :
4625 : : If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
4626 : : that all inputs will be forced into layout IN_LAYOUT_I beforehand.
4627 : :
4628 : : IN_LAYOUT_I has no meaning for other types of node.
4629 : :
4630 : : Keeping the node as-is is always valid. If the target doesn't appear
4631 : : to support the node as-is, but might realistically support other layouts,
4632 : : then layout 0 instead has the cost of a worst-case permutation. On the
4633 : : one hand, this ensures that every node has at least one valid layout,
4634 : : avoiding what would otherwise be an awkward special case. On the other,
4635 : : it still encourages the pass to change an invalid pre-existing layout
4636 : : choice into a valid one. */
4637 : :
4638 : : int
4639 : 181760 : vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
4640 : : unsigned int out_layout_i)
4641 : : {
4642 : 181760 : const int fallback_cost = 1;
4643 : :
4644 : 181760 : if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
4645 : : {
4646 : 25139 : auto_lane_permutation_t tmp_perm;
4647 : 25139 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
4648 : :
4649 : : /* Check that the child nodes support the chosen layout. Checking
4650 : : the first child is enough, since any second child would have the
4651 : : same shape. */
4652 : 25139 : auto first_child = SLP_TREE_CHILDREN (node)[0];
4653 : 25139 : if (in_layout_i > 0
4654 : 25139 : && !is_compatible_layout (first_child, in_layout_i))
4655 : : return -1;
4656 : :
4657 : 24166 : change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
4658 : 48332 : int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
4659 : : node, tmp_perm,
4660 : 24166 : SLP_TREE_CHILDREN (node),
4661 : : false);
4662 : 24166 : if (count < 0)
4663 : : {
4664 : 3059 : if (in_layout_i == 0 && out_layout_i == 0)
4665 : : {
4666 : : /* Use the fallback cost if the node could in principle support
4667 : : some nonzero layout for both the inputs and the outputs.
4668 : : Otherwise assume that the node will be rejected later
4669 : : and rebuilt from scalars. */
4670 : 800 : if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
4671 : : return fallback_cost;
4672 : 609 : return 0;
4673 : : }
4674 : : return -1;
4675 : : }
4676 : :
4677 : : /* We currently have no way of telling whether the new layout is cheaper
4678 : : or more expensive than the old one. But at least in principle,
4679 : : it should be worth making zero permutations (whole-vector shuffles)
4680 : : cheaper than real permutations, in case the pass is able to remove
4681 : : the latter. */
4682 : 21107 : return count == 0 ? 0 : 1;
4683 : 25139 : }
4684 : :
4685 : 156621 : stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
4686 : 156621 : if (rep
4687 : 155208 : && STMT_VINFO_DATA_REF (rep)
4688 : 59733 : && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
4689 : 199913 : && SLP_TREE_LOAD_PERMUTATION (node).exists ())
4690 : : {
4691 : 43276 : auto_load_permutation_t tmp_perm;
4692 : 43276 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4693 : 43276 : if (out_layout_i > 0)
4694 : 19471 : vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
4695 : :
4696 : 43276 : poly_uint64 vf = 1;
4697 : 43276 : if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
4698 : 6892 : vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4699 : 43276 : unsigned int n_perms;
4700 : 43276 : if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
4701 : : nullptr, vf, true, false, &n_perms))
4702 : : {
4703 : 2812 : auto rep = SLP_TREE_REPRESENTATIVE (node);
4704 : 2812 : if (out_layout_i == 0)
4705 : : {
4706 : : /* Use the fallback cost if the load is an N-to-N permutation.
4707 : : Otherwise assume that the node will be rejected later
4708 : : and rebuilt from scalars. */
4709 : 1587 : if (STMT_VINFO_GROUPED_ACCESS (rep)
4710 : 3174 : && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
4711 : 1587 : == SLP_TREE_LANES (node)))
4712 : 959 : return fallback_cost;
4713 : : return 0;
4714 : : }
4715 : : return -1;
4716 : : }
4717 : :
4718 : : /* See the comment above the corresponding VEC_PERM_EXPR handling. */
4719 : 40464 : return n_perms == 0 ? 0 : 1;
4720 : 43276 : }
4721 : :
4722 : : return 0;
4723 : : }
4724 : :
4725 : : /* Decide which element layouts we should consider using. Calculate the
4726 : : weights associated with inserting layout changes on partition edges.
4727 : : Also mark partitions that cannot change layout, by setting their
4728 : : layout to zero. */
4729 : :
4730 : : void
4731 : 302376 : vect_optimize_slp_pass::start_choosing_layouts ()
4732 : : {
4733 : : /* Used to assign unique permutation indices. */
4734 : 302376 : using perm_hash = unbounded_hashmap_traits<
4735 : : vec_free_hash_base<int_hash_base<unsigned>>,
4736 : : int_hash<int, -1, -2>
4737 : : >;
4738 : 302376 : hash_map<vec<unsigned>, int, perm_hash> layout_ids;
4739 : :
4740 : : /* Layout 0 is "no change". */
4741 : 302376 : m_perms.safe_push (vNULL);
4742 : :
4743 : : /* Create layouts from existing permutations. */
4744 : 302376 : auto_load_permutation_t tmp_perm;
4745 : 2042041 : for (unsigned int node_i : m_partitioned_nodes)
4746 : : {
4747 : : /* Leafs also double as entries to the reverse graph. Allow the
4748 : : layout of those to be changed. */
4749 : 1134913 : auto &vertex = m_vertices[node_i];
4750 : 1134913 : auto &partition = m_partitions[vertex.partition];
4751 : 1134913 : if (!m_slpg->vertices[node_i].succ)
4752 : 168265 : partition.layout = 0;
4753 : :
4754 : : /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
4755 : 1134913 : slp_tree node = vertex.node;
4756 : 1134913 : stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
4757 : 1134913 : slp_tree child;
4758 : 1134913 : unsigned HOST_WIDE_INT imin, imax = 0;
4759 : 1134913 : bool any_permute = false;
4760 : 1134913 : tmp_perm.truncate (0);
4761 : 1134913 : if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
4762 : : {
4763 : : /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
4764 : : unpermuted, record a layout that reverses this permutation.
4765 : :
4766 : : We would need more work to cope with loads that are internally
4767 : : permuted and also have inputs (such as masks for
4768 : : IFN_MASK_LOADs). */
4769 : 165014 : gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
4770 : 165014 : if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
4771 : : {
4772 : 492 : partition.layout = -1;
4773 : 1121062 : continue;
4774 : : }
4775 : 164522 : dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
4776 : 164522 : imin = DR_GROUP_SIZE (dr_stmt) + 1;
4777 : 164522 : tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
4778 : : }
4779 : 1937076 : else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
4780 : 6620 : && SLP_TREE_CHILDREN (node).length () == 1
4781 : 2722 : && (child = SLP_TREE_CHILDREN (node)[0])
4782 : 972621 : && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
4783 : 2722 : .is_constant (&imin)))
4784 : : {
4785 : : /* If the child has the same vector size as this node,
4786 : : reversing the permutation can make the permutation a no-op.
4787 : : In other cases it can change a true permutation into a
4788 : : full-vector extract. */
4789 : 2722 : tmp_perm.reserve (SLP_TREE_LANES (node));
4790 : 8768 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4791 : 6046 : tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
4792 : : }
4793 : : else
4794 : 967177 : continue;
4795 : :
4796 : 579883 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4797 : : {
4798 : 412639 : unsigned idx = tmp_perm[j];
4799 : 412639 : imin = MIN (imin, idx);
4800 : 412639 : imax = MAX (imax, idx);
4801 : 412639 : if (idx - tmp_perm[0] != j)
4802 : 53812 : any_permute = true;
4803 : : }
4804 : : /* If the span doesn't match we'd disrupt VF computation, avoid
4805 : : that for now. */
4806 : 167244 : if (imax - imin + 1 != SLP_TREE_LANES (node))
4807 : 19644 : continue;
4808 : : /* If there's no permute no need to split one out. In this case
4809 : : we can consider turning a load into a permuted load, if that
4810 : : turns out to be cheaper than alternatives. */
4811 : 147600 : if (!any_permute)
4812 : : {
4813 : 133686 : partition.layout = -1;
4814 : 133686 : continue;
4815 : : }
4816 : :
4817 : : /* For now only handle true permutes, like
4818 : : vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
4819 : : when permuting constants and invariants keeping the permute
4820 : : bijective. */
4821 : 13914 : auto_sbitmap load_index (SLP_TREE_LANES (node));
4822 : 13914 : bitmap_clear (load_index);
4823 : 54936 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4824 : 41022 : bitmap_set_bit (load_index, tmp_perm[j] - imin);
4825 : : unsigned j;
4826 : 54401 : for (j = 0; j < SLP_TREE_LANES (node); ++j)
4827 : 40550 : if (!bitmap_bit_p (load_index, j))
4828 : : break;
4829 : 13914 : if (j != SLP_TREE_LANES (node))
4830 : 63 : continue;
4831 : :
4832 : 13851 : vec<unsigned> perm = vNULL;
4833 : 13851 : perm.safe_grow (SLP_TREE_LANES (node), true);
4834 : 54243 : for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
4835 : 40392 : perm[j] = tmp_perm[j] - imin;
4836 : :
4837 : 27702 : if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
4838 : : {
4839 : : /* Continue to use existing layouts, but don't add any more. */
4840 : 0 : int *entry = layout_ids.get (perm);
4841 : 0 : partition.layout = entry ? *entry : 0;
4842 : 0 : perm.release ();
4843 : : }
4844 : : else
4845 : : {
4846 : 13851 : bool existed;
4847 : 13851 : int &layout_i = layout_ids.get_or_insert (perm, &existed);
4848 : 13851 : if (existed)
4849 : 4074 : perm.release ();
4850 : : else
4851 : : {
4852 : 9777 : layout_i = m_perms.length ();
4853 : 9777 : m_perms.safe_push (perm);
4854 : : }
4855 : 13851 : partition.layout = layout_i;
4856 : : }
4857 : 13914 : }
4858 : :
4859 : : /* Initially assume that every layout is possible and has zero cost
4860 : : in every partition. */
4861 : 302376 : m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
4862 : 604752 : * m_perms.length ());
4863 : :
4864 : : /* We have to mark outgoing permutations facing non-associating-reduction
4865 : : graph entries that are not represented as to be materialized.
4866 : : slp_inst_kind_bb_reduc currently only covers associatable reductions. */
4867 : 1678012 : for (slp_instance instance : m_vinfo->slp_instances)
4868 : 770884 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
4869 : : {
4870 : 5492 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4871 : 5492 : m_partitions[m_vertices[node_i].partition].layout = 0;
4872 : : }
4873 : 765392 : else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
4874 : : {
4875 : 346 : stmt_vec_info stmt_info
4876 : 346 : = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
4877 : 346 : stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
4878 : 346 : if (needs_fold_left_reduction_p (TREE_TYPE
4879 : : (gimple_get_lhs (stmt_info->stmt)),
4880 : : STMT_VINFO_REDUC_CODE (reduc_info)))
4881 : : {
4882 : 80 : unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
4883 : 80 : m_partitions[m_vertices[node_i].partition].layout = 0;
4884 : : }
4885 : : }
4886 : :
4887 : : /* Check which layouts each node and partition can handle. Calculate the
4888 : : weights associated with inserting layout changes on edges. */
4889 : 2042041 : for (unsigned int node_i : m_partitioned_nodes)
4890 : : {
4891 : 1134913 : auto &vertex = m_vertices[node_i];
4892 : 1134913 : auto &partition = m_partitions[vertex.partition];
4893 : 1134913 : slp_tree node = vertex.node;
4894 : :
4895 : 1134913 : if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
4896 : : {
4897 : 1132183 : vertex.weight = vect_slp_node_weight (node);
4898 : :
4899 : : /* We do not handle stores with a permutation, so all
4900 : : incoming permutations must have been materialized.
4901 : :
4902 : : We also don't handle masked grouped loads, which lack a
4903 : : permutation vector. In this case the memory locations
4904 : : form an implicit second input to the loads, on top of the
4905 : : explicit mask input, and the memory input's layout cannot
4906 : : be changed.
4907 : :
4908 : : On the other hand, we do support permuting gather loads and
4909 : : masked gather loads, where each scalar load is independent
4910 : : of the others. This can be useful if the address/index input
4911 : : benefits from permutation. */
4912 : 1132183 : if (STMT_VINFO_DATA_REF (rep)
4913 : 802823 : && STMT_VINFO_GROUPED_ACCESS (rep)
4914 : 1934468 : && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
4915 : 637763 : partition.layout = 0;
4916 : :
4917 : : /* We cannot change the layout of an operation that is
4918 : : not independent on lanes. Note this is an explicit
4919 : : negative list since that's much shorter than the respective
4920 : : positive one but it's critical to keep maintaining it. */
4921 : 1132183 : if (is_gimple_call (STMT_VINFO_STMT (rep)))
4922 : 5457 : switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
4923 : : {
4924 : 1049 : case CFN_COMPLEX_ADD_ROT90:
4925 : 1049 : case CFN_COMPLEX_ADD_ROT270:
4926 : 1049 : case CFN_COMPLEX_MUL:
4927 : 1049 : case CFN_COMPLEX_MUL_CONJ:
4928 : 1049 : case CFN_VEC_ADDSUB:
4929 : 1049 : case CFN_VEC_FMADDSUB:
4930 : 1049 : case CFN_VEC_FMSUBADD:
4931 : 1049 : partition.layout = 0;
4932 : : default:;
4933 : : }
4934 : : }
4935 : :
4936 : 1944119 : auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
4937 : : {
4938 : 809206 : auto &other_vertex = m_vertices[other_node_i];
4939 : :
4940 : : /* Count the number of edges from earlier partitions and the number
4941 : : of edges to later partitions. */
4942 : 809206 : if (other_vertex.partition < vertex.partition)
4943 : 404603 : partition.in_degree += 1;
4944 : : else
4945 : 404603 : partition.out_degree += 1;
4946 : :
4947 : : /* If the current node uses the result of OTHER_NODE_I, accumulate
4948 : : the effects of that. */
4949 : 809206 : if (ud->src == int (node_i))
4950 : : {
4951 : 404603 : other_vertex.out_weight += vertex.weight;
4952 : 404603 : other_vertex.out_degree += 1;
4953 : : }
4954 : 1944119 : };
4955 : 1134913 : for_each_partition_edge (node_i, process_edge);
4956 : : }
4957 : 302376 : }
4958 : :
4959 : : /* Return the incoming costs for node NODE_I, assuming that each input keeps
4960 : : its current (provisional) choice of layout. The inputs do not necessarily
4961 : : have the same layout as each other. */
4962 : :
4963 : : slpg_layout_cost
4964 : 3868 : vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
4965 : : {
4966 : 3868 : auto &vertex = m_vertices[node_i];
4967 : 3868 : slpg_layout_cost cost;
4968 : 13777 : auto add_cost = [&](graph_edge *, unsigned int other_node_i)
4969 : : {
4970 : 9909 : auto &other_vertex = m_vertices[other_node_i];
4971 : 9909 : if (other_vertex.partition < vertex.partition)
4972 : : {
4973 : 6323 : auto &other_partition = m_partitions[other_vertex.partition];
4974 : 12646 : auto &other_costs = partition_layout_costs (other_vertex.partition,
4975 : 6323 : other_partition.layout);
4976 : 6323 : slpg_layout_cost this_cost = other_costs.in_cost;
4977 : 6323 : this_cost.add_serial_cost (other_costs.internal_cost);
4978 : 6323 : this_cost.split (other_partition.out_degree);
4979 : 6323 : cost.add_parallel_cost (this_cost);
4980 : : }
4981 : 13777 : };
4982 : 3868 : for_each_partition_edge (node_i, add_cost);
4983 : 3868 : return cost;
4984 : : }
4985 : :
4986 : : /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
4987 : : and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
4988 : : slpg_layout_cost::impossible () if the change isn't possible. */
4989 : :
4990 : : slpg_layout_cost
4991 : 555576 : vect_optimize_slp_pass::
4992 : : edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
4993 : : unsigned int layout2_i)
4994 : : {
4995 : 555576 : auto &def_vertex = m_vertices[ud->dest];
4996 : 555576 : auto &use_vertex = m_vertices[ud->src];
4997 : 555576 : auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
4998 : 555576 : auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
4999 : 555576 : auto factor = change_layout_cost (def_vertex.node, def_layout_i,
5000 : : use_layout_i);
5001 : 555576 : if (factor < 0)
5002 : 7444 : return slpg_layout_cost::impossible ();
5003 : :
5004 : : /* We have a choice of putting the layout change at the site of the
5005 : : definition or at the site of the use. Prefer the former when
5006 : : optimizing for size or when the execution frequency of the
5007 : : definition is no greater than the combined execution frequencies of
5008 : : the uses. When putting the layout change at the site of the definition,
5009 : : divvy up the cost among all consumers. */
5010 : 548132 : if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
5011 : : {
5012 : 533009 : slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
5013 : 533009 : cost.split (def_vertex.out_degree);
5014 : 533009 : return cost;
5015 : : }
5016 : 15123 : return { use_vertex.weight * factor, m_optimize_size };
5017 : : }
5018 : :
5019 : : /* UD represents a use-def link between FROM_NODE_I and a node in a later
5020 : : partition; FROM_NODE_I could be the definition node or the use node.
5021 : : The node at the other end of the link wants to use layout TO_LAYOUT_I.
5022 : : Return the cost of any necessary fix-ups on edge UD, or return
5023 : : slpg_layout_cost::impossible () if the change isn't possible.
5024 : :
5025 : : At this point, FROM_NODE_I's partition has chosen the cheapest
5026 : : layout based on the information available so far, but this choice
5027 : : is only provisional. */
5028 : :
5029 : : slpg_layout_cost
5030 : 143265 : vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
5031 : : unsigned int to_layout_i)
5032 : : {
5033 : 143265 : auto &from_vertex = m_vertices[from_node_i];
5034 : 143265 : unsigned int from_partition_i = from_vertex.partition;
5035 : 143265 : slpg_partition_info &from_partition = m_partitions[from_partition_i];
5036 : 143265 : gcc_assert (from_partition.layout >= 0);
5037 : :
5038 : : /* First calculate the cost on the assumption that FROM_PARTITION sticks
5039 : : with its current layout preference. */
5040 : 143265 : slpg_layout_cost cost = slpg_layout_cost::impossible ();
5041 : 143265 : auto edge_cost = edge_layout_cost (ud, from_node_i,
5042 : 143265 : from_partition.layout, to_layout_i);
5043 : 143265 : if (edge_cost.is_possible ())
5044 : : {
5045 : 279006 : auto &from_costs = partition_layout_costs (from_partition_i,
5046 : 139503 : from_partition.layout);
5047 : 139503 : cost = from_costs.in_cost;
5048 : 139503 : cost.add_serial_cost (from_costs.internal_cost);
5049 : 139503 : cost.split (from_partition.out_degree);
5050 : 139503 : cost.add_serial_cost (edge_cost);
5051 : : }
5052 : 3762 : else if (from_partition.layout == 0)
5053 : : /* We must allow the source partition to have layout 0 as a fallback,
5054 : : in case all other options turn out to be impossible. */
5055 : 3762 : return cost;
5056 : :
5057 : : /* Take the minimum of that cost and the cost that applies if
5058 : : FROM_PARTITION instead switches to TO_LAYOUT_I. */
5059 : 139503 : auto &direct_layout_costs = partition_layout_costs (from_partition_i,
5060 : : to_layout_i);
5061 : 139503 : if (direct_layout_costs.is_possible ())
5062 : : {
5063 : 132425 : slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
5064 : 132425 : direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
5065 : 132425 : direct_cost.split (from_partition.out_degree);
5066 : 132425 : if (!cost.is_possible ()
5067 : 132425 : || direct_cost.is_better_than (cost, m_optimize_size))
5068 : 37990 : cost = direct_cost;
5069 : : }
5070 : :
5071 : 139503 : return cost;
5072 : : }
5073 : :
5074 : : /* UD represents a use-def link between TO_NODE_I and a node in an earlier
5075 : : partition; TO_NODE_I could be the definition node or the use node.
5076 : : The node at the other end of the link wants to use layout FROM_LAYOUT_I;
5077 : : return the cost of any necessary fix-ups on edge UD, or
5078 : : slpg_layout_cost::impossible () if the choice cannot be made.
5079 : :
5080 : : At this point, TO_NODE_I's partition has a fixed choice of layout. */
5081 : :
5082 : : slpg_layout_cost
5083 : 139075 : vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
5084 : : unsigned int from_layout_i)
5085 : : {
5086 : 139075 : auto &to_vertex = m_vertices[to_node_i];
5087 : 139075 : unsigned int to_partition_i = to_vertex.partition;
5088 : 139075 : slpg_partition_info &to_partition = m_partitions[to_partition_i];
5089 : 139075 : gcc_assert (to_partition.layout >= 0);
5090 : :
5091 : : /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
5092 : : adjusted for this input having layout FROM_LAYOUT_I. Assume that
5093 : : any other inputs keep their current choice of layout. */
5094 : 139075 : auto &to_costs = partition_layout_costs (to_partition_i,
5095 : : to_partition.layout);
5096 : 139075 : if (ud->src == int (to_node_i)
5097 : 138945 : && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
5098 : : {
5099 : 10943 : auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
5100 : 10943 : auto old_layout = from_partition.layout;
5101 : 10943 : from_partition.layout = from_layout_i;
5102 : 21886 : int factor = internal_node_cost (to_vertex.node, -1,
5103 : 10943 : to_partition.layout);
5104 : 10943 : from_partition.layout = old_layout;
5105 : 10943 : if (factor >= 0)
5106 : : {
5107 : 9793 : slpg_layout_cost cost = to_costs.out_cost;
5108 : 19586 : cost.add_serial_cost ({ to_vertex.weight * factor,
5109 : 9793 : m_optimize_size });
5110 : 9793 : cost.split (to_partition.in_degree);
5111 : 9793 : return cost;
5112 : : }
5113 : : }
5114 : :
5115 : : /* Compute the cost if we insert any necessary layout change on edge UD. */
5116 : 129282 : auto edge_cost = edge_layout_cost (ud, to_node_i,
5117 : 129282 : to_partition.layout, from_layout_i);
5118 : 129282 : if (edge_cost.is_possible ())
5119 : : {
5120 : 129282 : slpg_layout_cost cost = to_costs.out_cost;
5121 : 129282 : cost.add_serial_cost (to_costs.internal_cost);
5122 : 129282 : cost.split (to_partition.in_degree);
5123 : 129282 : cost.add_serial_cost (edge_cost);
5124 : 129282 : return cost;
5125 : : }
5126 : :
5127 : 0 : return slpg_layout_cost::impossible ();
5128 : : }
5129 : :
5130 : : /* Make a forward pass through the partitions, accumulating input costs.
5131 : : Make a tentative (provisional) choice of layout for each partition,
5132 : : ensuring that this choice still allows later partitions to keep
5133 : : their original layout. */
5134 : :
5135 : : void
5136 : 9255 : vect_optimize_slp_pass::forward_pass ()
5137 : : {
5138 : 205612 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5139 : : ++partition_i)
5140 : : {
5141 : 93551 : auto &partition = m_partitions[partition_i];
5142 : :
5143 : : /* If the partition consists of a single VEC_PERM_EXPR, precompute
5144 : : the incoming cost that would apply if every predecessor partition
5145 : : keeps its current layout. This is used within the loop below. */
5146 : 93551 : slpg_layout_cost in_cost;
5147 : 93551 : slp_tree single_node = nullptr;
5148 : 93551 : if (partition.node_end == partition.node_begin + 1)
5149 : : {
5150 : 92444 : unsigned int node_i = m_partitioned_nodes[partition.node_begin];
5151 : 92444 : single_node = m_vertices[node_i].node;
5152 : 92444 : if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5153 : 3868 : in_cost = total_in_cost (node_i);
5154 : : }
5155 : :
5156 : : /* Go through the possible layouts. Decide which ones are valid
5157 : : for this partition and record which of the valid layouts has
5158 : : the lowest cost. */
5159 : 93551 : unsigned int min_layout_i = 0;
5160 : 93551 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5161 : 576638 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5162 : : {
5163 : 194768 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5164 : 194768 : if (!layout_costs.is_possible ())
5165 : 37905 : continue;
5166 : :
5167 : : /* If the recorded layout is already 0 then the layout cannot
5168 : : change. */
5169 : 194768 : if (partition.layout == 0 && layout_i != 0)
5170 : : {
5171 : 25816 : layout_costs.mark_impossible ();
5172 : 25816 : continue;
5173 : : }
5174 : :
5175 : 168952 : bool is_possible = true;
5176 : 330878 : for (unsigned int order_i = partition.node_begin;
5177 : 330878 : order_i < partition.node_end; ++order_i)
5178 : : {
5179 : 171437 : unsigned int node_i = m_partitioned_nodes[order_i];
5180 : 171437 : auto &vertex = m_vertices[node_i];
5181 : :
5182 : : /* Reject the layout if it is individually incompatible
5183 : : with any node in the partition. */
5184 : 171437 : if (!is_compatible_layout (vertex.node, layout_i))
5185 : : {
5186 : 7124 : is_possible = false;
5187 : 9511 : break;
5188 : : }
5189 : :
5190 : 451351 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5191 : : {
5192 : 287038 : auto &other_vertex = m_vertices[other_node_i];
5193 : 287038 : if (other_vertex.partition < vertex.partition)
5194 : : {
5195 : : /* Accumulate the incoming costs from earlier
5196 : : partitions, plus the cost of any layout changes
5197 : : on UD itself. */
5198 : 143265 : auto cost = forward_cost (ud, other_node_i, layout_i);
5199 : 143265 : if (!cost.is_possible ())
5200 : 3762 : is_possible = false;
5201 : : else
5202 : 139503 : layout_costs.in_cost.add_parallel_cost (cost);
5203 : : }
5204 : : else
5205 : : /* Reject the layout if it would make layout 0 impossible
5206 : : for later partitions. This amounts to testing that the
5207 : : target supports reversing the layout change on edges
5208 : : to later partitions.
5209 : :
5210 : : In principle, it might be possible to push a layout
5211 : : change all the way down a graph, so that it never
5212 : : needs to be reversed and so that the target doesn't
5213 : : need to support the reverse operation. But it would
5214 : : be awkward to bail out if we hit a partition that
5215 : : does not support the new layout, especially since
5216 : : we are not dealing with a lattice. */
5217 : 143773 : is_possible &= edge_layout_cost (ud, other_node_i, 0,
5218 : 143773 : layout_i).is_possible ();
5219 : 451351 : };
5220 : 164313 : for_each_partition_edge (node_i, add_cost);
5221 : :
5222 : : /* Accumulate the cost of using LAYOUT_I within NODE,
5223 : : both for the inputs and the outputs. */
5224 : 164313 : int factor = internal_node_cost (vertex.node, layout_i,
5225 : : layout_i);
5226 : 164313 : if (factor < 0)
5227 : : {
5228 : 2387 : is_possible = false;
5229 : 2387 : break;
5230 : : }
5231 : 161926 : else if (factor)
5232 : 28208 : layout_costs.internal_cost.add_serial_cost
5233 : 28208 : ({ vertex.weight * factor, m_optimize_size });
5234 : : }
5235 : 168952 : if (!is_possible)
5236 : : {
5237 : 12089 : layout_costs.mark_impossible ();
5238 : 12089 : continue;
5239 : : }
5240 : :
5241 : : /* Combine the incoming and partition-internal costs. */
5242 : 156863 : slpg_layout_cost combined_cost = layout_costs.in_cost;
5243 : 156863 : combined_cost.add_serial_cost (layout_costs.internal_cost);
5244 : :
5245 : : /* If this partition consists of a single VEC_PERM_EXPR, see
5246 : : if the VEC_PERM_EXPR can be changed to support output layout
5247 : : LAYOUT_I while keeping all the provisional choices of input
5248 : : layout. */
5249 : 156863 : if (single_node
5250 : 154774 : && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
5251 : : {
5252 : 6504 : int factor = internal_node_cost (single_node, -1, layout_i);
5253 : 6504 : if (factor >= 0)
5254 : : {
5255 : 5584 : auto weight = m_vertices[single_node->vertex].weight;
5256 : 5584 : slpg_layout_cost internal_cost
5257 : 5584 : = { weight * factor, m_optimize_size };
5258 : :
5259 : 5584 : slpg_layout_cost alt_cost = in_cost;
5260 : 5584 : alt_cost.add_serial_cost (internal_cost);
5261 : 5584 : if (alt_cost.is_better_than (combined_cost, m_optimize_size))
5262 : : {
5263 : 1864 : combined_cost = alt_cost;
5264 : 1864 : layout_costs.in_cost = in_cost;
5265 : 1864 : layout_costs.internal_cost = internal_cost;
5266 : : }
5267 : : }
5268 : : }
5269 : :
5270 : : /* Record the layout with the lowest cost. Prefer layout 0 in
5271 : : the event of a tie between it and another layout. */
5272 : 156863 : if (!min_layout_cost.is_possible ()
5273 : 63312 : || combined_cost.is_better_than (min_layout_cost,
5274 : 63312 : m_optimize_size))
5275 : : {
5276 : 108171 : min_layout_i = layout_i;
5277 : 108171 : min_layout_cost = combined_cost;
5278 : : }
5279 : : }
5280 : :
5281 : : /* This loop's handling of earlier partitions should ensure that
5282 : : choosing the original layout for the current partition is no
5283 : : less valid than it was in the original graph, even with the
5284 : : provisional layout choices for those earlier partitions. */
5285 : 93551 : gcc_assert (min_layout_cost.is_possible ());
5286 : 93551 : partition.layout = min_layout_i;
5287 : : }
5288 : 9255 : }
5289 : :
5290 : : /* Make a backward pass through the partitions, accumulating output costs.
5291 : : Make a final choice of layout for each partition. */
5292 : :
5293 : : void
5294 : 9255 : vect_optimize_slp_pass::backward_pass ()
5295 : : {
5296 : 112061 : for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
5297 : : {
5298 : 93551 : auto &partition = m_partitions[partition_i];
5299 : :
5300 : 93551 : unsigned int min_layout_i = 0;
5301 : 93551 : slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
5302 : 576638 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5303 : : {
5304 : 194768 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5305 : 194768 : if (!layout_costs.is_possible ())
5306 : 37905 : continue;
5307 : :
5308 : : /* Accumulate the costs from successor partitions. */
5309 : 156863 : bool is_possible = true;
5310 : 316166 : for (unsigned int order_i = partition.node_begin;
5311 : 316166 : order_i < partition.node_end; ++order_i)
5312 : : {
5313 : 159303 : unsigned int node_i = m_partitioned_nodes[order_i];
5314 : 159303 : auto &vertex = m_vertices[node_i];
5315 : 437634 : auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
5316 : : {
5317 : 278331 : auto &other_vertex = m_vertices[other_node_i];
5318 : 278331 : auto &other_partition = m_partitions[other_vertex.partition];
5319 : 278331 : if (other_vertex.partition > vertex.partition)
5320 : : {
5321 : : /* Accumulate the incoming costs from later
5322 : : partitions, plus the cost of any layout changes
5323 : : on UD itself. */
5324 : 139075 : auto cost = backward_cost (ud, other_node_i, layout_i);
5325 : 139075 : if (!cost.is_possible ())
5326 : 0 : is_possible = false;
5327 : : else
5328 : 139075 : layout_costs.out_cost.add_parallel_cost (cost);
5329 : : }
5330 : : else
5331 : : /* Make sure that earlier partitions can (if necessary
5332 : : or beneficial) keep the layout that they chose in
5333 : : the forward pass. This ensures that there is at
5334 : : least one valid choice of layout. */
5335 : 139256 : is_possible &= edge_layout_cost (ud, other_node_i,
5336 : 139256 : other_partition.layout,
5337 : 139256 : layout_i).is_possible ();
5338 : 437634 : };
5339 : 159303 : for_each_partition_edge (node_i, add_cost);
5340 : : }
5341 : 156863 : if (!is_possible)
5342 : : {
5343 : 0 : layout_costs.mark_impossible ();
5344 : 0 : continue;
5345 : : }
5346 : :
5347 : : /* Locally combine the costs from the forward and backward passes.
5348 : : (This combined cost is not passed on, since that would lead
5349 : : to double counting.) */
5350 : 156863 : slpg_layout_cost combined_cost = layout_costs.in_cost;
5351 : 156863 : combined_cost.add_serial_cost (layout_costs.internal_cost);
5352 : 156863 : combined_cost.add_serial_cost (layout_costs.out_cost);
5353 : :
5354 : : /* Record the layout with the lowest cost. Prefer layout 0 in
5355 : : the event of a tie between it and another layout. */
5356 : 156863 : if (!min_layout_cost.is_possible ()
5357 : 63312 : || combined_cost.is_better_than (min_layout_cost,
5358 : 63312 : m_optimize_size))
5359 : : {
5360 : 104793 : min_layout_i = layout_i;
5361 : 104793 : min_layout_cost = combined_cost;
5362 : : }
5363 : : }
5364 : :
5365 : 93551 : gcc_assert (min_layout_cost.is_possible ());
5366 : 93551 : partition.layout = min_layout_i;
5367 : : }
5368 : 9255 : }
5369 : :
5370 : : /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
5371 : : NODE already has the layout that was selected for its partition. */
5372 : :
5373 : : slp_tree
5374 : 109745 : vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
5375 : : unsigned int to_layout_i)
5376 : : {
5377 : 109745 : unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
5378 : 109745 : slp_tree result = m_node_layouts[result_i];
5379 : 109745 : if (result)
5380 : : return result;
5381 : :
5382 : 109378 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
5383 : 109378 : || (SLP_TREE_DEF_TYPE (node) == vect_external_def
5384 : : /* We can't permute vector defs in place. */
5385 : 18583 : && SLP_TREE_VEC_DEFS (node).is_empty ()))
5386 : : {
5387 : : /* If the vector is uniform or unchanged, there's nothing to do. */
5388 : 34999 : if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
5389 : : result = node;
5390 : : else
5391 : : {
5392 : 1396 : auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
5393 : 1396 : result = vect_create_new_slp_node (scalar_ops);
5394 : 1396 : vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
5395 : : }
5396 : : }
5397 : : else
5398 : : {
5399 : 74379 : unsigned int partition_i = m_vertices[node->vertex].partition;
5400 : 74379 : unsigned int from_layout_i = m_partitions[partition_i].layout;
5401 : 74379 : if (from_layout_i == to_layout_i)
5402 : 74082 : return node;
5403 : :
5404 : : /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
5405 : : permutation instead of a serial one. Leave the new permutation
5406 : : in TMP_PERM on success. */
5407 : 297 : auto_lane_permutation_t tmp_perm;
5408 : 297 : unsigned int num_inputs = 1;
5409 : 297 : if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5410 : : {
5411 : 25 : tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5412 : 25 : if (from_layout_i != 0)
5413 : 25 : vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
5414 : 25 : if (to_layout_i != 0)
5415 : 4 : vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
5416 : 25 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5417 : : tmp_perm,
5418 : 25 : SLP_TREE_CHILDREN (node),
5419 : : false) >= 0)
5420 : 25 : num_inputs = SLP_TREE_CHILDREN (node).length ();
5421 : : else
5422 : 0 : tmp_perm.truncate (0);
5423 : : }
5424 : :
5425 : 297 : if (dump_enabled_p ())
5426 : : {
5427 : 67 : if (tmp_perm.length () > 0)
5428 : 6 : dump_printf_loc (MSG_NOTE, vect_location,
5429 : : "duplicating permutation node %p with"
5430 : : " layout %d\n",
5431 : : (void *) node, to_layout_i);
5432 : : else
5433 : 61 : dump_printf_loc (MSG_NOTE, vect_location,
5434 : : "inserting permutation node in place of %p\n",
5435 : : (void *) node);
5436 : : }
5437 : :
5438 : 297 : unsigned int num_lanes = SLP_TREE_LANES (node);
5439 : 297 : result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
5440 : 297 : if (SLP_TREE_SCALAR_STMTS (node).length ())
5441 : : {
5442 : 297 : auto &stmts = SLP_TREE_SCALAR_STMTS (result);
5443 : 297 : stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
5444 : 297 : if (from_layout_i != 0)
5445 : 251 : vect_slp_permute (m_perms[from_layout_i], stmts, false);
5446 : 297 : if (to_layout_i != 0)
5447 : 50 : vect_slp_permute (m_perms[to_layout_i], stmts, true);
5448 : : }
5449 : 297 : SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
5450 : 297 : SLP_TREE_LANES (result) = num_lanes;
5451 : 297 : SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
5452 : 297 : result->vertex = -1;
5453 : :
5454 : 297 : auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
5455 : 297 : if (tmp_perm.length ())
5456 : : {
5457 : 25 : lane_perm.safe_splice (tmp_perm);
5458 : 25 : SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
5459 : : }
5460 : : else
5461 : : {
5462 : 272 : lane_perm.create (num_lanes);
5463 : 958 : for (unsigned j = 0; j < num_lanes; ++j)
5464 : 686 : lane_perm.quick_push ({ 0, j });
5465 : 272 : if (from_layout_i != 0)
5466 : 226 : vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
5467 : 272 : if (to_layout_i != 0)
5468 : 46 : vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
5469 : 272 : SLP_TREE_CHILDREN (result).safe_push (node);
5470 : : }
5471 : 1192 : for (slp_tree child : SLP_TREE_CHILDREN (result))
5472 : 301 : child->refcnt++;
5473 : 297 : }
5474 : 35296 : m_node_layouts[result_i] = result;
5475 : 35296 : return result;
5476 : : }
5477 : :
5478 : : /* Apply the chosen vector layouts to the SLP graph. */
5479 : :
5480 : : void
5481 : 9255 : vect_optimize_slp_pass::materialize ()
5482 : : {
5483 : : /* We no longer need the costs, so avoid having two O(N * P) arrays
5484 : : live at the same time. */
5485 : 9255 : m_partition_layout_costs.release ();
5486 : 27765 : m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
5487 : :
5488 : 18510 : auto_sbitmap fully_folded (m_vertices.length ());
5489 : 9255 : bitmap_clear (fully_folded);
5490 : 122610 : for (unsigned int node_i : m_partitioned_nodes)
5491 : : {
5492 : 94845 : auto &vertex = m_vertices[node_i];
5493 : 94845 : slp_tree node = vertex.node;
5494 : 94845 : int layout_i = m_partitions[vertex.partition].layout;
5495 : 94845 : gcc_assert (layout_i >= 0);
5496 : :
5497 : : /* Rearrange the scalar statements to match the chosen layout. */
5498 : 94845 : if (layout_i > 0)
5499 : 11474 : vect_slp_permute (m_perms[layout_i],
5500 : 11474 : SLP_TREE_SCALAR_STMTS (node), true);
5501 : :
5502 : : /* Update load and lane permutations. */
5503 : 94845 : if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5504 : : {
5505 : : /* First try to absorb the input vector layouts. If that fails,
5506 : : force the inputs to have layout LAYOUT_I too. We checked that
5507 : : that was possible before deciding to use nonzero output layouts.
5508 : : (Note that at this stage we don't really have any guarantee that
5509 : : the target supports the original VEC_PERM_EXPR.) */
5510 : 3880 : auto &perm = SLP_TREE_LANE_PERMUTATION (node);
5511 : 3880 : auto_lane_permutation_t tmp_perm;
5512 : 3880 : tmp_perm.safe_splice (perm);
5513 : 3880 : change_vec_perm_layout (node, tmp_perm, -1, layout_i);
5514 : 3880 : if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
5515 : : tmp_perm,
5516 : 3880 : SLP_TREE_CHILDREN (node),
5517 : : false) >= 0)
5518 : : {
5519 : 3080 : if (dump_enabled_p ()
5520 : 3881 : && !std::equal (tmp_perm.begin (), tmp_perm.end (),
5521 : : perm.begin ()))
5522 : 56 : dump_printf_loc (MSG_NOTE, vect_location,
5523 : : "absorbing input layouts into %p\n",
5524 : : (void *) node);
5525 : 12320 : std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
5526 : 3080 : bitmap_set_bit (fully_folded, node_i);
5527 : : }
5528 : : else
5529 : : {
5530 : : /* Not MSG_MISSED because it would make no sense to users. */
5531 : 800 : if (dump_enabled_p ())
5532 : 161 : dump_printf_loc (MSG_NOTE, vect_location,
5533 : : "failed to absorb input layouts into %p\n",
5534 : : (void *) node);
5535 : 800 : change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
5536 : : }
5537 : 3880 : }
5538 : : else
5539 : : {
5540 : 90965 : gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
5541 : 90965 : auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
5542 : 90965 : if (layout_i > 0)
5543 : : /* ??? When we handle non-bijective permutes the idea
5544 : : is that we can force the load-permutation to be
5545 : : { min, min + 1, min + 2, ... max }. But then the
5546 : : scalar defs might no longer match the lane content
5547 : : which means wrong-code with live lane vectorization.
5548 : : So we possibly have to have NULL entries for those. */
5549 : 11352 : vect_slp_permute (m_perms[layout_i], load_perm, true);
5550 : : }
5551 : : }
5552 : :
5553 : : /* Do this before any nodes disappear, since it involves a walk
5554 : : over the leaves. */
5555 : 9255 : remove_redundant_permutations ();
5556 : :
5557 : : /* Replace each child with a correctly laid-out version. */
5558 : 122610 : for (unsigned int node_i : m_partitioned_nodes)
5559 : : {
5560 : : /* Skip nodes that have already been handled above. */
5561 : 94845 : if (bitmap_bit_p (fully_folded, node_i))
5562 : 3080 : continue;
5563 : :
5564 : 91765 : auto &vertex = m_vertices[node_i];
5565 : 91765 : int in_layout_i = m_partitions[vertex.partition].layout;
5566 : 91765 : gcc_assert (in_layout_i >= 0);
5567 : :
5568 : : unsigned j;
5569 : : slp_tree child;
5570 : 271913 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
5571 : : {
5572 : 110521 : if (!child)
5573 : 776 : continue;
5574 : :
5575 : 109745 : slp_tree new_child = get_result_with_layout (child, in_layout_i);
5576 : 109745 : if (new_child != child)
5577 : : {
5578 : 1766 : vect_free_slp_tree (child);
5579 : 1766 : SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
5580 : 1766 : new_child->refcnt += 1;
5581 : : }
5582 : : }
5583 : : }
5584 : 9255 : }
5585 : :
5586 : : /* Elide load permutations that are not necessary. Such permutations might
5587 : : be pre-existing, rather than created by the layout optimizations. */
5588 : :
5589 : : void
5590 : 302376 : vect_optimize_slp_pass::remove_redundant_permutations ()
5591 : : {
5592 : 1902473 : for (unsigned int node_i : m_leafs)
5593 : : {
5594 : 995345 : slp_tree node = m_vertices[node_i].node;
5595 : 995345 : if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
5596 : 830331 : continue;
5597 : :
5598 : : /* In basic block vectorization we allow any subchain of an interleaving
5599 : : chain.
5600 : : FORNOW: not in loop SLP because of realignment complications. */
5601 : 165014 : if (is_a <bb_vec_info> (m_vinfo))
5602 : : {
5603 : 492492 : bool subchain_p = true;
5604 : : stmt_vec_info next_load_info = NULL;
5605 : : stmt_vec_info load_info;
5606 : : unsigned j;
5607 : 492492 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5608 : : {
5609 : 360988 : if (j != 0
5610 : 360988 : && (next_load_info != load_info
5611 : 192642 : || DR_GROUP_GAP (load_info) != 1))
5612 : : {
5613 : : subchain_p = false;
5614 : : break;
5615 : : }
5616 : 339402 : next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
5617 : : }
5618 : 153090 : if (subchain_p)
5619 : : {
5620 : 131504 : SLP_TREE_LOAD_PERMUTATION (node).release ();
5621 : 131504 : continue;
5622 : : }
5623 : : }
5624 : : else
5625 : : {
5626 : 11924 : loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
5627 : 11924 : stmt_vec_info load_info;
5628 : 11924 : bool this_load_permuted = false;
5629 : 11924 : unsigned j;
5630 : 47650 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
5631 : 28554 : if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
5632 : : {
5633 : : this_load_permuted = true;
5634 : : break;
5635 : : }
5636 : : /* When this isn't a grouped access we know it's single element
5637 : : and contiguous. */
5638 : 11924 : if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
5639 : : {
5640 : 492 : if (!this_load_permuted
5641 : 492 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5642 : 38 : || SLP_TREE_LANES (node) == 1))
5643 : 38 : SLP_TREE_LOAD_PERMUTATION (node).release ();
5644 : 492 : continue;
5645 : : }
5646 : 11432 : stmt_vec_info first_stmt_info
5647 : 11432 : = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
5648 : 17810 : if (!this_load_permuted
5649 : : /* The load requires permutation when unrolling exposes
5650 : : a gap either because the group is larger than the SLP
5651 : : group-size or because there is a gap between the groups. */
5652 : 11432 : && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
5653 : 4140 : || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
5654 : 3384 : && DR_GROUP_GAP (first_stmt_info) == 0)))
5655 : : {
5656 : 6378 : SLP_TREE_LOAD_PERMUTATION (node).release ();
5657 : 6378 : continue;
5658 : : }
5659 : : }
5660 : : }
5661 : 302376 : }
5662 : :
5663 : : /* Print the partition graph and layout information to the dump file. */
5664 : :
5665 : : void
5666 : 606 : vect_optimize_slp_pass::dump ()
5667 : : {
5668 : 606 : dump_printf_loc (MSG_NOTE, vect_location,
5669 : : "SLP optimize permutations:\n");
5670 : 2456 : for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
5671 : : {
5672 : 622 : dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
5673 : 622 : const char *sep = "";
5674 : 4918 : for (unsigned int idx : m_perms[layout_i])
5675 : : {
5676 : 3052 : dump_printf (MSG_NOTE, "%s%d", sep, idx);
5677 : 3052 : sep = ", ";
5678 : : }
5679 : 622 : dump_printf (MSG_NOTE, " }\n");
5680 : : }
5681 : 606 : dump_printf_loc (MSG_NOTE, vect_location,
5682 : : "SLP optimize partitions:\n");
5683 : 10014 : for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
5684 : : ++partition_i)
5685 : : {
5686 : 4401 : auto &partition = m_partitions[partition_i];
5687 : 4401 : dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
5688 : 4401 : dump_printf_loc (MSG_NOTE, vect_location,
5689 : : " partition %d (layout %d):\n",
5690 : : partition_i, partition.layout);
5691 : 4401 : dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
5692 : 9006 : for (unsigned int order_i = partition.node_begin;
5693 : 9006 : order_i < partition.node_end; ++order_i)
5694 : : {
5695 : 4605 : auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
5696 : 9210 : dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
5697 : 4605 : (void *) vertex.node);
5698 : 4605 : dump_printf_loc (MSG_NOTE, vect_location,
5699 : : " weight: %f\n",
5700 : : vertex.weight.to_double ());
5701 : 4605 : if (vertex.out_degree)
5702 : 3595 : dump_printf_loc (MSG_NOTE, vect_location,
5703 : : " out weight: %f (degree %d)\n",
5704 : : vertex.out_weight.to_double (),
5705 : : vertex.out_degree);
5706 : 4605 : if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
5707 : 428 : dump_printf_loc (MSG_NOTE, vect_location,
5708 : : " op: VEC_PERM_EXPR\n");
5709 : 4177 : else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
5710 : 4159 : dump_printf_loc (MSG_NOTE, vect_location,
5711 : : " op template: %G", rep->stmt);
5712 : : }
5713 : 4401 : dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
5714 : 9006 : for (unsigned int order_i = partition.node_begin;
5715 : 9006 : order_i < partition.node_end; ++order_i)
5716 : : {
5717 : 4605 : unsigned int node_i = m_partitioned_nodes[order_i];
5718 : 4605 : auto &vertex = m_vertices[node_i];
5719 : 13829 : auto print_edge = [&](graph_edge *, unsigned int other_node_i)
5720 : : {
5721 : 9224 : auto &other_vertex = m_vertices[other_node_i];
5722 : 9224 : if (other_vertex.partition < vertex.partition)
5723 : 4612 : dump_printf_loc (MSG_NOTE, vect_location,
5724 : : " - %p [%d] --> %p\n",
5725 : 4612 : (void *) other_vertex.node,
5726 : : other_vertex.partition,
5727 : 4612 : (void *) vertex.node);
5728 : : else
5729 : 4612 : dump_printf_loc (MSG_NOTE, vect_location,
5730 : : " - %p --> [%d] %p\n",
5731 : 4612 : (void *) vertex.node,
5732 : : other_vertex.partition,
5733 : 4612 : (void *) other_vertex.node);
5734 : 13829 : };
5735 : 4605 : for_each_partition_edge (node_i, print_edge);
5736 : : }
5737 : :
5738 : 26942 : for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
5739 : : {
5740 : 9070 : auto &layout_costs = partition_layout_costs (partition_i, layout_i);
5741 : 9070 : if (layout_costs.is_possible ())
5742 : : {
5743 : 7246 : dump_printf_loc (MSG_NOTE, vect_location,
5744 : : " layout %d:%s\n", layout_i,
5745 : 7246 : partition.layout == int (layout_i)
5746 : : ? " (*)" : "");
5747 : 7246 : slpg_layout_cost combined_cost = layout_costs.in_cost;
5748 : 7246 : combined_cost.add_serial_cost (layout_costs.internal_cost);
5749 : 7246 : combined_cost.add_serial_cost (layout_costs.out_cost);
5750 : : #define TEMPLATE "{depth: %f, total: %f}"
5751 : 7246 : dump_printf_loc (MSG_NOTE, vect_location,
5752 : : " " TEMPLATE "\n",
5753 : : layout_costs.in_cost.depth.to_double (),
5754 : : layout_costs.in_cost.total.to_double ());
5755 : 7246 : dump_printf_loc (MSG_NOTE, vect_location,
5756 : : " + " TEMPLATE "\n",
5757 : : layout_costs.internal_cost.depth.to_double (),
5758 : : layout_costs.internal_cost.total.to_double ());
5759 : 7246 : dump_printf_loc (MSG_NOTE, vect_location,
5760 : : " + " TEMPLATE "\n",
5761 : : layout_costs.out_cost.depth.to_double (),
5762 : : layout_costs.out_cost.total.to_double ());
5763 : 7246 : dump_printf_loc (MSG_NOTE, vect_location,
5764 : : " = " TEMPLATE "\n",
5765 : : combined_cost.depth.to_double (),
5766 : : combined_cost.total.to_double ());
5767 : : #undef TEMPLATE
5768 : : }
5769 : : else
5770 : 1824 : dump_printf_loc (MSG_NOTE, vect_location,
5771 : : " layout %d: rejected\n", layout_i);
5772 : : }
5773 : : }
5774 : 606 : }
5775 : :
5776 : : /* Main entry point for the SLP graph optimization pass. */
5777 : :
5778 : : void
5779 : 302376 : vect_optimize_slp_pass::run ()
5780 : : {
5781 : 302376 : build_graph ();
5782 : 302376 : create_partitions ();
5783 : 302376 : start_choosing_layouts ();
5784 : 302376 : if (m_perms.length () > 1)
5785 : : {
5786 : 9255 : forward_pass ();
5787 : 9255 : backward_pass ();
5788 : 9255 : if (dump_enabled_p ())
5789 : 606 : dump ();
5790 : 9255 : materialize ();
5791 : 37542 : while (!m_perms.is_empty ())
5792 : 19032 : m_perms.pop ().release ();
5793 : : }
5794 : : else
5795 : 293121 : remove_redundant_permutations ();
5796 : 302376 : free_graph (m_slpg);
5797 : 302376 : }
5798 : :
5799 : : /* Optimize the SLP graph of VINFO. */
5800 : :
5801 : : void
5802 : 586931 : vect_optimize_slp (vec_info *vinfo)
5803 : : {
5804 : 586931 : if (vinfo->slp_instances.is_empty ())
5805 : : return;
5806 : 302376 : vect_optimize_slp_pass (vinfo).run ();
5807 : : }
5808 : :
5809 : : /* Gather loads reachable from the individual SLP graph entries. */
5810 : :
5811 : : void
5812 : 586931 : vect_gather_slp_loads (vec_info *vinfo)
5813 : : {
5814 : 586931 : unsigned i;
5815 : 586931 : slp_instance instance;
5816 : 1357815 : FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
5817 : : {
5818 : 770884 : hash_set<slp_tree> visited;
5819 : 770884 : vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
5820 : : SLP_INSTANCE_TREE (instance), visited);
5821 : 770884 : }
5822 : 586931 : }
5823 : :
5824 : :
5825 : : /* For each possible SLP instance decide whether to SLP it and calculate overall
5826 : : unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
5827 : : least one instance. */
5828 : :
5829 : : bool
5830 : 170651 : vect_make_slp_decision (loop_vec_info loop_vinfo)
5831 : : {
5832 : 170651 : unsigned int i;
5833 : 170651 : poly_uint64 unrolling_factor = 1;
5834 : 170651 : const vec<slp_instance> &slp_instances
5835 : : = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
5836 : 170651 : slp_instance instance;
5837 : 170651 : int decided_to_slp = 0;
5838 : :
5839 : 170651 : DUMP_VECT_SCOPE ("vect_make_slp_decision");
5840 : :
5841 : 182253 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
5842 : : {
5843 : : /* FORNOW: SLP if you can. */
5844 : : /* All unroll factors have the form:
5845 : :
5846 : : GET_MODE_SIZE (vinfo->vector_mode) * X
5847 : :
5848 : : for some rational X, so they must have a common multiple. */
5849 : 11602 : unrolling_factor
5850 : 11602 : = force_common_multiple (unrolling_factor,
5851 : 11602 : SLP_INSTANCE_UNROLLING_FACTOR (instance));
5852 : :
5853 : : /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
5854 : : call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
5855 : : loop-based vectorization. Such stmts will be marked as HYBRID. */
5856 : 11602 : vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
5857 : 11602 : decided_to_slp++;
5858 : : }
5859 : :
5860 : 170651 : LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
5861 : :
5862 : 170651 : if (decided_to_slp && dump_enabled_p ())
5863 : : {
5864 : 2062 : dump_printf_loc (MSG_NOTE, vect_location,
5865 : : "Decided to SLP %d instances. Unrolling factor ",
5866 : : decided_to_slp);
5867 : 2062 : dump_dec (MSG_NOTE, unrolling_factor);
5868 : 2062 : dump_printf (MSG_NOTE, "\n");
5869 : : }
5870 : :
5871 : 170651 : return (decided_to_slp > 0);
5872 : : }
5873 : :
5874 : : /* Private data for vect_detect_hybrid_slp. */
5875 : : struct vdhs_data
5876 : : {
5877 : : loop_vec_info loop_vinfo;
5878 : : vec<stmt_vec_info> *worklist;
5879 : : };
5880 : :
5881 : : /* Walker for walk_gimple_op. */
5882 : :
5883 : : static tree
5884 : 62649 : vect_detect_hybrid_slp (tree *tp, int *, void *data)
5885 : : {
5886 : 62649 : walk_stmt_info *wi = (walk_stmt_info *)data;
5887 : 62649 : vdhs_data *dat = (vdhs_data *)wi->info;
5888 : :
5889 : 62649 : if (wi->is_lhs)
5890 : : return NULL_TREE;
5891 : :
5892 : 14781 : stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
5893 : 14781 : if (!def_stmt_info)
5894 : : return NULL_TREE;
5895 : 4941 : def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
5896 : 4941 : if (PURE_SLP_STMT (def_stmt_info))
5897 : : {
5898 : 1054 : if (dump_enabled_p ())
5899 : 698 : dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
5900 : : def_stmt_info->stmt);
5901 : 1054 : STMT_SLP_TYPE (def_stmt_info) = hybrid;
5902 : 1054 : dat->worklist->safe_push (def_stmt_info);
5903 : : }
5904 : :
5905 : : return NULL_TREE;
5906 : : }
5907 : :
5908 : : /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
5909 : : if so, otherwise pushing it to WORKLIST. */
5910 : :
5911 : : static void
5912 : 8972 : maybe_push_to_hybrid_worklist (vec_info *vinfo,
5913 : : vec<stmt_vec_info> &worklist,
5914 : : stmt_vec_info stmt_info)
5915 : : {
5916 : 8972 : if (dump_enabled_p ())
5917 : 1817 : dump_printf_loc (MSG_NOTE, vect_location,
5918 : : "Processing hybrid candidate : %G", stmt_info->stmt);
5919 : 8972 : stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
5920 : 8972 : imm_use_iterator iter2;
5921 : 8972 : ssa_op_iter iter1;
5922 : 8972 : use_operand_p use_p;
5923 : 8972 : def_operand_p def_p;
5924 : 8972 : bool any_def = false;
5925 : 19565 : FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
5926 : : {
5927 : 3765 : any_def = true;
5928 : 5410 : FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
5929 : : {
5930 : 3789 : if (is_gimple_debug (USE_STMT (use_p)))
5931 : 8 : continue;
5932 : 3781 : stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
5933 : : /* An out-of loop use means this is a loop_vect sink. */
5934 : 3781 : if (!use_info)
5935 : : {
5936 : 206 : if (dump_enabled_p ())
5937 : 43 : dump_printf_loc (MSG_NOTE, vect_location,
5938 : : "Found loop_vect sink: %G", stmt_info->stmt);
5939 : 206 : worklist.safe_push (stmt_info);
5940 : 7557 : return;
5941 : : }
5942 : 3664 : else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
5943 : : {
5944 : 1938 : if (dump_enabled_p ())
5945 : 855 : dump_printf_loc (MSG_NOTE, vect_location,
5946 : : "Found loop_vect use: %G", use_info->stmt);
5947 : 1938 : worklist.safe_push (stmt_info);
5948 : 1938 : return;
5949 : : }
5950 : : }
5951 : : }
5952 : : /* No def means this is a loo_vect sink. */
5953 : 6828 : if (!any_def)
5954 : : {
5955 : 5207 : if (dump_enabled_p ())
5956 : 418 : dump_printf_loc (MSG_NOTE, vect_location,
5957 : : "Found loop_vect sink: %G", stmt_info->stmt);
5958 : 5207 : worklist.safe_push (stmt_info);
5959 : 5207 : return;
5960 : : }
5961 : 1621 : if (dump_enabled_p ())
5962 : 501 : dump_printf_loc (MSG_NOTE, vect_location,
5963 : : "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
5964 : 1621 : STMT_SLP_TYPE (stmt_info) = pure_slp;
5965 : : }
5966 : :
5967 : : /* Find stmts that must be both vectorized and SLPed. */
5968 : :
5969 : : void
5970 : 9216 : vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
5971 : : {
5972 : 9216 : DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
5973 : :
5974 : : /* All stmts participating in SLP are marked pure_slp, all other
5975 : : stmts are loop_vect.
5976 : : First collect all loop_vect stmts into a worklist.
5977 : : SLP patterns cause not all original scalar stmts to appear in
5978 : : SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
5979 : : Rectify this here and do a backward walk over the IL only considering
5980 : : stmts as loop_vect when they are used by a loop_vect stmt and otherwise
5981 : : mark them as pure_slp. */
5982 : 9216 : auto_vec<stmt_vec_info> worklist;
5983 : 27813 : for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
5984 : : {
5985 : 18597 : basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
5986 : 47180 : for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
5987 : 28583 : gsi_next (&gsi))
5988 : : {
5989 : 28583 : gphi *phi = gsi.phi ();
5990 : 28583 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
5991 : 28583 : if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
5992 : 203 : maybe_push_to_hybrid_worklist (loop_vinfo,
5993 : : worklist, stmt_info);
5994 : : }
5995 : 18597 : for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
5996 : 482413 : gsi_prev (&gsi))
5997 : : {
5998 : 231908 : gimple *stmt = gsi_stmt (gsi);
5999 : 231908 : if (is_gimple_debug (stmt))
6000 : 34728 : continue;
6001 : 197180 : stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
6002 : 197180 : if (STMT_VINFO_IN_PATTERN_P (stmt_info))
6003 : : {
6004 : 6661 : for (gimple_stmt_iterator gsi2
6005 : 6661 : = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
6006 : 15079 : !gsi_end_p (gsi2); gsi_next (&gsi2))
6007 : : {
6008 : 8418 : stmt_vec_info patt_info
6009 : 8418 : = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
6010 : 8418 : if (!STMT_SLP_TYPE (patt_info)
6011 : 2623 : && STMT_VINFO_RELEVANT (patt_info))
6012 : 67 : maybe_push_to_hybrid_worklist (loop_vinfo,
6013 : : worklist, patt_info);
6014 : : }
6015 : 6661 : stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6016 : : }
6017 : 197180 : if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
6018 : 8702 : maybe_push_to_hybrid_worklist (loop_vinfo,
6019 : : worklist, stmt_info);
6020 : : }
6021 : : }
6022 : :
6023 : : /* Now we have a worklist of non-SLP stmts, follow use->def chains and
6024 : : mark any SLP vectorized stmt as hybrid.
6025 : : ??? We're visiting def stmts N times (once for each non-SLP and
6026 : : once for each hybrid-SLP use). */
6027 : 9216 : walk_stmt_info wi;
6028 : 9216 : vdhs_data dat;
6029 : 9216 : dat.worklist = &worklist;
6030 : 9216 : dat.loop_vinfo = loop_vinfo;
6031 : 9216 : memset (&wi, 0, sizeof (wi));
6032 : 9216 : wi.info = (void *)&dat;
6033 : 18928 : while (!worklist.is_empty ())
6034 : : {
6035 : 8405 : stmt_vec_info stmt_info = worklist.pop ();
6036 : : /* Since SSA operands are not set up for pattern stmts we need
6037 : : to use walk_gimple_op. */
6038 : 8405 : wi.is_lhs = 0;
6039 : 8405 : walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
6040 : : /* For gather/scatter make sure to walk the offset operand, that
6041 : : can be a scaling and conversion away. */
6042 : 8405 : gather_scatter_info gs_info;
6043 : 8405 : if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
6044 : 8405 : && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
6045 : : {
6046 : 12 : int dummy;
6047 : 12 : vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
6048 : : }
6049 : : }
6050 : 9216 : }
6051 : :
6052 : :
6053 : : /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
6054 : :
6055 : 2285374 : _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
6056 : : : vec_info (vec_info::bb, shared),
6057 : 2285374 : bbs (_bbs),
6058 : 2285374 : roots (vNULL)
6059 : : {
6060 : 31795084 : for (unsigned i = 0; i < bbs.length (); ++i)
6061 : : {
6062 : 13612168 : if (i != 0)
6063 : 17417795 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6064 : 6091001 : gsi_next (&si))
6065 : : {
6066 : 6091001 : gphi *phi = si.phi ();
6067 : 6091001 : gimple_set_uid (phi, 0);
6068 : 6091001 : add_stmt (phi);
6069 : : }
6070 : 27224336 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6071 : 106251004 : !gsi_end_p (gsi); gsi_next (&gsi))
6072 : : {
6073 : 92638836 : gimple *stmt = gsi_stmt (gsi);
6074 : 92638836 : gimple_set_uid (stmt, 0);
6075 : 92638836 : if (is_gimple_debug (stmt))
6076 : 53075948 : continue;
6077 : 39562888 : add_stmt (stmt);
6078 : : }
6079 : : }
6080 : 2285374 : }
6081 : :
6082 : :
6083 : : /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
6084 : : stmts in the basic block. */
6085 : :
6086 : 2285374 : _bb_vec_info::~_bb_vec_info ()
6087 : : {
6088 : : /* Reset region marker. */
6089 : 31795084 : for (unsigned i = 0; i < bbs.length (); ++i)
6090 : : {
6091 : 13612168 : if (i != 0)
6092 : 17439356 : for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
6093 : 6112562 : gsi_next (&si))
6094 : : {
6095 : 6112562 : gphi *phi = si.phi ();
6096 : 6112562 : gimple_set_uid (phi, -1);
6097 : : }
6098 : 27224336 : for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
6099 : 106285801 : !gsi_end_p (gsi); gsi_next (&gsi))
6100 : : {
6101 : 92673633 : gimple *stmt = gsi_stmt (gsi);
6102 : 92673633 : gimple_set_uid (stmt, -1);
6103 : : }
6104 : : }
6105 : :
6106 : 4679557 : for (unsigned i = 0; i < roots.length (); ++i)
6107 : : {
6108 : 1041926 : roots[i].stmts.release ();
6109 : 1041926 : roots[i].roots.release ();
6110 : 1041926 : roots[i].remain.release ();
6111 : : }
6112 : 2285374 : roots.release ();
6113 : 2285374 : }
6114 : :
6115 : : /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
6116 : : given then that child nodes have already been processed, and that
6117 : : their def types currently match their SLP node's def type. */
6118 : :
6119 : : static bool
6120 : 1207115 : vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
6121 : : slp_instance node_instance,
6122 : : stmt_vector_for_cost *cost_vec)
6123 : : {
6124 : 1207115 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
6125 : :
6126 : : /* Calculate the number of vector statements to be created for the
6127 : : scalar stmts in this node. For SLP reductions it is equal to the
6128 : : number of vector statements in the children (which has already been
6129 : : calculated by the recursive call). Otherwise it is the number of
6130 : : scalar elements in one scalar iteration (DR_GROUP_SIZE) multiplied by
6131 : : VF divided by the number of elements in a vector. */
6132 : 1207115 : if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
6133 : 1178246 : && !STMT_VINFO_DATA_REF (stmt_info)
6134 : 1595824 : && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6135 : : {
6136 : 670 : for (unsigned i = 0; i < SLP_TREE_CHILDREN (node).length (); ++i)
6137 : 335 : if (SLP_TREE_DEF_TYPE (SLP_TREE_CHILDREN (node)[i]) == vect_internal_def)
6138 : : {
6139 : 312 : SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6140 : 312 : = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[i]);
6141 : 312 : break;
6142 : : }
6143 : : }
6144 : : else
6145 : : {
6146 : 1206803 : poly_uint64 vf;
6147 : 1206803 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6148 : 34100 : vf = loop_vinfo->vectorization_factor;
6149 : : else
6150 : : vf = 1;
6151 : 1206803 : unsigned int group_size = SLP_TREE_LANES (node);
6152 : 1206803 : tree vectype = SLP_TREE_VECTYPE (node);
6153 : 1206803 : SLP_TREE_NUMBER_OF_VEC_STMTS (node)
6154 : 1206803 : = vect_get_num_vectors (vf * group_size, vectype);
6155 : : }
6156 : :
6157 : : /* Handle purely internal nodes. */
6158 : 1207115 : if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6159 : : {
6160 : 28869 : if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
6161 : : return false;
6162 : :
6163 : : stmt_vec_info slp_stmt_info;
6164 : : unsigned int i;
6165 : 86027 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
6166 : : {
6167 : 58152 : if (STMT_VINFO_LIVE_P (slp_stmt_info)
6168 : 58152 : && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
6169 : : node_instance, i,
6170 : : false, cost_vec))
6171 : : return false;
6172 : : }
6173 : : return true;
6174 : : }
6175 : :
6176 : 1178246 : bool dummy;
6177 : 1178246 : return vect_analyze_stmt (vinfo, stmt_info, &dummy,
6178 : : node, node_instance, cost_vec);
6179 : : }
6180 : :
6181 : : /* Try to build NODE from scalars, returning true on success.
6182 : : NODE_INSTANCE is the SLP instance that contains NODE. */
6183 : :
6184 : : static bool
6185 : 86678 : vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
6186 : : slp_instance node_instance)
6187 : : {
6188 : 86678 : stmt_vec_info stmt_info;
6189 : 86678 : unsigned int i;
6190 : :
6191 : 86678 : if (!is_a <bb_vec_info> (vinfo)
6192 : 84150 : || node == SLP_INSTANCE_TREE (node_instance)
6193 : 26096 : || !SLP_TREE_SCALAR_STMTS (node).exists ()
6194 : 26028 : || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
6195 : : /* Force the mask use to be built from scalars instead. */
6196 : 110465 : || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
6197 : : return false;
6198 : :
6199 : 23602 : if (dump_enabled_p ())
6200 : 154 : dump_printf_loc (MSG_NOTE, vect_location,
6201 : : "Building vector operands of %p from scalars instead\n",
6202 : : (void *) node);
6203 : :
6204 : : /* Don't remove and free the child nodes here, since they could be
6205 : : referenced by other structures. The analysis and scheduling phases
6206 : : (need to) ignore child nodes of anything that isn't vect_internal_def. */
6207 : 23602 : unsigned int group_size = SLP_TREE_LANES (node);
6208 : 23602 : SLP_TREE_DEF_TYPE (node) = vect_external_def;
6209 : : /* Invariants get their vector type from the uses. */
6210 : 23602 : SLP_TREE_VECTYPE (node) = NULL_TREE;
6211 : 23602 : SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
6212 : 23602 : SLP_TREE_LOAD_PERMUTATION (node).release ();
6213 : 80760 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6214 : : {
6215 : 57158 : tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
6216 : 57158 : SLP_TREE_SCALAR_OPS (node)[i] = lhs;
6217 : : }
6218 : : return true;
6219 : : }
6220 : :
6221 : : /* Return true if all elements of the slice are the same. */
6222 : : bool
6223 : 417890 : vect_scalar_ops_slice::all_same_p () const
6224 : : {
6225 : 456924 : for (unsigned int i = 1; i < length; ++i)
6226 : 358840 : if (!operand_equal_p (op (0), op (i)))
6227 : : return false;
6228 : : return true;
6229 : : }
6230 : :
6231 : : hashval_t
6232 : 594594 : vect_scalar_ops_slice_hash::hash (const value_type &s)
6233 : : {
6234 : 594594 : hashval_t hash = 0;
6235 : 1861334 : for (unsigned i = 0; i < s.length; ++i)
6236 : 1266740 : hash = iterative_hash_expr (s.op (i), hash);
6237 : 594594 : return hash;
6238 : : }
6239 : :
6240 : : bool
6241 : 363240 : vect_scalar_ops_slice_hash::equal (const value_type &s1,
6242 : : const compare_type &s2)
6243 : : {
6244 : 363240 : if (s1.length != s2.length)
6245 : : return false;
6246 : 529097 : for (unsigned i = 0; i < s1.length; ++i)
6247 : 458528 : if (!operand_equal_p (s1.op (i), s2.op (i)))
6248 : : return false;
6249 : : return true;
6250 : : }
6251 : :
6252 : : /* Compute the prologue cost for invariant or constant operands represented
6253 : : by NODE. */
6254 : :
6255 : : static void
6256 : 738433 : vect_prologue_cost_for_slp (slp_tree node,
6257 : : stmt_vector_for_cost *cost_vec)
6258 : : {
6259 : : /* There's a special case of an existing vector, that costs nothing. */
6260 : 738433 : if (SLP_TREE_SCALAR_OPS (node).length () == 0
6261 : 738433 : && !SLP_TREE_VEC_DEFS (node).is_empty ())
6262 : 1854 : return;
6263 : : /* Without looking at the actual initializer a vector of
6264 : : constants can be implemented as load from the constant pool.
6265 : : When all elements are the same we can use a splat. */
6266 : 736579 : tree vectype = SLP_TREE_VECTYPE (node);
6267 : 736579 : unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
6268 : 736579 : unsigned HOST_WIDE_INT const_nunits;
6269 : 736579 : unsigned nelt_limit;
6270 : 736579 : auto ops = &SLP_TREE_SCALAR_OPS (node);
6271 : 736579 : auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
6272 : 736579 : if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
6273 : 736579 : && ! multiple_p (const_nunits, group_size))
6274 : : {
6275 : 84376 : nelt_limit = const_nunits;
6276 : 84376 : hash_set<vect_scalar_ops_slice_hash> vector_ops;
6277 : 348086 : for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
6278 : 263710 : if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
6279 : 193141 : starts.quick_push (i * const_nunits);
6280 : 84376 : }
6281 : : else
6282 : : {
6283 : : /* If either the vector has variable length or the vectors
6284 : : are composed of repeated whole groups we only need to
6285 : : cost construction once. All vectors will be the same. */
6286 : 652203 : nelt_limit = group_size;
6287 : 652203 : starts.quick_push (0);
6288 : : }
6289 : : /* ??? We're just tracking whether vectors in a single node are the same.
6290 : : Ideally we'd do something more global. */
6291 : 736579 : bool passed = false;
6292 : 3055081 : for (unsigned int start : starts)
6293 : : {
6294 : 845344 : vect_cost_for_stmt kind;
6295 : 845344 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
6296 : : kind = vector_load;
6297 : 417890 : else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
6298 : : kind = scalar_to_vec;
6299 : : else
6300 : 319806 : kind = vec_construct;
6301 : : /* The target cost hook has no idea which part of the SLP node
6302 : : we are costing so avoid passing it down more than once. Pass
6303 : : it to the first vec_construct or scalar_to_vec part since for those
6304 : : the x86 backend tries to account for GPR to XMM register moves. */
6305 : 845344 : record_stmt_cost (cost_vec, 1, kind,
6306 : 845344 : (kind != vector_load && !passed) ? node : nullptr,
6307 : : vectype, 0, vect_prologue);
6308 : 845344 : if (kind != vector_load)
6309 : 417890 : passed = true;
6310 : : }
6311 : 736579 : }
6312 : :
6313 : : /* Analyze statements contained in SLP tree NODE after recursively analyzing
6314 : : the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
6315 : :
6316 : : Return true if the operations are supported. */
6317 : :
6318 : : static bool
6319 : 2189786 : vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
6320 : : slp_instance node_instance,
6321 : : hash_set<slp_tree> &visited_set,
6322 : : vec<slp_tree> &visited_vec,
6323 : : stmt_vector_for_cost *cost_vec)
6324 : : {
6325 : 2189786 : int i, j;
6326 : 2189786 : slp_tree child;
6327 : :
6328 : : /* Assume we can code-generate all invariants. */
6329 : 2189786 : if (!node
6330 : 2187492 : || SLP_TREE_DEF_TYPE (node) == vect_constant_def
6331 : 1741981 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
6332 : : return true;
6333 : :
6334 : 1310347 : if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
6335 : : {
6336 : 8 : if (dump_enabled_p ())
6337 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
6338 : : "Failed cyclic SLP reference in %p\n", (void *) node);
6339 : 8 : return false;
6340 : : }
6341 : 1310339 : gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
6342 : :
6343 : : /* If we already analyzed the exact same set of scalar stmts we're done.
6344 : : We share the generated vector stmts for those. */
6345 : 1310339 : if (visited_set.add (node))
6346 : : return true;
6347 : 1211300 : visited_vec.safe_push (node);
6348 : :
6349 : 1211300 : bool res = true;
6350 : 1211300 : unsigned visited_rec_start = visited_vec.length ();
6351 : 1211300 : unsigned cost_vec_rec_start = cost_vec->length ();
6352 : 1211300 : bool seen_non_constant_child = false;
6353 : 2635087 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6354 : : {
6355 : 1427972 : res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
6356 : : visited_set, visited_vec,
6357 : : cost_vec);
6358 : 1427972 : if (!res)
6359 : : break;
6360 : 1423787 : if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
6361 : 1423787 : seen_non_constant_child = true;
6362 : : }
6363 : : /* We're having difficulties scheduling nodes with just constant
6364 : : operands and no scalar stmts since we then cannot compute a stmt
6365 : : insertion place. */
6366 : 1211300 : if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
6367 : : {
6368 : 38 : if (dump_enabled_p ())
6369 : 35 : dump_printf_loc (MSG_NOTE, vect_location,
6370 : : "Cannot vectorize all-constant op node %p\n",
6371 : : (void *) node);
6372 : : res = false;
6373 : : }
6374 : :
6375 : 1211262 : if (res)
6376 : 1207115 : res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
6377 : : cost_vec);
6378 : : /* If analysis failed we have to pop all recursive visited nodes
6379 : : plus ourselves. */
6380 : 1211300 : if (!res)
6381 : : {
6382 : 482448 : while (visited_vec.length () >= visited_rec_start)
6383 : 154546 : visited_set.remove (visited_vec.pop ());
6384 : 86678 : cost_vec->truncate (cost_vec_rec_start);
6385 : : }
6386 : :
6387 : : /* When the node can be vectorized cost invariant nodes it references.
6388 : : This is not done in DFS order to allow the refering node
6389 : : vectorizable_* calls to nail down the invariant nodes vector type
6390 : : and possibly unshare it if it needs a different vector type than
6391 : : other referrers. */
6392 : 1211300 : if (res)
6393 : 2411273 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
6394 : 1286651 : if (child
6395 : 1284497 : && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
6396 : 1284497 : || SLP_TREE_DEF_TYPE (child) == vect_external_def)
6397 : : /* Perform usual caching, note code-generation still
6398 : : code-gens these nodes multiple times but we expect
6399 : : to CSE them later. */
6400 : 2071390 : && !visited_set.add (child))
6401 : : {
6402 : 760406 : visited_vec.safe_push (child);
6403 : : /* ??? After auditing more code paths make a "default"
6404 : : and push the vector type from NODE to all children
6405 : : if it is not already set. */
6406 : : /* Compute the number of vectors to be generated. */
6407 : 760406 : tree vector_type = SLP_TREE_VECTYPE (child);
6408 : 760406 : if (!vector_type)
6409 : : {
6410 : : /* For shifts with a scalar argument we don't need
6411 : : to cost or code-generate anything.
6412 : : ??? Represent this more explicitely. */
6413 : 21973 : gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
6414 : : == shift_vec_info_type)
6415 : : && j == 1);
6416 : 21973 : continue;
6417 : : }
6418 : 738433 : unsigned group_size = SLP_TREE_LANES (child);
6419 : 738433 : poly_uint64 vf = 1;
6420 : 738433 : if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
6421 : 6734 : vf = loop_vinfo->vectorization_factor;
6422 : 738433 : SLP_TREE_NUMBER_OF_VEC_STMTS (child)
6423 : 738433 : = vect_get_num_vectors (vf * group_size, vector_type);
6424 : : /* And cost them. */
6425 : 738433 : vect_prologue_cost_for_slp (child, cost_vec);
6426 : : }
6427 : :
6428 : : /* If this node or any of its children can't be vectorized, try pruning
6429 : : the tree here rather than felling the whole thing. */
6430 : 86678 : if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
6431 : : {
6432 : : /* We'll need to revisit this for invariant costing and number
6433 : : of vectorized stmt setting. */
6434 : : res = true;
6435 : : }
6436 : :
6437 : : return res;
6438 : : }
6439 : :
6440 : : /* Given a definition DEF, analyze if it will have any live scalar use after
6441 : : performing SLP vectorization whose information is represented by BB_VINFO,
6442 : : and record result into hash map SCALAR_USE_MAP as cache for later fast
6443 : : check. If recursion DEPTH exceeds a limit, stop analysis and make a
6444 : : conservative assumption. Return 0 if no scalar use, 1 if there is, -1
6445 : : means recursion is limited. */
6446 : :
6447 : : static int
6448 : 567378 : vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
6449 : : hash_map<tree, int> &scalar_use_map,
6450 : : int depth = 0)
6451 : : {
6452 : 567378 : const int depth_limit = 2;
6453 : 567378 : imm_use_iterator use_iter;
6454 : 567378 : gimple *use_stmt;
6455 : :
6456 : 567378 : if (int *res = scalar_use_map.get (def))
6457 : 21325 : return *res;
6458 : :
6459 : 546053 : int scalar_use = 1;
6460 : :
6461 : 1280929 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
6462 : : {
6463 : 836794 : if (is_gimple_debug (use_stmt))
6464 : 214078 : continue;
6465 : :
6466 : 622716 : stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
6467 : :
6468 : 622716 : if (!use_stmt_info)
6469 : : break;
6470 : :
6471 : 623574 : if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6472 : 518464 : continue;
6473 : :
6474 : : /* Do not step forward when encounter PHI statement, since it may
6475 : : involve cyclic reference and cause infinite recursive invocation. */
6476 : 97967 : if (gimple_code (use_stmt) == GIMPLE_PHI)
6477 : : break;
6478 : :
6479 : : /* When pattern recognition is involved, a statement whose definition is
6480 : : consumed in some pattern, may not be included in the final replacement
6481 : : pattern statements, so would be skipped when building SLP graph.
6482 : :
6483 : : * Original
6484 : : char a_c = *(char *) a;
6485 : : char b_c = *(char *) b;
6486 : : unsigned short a_s = (unsigned short) a_c;
6487 : : int a_i = (int) a_s;
6488 : : int b_i = (int) b_c;
6489 : : int r_i = a_i - b_i;
6490 : :
6491 : : * After pattern replacement
6492 : : a_s = (unsigned short) a_c;
6493 : : a_i = (int) a_s;
6494 : :
6495 : : patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
6496 : : patt_b_i = (int) patt_b_s; // b_i = (int) b_c
6497 : :
6498 : : patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
6499 : : patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
6500 : :
6501 : : The definitions of a_i(original statement) and b_i(pattern statement)
6502 : : are related to, but actually not part of widen_minus pattern.
6503 : : Vectorizing the pattern does not cause these definition statements to
6504 : : be marked as PURE_SLP. For this case, we need to recursively check
6505 : : whether their uses are all absorbed into vectorized code. But there
6506 : : is an exception that some use may participate in an vectorized
6507 : : operation via an external SLP node containing that use as an element.
6508 : : The parameter "scalar_use_map" tags such kind of SSA as having scalar
6509 : : use in advance. */
6510 : 83604 : tree lhs = gimple_get_lhs (use_stmt);
6511 : :
6512 : 83604 : if (!lhs || TREE_CODE (lhs) != SSA_NAME)
6513 : : break;
6514 : :
6515 : 51585 : if (depth_limit && depth >= depth_limit)
6516 : 9879 : return -1;
6517 : :
6518 : 41706 : if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
6519 : : depth + 1)))
6520 : : break;
6521 : 546053 : }
6522 : :
6523 : 536174 : if (end_imm_use_stmt_p (&use_iter))
6524 : 444135 : scalar_use = 0;
6525 : :
6526 : : /* If recursion is limited, do not cache result for non-root defs. */
6527 : 536174 : if (!depth || scalar_use >= 0)
6528 : : {
6529 : 526295 : bool added = scalar_use_map.put (def, scalar_use);
6530 : 526295 : gcc_assert (!added);
6531 : : }
6532 : :
6533 : 536174 : return scalar_use;
6534 : : }
6535 : :
6536 : : /* Mark lanes of NODE that are live outside of the basic-block vectorized
6537 : : region and that can be vectorized using vectorizable_live_operation
6538 : : with STMT_VINFO_LIVE_P. Not handled live operations will cause the
6539 : : scalar code computing it to be retained. */
6540 : :
6541 : : static void
6542 : 877360 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
6543 : : slp_instance instance,
6544 : : stmt_vector_for_cost *cost_vec,
6545 : : hash_map<tree, int> &scalar_use_map,
6546 : : hash_set<stmt_vec_info> &svisited,
6547 : : hash_set<slp_tree> &visited)
6548 : : {
6549 : 877360 : if (visited.add (node))
6550 : 29486 : return;
6551 : :
6552 : 847874 : unsigned i;
6553 : 847874 : stmt_vec_info stmt_info;
6554 : 847874 : stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
6555 : 3009263 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6556 : : {
6557 : 2161389 : if (svisited.contains (stmt_info))
6558 : 27628 : continue;
6559 : 2141842 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6560 : 2141842 : if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
6561 : 10609 : && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
6562 : : /* Only the pattern root stmt computes the original scalar value. */
6563 : 8081 : continue;
6564 : 2133761 : bool mark_visited = true;
6565 : 2133761 : gimple *orig_stmt = orig_stmt_info->stmt;
6566 : 2133761 : ssa_op_iter op_iter;
6567 : 2133761 : def_operand_p def_p;
6568 : 4793194 : FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
6569 : : {
6570 : 525672 : if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
6571 : : scalar_use_map))
6572 : : {
6573 : 83361 : STMT_VINFO_LIVE_P (stmt_info) = true;
6574 : 83361 : if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
6575 : : instance, i, false, cost_vec))
6576 : : /* ??? So we know we can vectorize the live stmt from one SLP
6577 : : node. If we cannot do so from all or none consistently
6578 : : we'd have to record which SLP node (and lane) we want to
6579 : : use for the live operation. So make sure we can
6580 : : code-generate from all nodes. */
6581 : : mark_visited = false;
6582 : : else
6583 : 0 : STMT_VINFO_LIVE_P (stmt_info) = false;
6584 : : }
6585 : :
6586 : : /* We have to verify whether we can insert the lane extract
6587 : : before all uses. The following is a conservative approximation.
6588 : : We cannot put this into vectorizable_live_operation because
6589 : : iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
6590 : : doesn't work.
6591 : : Note that while the fact that we emit code for loads at the
6592 : : first load should make this a non-problem leafs we construct
6593 : : from scalars are vectorized after the last scalar def.
6594 : : ??? If we'd actually compute the insert location during
6595 : : analysis we could use sth less conservative than the last
6596 : : scalar stmt in the node for the dominance check. */
6597 : : /* ??? What remains is "live" uses in vector CTORs in the same
6598 : : SLP graph which is where those uses can end up code-generated
6599 : : right after their definition instead of close to their original
6600 : : use. But that would restrict us to code-generate lane-extracts
6601 : : from the latest stmt in a node. So we compensate for this
6602 : : during code-generation, simply not replacing uses for those
6603 : : hopefully rare cases. */
6604 : 525672 : imm_use_iterator use_iter;
6605 : 525672 : gimple *use_stmt;
6606 : 525672 : stmt_vec_info use_stmt_info;
6607 : :
6608 : 525672 : if (STMT_VINFO_LIVE_P (stmt_info))
6609 : 466642 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
6610 : 383281 : if (!is_gimple_debug (use_stmt)
6611 : 294543 : && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
6612 : 283583 : || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
6613 : 548257 : && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
6614 : : {
6615 : 12872 : if (dump_enabled_p ())
6616 : 192 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6617 : : "Cannot determine insertion place for "
6618 : : "lane extract\n");
6619 : 12872 : STMT_VINFO_LIVE_P (stmt_info) = false;
6620 : 12872 : mark_visited = true;
6621 : 83361 : }
6622 : : }
6623 : 2133761 : if (mark_visited)
6624 : 2061375 : svisited.add (stmt_info);
6625 : : }
6626 : :
6627 : : slp_tree child;
6628 : 2399584 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6629 : 826668 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6630 : 236583 : vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
6631 : : scalar_use_map, svisited, visited);
6632 : : }
6633 : :
6634 : : /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
6635 : : are live outside of the basic-block vectorized region and that can be
6636 : : vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
6637 : :
6638 : : static void
6639 : 291219 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
6640 : : {
6641 : 291219 : if (bb_vinfo->slp_instances.is_empty ())
6642 : 37081 : return;
6643 : :
6644 : 254138 : hash_set<stmt_vec_info> svisited;
6645 : 254138 : hash_set<slp_tree> visited;
6646 : 254138 : hash_map<tree, int> scalar_use_map;
6647 : 254138 : auto_vec<slp_tree> worklist;
6648 : :
6649 : 1403191 : for (slp_instance instance : bb_vinfo->slp_instances)
6650 : : {
6651 : 640777 : if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
6652 : 49868 : for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
6653 : 15025 : if (TREE_CODE (op) == SSA_NAME)
6654 : 13392 : scalar_use_map.put (op, 1);
6655 : 640777 : if (!visited.add (SLP_INSTANCE_TREE (instance)))
6656 : 639364 : worklist.safe_push (SLP_INSTANCE_TREE (instance));
6657 : : }
6658 : :
6659 : 1436955 : do
6660 : : {
6661 : 1436955 : slp_tree node = worklist.pop ();
6662 : :
6663 : 1436955 : if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
6664 : : {
6665 : 1386465 : for (tree op : SLP_TREE_SCALAR_OPS (node))
6666 : 607098 : if (TREE_CODE (op) == SSA_NAME)
6667 : 477927 : scalar_use_map.put (op, 1);
6668 : : }
6669 : : else
6670 : : {
6671 : 3453402 : for (slp_tree child : SLP_TREE_CHILDREN (node))
6672 : 826650 : if (child && !visited.add (child))
6673 : 797591 : worklist.safe_push (child);
6674 : : }
6675 : : }
6676 : 2873910 : while (!worklist.is_empty ());
6677 : :
6678 : 254138 : visited.empty ();
6679 : :
6680 : 1403191 : for (slp_instance instance : bb_vinfo->slp_instances)
6681 : : {
6682 : 640777 : vect_location = instance->location ();
6683 : 640777 : vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
6684 : : instance, &instance->cost_vec,
6685 : : scalar_use_map, svisited, visited);
6686 : : }
6687 : 254138 : }
6688 : :
6689 : : /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
6690 : :
6691 : : static bool
6692 : 70628 : vectorizable_bb_reduc_epilogue (slp_instance instance,
6693 : : stmt_vector_for_cost *cost_vec)
6694 : : {
6695 : 70628 : gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
6696 : 70628 : enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
6697 : 70628 : if (reduc_code == MINUS_EXPR)
6698 : 0 : reduc_code = PLUS_EXPR;
6699 : 70628 : internal_fn reduc_fn;
6700 : 70628 : tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
6701 : 70628 : if (!vectype
6702 : 70534 : || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
6703 : 70534 : || reduc_fn == IFN_LAST
6704 : 70534 : || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
6705 : 101907 : || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
6706 : 31279 : TREE_TYPE (vectype)))
6707 : : {
6708 : 50660 : if (dump_enabled_p ())
6709 : 379 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6710 : : "not vectorized: basic block reduction epilogue "
6711 : : "operation unsupported.\n");
6712 : 50660 : return false;
6713 : : }
6714 : :
6715 : : /* There's no way to cost a horizontal vector reduction via REDUC_FN so
6716 : : cost log2 vector operations plus shuffles and one extraction. */
6717 : 19968 : unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
6718 : 19968 : record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
6719 : : vectype, 0, vect_body);
6720 : 19968 : record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
6721 : : vectype, 0, vect_body);
6722 : 19968 : record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
6723 : : vectype, 0, vect_body);
6724 : :
6725 : : /* Since we replace all stmts of a possibly longer scalar reduction
6726 : : chain account for the extra scalar stmts for that. */
6727 : 19968 : record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
6728 : 19968 : instance->root_stmts[0], 0, vect_body);
6729 : 19968 : return true;
6730 : : }
6731 : :
6732 : : /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
6733 : : and recurse to children. */
6734 : :
6735 : : static void
6736 : 147702 : vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
6737 : : hash_set<slp_tree> &visited)
6738 : : {
6739 : 147702 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
6740 : 147702 : || visited.add (node))
6741 : 66570 : return;
6742 : :
6743 : : stmt_vec_info stmt;
6744 : : unsigned i;
6745 : 285990 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
6746 : 205591 : roots.remove (vect_orig_stmt (stmt));
6747 : :
6748 : : slp_tree child;
6749 : 181258 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6750 : 100126 : if (child)
6751 : 100126 : vect_slp_prune_covered_roots (child, roots, visited);
6752 : : }
6753 : :
6754 : : /* Analyze statements in SLP instances of VINFO. Return true if the
6755 : : operations are supported. */
6756 : :
6757 : : bool
6758 : 299882 : vect_slp_analyze_operations (vec_info *vinfo)
6759 : : {
6760 : 299882 : slp_instance instance;
6761 : 299882 : int i;
6762 : :
6763 : 299882 : DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
6764 : :
6765 : 299882 : hash_set<slp_tree> visited;
6766 : 1061696 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6767 : : {
6768 : 761814 : auto_vec<slp_tree> visited_vec;
6769 : 761814 : stmt_vector_for_cost cost_vec;
6770 : 761814 : cost_vec.create (2);
6771 : 761814 : if (is_a <bb_vec_info> (vinfo))
6772 : 750904 : vect_location = instance->location ();
6773 : 761814 : if (!vect_slp_analyze_node_operations (vinfo,
6774 : : SLP_INSTANCE_TREE (instance),
6775 : : instance, visited, visited_vec,
6776 : : &cost_vec)
6777 : : /* CTOR instances require vectorized defs for the SLP tree root. */
6778 : 702915 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
6779 : 5083 : && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
6780 : : != vect_internal_def
6781 : : /* Make sure we vectorized with the expected type. */
6782 : 5079 : || !useless_type_conversion_p
6783 : 5079 : (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
6784 : : (instance->root_stmts[0]->stmt))),
6785 : 5079 : TREE_TYPE (SLP_TREE_VECTYPE
6786 : : (SLP_INSTANCE_TREE (instance))))))
6787 : : /* Check we can vectorize the reduction. */
6788 : 1464723 : || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
6789 : 70628 : && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
6790 : : {
6791 : 109565 : slp_tree node = SLP_INSTANCE_TREE (instance);
6792 : 109565 : stmt_vec_info stmt_info;
6793 : 109565 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6794 : 106824 : stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6795 : : else
6796 : 2741 : stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
6797 : 109565 : if (dump_enabled_p ())
6798 : 834 : dump_printf_loc (MSG_NOTE, vect_location,
6799 : : "removing SLP instance operations starting from: %G",
6800 : : stmt_info->stmt);
6801 : 109565 : vect_free_slp_instance (instance);
6802 : 109565 : vinfo->slp_instances.ordered_remove (i);
6803 : 109565 : cost_vec.release ();
6804 : 1301434 : while (!visited_vec.is_empty ())
6805 : 320629 : visited.remove (visited_vec.pop ());
6806 : : }
6807 : : else
6808 : : {
6809 : 652249 : i++;
6810 : 652249 : if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
6811 : : {
6812 : 10065 : add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
6813 : 10065 : cost_vec.release ();
6814 : : }
6815 : : else
6816 : : /* For BB vectorization remember the SLP graph entry
6817 : : cost for later. */
6818 : 642184 : instance->cost_vec = cost_vec;
6819 : : }
6820 : 761814 : }
6821 : :
6822 : : /* Now look for SLP instances with a root that are covered by other
6823 : : instances and remove them. */
6824 : 299882 : hash_set<stmt_vec_info> roots;
6825 : 1252013 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6826 : 677294 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
6827 : 25045 : roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
6828 : 299882 : if (!roots.is_empty ())
6829 : : {
6830 : 10075 : visited.empty ();
6831 : 57651 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
6832 : 47576 : vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
6833 : : visited);
6834 : 57651 : for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
6835 : 47576 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
6836 : 25045 : && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
6837 : : {
6838 : 1407 : stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
6839 : 1407 : if (dump_enabled_p ())
6840 : 20 : dump_printf_loc (MSG_NOTE, vect_location,
6841 : : "removing SLP instance operations starting "
6842 : : "from: %G", root->stmt);
6843 : 1407 : vect_free_slp_instance (instance);
6844 : 1407 : vinfo->slp_instances.ordered_remove (i);
6845 : : }
6846 : : else
6847 : 46169 : ++i;
6848 : : }
6849 : :
6850 : : /* Compute vectorizable live stmts. */
6851 : 299882 : if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
6852 : 291219 : vect_bb_slp_mark_live_stmts (bb_vinfo);
6853 : :
6854 : 599764 : return !vinfo->slp_instances.is_empty ();
6855 : 299882 : }
6856 : :
6857 : : /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
6858 : : closing the eventual chain. */
6859 : :
6860 : : static slp_instance
6861 : 683697 : get_ultimate_leader (slp_instance instance,
6862 : : hash_map<slp_instance, slp_instance> &instance_leader)
6863 : : {
6864 : 683697 : auto_vec<slp_instance *, 8> chain;
6865 : 683697 : slp_instance *tem;
6866 : 734676 : while (*(tem = instance_leader.get (instance)) != instance)
6867 : : {
6868 : 50979 : chain.safe_push (tem);
6869 : 50979 : instance = *tem;
6870 : : }
6871 : 734676 : while (!chain.is_empty ())
6872 : 50979 : *chain.pop () = instance;
6873 : 683697 : return instance;
6874 : 683697 : }
6875 : :
6876 : : namespace {
6877 : : /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
6878 : : KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
6879 : : for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
6880 : :
6881 : : INSTANCE_LEADER is as for get_ultimate_leader. */
6882 : :
6883 : : template<typename T>
6884 : : bool
6885 : 3108995 : vect_map_to_instance (slp_instance instance, T key,
6886 : : hash_map<T, slp_instance> &key_to_instance,
6887 : : hash_map<slp_instance, slp_instance> &instance_leader)
6888 : : {
6889 : : bool existed_p;
6890 : 3108995 : slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
6891 : 3108995 : if (!existed_p)
6892 : : ;
6893 : 127051 : else if (key_instance != instance)
6894 : : {
6895 : : /* If we're running into a previously marked key make us the
6896 : : leader of the current ultimate leader. This keeps the
6897 : : leader chain acyclic and works even when the current instance
6898 : : connects two previously independent graph parts. */
6899 : 42920 : slp_instance key_leader
6900 : 42920 : = get_ultimate_leader (key_instance, instance_leader);
6901 : 42920 : if (key_leader != instance)
6902 : 12475 : instance_leader.put (key_leader, instance);
6903 : : }
6904 : 3108995 : key_instance = instance;
6905 : 3108995 : return existed_p;
6906 : : }
6907 : : }
6908 : :
6909 : : /* Worker of vect_bb_partition_graph, recurse on NODE. */
6910 : :
6911 : : static void
6912 : 877360 : vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
6913 : : slp_instance instance, slp_tree node,
6914 : : hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
6915 : : hash_map<slp_tree, slp_instance> &node_to_instance,
6916 : : hash_map<slp_instance, slp_instance> &instance_leader)
6917 : : {
6918 : 877360 : stmt_vec_info stmt_info;
6919 : 877360 : unsigned i;
6920 : :
6921 : 3108995 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6922 : 2231635 : vect_map_to_instance (instance, stmt_info, stmt_to_instance,
6923 : : instance_leader);
6924 : :
6925 : 877360 : if (vect_map_to_instance (instance, node, node_to_instance,
6926 : : instance_leader))
6927 : 877360 : return;
6928 : :
6929 : : slp_tree child;
6930 : 1674542 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6931 : 826668 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
6932 : 236583 : vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
6933 : : node_to_instance, instance_leader);
6934 : : }
6935 : :
6936 : : /* Partition the SLP graph into pieces that can be costed independently. */
6937 : :
6938 : : static void
6939 : 254138 : vect_bb_partition_graph (bb_vec_info bb_vinfo)
6940 : : {
6941 : 254138 : DUMP_VECT_SCOPE ("vect_bb_partition_graph");
6942 : :
6943 : : /* First walk the SLP graph assigning each involved scalar stmt a
6944 : : corresponding SLP graph entry and upon visiting a previously
6945 : : marked stmt, make the stmts leader the current SLP graph entry. */
6946 : 254138 : hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
6947 : 254138 : hash_map<slp_tree, slp_instance> node_to_instance;
6948 : 254138 : hash_map<slp_instance, slp_instance> instance_leader;
6949 : 254138 : slp_instance instance;
6950 : 894915 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6951 : : {
6952 : 640777 : instance_leader.put (instance, instance);
6953 : 640777 : vect_bb_partition_graph_r (bb_vinfo,
6954 : : instance, SLP_INSTANCE_TREE (instance),
6955 : : stmt_to_instance, node_to_instance,
6956 : : instance_leader);
6957 : : }
6958 : :
6959 : : /* Then collect entries to each independent subgraph. */
6960 : 1149053 : for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
6961 : : {
6962 : 640777 : slp_instance leader = get_ultimate_leader (instance, instance_leader);
6963 : 640777 : leader->subgraph_entries.safe_push (instance);
6964 : 640777 : if (dump_enabled_p ()
6965 : 640777 : && leader != instance)
6966 : 67 : dump_printf_loc (MSG_NOTE, vect_location,
6967 : : "instance %p is leader of %p\n",
6968 : : (void *) leader, (void *) instance);
6969 : : }
6970 : 254138 : }
6971 : :
6972 : : /* Compute the set of scalar stmts participating in internal and external
6973 : : nodes. */
6974 : :
6975 : : static void
6976 : 1453180 : vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
6977 : : hash_set<slp_tree> &visited,
6978 : : hash_set<stmt_vec_info> &vstmts,
6979 : : hash_set<stmt_vec_info> &estmts)
6980 : : {
6981 : 1453180 : int i;
6982 : 1453180 : stmt_vec_info stmt_info;
6983 : 1453180 : slp_tree child;
6984 : :
6985 : 1453180 : if (visited.add (node))
6986 : 29470 : return;
6987 : :
6988 : 1423710 : if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
6989 : : {
6990 : 2954664 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
6991 : 2115943 : vstmts.add (stmt_info);
6992 : :
6993 : 2958174 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
6994 : 816730 : if (child)
6995 : 816730 : vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
6996 : : vstmts, estmts);
6997 : : }
6998 : : else
6999 : 3264050 : for (tree def : SLP_TREE_SCALAR_OPS (node))
7000 : : {
7001 : 1510091 : stmt_vec_info def_stmt = vinfo->lookup_def (def);
7002 : 1510091 : if (def_stmt)
7003 : 329298 : estmts.add (def_stmt);
7004 : : }
7005 : : }
7006 : :
7007 : :
7008 : : /* Compute the scalar cost of the SLP node NODE and its children
7009 : : and return it. Do not account defs that are marked in LIFE and
7010 : : update LIFE according to uses of NODE. */
7011 : :
7012 : : static void
7013 : 867621 : vect_bb_slp_scalar_cost (vec_info *vinfo,
7014 : : slp_tree node, vec<bool, va_heap> *life,
7015 : : stmt_vector_for_cost *cost_vec,
7016 : : hash_set<stmt_vec_info> &vectorized_scalar_stmts,
7017 : : hash_set<slp_tree> &visited)
7018 : : {
7019 : 867621 : unsigned i;
7020 : 867621 : stmt_vec_info stmt_info;
7021 : 867621 : slp_tree child;
7022 : :
7023 : 867621 : if (visited.add (node))
7024 : 28888 : return;
7025 : :
7026 : 2954700 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7027 : : {
7028 : 2115967 : ssa_op_iter op_iter;
7029 : 2115967 : def_operand_p def_p;
7030 : :
7031 : 2115967 : if ((*life)[i])
7032 : 92317 : continue;
7033 : :
7034 : 2092386 : stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
7035 : 2092386 : gimple *orig_stmt = orig_stmt_info->stmt;
7036 : :
7037 : : /* If there is a non-vectorized use of the defs then the scalar
7038 : : stmt is kept live in which case we do not account it or any
7039 : : required defs in the SLP children in the scalar cost. This
7040 : : way we make the vectorization more costly when compared to
7041 : : the scalar cost. */
7042 : 2092386 : if (!STMT_VINFO_LIVE_P (stmt_info))
7043 : : {
7044 : 2025443 : auto_vec<gimple *, 8> worklist;
7045 : 2025443 : hash_set<gimple *> *worklist_visited = NULL;
7046 : 2025443 : worklist.quick_push (orig_stmt);
7047 : 2029717 : do
7048 : : {
7049 : 2029717 : gimple *work_stmt = worklist.pop ();
7050 : 4487414 : FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
7051 : : {
7052 : 443256 : imm_use_iterator use_iter;
7053 : 443256 : gimple *use_stmt;
7054 : 1121110 : FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
7055 : : DEF_FROM_PTR (def_p))
7056 : 693130 : if (!is_gimple_debug (use_stmt))
7057 : : {
7058 : 500930 : stmt_vec_info use_stmt_info
7059 : 500930 : = vinfo->lookup_stmt (use_stmt);
7060 : 500930 : if (!use_stmt_info
7061 : 500930 : || !vectorized_scalar_stmts.contains (use_stmt_info))
7062 : : {
7063 : 19617 : if (use_stmt_info
7064 : 17457 : && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
7065 : : {
7066 : : /* For stmts participating in patterns we have
7067 : : to check its uses recursively. */
7068 : 4341 : if (!worklist_visited)
7069 : 3433 : worklist_visited = new hash_set<gimple *> ();
7070 : 4341 : if (!worklist_visited->add (use_stmt))
7071 : 4341 : worklist.safe_push (use_stmt);
7072 : 4341 : continue;
7073 : : }
7074 : 15276 : (*life)[i] = true;
7075 : 15276 : goto next_lane;
7076 : : }
7077 : 443256 : }
7078 : : }
7079 : : }
7080 : 4028882 : while (!worklist.is_empty ());
7081 : 2010167 : next_lane:
7082 : 2025443 : if (worklist_visited)
7083 : 3433 : delete worklist_visited;
7084 : 2025443 : if ((*life)[i])
7085 : 15276 : continue;
7086 : 2025443 : }
7087 : :
7088 : : /* Count scalar stmts only once. */
7089 : 2077110 : if (gimple_visited_p (orig_stmt))
7090 : 24031 : continue;
7091 : 2053079 : gimple_set_visited (orig_stmt, true);
7092 : :
7093 : 2053079 : vect_cost_for_stmt kind;
7094 : 2053079 : if (STMT_VINFO_DATA_REF (orig_stmt_info))
7095 : : {
7096 : 1858888 : if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
7097 : : kind = scalar_load;
7098 : : else
7099 : 1586325 : kind = scalar_store;
7100 : : }
7101 : 194191 : else if (vect_nop_conversion_p (orig_stmt_info))
7102 : 20529 : continue;
7103 : : /* For single-argument PHIs assume coalescing which means zero cost
7104 : : for the scalar and the vector PHIs. This avoids artificially
7105 : : favoring the vector path (but may pessimize it in some cases). */
7106 : 173662 : else if (is_a <gphi *> (orig_stmt_info->stmt)
7107 : 173662 : && gimple_phi_num_args
7108 : 92574 : (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
7109 : 8900 : continue;
7110 : : else
7111 : : kind = scalar_stmt;
7112 : 2023650 : record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
7113 : : SLP_TREE_VECTYPE (node), 0, vect_body);
7114 : : }
7115 : :
7116 : 1677466 : auto_vec<bool, 20> subtree_life;
7117 : 2373221 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7118 : : {
7119 : 816748 : if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7120 : : {
7121 : : /* Do not directly pass LIFE to the recursive call, copy it to
7122 : : confine changes in the callee to the current child/subtree. */
7123 : 231171 : if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7124 : : {
7125 : 3765 : subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
7126 : 12853 : for (unsigned j = 0;
7127 : 25706 : j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
7128 : : {
7129 : 9088 : auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
7130 : 9088 : if (perm.first == i)
7131 : 4724 : subtree_life[perm.second] = (*life)[j];
7132 : : }
7133 : : }
7134 : : else
7135 : : {
7136 : 227406 : gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
7137 : 227406 : subtree_life.safe_splice (*life);
7138 : : }
7139 : 231171 : vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
7140 : : vectorized_scalar_stmts, visited);
7141 : 231171 : subtree_life.truncate (0);
7142 : : }
7143 : : }
7144 : : }
7145 : :
7146 : : /* Comparator for the loop-index sorted cost vectors. */
7147 : :
7148 : : static int
7149 : 16729647 : li_cost_vec_cmp (const void *a_, const void *b_)
7150 : : {
7151 : 16729647 : auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
7152 : 16729647 : auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
7153 : 16729647 : if (a->first < b->first)
7154 : : return -1;
7155 : 16108008 : else if (a->first == b->first)
7156 : 15563585 : return 0;
7157 : : return 1;
7158 : : }
7159 : :
7160 : : /* Check if vectorization of the basic block is profitable for the
7161 : : subgraph denoted by SLP_INSTANCES. */
7162 : :
7163 : : static bool
7164 : 624124 : vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
7165 : : vec<slp_instance> slp_instances,
7166 : : loop_p orig_loop)
7167 : : {
7168 : 624124 : slp_instance instance;
7169 : 624124 : int i;
7170 : 624124 : unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
7171 : 624124 : unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
7172 : :
7173 : 624124 : if (dump_enabled_p ())
7174 : : {
7175 : 101 : dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
7176 : 101 : hash_set<slp_tree> visited;
7177 : 406 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
7178 : 103 : vect_print_slp_graph (MSG_NOTE, vect_location,
7179 : : SLP_INSTANCE_TREE (instance), visited);
7180 : 101 : }
7181 : :
7182 : : /* Compute the set of scalar stmts we know will go away 'locally' when
7183 : : vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
7184 : : not accurate for nodes promoted extern late or for scalar stmts that
7185 : : are used both in extern defs and in vectorized defs. */
7186 : 624124 : hash_set<stmt_vec_info> vectorized_scalar_stmts;
7187 : 624124 : hash_set<stmt_vec_info> scalar_stmts_in_externs;
7188 : 624124 : hash_set<slp_tree> visited;
7189 : 1260574 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
7190 : : {
7191 : 636450 : vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
7192 : : SLP_INSTANCE_TREE (instance),
7193 : : visited,
7194 : : vectorized_scalar_stmts,
7195 : : scalar_stmts_in_externs);
7196 : 727400 : for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
7197 : 44712 : vectorized_scalar_stmts.add (rstmt);
7198 : : }
7199 : : /* Scalar stmts used as defs in external nodes need to be preseved, so
7200 : : remove them from vectorized_scalar_stmts. */
7201 : 920817 : for (stmt_vec_info stmt : scalar_stmts_in_externs)
7202 : 296693 : vectorized_scalar_stmts.remove (stmt);
7203 : :
7204 : : /* Calculate scalar cost and sum the cost for the vector stmts
7205 : : previously collected. */
7206 : 624124 : stmt_vector_for_cost scalar_costs = vNULL;
7207 : 624124 : stmt_vector_for_cost vector_costs = vNULL;
7208 : 624124 : visited.empty ();
7209 : 1260574 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
7210 : : {
7211 : 636450 : auto_vec<bool, 20> life;
7212 : 636450 : life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
7213 : : true);
7214 : 636450 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7215 : 46238 : record_stmt_cost (&scalar_costs,
7216 : 23119 : SLP_INSTANCE_ROOT_STMTS (instance).length (),
7217 : : scalar_stmt,
7218 : 23119 : SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
7219 : 636450 : vect_bb_slp_scalar_cost (bb_vinfo,
7220 : : SLP_INSTANCE_TREE (instance),
7221 : : &life, &scalar_costs, vectorized_scalar_stmts,
7222 : : visited);
7223 : 636450 : vector_costs.safe_splice (instance->cost_vec);
7224 : 636450 : instance->cost_vec.release ();
7225 : 636450 : }
7226 : :
7227 : 624124 : if (dump_enabled_p ())
7228 : 101 : dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
7229 : :
7230 : : /* When costing non-loop vectorization we need to consider each covered
7231 : : loop independently and make sure vectorization is profitable. For
7232 : : now we assume a loop may be not entered or executed an arbitrary
7233 : : number of iterations (??? static information can provide more
7234 : : precise info here) which means we can simply cost each containing
7235 : : loops stmts separately. */
7236 : :
7237 : : /* First produce cost vectors sorted by loop index. */
7238 : 624124 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7239 : 624124 : li_scalar_costs (scalar_costs.length ());
7240 : 624124 : auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
7241 : 624124 : li_vector_costs (vector_costs.length ());
7242 : 624124 : stmt_info_for_cost *cost;
7243 : 2670893 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7244 : : {
7245 : 2046769 : unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7246 : 2046769 : li_scalar_costs.quick_push (std::make_pair (l, cost));
7247 : : }
7248 : : /* Use a random used loop as fallback in case the first vector_costs
7249 : : entry does not have a stmt_info associated with it. */
7250 : 624124 : unsigned l = li_scalar_costs[0].first;
7251 : 2334670 : FOR_EACH_VEC_ELT (vector_costs, i, cost)
7252 : : {
7253 : : /* We inherit from the previous COST, invariants, externals and
7254 : : extracts immediately follow the cost for the related stmt. */
7255 : 1710546 : if (cost->stmt_info)
7256 : 1031933 : l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
7257 : 1710546 : li_vector_costs.quick_push (std::make_pair (l, cost));
7258 : : }
7259 : 624124 : li_scalar_costs.qsort (li_cost_vec_cmp);
7260 : 624124 : li_vector_costs.qsort (li_cost_vec_cmp);
7261 : :
7262 : : /* Now cost the portions individually. */
7263 : : unsigned vi = 0;
7264 : : unsigned si = 0;
7265 : 1133102 : bool profitable = true;
7266 : 1133102 : while (si < li_scalar_costs.length ()
7267 : 1762355 : && vi < li_vector_costs.length ())
7268 : : {
7269 : 629253 : unsigned sl = li_scalar_costs[si].first;
7270 : 629253 : unsigned vl = li_vector_costs[vi].first;
7271 : 629253 : if (sl != vl)
7272 : : {
7273 : 593 : if (dump_enabled_p ())
7274 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
7275 : : "Scalar %d and vector %d loop part do not "
7276 : : "match up, skipping scalar part\n", sl, vl);
7277 : : /* Skip the scalar part, assuming zero cost on the vector side. */
7278 : 1268 : do
7279 : : {
7280 : 1268 : si++;
7281 : : }
7282 : 1268 : while (si < li_scalar_costs.length ()
7283 : 2260 : && li_scalar_costs[si].first == sl);
7284 : 593 : continue;
7285 : : }
7286 : :
7287 : 628660 : class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
7288 : 2033928 : do
7289 : : {
7290 : 2033928 : add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
7291 : 2033928 : si++;
7292 : : }
7293 : 2033928 : while (si < li_scalar_costs.length ()
7294 : 4075088 : && li_scalar_costs[si].first == sl);
7295 : 628660 : unsigned dummy;
7296 : 628660 : finish_cost (scalar_target_cost_data, nullptr,
7297 : : &dummy, &scalar_cost, &dummy);
7298 : :
7299 : : /* Complete the target-specific vector cost calculation. */
7300 : 628660 : class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
7301 : 1688190 : do
7302 : : {
7303 : 1688190 : add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
7304 : 1688190 : vi++;
7305 : : }
7306 : 1688190 : while (vi < li_vector_costs.length ()
7307 : 3384529 : && li_vector_costs[vi].first == vl);
7308 : 628660 : finish_cost (vect_target_cost_data, scalar_target_cost_data,
7309 : : &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
7310 : 628660 : delete scalar_target_cost_data;
7311 : 628660 : delete vect_target_cost_data;
7312 : :
7313 : 628660 : vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
7314 : :
7315 : 628660 : if (dump_enabled_p ())
7316 : : {
7317 : 109 : dump_printf_loc (MSG_NOTE, vect_location,
7318 : : "Cost model analysis for part in loop %d:\n", sl);
7319 : 109 : dump_printf (MSG_NOTE, " Vector cost: %d\n",
7320 : : vec_inside_cost + vec_outside_cost);
7321 : 109 : dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
7322 : : }
7323 : :
7324 : : /* Vectorization is profitable if its cost is more than the cost of scalar
7325 : : version. Note that we err on the vector side for equal cost because
7326 : : the cost estimate is otherwise quite pessimistic (constant uses are
7327 : : free on the scalar side but cost a load on the vector side for
7328 : : example). */
7329 : 628660 : if (vec_outside_cost + vec_inside_cost > scalar_cost)
7330 : : {
7331 : 120275 : profitable = false;
7332 : 120275 : break;
7333 : : }
7334 : : }
7335 : 624124 : if (profitable && vi < li_vector_costs.length ())
7336 : : {
7337 : 940 : if (dump_enabled_p ())
7338 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
7339 : : "Excess vector cost for part in loop %d:\n",
7340 : 0 : li_vector_costs[vi].first);
7341 : : profitable = false;
7342 : : }
7343 : :
7344 : : /* Unset visited flag. This is delayed when the subgraph is profitable
7345 : : and we process the loop for remaining unvectorized if-converted code. */
7346 : 624124 : if (!orig_loop || !profitable)
7347 : 2670372 : FOR_EACH_VEC_ELT (scalar_costs, i, cost)
7348 : 2046331 : gimple_set_visited (cost->stmt_info->stmt, false);
7349 : :
7350 : 624124 : scalar_costs.release ();
7351 : 624124 : vector_costs.release ();
7352 : :
7353 : 624124 : return profitable;
7354 : 624124 : }
7355 : :
7356 : : /* qsort comparator for lane defs. */
7357 : :
7358 : : static int
7359 : 40 : vld_cmp (const void *a_, const void *b_)
7360 : : {
7361 : 40 : auto *a = (const std::pair<unsigned, tree> *)a_;
7362 : 40 : auto *b = (const std::pair<unsigned, tree> *)b_;
7363 : 40 : return a->first - b->first;
7364 : : }
7365 : :
7366 : : /* Return true if USE_STMT is a vector lane insert into VEC and set
7367 : : *THIS_LANE to the lane number that is set. */
7368 : :
7369 : : static bool
7370 : 203 : vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
7371 : : {
7372 : 203 : gassign *use_ass = dyn_cast <gassign *> (use_stmt);
7373 : 103 : if (!use_ass
7374 : 103 : || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
7375 : 22 : || (vec
7376 : 22 : ? gimple_assign_rhs1 (use_ass) != vec
7377 : 24 : : ((vec = gimple_assign_rhs1 (use_ass)), false))
7378 : 46 : || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
7379 : 46 : TREE_TYPE (gimple_assign_rhs2 (use_ass)))
7380 : 46 : || !constant_multiple_p
7381 : 46 : (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
7382 : 92 : tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
7383 : : this_lane))
7384 : 157 : return false;
7385 : : return true;
7386 : : }
7387 : :
7388 : : /* Find any vectorizable constructors and add them to the grouped_store
7389 : : array. */
7390 : :
7391 : : static void
7392 : 2285374 : vect_slp_check_for_roots (bb_vec_info bb_vinfo)
7393 : : {
7394 : 31795084 : for (unsigned i = 0; i < bb_vinfo->bbs.length (); ++i)
7395 : 27224336 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
7396 : 106251004 : !gsi_end_p (gsi); gsi_next (&gsi))
7397 : : {
7398 : 92638836 : gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
7399 : 92638836 : if (!assign)
7400 : 133024226 : continue;
7401 : :
7402 : 27020445 : tree rhs = gimple_assign_rhs1 (assign);
7403 : 27020445 : enum tree_code code = gimple_assign_rhs_code (assign);
7404 : 27020445 : use_operand_p use_p;
7405 : 27020445 : gimple *use_stmt;
7406 : 27020445 : if (code == CONSTRUCTOR)
7407 : : {
7408 : 1557347 : if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7409 : 58634 : || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
7410 : 75366 : CONSTRUCTOR_NELTS (rhs))
7411 : 35640 : || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
7412 : 1592986 : || uniform_vector_p (rhs))
7413 : 1540215 : continue;
7414 : :
7415 : : unsigned j;
7416 : : tree val;
7417 : 83736 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7418 : 66604 : if (TREE_CODE (val) != SSA_NAME
7419 : 66604 : || !bb_vinfo->lookup_def (val))
7420 : : break;
7421 : 39134 : if (j != CONSTRUCTOR_NELTS (rhs))
7422 : 2435 : continue;
7423 : :
7424 : 17132 : vec<stmt_vec_info> roots = vNULL;
7425 : 17132 : roots.safe_push (bb_vinfo->lookup_stmt (assign));
7426 : 17132 : vec<stmt_vec_info> stmts;
7427 : 17132 : stmts.create (CONSTRUCTOR_NELTS (rhs));
7428 : 97108 : FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
7429 : 62844 : stmts.quick_push
7430 : 62844 : (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
7431 : 17132 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7432 : : stmts, roots));
7433 : : }
7434 : 25463098 : else if (code == BIT_INSERT_EXPR
7435 : 1347 : && VECTOR_TYPE_P (TREE_TYPE (rhs))
7436 : 1008 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
7437 : 1008 : && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
7438 : 1008 : && integer_zerop (gimple_assign_rhs3 (assign))
7439 : 284 : && useless_type_conversion_p
7440 : 284 : (TREE_TYPE (TREE_TYPE (rhs)),
7441 : 284 : TREE_TYPE (gimple_assign_rhs2 (assign)))
7442 : 25463606 : && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
7443 : : {
7444 : : /* We start to match on insert to lane zero but since the
7445 : : inserts need not be ordered we'd have to search both
7446 : : the def and the use chains. */
7447 : 183 : tree vectype = TREE_TYPE (rhs);
7448 : 183 : unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
7449 : 183 : auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
7450 : 183 : auto_sbitmap lanes (nlanes);
7451 : 183 : bitmap_clear (lanes);
7452 : 183 : bitmap_set_bit (lanes, 0);
7453 : 183 : tree def = gimple_assign_lhs (assign);
7454 : 183 : lane_defs.quick_push
7455 : 183 : (std::make_pair (0, gimple_assign_rhs2 (assign)));
7456 : 183 : unsigned lanes_found = 1;
7457 : : /* Start with the use chains, the last stmt will be the root. */
7458 : 183 : stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
7459 : 183 : vec<stmt_vec_info> roots = vNULL;
7460 : 183 : roots.safe_push (last);
7461 : 185 : do
7462 : : {
7463 : 185 : use_operand_p use_p;
7464 : 185 : gimple *use_stmt;
7465 : 185 : if (!single_imm_use (def, &use_p, &use_stmt))
7466 : : break;
7467 : 179 : unsigned this_lane;
7468 : 179 : if (!bb_vinfo->lookup_stmt (use_stmt)
7469 : 179 : || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
7470 : 201 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
7471 : : break;
7472 : 22 : if (bitmap_bit_p (lanes, this_lane))
7473 : : break;
7474 : 2 : lanes_found++;
7475 : 2 : bitmap_set_bit (lanes, this_lane);
7476 : 2 : gassign *use_ass = as_a <gassign *> (use_stmt);
7477 : 2 : lane_defs.quick_push (std::make_pair
7478 : 2 : (this_lane, gimple_assign_rhs2 (use_ass)));
7479 : 2 : last = bb_vinfo->lookup_stmt (use_ass);
7480 : 2 : roots.safe_push (last);
7481 : 2 : def = gimple_assign_lhs (use_ass);
7482 : : }
7483 : 2 : while (lanes_found < nlanes);
7484 : 183 : if (roots.length () > 1)
7485 : 2 : std::swap(roots[0], roots[roots.length () - 1]);
7486 : 183 : if (lanes_found < nlanes)
7487 : : {
7488 : : /* Now search the def chain. */
7489 : 183 : def = gimple_assign_rhs1 (assign);
7490 : 185 : do
7491 : : {
7492 : 185 : if (TREE_CODE (def) != SSA_NAME
7493 : 185 : || !has_single_use (def))
7494 : : break;
7495 : 41 : gimple *def_stmt = SSA_NAME_DEF_STMT (def);
7496 : 41 : unsigned this_lane;
7497 : 41 : if (!bb_vinfo->lookup_stmt (def_stmt)
7498 : 24 : || !vect_slp_is_lane_insert (def_stmt,
7499 : : NULL_TREE, &this_lane)
7500 : 65 : || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
7501 : : break;
7502 : 24 : if (bitmap_bit_p (lanes, this_lane))
7503 : : break;
7504 : 4 : lanes_found++;
7505 : 4 : bitmap_set_bit (lanes, this_lane);
7506 : 8 : lane_defs.quick_push (std::make_pair
7507 : 4 : (this_lane,
7508 : 4 : gimple_assign_rhs2 (def_stmt)));
7509 : 4 : roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
7510 : 4 : def = gimple_assign_rhs1 (def_stmt);
7511 : : }
7512 : 4 : while (lanes_found < nlanes);
7513 : : }
7514 : 183 : if (lanes_found == nlanes)
7515 : : {
7516 : : /* Sort lane_defs after the lane index and register the root. */
7517 : 2 : lane_defs.qsort (vld_cmp);
7518 : 2 : vec<stmt_vec_info> stmts;
7519 : 2 : stmts.create (nlanes);
7520 : 10 : for (unsigned i = 0; i < nlanes; ++i)
7521 : 8 : stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
7522 : 2 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
7523 : : stmts, roots));
7524 : : }
7525 : : else
7526 : 181 : roots.release ();
7527 : 183 : }
7528 : 25462915 : else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
7529 : 24717445 : && (associative_tree_code (code) || code == MINUS_EXPR)
7530 : : /* ??? This pessimizes a two-element reduction. PR54400.
7531 : : ??? In-order reduction could be handled if we only
7532 : : traverse one operand chain in vect_slp_linearize_chain. */
7533 : 28593355 : && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
7534 : : /* Ops with constants at the tail can be stripped here. */
7535 : 4733950 : && TREE_CODE (rhs) == SSA_NAME
7536 : 4680474 : && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
7537 : : /* Should be the chain end. */
7538 : 27424466 : && (!single_imm_use (gimple_assign_lhs (assign),
7539 : : &use_p, &use_stmt)
7540 : 1521741 : || !is_gimple_assign (use_stmt)
7541 : 1027215 : || (gimple_assign_rhs_code (use_stmt) != code
7542 : 740554 : && ((code != PLUS_EXPR && code != MINUS_EXPR)
7543 : 396954 : || (gimple_assign_rhs_code (use_stmt)
7544 : 396954 : != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
7545 : : {
7546 : : /* We start the match at the end of a possible association
7547 : : chain. */
7548 : 1603510 : auto_vec<chain_op_t> chain;
7549 : 1603510 : auto_vec<std::pair<tree_code, gimple *> > worklist;
7550 : 1603510 : auto_vec<gimple *> chain_stmts;
7551 : 1603510 : gimple *code_stmt = NULL, *alt_code_stmt = NULL;
7552 : 1603510 : if (code == MINUS_EXPR)
7553 : 248881 : code = PLUS_EXPR;
7554 : 1603510 : internal_fn reduc_fn;
7555 : 1850739 : if (!reduction_fn_for_scalar_code (code, &reduc_fn)
7556 : 1603510 : || reduc_fn == IFN_LAST)
7557 : 247229 : continue;
7558 : 1356281 : vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
7559 : : /* ??? */
7560 : : code_stmt, alt_code_stmt, &chain_stmts);
7561 : 2712562 : if (chain.length () > 1)
7562 : : {
7563 : : /* Sort the chain according to def_type and operation. */
7564 : 1356281 : chain.sort (dt_sort_cmp, bb_vinfo);
7565 : : /* ??? Now we'd want to strip externals and constants
7566 : : but record those to be handled in the epilogue. */
7567 : : /* ??? For now do not allow mixing ops or externs/constants. */
7568 : 1356281 : bool invalid = false;
7569 : 1356281 : unsigned remain_cnt = 0;
7570 : 1356281 : unsigned last_idx = 0;
7571 : 4119431 : for (unsigned i = 0; i < chain.length (); ++i)
7572 : : {
7573 : 3037455 : if (chain[i].code != code)
7574 : : {
7575 : : invalid = true;
7576 : : break;
7577 : : }
7578 : 2763150 : if (chain[i].dt != vect_internal_def
7579 : : /* Avoid stmts where the def is not the LHS, like
7580 : : ASMs. */
7581 : 5331249 : || (gimple_get_lhs (bb_vinfo->lookup_def
7582 : 2568099 : (chain[i].op)->stmt)
7583 : 2568099 : != chain[i].op))
7584 : 197928 : remain_cnt++;
7585 : : else
7586 : : last_idx = i;
7587 : : }
7588 : : /* Make sure to have an even number of lanes as we later do
7589 : : all-or-nothing discovery, not trying to split further. */
7590 : 1356281 : if ((chain.length () - remain_cnt) & 1)
7591 : 166183 : remain_cnt++;
7592 : 1356281 : if (!invalid && chain.length () - remain_cnt > 1)
7593 : : {
7594 : 1024792 : vec<stmt_vec_info> stmts;
7595 : 1024792 : vec<tree> remain = vNULL;
7596 : 1024792 : stmts.create (chain.length ());
7597 : 1024792 : if (remain_cnt > 0)
7598 : 94086 : remain.create (remain_cnt);
7599 : 3311423 : for (unsigned i = 0; i < chain.length (); ++i)
7600 : : {
7601 : 2286631 : stmt_vec_info stmt_info;
7602 : 2286631 : if (chain[i].dt == vect_internal_def
7603 : 2255522 : && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
7604 : 2255522 : gimple_get_lhs (stmt_info->stmt) == chain[i].op)
7605 : 4542069 : && (i != last_idx
7606 : 1024792 : || (stmts.length () & 1)))
7607 : 2181334 : stmts.quick_push (stmt_info);
7608 : : else
7609 : 105297 : remain.quick_push (chain[i].op);
7610 : : }
7611 : 1024792 : vec<stmt_vec_info> roots;
7612 : 1024792 : roots.create (chain_stmts.length ());
7613 : 4573262 : for (unsigned i = 0; i < chain_stmts.length (); ++i)
7614 : 1261839 : roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
7615 : 1024792 : bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
7616 : : stmts, roots, remain));
7617 : : }
7618 : : }
7619 : 1603510 : }
7620 : : }
7621 : 2285374 : }
7622 : :
7623 : : /* Walk the grouped store chains and replace entries with their
7624 : : pattern variant if any. */
7625 : :
7626 : : static void
7627 : 577715 : vect_fixup_store_groups_with_patterns (vec_info *vinfo)
7628 : : {
7629 : 577715 : stmt_vec_info first_element;
7630 : 577715 : unsigned i;
7631 : :
7632 : 1376497 : FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
7633 : : {
7634 : : /* We also have CTORs in this array. */
7635 : 798782 : if (!STMT_VINFO_GROUPED_ACCESS (first_element))
7636 : 0 : continue;
7637 : 798782 : if (STMT_VINFO_IN_PATTERN_P (first_element))
7638 : : {
7639 : 245 : stmt_vec_info orig = first_element;
7640 : 245 : first_element = STMT_VINFO_RELATED_STMT (first_element);
7641 : 245 : DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
7642 : 245 : DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
7643 : 245 : DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
7644 : 245 : DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
7645 : 245 : vinfo->grouped_stores[i] = first_element;
7646 : : }
7647 : 798782 : stmt_vec_info prev = first_element;
7648 : 2227912 : while (DR_GROUP_NEXT_ELEMENT (prev))
7649 : : {
7650 : 1429130 : stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
7651 : 1429130 : if (STMT_VINFO_IN_PATTERN_P (elt))
7652 : : {
7653 : 826 : stmt_vec_info orig = elt;
7654 : 826 : elt = STMT_VINFO_RELATED_STMT (elt);
7655 : 826 : DR_GROUP_NEXT_ELEMENT (prev) = elt;
7656 : 826 : DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
7657 : 826 : DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
7658 : : }
7659 : 1429130 : DR_GROUP_FIRST_ELEMENT (elt) = first_element;
7660 : 1429130 : prev = elt;
7661 : : }
7662 : : }
7663 : 577715 : }
7664 : :
7665 : : /* Check if the region described by BB_VINFO can be vectorized, returning
7666 : : true if so. When returning false, set FATAL to true if the same failure
7667 : : would prevent vectorization at other vector sizes, false if it is still
7668 : : worth trying other sizes. N_STMTS is the number of statements in the
7669 : : region. */
7670 : :
7671 : : static bool
7672 : 2285374 : vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
7673 : : vec<int> *dataref_groups)
7674 : : {
7675 : 2285374 : DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
7676 : :
7677 : 2285374 : slp_instance instance;
7678 : 2285374 : int i;
7679 : 2285374 : poly_uint64 min_vf = 2;
7680 : :
7681 : : /* The first group of checks is independent of the vector size. */
7682 : 2285374 : fatal = true;
7683 : :
7684 : : /* Analyze the data references. */
7685 : :
7686 : 2285374 : if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
7687 : : {
7688 : 0 : if (dump_enabled_p ())
7689 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7690 : : "not vectorized: unhandled data-ref in basic "
7691 : : "block.\n");
7692 : 0 : return false;
7693 : : }
7694 : :
7695 : 2285374 : if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
7696 : : {
7697 : 0 : if (dump_enabled_p ())
7698 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7699 : : "not vectorized: unhandled data access in "
7700 : : "basic block.\n");
7701 : 0 : return false;
7702 : : }
7703 : :
7704 : 2285374 : vect_slp_check_for_roots (bb_vinfo);
7705 : :
7706 : : /* If there are no grouped stores and no constructors in the region
7707 : : there is no need to continue with pattern recog as vect_analyze_slp
7708 : : will fail anyway. */
7709 : 2285374 : if (bb_vinfo->grouped_stores.is_empty ()
7710 : 1959152 : && bb_vinfo->roots.is_empty ())
7711 : : {
7712 : 1707659 : if (dump_enabled_p ())
7713 : 948 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7714 : : "not vectorized: no grouped stores in "
7715 : : "basic block.\n");
7716 : 1707659 : return false;
7717 : : }
7718 : :
7719 : : /* While the rest of the analysis below depends on it in some way. */
7720 : 577715 : fatal = false;
7721 : :
7722 : 577715 : vect_pattern_recog (bb_vinfo);
7723 : :
7724 : : /* Update store groups from pattern processing. */
7725 : 577715 : vect_fixup_store_groups_with_patterns (bb_vinfo);
7726 : :
7727 : : /* Check the SLP opportunities in the basic block, analyze and build SLP
7728 : : trees. */
7729 : 577715 : if (!vect_analyze_slp (bb_vinfo, n_stmts))
7730 : : {
7731 : 0 : if (dump_enabled_p ())
7732 : : {
7733 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7734 : : "Failed to SLP the basic block.\n");
7735 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7736 : : "not vectorized: failed to find SLP opportunities "
7737 : : "in basic block.\n");
7738 : : }
7739 : 0 : return false;
7740 : : }
7741 : :
7742 : : /* Optimize permutations. */
7743 : 577715 : vect_optimize_slp (bb_vinfo);
7744 : :
7745 : : /* Gather the loads reachable from the SLP graph entries. */
7746 : 577715 : vect_gather_slp_loads (bb_vinfo);
7747 : :
7748 : 577715 : vect_record_base_alignments (bb_vinfo);
7749 : :
7750 : : /* Analyze and verify the alignment of data references and the
7751 : : dependence in the SLP instances. */
7752 : 1336997 : for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
7753 : : {
7754 : 759282 : vect_location = instance->location ();
7755 : 759282 : if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
7756 : 759282 : || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
7757 : : {
7758 : 8378 : slp_tree node = SLP_INSTANCE_TREE (instance);
7759 : 8378 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7760 : 8378 : if (dump_enabled_p ())
7761 : 6 : dump_printf_loc (MSG_NOTE, vect_location,
7762 : : "removing SLP instance operations starting from: %G",
7763 : : stmt_info->stmt);
7764 : 8378 : vect_free_slp_instance (instance);
7765 : 8378 : BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
7766 : 8378 : continue;
7767 : 8378 : }
7768 : :
7769 : : /* Mark all the statements that we want to vectorize as pure SLP and
7770 : : relevant. */
7771 : 750904 : vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7772 : 750904 : vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
7773 : 750904 : unsigned j;
7774 : 750904 : stmt_vec_info root;
7775 : : /* Likewise consider instance root stmts as vectorized. */
7776 : 1667876 : FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
7777 : 166068 : STMT_SLP_TYPE (root) = pure_slp;
7778 : :
7779 : 750904 : i++;
7780 : : }
7781 : 2322455 : if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
7782 : : return false;
7783 : :
7784 : 291219 : if (!vect_slp_analyze_operations (bb_vinfo))
7785 : : {
7786 : 37081 : if (dump_enabled_p ())
7787 : 95 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7788 : : "not vectorized: bad operation in basic block.\n");
7789 : 37081 : return false;
7790 : : }
7791 : :
7792 : 254138 : vect_bb_partition_graph (bb_vinfo);
7793 : :
7794 : 254138 : return true;
7795 : : }
7796 : :
7797 : : /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
7798 : : basic blocks in BBS, returning true on success.
7799 : : The region has N_STMTS statements and has the datarefs given by DATAREFS. */
7800 : :
7801 : : static bool
7802 : 2010135 : vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
7803 : : vec<int> *dataref_groups, unsigned int n_stmts,
7804 : : loop_p orig_loop)
7805 : : {
7806 : 2010135 : bb_vec_info bb_vinfo;
7807 : 2010135 : auto_vector_modes vector_modes;
7808 : :
7809 : : /* Autodetect first vector size we try. */
7810 : 2010135 : machine_mode next_vector_mode = VOIDmode;
7811 : 2010135 : targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
7812 : 2010135 : unsigned int mode_i = 0;
7813 : :
7814 : 2010135 : vec_info_shared shared;
7815 : :
7816 : 2010135 : machine_mode autodetected_vector_mode = VOIDmode;
7817 : 2560613 : while (1)
7818 : : {
7819 : 2285374 : bool vectorized = false;
7820 : 2285374 : bool fatal = false;
7821 : 2285374 : bb_vinfo = new _bb_vec_info (bbs, &shared);
7822 : :
7823 : 2285374 : bool first_time_p = shared.datarefs.is_empty ();
7824 : 2285374 : BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
7825 : 2285374 : if (first_time_p)
7826 : 2033019 : bb_vinfo->shared->save_datarefs ();
7827 : : else
7828 : 252355 : bb_vinfo->shared->check_datarefs ();
7829 : 2285374 : bb_vinfo->vector_mode = next_vector_mode;
7830 : :
7831 : 2285374 : if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
7832 : : {
7833 : 254138 : if (dump_enabled_p ())
7834 : : {
7835 : 1384 : dump_printf_loc (MSG_NOTE, vect_location,
7836 : : "***** Analysis succeeded with vector mode"
7837 : 692 : " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
7838 : 692 : dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
7839 : : }
7840 : :
7841 : 254138 : bb_vinfo->shared->check_datarefs ();
7842 : :
7843 : 254138 : bool force_clear = false;
7844 : 254138 : auto_vec<slp_instance> profitable_subgraphs;
7845 : 1403191 : for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
7846 : : {
7847 : 640777 : if (instance->subgraph_entries.is_empty ())
7848 : 146165 : continue;
7849 : :
7850 : 628302 : dump_user_location_t saved_vect_location = vect_location;
7851 : 628302 : vect_location = instance->location ();
7852 : 628302 : if (!unlimited_cost_model (NULL)
7853 : 1252426 : && !vect_bb_vectorization_profitable_p
7854 : 624124 : (bb_vinfo, instance->subgraph_entries, orig_loop))
7855 : : {
7856 : 121215 : if (dump_enabled_p ())
7857 : 33 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7858 : : "not vectorized: vectorization is not "
7859 : : "profitable.\n");
7860 : 121215 : vect_location = saved_vect_location;
7861 : 121215 : continue;
7862 : : }
7863 : :
7864 : 507087 : vect_location = saved_vect_location;
7865 : 507087 : if (!dbg_cnt (vect_slp))
7866 : : {
7867 : 0 : force_clear = true;
7868 : 0 : continue;
7869 : : }
7870 : :
7871 : 507087 : profitable_subgraphs.safe_push (instance);
7872 : : }
7873 : :
7874 : : /* When we're vectorizing an if-converted loop body make sure
7875 : : we vectorized all if-converted code. */
7876 : 440467 : if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
7877 : : {
7878 : 94 : gcc_assert (bb_vinfo->bbs.length () == 1);
7879 : 188 : for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
7880 : 3115 : !gsi_end_p (gsi); gsi_next (&gsi))
7881 : : {
7882 : : /* The costing above left us with DCEable vectorized scalar
7883 : : stmts having the visited flag set on profitable
7884 : : subgraphs. Do the delayed clearing of the flag here. */
7885 : 3021 : if (gimple_visited_p (gsi_stmt (gsi)))
7886 : : {
7887 : 412 : gimple_set_visited (gsi_stmt (gsi), false);
7888 : 412 : continue;
7889 : : }
7890 : 2609 : if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
7891 : 934 : continue;
7892 : :
7893 : 4116 : if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
7894 : 1323 : if (gimple_assign_rhs_code (ass) == COND_EXPR)
7895 : : {
7896 : 145 : if (!profitable_subgraphs.is_empty ()
7897 : 52 : && dump_enabled_p ())
7898 : 0 : dump_printf_loc (MSG_NOTE, vect_location,
7899 : : "not profitable because of "
7900 : : "unprofitable if-converted scalar "
7901 : : "code\n");
7902 : 93 : profitable_subgraphs.truncate (0);
7903 : : }
7904 : : }
7905 : : }
7906 : :
7907 : : /* Finally schedule the profitable subgraphs. */
7908 : 1133821 : for (slp_instance instance : profitable_subgraphs)
7909 : : {
7910 : 507025 : if (!vectorized && dump_enabled_p ())
7911 : 661 : dump_printf_loc (MSG_NOTE, vect_location,
7912 : : "Basic block will be vectorized "
7913 : : "using SLP\n");
7914 : 507025 : vectorized = true;
7915 : :
7916 : : /* Dump before scheduling as store vectorization will remove
7917 : : the original stores and mess with the instance tree
7918 : : so querying its location will eventually ICE. */
7919 : 507025 : if (flag_checking)
7920 : 2035192 : for (slp_instance sub : instance->subgraph_entries)
7921 : 514117 : gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
7922 : 507025 : unsigned HOST_WIDE_INT bytes;
7923 : 507025 : if (dump_enabled_p ())
7924 : 3233 : for (slp_instance sub : instance->subgraph_entries)
7925 : : {
7926 : 857 : tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
7927 : 1714 : if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
7928 : 857 : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7929 : 857 : sub->location (),
7930 : : "basic block part vectorized using %wu "
7931 : : "byte vectors\n", bytes);
7932 : : else
7933 : : dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
7934 : : sub->location (),
7935 : : "basic block part vectorized using "
7936 : : "variable length vectors\n");
7937 : : }
7938 : :
7939 : 507025 : dump_user_location_t saved_vect_location = vect_location;
7940 : 507025 : vect_location = instance->location ();
7941 : :
7942 : 507025 : vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
7943 : :
7944 : 507025 : vect_location = saved_vect_location;
7945 : : }
7946 : 254138 : }
7947 : : else
7948 : : {
7949 : 2031236 : if (dump_enabled_p ())
7950 : 1254 : dump_printf_loc (MSG_NOTE, vect_location,
7951 : : "***** Analysis failed with vector mode %s\n",
7952 : 1254 : GET_MODE_NAME (bb_vinfo->vector_mode));
7953 : : }
7954 : :
7955 : 2285374 : if (mode_i == 0)
7956 : 2010135 : autodetected_vector_mode = bb_vinfo->vector_mode;
7957 : :
7958 : 2285374 : if (!fatal)
7959 : 3200010 : while (mode_i < vector_modes.length ()
7960 : 1712972 : && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
7961 : : {
7962 : 336921 : if (dump_enabled_p ())
7963 : 1536 : dump_printf_loc (MSG_NOTE, vect_location,
7964 : : "***** The result for vector mode %s would"
7965 : : " be the same\n",
7966 : 768 : GET_MODE_NAME (vector_modes[mode_i]));
7967 : 336921 : mode_i += 1;
7968 : : }
7969 : :
7970 : 2285374 : delete bb_vinfo;
7971 : :
7972 : 2285374 : if (mode_i < vector_modes.length ()
7973 : 2132597 : && VECTOR_MODE_P (autodetected_vector_mode)
7974 : 1981716 : && (related_vector_mode (vector_modes[mode_i],
7975 : : GET_MODE_INNER (autodetected_vector_mode))
7976 : 990858 : == autodetected_vector_mode)
7977 : 4417971 : && (related_vector_mode (autodetected_vector_mode,
7978 : 529057 : GET_MODE_INNER (vector_modes[mode_i]))
7979 : 1058114 : == vector_modes[mode_i]))
7980 : : {
7981 : 529057 : if (dump_enabled_p ())
7982 : 229 : dump_printf_loc (MSG_NOTE, vect_location,
7983 : : "***** Skipping vector mode %s, which would"
7984 : : " repeat the analysis for %s\n",
7985 : 229 : GET_MODE_NAME (vector_modes[mode_i]),
7986 : 229 : GET_MODE_NAME (autodetected_vector_mode));
7987 : 529057 : mode_i += 1;
7988 : : }
7989 : :
7990 : 2285374 : if (vectorized
7991 : 2099097 : || mode_i == vector_modes.length ()
7992 : 1946421 : || autodetected_vector_mode == VOIDmode
7993 : : /* If vect_slp_analyze_bb_1 signaled that analysis for all
7994 : : vector sizes will fail do not bother iterating. */
7995 : 3090056 : || fatal)
7996 : 4020270 : return vectorized;
7997 : :
7998 : : /* Try the next biggest vector size. */
7999 : 275239 : next_vector_mode = vector_modes[mode_i++];
8000 : 275239 : if (dump_enabled_p ())
8001 : 232 : dump_printf_loc (MSG_NOTE, vect_location,
8002 : : "***** Re-trying analysis with vector mode %s\n",
8003 : 232 : GET_MODE_NAME (next_vector_mode));
8004 : 275239 : }
8005 : 2010135 : }
8006 : :
8007 : :
8008 : : /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
8009 : : true if anything in the basic-block was vectorized. */
8010 : :
8011 : : static bool
8012 : 2010135 : vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
8013 : : {
8014 : 2010135 : vec<data_reference_p> datarefs = vNULL;
8015 : 2010135 : auto_vec<int> dataref_groups;
8016 : 2010135 : int insns = 0;
8017 : 2010135 : int current_group = 0;
8018 : :
8019 : 23786634 : for (unsigned i = 0; i < bbs.length (); i++)
8020 : : {
8021 : 9883182 : basic_block bb = bbs[i];
8022 : 75363717 : for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
8023 : 65480535 : gsi_next (&gsi))
8024 : : {
8025 : 65480535 : gimple *stmt = gsi_stmt (gsi);
8026 : 65480535 : if (is_gimple_debug (stmt))
8027 : 38251678 : continue;
8028 : :
8029 : 27228857 : insns++;
8030 : :
8031 : 27228857 : if (gimple_location (stmt) != UNKNOWN_LOCATION)
8032 : 24871040 : vect_location = stmt;
8033 : :
8034 : 27228857 : if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
8035 : 27228857 : &dataref_groups, current_group))
8036 : 4971574 : ++current_group;
8037 : : }
8038 : : /* New BBs always start a new DR group. */
8039 : 9883182 : ++current_group;
8040 : : }
8041 : :
8042 : 2010135 : return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
8043 : 2010135 : }
8044 : :
8045 : : /* Special entry for the BB vectorizer. Analyze and transform a single
8046 : : if-converted BB with ORIG_LOOPs body being the not if-converted
8047 : : representation. Returns true if anything in the basic-block was
8048 : : vectorized. */
8049 : :
8050 : : bool
8051 : 15165 : vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
8052 : : {
8053 : 15165 : auto_vec<basic_block> bbs;
8054 : 15165 : bbs.safe_push (bb);
8055 : 15165 : return vect_slp_bbs (bbs, orig_loop);
8056 : 15165 : }
8057 : :
8058 : : /* Main entry for the BB vectorizer. Analyze and transform BB, returns
8059 : : true if anything in the basic-block was vectorized. */
8060 : :
8061 : : bool
8062 : 864381 : vect_slp_function (function *fun)
8063 : : {
8064 : 864381 : bool r = false;
8065 : 864381 : int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
8066 : 864381 : auto_bitmap exit_bbs;
8067 : 864381 : bitmap_set_bit (exit_bbs, EXIT_BLOCK);
8068 : 864381 : edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
8069 : 864381 : unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
8070 : 864381 : true, rpo, NULL);
8071 : :
8072 : : /* For the moment split the function into pieces to avoid making
8073 : : the iteration on the vector mode moot. Split at points we know
8074 : : to not handle well which is CFG merges (SLP discovery doesn't
8075 : : handle non-loop-header PHIs) and loop exits. Since pattern
8076 : : recog requires reverse iteration to visit uses before defs
8077 : : simply chop RPO into pieces. */
8078 : 864381 : auto_vec<basic_block> bbs;
8079 : 10820407 : for (unsigned i = 0; i < n; i++)
8080 : : {
8081 : 9956026 : basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
8082 : 9956026 : bool split = false;
8083 : :
8084 : : /* Split when a BB is not dominated by the first block. */
8085 : 18632783 : if (!bbs.is_empty ()
8086 : 8676757 : && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
8087 : : {
8088 : 778795 : if (dump_enabled_p ())
8089 : 99 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8090 : : "splitting region at dominance boundary bb%d\n",
8091 : : bb->index);
8092 : : split = true;
8093 : : }
8094 : : /* Split when the loop determined by the first block
8095 : : is exited. This is because we eventually insert
8096 : : invariants at region begin. */
8097 : 17075193 : else if (!bbs.is_empty ()
8098 : 7897962 : && bbs[0]->loop_father != bb->loop_father
8099 : 1877381 : && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
8100 : : {
8101 : 5602 : if (dump_enabled_p ())
8102 : 6 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8103 : : "splitting region at loop %d exit at bb%d\n",
8104 : 3 : bbs[0]->loop_father->num, bb->index);
8105 : : split = true;
8106 : : }
8107 : 9171629 : else if (!bbs.is_empty ()
8108 : 7892360 : && bb->loop_father->header == bb
8109 : 403502 : && bb->loop_father->dont_vectorize)
8110 : : {
8111 : 19313 : if (dump_enabled_p ())
8112 : 62 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8113 : : "splitting region at dont-vectorize loop %d "
8114 : : "entry at bb%d\n",
8115 : : bb->loop_father->num, bb->index);
8116 : : split = true;
8117 : : }
8118 : :
8119 : 10759736 : if (split && !bbs.is_empty ())
8120 : : {
8121 : 803710 : r |= vect_slp_bbs (bbs, NULL);
8122 : 803710 : bbs.truncate (0);
8123 : : }
8124 : :
8125 : 9956026 : if (bbs.is_empty ())
8126 : : {
8127 : : /* We need to be able to insert at the head of the region which
8128 : : we cannot for region starting with a returns-twice call. */
8129 : 2082979 : if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
8130 : 386238 : if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
8131 : : {
8132 : 284 : if (dump_enabled_p ())
8133 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8134 : : "skipping bb%d as start of region as it "
8135 : : "starts with returns-twice call\n",
8136 : : bb->index);
8137 : 88009 : continue;
8138 : : }
8139 : : /* If the loop this BB belongs to is marked as not to be vectorized
8140 : : honor that also for BB vectorization. */
8141 : 2082695 : if (bb->loop_father->dont_vectorize)
8142 : 87725 : continue;
8143 : : }
8144 : :
8145 : 9868017 : bbs.safe_push (bb);
8146 : :
8147 : : /* When we have a stmt ending this block and defining a
8148 : : value we have to insert on edges when inserting after it for
8149 : : a vector containing its definition. Avoid this for now. */
8150 : 19736034 : if (gimple *last = *gsi_last_bb (bb))
8151 : 7856095 : if (gimple_get_lhs (last)
8152 : 7856095 : && is_ctrl_altering_stmt (last))
8153 : : {
8154 : 326893 : if (dump_enabled_p ())
8155 : 2 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8156 : : "splitting region at control altering "
8157 : : "definition %G", last);
8158 : 326893 : r |= vect_slp_bbs (bbs, NULL);
8159 : 326893 : bbs.truncate (0);
8160 : : }
8161 : : }
8162 : :
8163 : 864381 : if (!bbs.is_empty ())
8164 : 864367 : r |= vect_slp_bbs (bbs, NULL);
8165 : :
8166 : 864381 : free (rpo);
8167 : :
8168 : 864381 : return r;
8169 : 864381 : }
8170 : :
8171 : : /* Build a variable-length vector in which the elements in ELTS are repeated
8172 : : to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
8173 : : RESULTS and add any new instructions to SEQ.
8174 : :
8175 : : The approach we use is:
8176 : :
8177 : : (1) Find a vector mode VM with integer elements of mode IM.
8178 : :
8179 : : (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8180 : : ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
8181 : : from small vectors to IM.
8182 : :
8183 : : (3) Duplicate each ELTS'[I] into a vector of mode VM.
8184 : :
8185 : : (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
8186 : : correct byte contents.
8187 : :
8188 : : (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
8189 : :
8190 : : We try to find the largest IM for which this sequence works, in order
8191 : : to cut down on the number of interleaves. */
8192 : :
8193 : : void
8194 : 0 : duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
8195 : : const vec<tree> &elts, unsigned int nresults,
8196 : : vec<tree> &results)
8197 : : {
8198 : 0 : unsigned int nelts = elts.length ();
8199 : 0 : tree element_type = TREE_TYPE (vector_type);
8200 : :
8201 : : /* (1) Find a vector mode VM with integer elements of mode IM. */
8202 : 0 : unsigned int nvectors = 1;
8203 : 0 : tree new_vector_type;
8204 : 0 : tree permutes[2];
8205 : 0 : if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
8206 : : &nvectors, &new_vector_type,
8207 : : permutes))
8208 : 0 : gcc_unreachable ();
8209 : :
8210 : : /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
8211 : 0 : unsigned int partial_nelts = nelts / nvectors;
8212 : 0 : tree partial_vector_type = build_vector_type (element_type, partial_nelts);
8213 : :
8214 : 0 : tree_vector_builder partial_elts;
8215 : 0 : auto_vec<tree, 32> pieces (nvectors * 2);
8216 : 0 : pieces.quick_grow_cleared (nvectors * 2);
8217 : 0 : for (unsigned int i = 0; i < nvectors; ++i)
8218 : : {
8219 : : /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
8220 : : ELTS' has mode IM. */
8221 : 0 : partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
8222 : 0 : for (unsigned int j = 0; j < partial_nelts; ++j)
8223 : 0 : partial_elts.quick_push (elts[i * partial_nelts + j]);
8224 : 0 : tree t = gimple_build_vector (seq, &partial_elts);
8225 : 0 : t = gimple_build (seq, VIEW_CONVERT_EXPR,
8226 : 0 : TREE_TYPE (new_vector_type), t);
8227 : :
8228 : : /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
8229 : 0 : pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
8230 : : }
8231 : :
8232 : : /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
8233 : : correct byte contents.
8234 : :
8235 : : Conceptually, we need to repeat the following operation log2(nvectors)
8236 : : times, where hi_start = nvectors / 2:
8237 : :
8238 : : out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
8239 : : out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
8240 : :
8241 : : However, if each input repeats every N elements and the VF is
8242 : : a multiple of N * 2, the HI result is the same as the LO result.
8243 : : This will be true for the first N1 iterations of the outer loop,
8244 : : followed by N2 iterations for which both the LO and HI results
8245 : : are needed. I.e.:
8246 : :
8247 : : N1 + N2 = log2(nvectors)
8248 : :
8249 : : Each "N1 iteration" doubles the number of redundant vectors and the
8250 : : effect of the process as a whole is to have a sequence of nvectors/2**N1
8251 : : vectors that repeats 2**N1 times. Rather than generate these redundant
8252 : : vectors, we halve the number of vectors for each N1 iteration. */
8253 : : unsigned int in_start = 0;
8254 : : unsigned int out_start = nvectors;
8255 : : unsigned int new_nvectors = nvectors;
8256 : 0 : for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
8257 : : {
8258 : 0 : unsigned int hi_start = new_nvectors / 2;
8259 : 0 : unsigned int out_i = 0;
8260 : 0 : for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
8261 : : {
8262 : 0 : if ((in_i & 1) != 0
8263 : 0 : && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
8264 : : 2 * in_repeat))
8265 : 0 : continue;
8266 : :
8267 : 0 : tree output = make_ssa_name (new_vector_type);
8268 : 0 : tree input1 = pieces[in_start + (in_i / 2)];
8269 : 0 : tree input2 = pieces[in_start + (in_i / 2) + hi_start];
8270 : 0 : gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
8271 : : input1, input2,
8272 : : permutes[in_i & 1]);
8273 : 0 : gimple_seq_add_stmt (seq, stmt);
8274 : 0 : pieces[out_start + out_i] = output;
8275 : 0 : out_i += 1;
8276 : : }
8277 : 0 : std::swap (in_start, out_start);
8278 : 0 : new_nvectors = out_i;
8279 : : }
8280 : :
8281 : : /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
8282 : 0 : results.reserve (nresults);
8283 : 0 : for (unsigned int i = 0; i < nresults; ++i)
8284 : 0 : if (i < new_nvectors)
8285 : 0 : results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
8286 : 0 : pieces[in_start + i]));
8287 : : else
8288 : 0 : results.quick_push (results[i - new_nvectors]);
8289 : 0 : }
8290 : :
8291 : :
8292 : : /* For constant and loop invariant defs in OP_NODE this function creates
8293 : : vector defs that will be used in the vectorized stmts and stores them
8294 : : to SLP_TREE_VEC_DEFS of OP_NODE. */
8295 : :
8296 : : static void
8297 : 443217 : vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
8298 : : {
8299 : 443217 : unsigned HOST_WIDE_INT nunits;
8300 : 443217 : tree vec_cst;
8301 : 443217 : unsigned j, number_of_places_left_in_vector;
8302 : 443217 : tree vector_type;
8303 : 443217 : tree vop;
8304 : 443217 : int group_size = op_node->ops.length ();
8305 : 443217 : unsigned int vec_num, i;
8306 : 443217 : unsigned number_of_copies = 1;
8307 : 443217 : bool constant_p;
8308 : 443217 : gimple_seq ctor_seq = NULL;
8309 : 443217 : auto_vec<tree, 16> permute_results;
8310 : :
8311 : : /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
8312 : 443217 : vector_type = SLP_TREE_VECTYPE (op_node);
8313 : :
8314 : 443217 : unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
8315 : 443217 : SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
8316 : 443217 : auto_vec<tree> voprnds (number_of_vectors);
8317 : :
8318 : : /* NUMBER_OF_COPIES is the number of times we need to use the same values in
8319 : : created vectors. It is greater than 1 if unrolling is performed.
8320 : :
8321 : : For example, we have two scalar operands, s1 and s2 (e.g., group of
8322 : : strided accesses of size two), while NUNITS is four (i.e., four scalars
8323 : : of this type can be packed in a vector). The output vector will contain
8324 : : two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
8325 : : will be 2).
8326 : :
8327 : : If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
8328 : : containing the operands.
8329 : :
8330 : : For example, NUNITS is four as before, and the group size is 8
8331 : : (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
8332 : : {s5, s6, s7, s8}. */
8333 : :
8334 : : /* When using duplicate_and_interleave, we just need one element for
8335 : : each scalar statement. */
8336 : 443217 : if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
8337 : : nunits = group_size;
8338 : :
8339 : 443217 : number_of_copies = nunits * number_of_vectors / group_size;
8340 : :
8341 : 443217 : number_of_places_left_in_vector = nunits;
8342 : 443217 : constant_p = true;
8343 : 443217 : tree uniform_elt = NULL_TREE;
8344 : 443217 : tree_vector_builder elts (vector_type, nunits, 1);
8345 : 443217 : elts.quick_grow (nunits);
8346 : 443217 : stmt_vec_info insert_after = NULL;
8347 : 892623 : for (j = 0; j < number_of_copies; j++)
8348 : : {
8349 : 449406 : tree op;
8350 : 2131256 : for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
8351 : : {
8352 : : /* Create 'vect_ = {op0,op1,...,opn}'. */
8353 : 1232444 : tree orig_op = op;
8354 : 1232444 : if (number_of_places_left_in_vector == nunits)
8355 : : uniform_elt = op;
8356 : 696869 : else if (uniform_elt && operand_equal_p (uniform_elt, op))
8357 : 270492 : op = elts[number_of_places_left_in_vector];
8358 : : else
8359 : : uniform_elt = NULL_TREE;
8360 : 1232444 : number_of_places_left_in_vector--;
8361 : 1232444 : if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
8362 : : {
8363 : 300828 : if (CONSTANT_CLASS_P (op))
8364 : : {
8365 : 82890 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8366 : : {
8367 : : /* Can't use VIEW_CONVERT_EXPR for booleans because
8368 : : of possibly different sizes of scalar value and
8369 : : vector element. */
8370 : 0 : if (integer_zerop (op))
8371 : 0 : op = build_int_cst (TREE_TYPE (vector_type), 0);
8372 : 0 : else if (integer_onep (op))
8373 : 0 : op = build_all_ones_cst (TREE_TYPE (vector_type));
8374 : : else
8375 : 0 : gcc_unreachable ();
8376 : : }
8377 : : else
8378 : 82890 : op = fold_unary (VIEW_CONVERT_EXPR,
8379 : : TREE_TYPE (vector_type), op);
8380 : 82890 : gcc_assert (op && CONSTANT_CLASS_P (op));
8381 : : }
8382 : : else
8383 : : {
8384 : 217938 : tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
8385 : 217938 : gimple *init_stmt;
8386 : 217938 : if (VECTOR_BOOLEAN_TYPE_P (vector_type))
8387 : : {
8388 : 0 : tree true_val
8389 : 0 : = build_all_ones_cst (TREE_TYPE (vector_type));
8390 : 0 : tree false_val
8391 : 0 : = build_zero_cst (TREE_TYPE (vector_type));
8392 : 0 : gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
8393 : 0 : init_stmt = gimple_build_assign (new_temp, COND_EXPR,
8394 : : op, true_val,
8395 : : false_val);
8396 : : }
8397 : : else
8398 : : {
8399 : 217938 : op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
8400 : : op);
8401 : 217938 : init_stmt
8402 : 217938 : = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
8403 : : op);
8404 : : }
8405 : 217938 : gimple_seq_add_stmt (&ctor_seq, init_stmt);
8406 : 217938 : op = new_temp;
8407 : : }
8408 : : }
8409 : 1232444 : elts[number_of_places_left_in_vector] = op;
8410 : 1232444 : if (!CONSTANT_CLASS_P (op))
8411 : 341297 : constant_p = false;
8412 : : /* For BB vectorization we have to compute an insert location
8413 : : when a def is inside the analyzed region since we cannot
8414 : : simply insert at the BB start in this case. */
8415 : 1232444 : stmt_vec_info opdef;
8416 : 1232444 : if (TREE_CODE (orig_op) == SSA_NAME
8417 : 218980 : && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
8418 : 205008 : && is_a <bb_vec_info> (vinfo)
8419 : 1431064 : && (opdef = vinfo->lookup_def (orig_op)))
8420 : : {
8421 : 154493 : if (!insert_after)
8422 : : insert_after = opdef;
8423 : : else
8424 : 81269 : insert_after = get_later_stmt (insert_after, opdef);
8425 : : }
8426 : :
8427 : 1232444 : if (number_of_places_left_in_vector == 0)
8428 : : {
8429 : 535575 : auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
8430 : 535575 : if (uniform_elt)
8431 : 469236 : vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
8432 : 234618 : elts[0]);
8433 : 300957 : else if (constant_p
8434 : 466065 : ? multiple_p (type_nunits, nunits)
8435 : 135849 : : known_eq (type_nunits, nunits))
8436 : 300957 : vec_cst = gimple_build_vector (&ctor_seq, &elts);
8437 : : else
8438 : : {
8439 : 0 : if (permute_results.is_empty ())
8440 : 0 : duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
8441 : : elts, number_of_vectors,
8442 : : permute_results);
8443 : 0 : vec_cst = permute_results[number_of_vectors - j - 1];
8444 : : }
8445 : 535575 : if (!gimple_seq_empty_p (ctor_seq))
8446 : : {
8447 : 157715 : if (insert_after)
8448 : : {
8449 : 73224 : gimple_stmt_iterator gsi;
8450 : 73224 : if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
8451 : : {
8452 : 8568 : gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
8453 : 8568 : gsi_insert_seq_before (&gsi, ctor_seq,
8454 : : GSI_CONTINUE_LINKING);
8455 : : }
8456 : 64656 : else if (!stmt_ends_bb_p (insert_after->stmt))
8457 : : {
8458 : 64656 : gsi = gsi_for_stmt (insert_after->stmt);
8459 : 64656 : gsi_insert_seq_after (&gsi, ctor_seq,
8460 : : GSI_CONTINUE_LINKING);
8461 : : }
8462 : : else
8463 : : {
8464 : : /* When we want to insert after a def where the
8465 : : defining stmt throws then insert on the fallthru
8466 : : edge. */
8467 : 0 : edge e = find_fallthru_edge
8468 : 0 : (gimple_bb (insert_after->stmt)->succs);
8469 : 0 : basic_block new_bb
8470 : 0 : = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
8471 : 0 : gcc_assert (!new_bb);
8472 : : }
8473 : : }
8474 : : else
8475 : 84491 : vinfo->insert_seq_on_entry (NULL, ctor_seq);
8476 : 157715 : ctor_seq = NULL;
8477 : : }
8478 : 535575 : voprnds.quick_push (vec_cst);
8479 : 535575 : insert_after = NULL;
8480 : 535575 : number_of_places_left_in_vector = nunits;
8481 : 535575 : constant_p = true;
8482 : 535575 : elts.new_vector (vector_type, nunits, 1);
8483 : 535575 : elts.quick_grow (nunits);
8484 : : }
8485 : : }
8486 : : }
8487 : :
8488 : : /* Since the vectors are created in the reverse order, we should invert
8489 : : them. */
8490 : 443217 : vec_num = voprnds.length ();
8491 : 978792 : for (j = vec_num; j != 0; j--)
8492 : : {
8493 : 535575 : vop = voprnds[j - 1];
8494 : 535575 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8495 : : }
8496 : :
8497 : : /* In case that VF is greater than the unrolling factor needed for the SLP
8498 : : group of stmts, NUMBER_OF_VECTORS to be created is greater than
8499 : : NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
8500 : : to replicate the vectors. */
8501 : 443217 : while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
8502 : 443217 : for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
8503 : : i++)
8504 : 0 : SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
8505 : 443217 : }
8506 : :
8507 : : /* Get the Ith vectorized definition from SLP_NODE. */
8508 : :
8509 : : tree
8510 : 42531 : vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
8511 : : {
8512 : 42531 : return SLP_TREE_VEC_DEFS (slp_node)[i];
8513 : : }
8514 : :
8515 : : /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
8516 : :
8517 : : void
8518 : 600405 : vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
8519 : : {
8520 : 600405 : vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
8521 : 600405 : vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
8522 : 600405 : }
8523 : :
8524 : : /* Get N vectorized definitions for SLP_NODE. */
8525 : :
8526 : : void
8527 : 1038 : vect_get_slp_defs (vec_info *,
8528 : : slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
8529 : : {
8530 : 1038 : if (n == -1U)
8531 : 1038 : n = SLP_TREE_CHILDREN (slp_node).length ();
8532 : :
8533 : 3588 : for (unsigned i = 0; i < n; ++i)
8534 : : {
8535 : 2550 : slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8536 : 2550 : vec<tree> vec_defs = vNULL;
8537 : 2550 : vect_get_slp_defs (child, &vec_defs);
8538 : 2550 : vec_oprnds->quick_push (vec_defs);
8539 : : }
8540 : 1038 : }
8541 : :
8542 : : /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
8543 : : - PERM gives the permutation that the caller wants to use for NODE,
8544 : : which might be different from SLP_LOAD_PERMUTATION.
8545 : : - DUMP_P controls whether the function dumps information. */
8546 : :
8547 : : static bool
8548 : 103608 : vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
8549 : : load_permutation_t &perm,
8550 : : const vec<tree> &dr_chain,
8551 : : gimple_stmt_iterator *gsi, poly_uint64 vf,
8552 : : bool analyze_only, bool dump_p,
8553 : : unsigned *n_perms, unsigned int *n_loads,
8554 : : bool dce_chain)
8555 : : {
8556 : 103608 : stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8557 : 103608 : int vec_index = 0;
8558 : 103608 : tree vectype = SLP_TREE_VECTYPE (node);
8559 : 103608 : unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
8560 : 103608 : unsigned int mask_element;
8561 : 103608 : unsigned dr_group_size;
8562 : 103608 : machine_mode mode;
8563 : :
8564 : 103608 : if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
8565 : : dr_group_size = 1;
8566 : : else
8567 : : {
8568 : 102224 : stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
8569 : 102224 : dr_group_size = DR_GROUP_SIZE (stmt_info);
8570 : : }
8571 : :
8572 : 103608 : mode = TYPE_MODE (vectype);
8573 : 103608 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8574 : 103608 : unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8575 : :
8576 : : /* Initialize the vect stmts of NODE to properly insert the generated
8577 : : stmts later. */
8578 : 103608 : if (! analyze_only)
8579 : 29467 : for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
8580 : 11627 : SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
8581 : :
8582 : : /* Generate permutation masks for every NODE. Number of masks for each NODE
8583 : : is equal to GROUP_SIZE.
8584 : : E.g., we have a group of three nodes with three loads from the same
8585 : : location in each node, and the vector size is 4. I.e., we have a
8586 : : a0b0c0a1b1c1... sequence and we need to create the following vectors:
8587 : : for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
8588 : : for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
8589 : : ...
8590 : :
8591 : : The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
8592 : : The last mask is illegal since we assume two operands for permute
8593 : : operation, and the mask element values can't be outside that range.
8594 : : Hence, the last mask must be converted into {2,5,5,5}.
8595 : : For the first two permutations we need the first and the second input
8596 : : vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
8597 : : we need the second and the third vectors: {b1,c1,a2,b2} and
8598 : : {c2,a3,b3,c3}. */
8599 : :
8600 : 103608 : int vect_stmts_counter = 0;
8601 : 103608 : unsigned int index = 0;
8602 : 103608 : int first_vec_index = -1;
8603 : 103608 : int second_vec_index = -1;
8604 : 103608 : bool noop_p = true;
8605 : 103608 : *n_perms = 0;
8606 : :
8607 : 103608 : vec_perm_builder mask;
8608 : 103608 : unsigned int nelts_to_build;
8609 : 103608 : unsigned int nvectors_per_build;
8610 : 103608 : unsigned int in_nlanes;
8611 : 103608 : bool repeating_p = (group_size == dr_group_size
8612 : 155474 : && multiple_p (nunits, group_size));
8613 : 103608 : if (repeating_p)
8614 : : {
8615 : : /* A single vector contains a whole number of copies of the node, so:
8616 : : (a) all permutes can use the same mask; and
8617 : : (b) the permutes only need a single vector input. */
8618 : 41851 : mask.new_vector (nunits, group_size, 3);
8619 : 41851 : nelts_to_build = mask.encoded_nelts ();
8620 : : /* It's possible to obtain zero nstmts during analyze_only, so make
8621 : : it at least one to ensure the later computation for n_perms
8622 : : proceed. */
8623 : 41851 : nvectors_per_build = nstmts > 0 ? nstmts : 1;
8624 : 41851 : in_nlanes = dr_group_size * 3;
8625 : : }
8626 : : else
8627 : : {
8628 : : /* We need to construct a separate mask for each vector statement. */
8629 : 61757 : unsigned HOST_WIDE_INT const_nunits, const_vf;
8630 : 61757 : if (!nunits.is_constant (&const_nunits)
8631 : 61757 : || !vf.is_constant (&const_vf))
8632 : : return false;
8633 : 61757 : mask.new_vector (const_nunits, const_nunits, 1);
8634 : 61757 : nelts_to_build = const_vf * group_size;
8635 : 61757 : nvectors_per_build = 1;
8636 : 61757 : in_nlanes = const_vf * dr_group_size;
8637 : : }
8638 : 103608 : auto_sbitmap used_in_lanes (in_nlanes);
8639 : 103608 : bitmap_clear (used_in_lanes);
8640 : 103608 : auto_bitmap used_defs;
8641 : :
8642 : 103608 : unsigned int count = mask.encoded_nelts ();
8643 : 103608 : mask.quick_grow (count);
8644 : 103608 : vec_perm_indices indices;
8645 : :
8646 : 591267 : for (unsigned int j = 0; j < nelts_to_build; j++)
8647 : : {
8648 : 492405 : unsigned int iter_num = j / group_size;
8649 : 492405 : unsigned int stmt_num = j % group_size;
8650 : 492405 : unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
8651 : 492405 : bitmap_set_bit (used_in_lanes, i);
8652 : 492405 : if (repeating_p)
8653 : : {
8654 : : first_vec_index = 0;
8655 : : mask_element = i;
8656 : : }
8657 : : else
8658 : : {
8659 : : /* Enforced before the loop when !repeating_p. */
8660 : 220995 : unsigned int const_nunits = nunits.to_constant ();
8661 : 220995 : vec_index = i / const_nunits;
8662 : 220995 : mask_element = i % const_nunits;
8663 : 220995 : if (vec_index == first_vec_index
8664 : 220995 : || first_vec_index == -1)
8665 : : {
8666 : : first_vec_index = vec_index;
8667 : : }
8668 : 52224 : else if (vec_index == second_vec_index
8669 : 52224 : || second_vec_index == -1)
8670 : : {
8671 : 51822 : second_vec_index = vec_index;
8672 : 51822 : mask_element += const_nunits;
8673 : : }
8674 : : else
8675 : : {
8676 : 402 : if (dump_p)
8677 : 79 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8678 : : "permutation requires at "
8679 : : "least three vectors %G",
8680 : : stmt_info->stmt);
8681 : 402 : gcc_assert (analyze_only);
8682 : : return false;
8683 : : }
8684 : :
8685 : 220593 : gcc_assert (mask_element < 2 * const_nunits);
8686 : : }
8687 : :
8688 : 492003 : if (mask_element != index)
8689 : 278404 : noop_p = false;
8690 : 492003 : mask[index++] = mask_element;
8691 : :
8692 : 492003 : if (index == count)
8693 : : {
8694 : 140454 : if (!noop_p)
8695 : : {
8696 : 134038 : indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
8697 : 92394 : if (!can_vec_perm_const_p (mode, mode, indices))
8698 : : {
8699 : 4344 : if (dump_p)
8700 : : {
8701 : 193 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8702 : : "unsupported vect permute { ");
8703 : 1279 : for (i = 0; i < count; ++i)
8704 : : {
8705 : 1086 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
8706 : 1086 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
8707 : : }
8708 : 193 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
8709 : : }
8710 : 4344 : gcc_assert (analyze_only);
8711 : : return false;
8712 : : }
8713 : :
8714 : 88050 : tree mask_vec = NULL_TREE;
8715 : 88050 : if (!analyze_only)
8716 : 10285 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
8717 : :
8718 : 88050 : if (second_vec_index == -1)
8719 : 48050 : second_vec_index = first_vec_index;
8720 : :
8721 : 176648 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8722 : : {
8723 : 88598 : ++*n_perms;
8724 : 88598 : if (analyze_only)
8725 : 78206 : continue;
8726 : : /* Generate the permute statement if necessary. */
8727 : 10392 : tree first_vec = dr_chain[first_vec_index + ri];
8728 : 10392 : tree second_vec = dr_chain[second_vec_index + ri];
8729 : 10392 : gassign *stmt = as_a<gassign *> (stmt_info->stmt);
8730 : 10392 : tree perm_dest
8731 : 10392 : = vect_create_destination_var (gimple_assign_lhs (stmt),
8732 : : vectype);
8733 : 10392 : perm_dest = make_ssa_name (perm_dest);
8734 : 10392 : gimple *perm_stmt
8735 : 10392 : = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
8736 : : second_vec, mask_vec);
8737 : 10392 : vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
8738 : : gsi);
8739 : 10392 : if (dce_chain)
8740 : : {
8741 : 9630 : bitmap_set_bit (used_defs, first_vec_index + ri);
8742 : 9630 : bitmap_set_bit (used_defs, second_vec_index + ri);
8743 : : }
8744 : :
8745 : : /* Store the vector statement in NODE. */
8746 : 10392 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
8747 : : }
8748 : : }
8749 : 48060 : else if (!analyze_only)
8750 : : {
8751 : 2470 : for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
8752 : : {
8753 : 1235 : tree first_vec = dr_chain[first_vec_index + ri];
8754 : : /* If mask was NULL_TREE generate the requested
8755 : : identity transform. */
8756 : 1235 : if (dce_chain)
8757 : 1235 : bitmap_set_bit (used_defs, first_vec_index + ri);
8758 : :
8759 : : /* Store the vector statement in NODE. */
8760 : 1235 : SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
8761 : : }
8762 : : }
8763 : :
8764 : : index = 0;
8765 : : first_vec_index = -1;
8766 : : second_vec_index = -1;
8767 : : noop_p = true;
8768 : : }
8769 : : }
8770 : :
8771 : 98862 : if (n_loads)
8772 : : {
8773 : 1274 : if (repeating_p)
8774 : 1141 : *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
8775 : : else
8776 : : {
8777 : : /* Enforced above when !repeating_p. */
8778 : 133 : unsigned int const_nunits = nunits.to_constant ();
8779 : 133 : *n_loads = 0;
8780 : 133 : bool load_seen = false;
8781 : 3835 : for (unsigned i = 0; i < in_nlanes; ++i)
8782 : : {
8783 : 3702 : if (i % const_nunits == 0)
8784 : : {
8785 : 241 : if (load_seen)
8786 : 106 : *n_loads += 1;
8787 : : load_seen = false;
8788 : : }
8789 : 3702 : if (bitmap_bit_p (used_in_lanes, i))
8790 : 530 : load_seen = true;
8791 : : }
8792 : 133 : if (load_seen)
8793 : 131 : *n_loads += 1;
8794 : : }
8795 : : }
8796 : :
8797 : 98862 : if (dce_chain)
8798 : 95170 : for (unsigned i = 0; i < dr_chain.length (); ++i)
8799 : 39403 : if (!bitmap_bit_p (used_defs, i))
8800 : : {
8801 : 24660 : tree def = dr_chain[i];
8802 : 24698 : do
8803 : : {
8804 : 24698 : gimple *stmt = SSA_NAME_DEF_STMT (def);
8805 : 24698 : if (is_gimple_assign (stmt)
8806 : 24698 : && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
8807 : 24698 : || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
8808 : 171 : def = single_ssa_tree_operand (stmt, SSA_OP_USE);
8809 : : else
8810 : : def = NULL;
8811 : 24698 : gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
8812 : 24698 : gsi_remove (&rgsi, true);
8813 : 24698 : release_defs (stmt);
8814 : : }
8815 : 24698 : while (def);
8816 : : }
8817 : :
8818 : : return true;
8819 : 103608 : }
8820 : :
8821 : : /* Generate vector permute statements from a list of loads in DR_CHAIN.
8822 : : If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
8823 : : permute statements for the SLP node NODE. Store the number of vector
8824 : : permute instructions in *N_PERMS and the number of vector load
8825 : : instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
8826 : : that were not needed. */
8827 : :
8828 : : bool
8829 : 60332 : vect_transform_slp_perm_load (vec_info *vinfo,
8830 : : slp_tree node, const vec<tree> &dr_chain,
8831 : : gimple_stmt_iterator *gsi, poly_uint64 vf,
8832 : : bool analyze_only, unsigned *n_perms,
8833 : : unsigned int *n_loads, bool dce_chain)
8834 : : {
8835 : 60332 : return vect_transform_slp_perm_load_1 (vinfo, node,
8836 : 60332 : SLP_TREE_LOAD_PERMUTATION (node),
8837 : : dr_chain, gsi, vf, analyze_only,
8838 : : dump_enabled_p (), n_perms, n_loads,
8839 : 60332 : dce_chain);
8840 : : }
8841 : :
8842 : : /* Produce the next vector result for SLP permutation NODE by adding a vector
8843 : : statement at GSI. If MASK_VEC is nonnull, add:
8844 : :
8845 : : <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
8846 : :
8847 : : otherwise add:
8848 : :
8849 : : <new SSA name> = FIRST_DEF. */
8850 : :
8851 : : static void
8852 : 2021 : vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
8853 : : slp_tree node, tree first_def, tree second_def,
8854 : : tree mask_vec, poly_uint64 identity_offset)
8855 : : {
8856 : 2021 : tree vectype = SLP_TREE_VECTYPE (node);
8857 : :
8858 : : /* ??? We SLP match existing vector element extracts but
8859 : : allow punning which we need to re-instantiate at uses
8860 : : but have no good way of explicitly representing. */
8861 : 2021 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
8862 : 2021 : && !types_compatible_p (TREE_TYPE (first_def), vectype))
8863 : : {
8864 : 40 : gassign *conv_stmt
8865 : 40 : = gimple_build_assign (make_ssa_name (vectype),
8866 : : build1 (VIEW_CONVERT_EXPR, vectype, first_def));
8867 : 40 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8868 : 40 : first_def = gimple_assign_lhs (conv_stmt);
8869 : : }
8870 : 2021 : gassign *perm_stmt;
8871 : 2021 : tree perm_dest = make_ssa_name (vectype);
8872 : 2021 : if (mask_vec)
8873 : : {
8874 : 1467 : if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
8875 : 1467 : TYPE_SIZE (vectype))
8876 : 1467 : && !types_compatible_p (TREE_TYPE (second_def), vectype))
8877 : : {
8878 : 9 : gassign *conv_stmt
8879 : 9 : = gimple_build_assign (make_ssa_name (vectype),
8880 : : build1 (VIEW_CONVERT_EXPR,
8881 : : vectype, second_def));
8882 : 9 : vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
8883 : 9 : second_def = gimple_assign_lhs (conv_stmt);
8884 : : }
8885 : 1467 : perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
8886 : : first_def, second_def,
8887 : : mask_vec);
8888 : : }
8889 : 554 : else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
8890 : : {
8891 : : /* For identity permutes we still need to handle the case
8892 : : of offsetted extracts or concats. */
8893 : 221 : unsigned HOST_WIDE_INT c;
8894 : 221 : auto first_def_nunits
8895 : 221 : = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
8896 : 221 : if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
8897 : : {
8898 : 217 : unsigned HOST_WIDE_INT elsz
8899 : 217 : = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
8900 : 434 : tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
8901 : 217 : TYPE_SIZE (vectype),
8902 : 217 : bitsize_int (identity_offset * elsz));
8903 : 217 : perm_stmt = gimple_build_assign (perm_dest, lowpart);
8904 : : }
8905 : 4 : else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
8906 : 4 : first_def_nunits, &c) && c == 2)
8907 : : {
8908 : 4 : tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
8909 : : NULL_TREE, second_def);
8910 : 4 : perm_stmt = gimple_build_assign (perm_dest, ctor);
8911 : : }
8912 : : else
8913 : 0 : gcc_unreachable ();
8914 : : }
8915 : : else
8916 : : {
8917 : : /* We need a copy here in case the def was external. */
8918 : 333 : perm_stmt = gimple_build_assign (perm_dest, first_def);
8919 : : }
8920 : 2021 : vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
8921 : : /* Store the vector statement in NODE. */
8922 : 2021 : node->push_vec_def (perm_stmt);
8923 : 2021 : }
8924 : :
8925 : : /* Subroutine of vectorizable_slp_permutation. Check whether the target
8926 : : can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
8927 : : If GSI is nonnull, emit the permutation there.
8928 : :
8929 : : When GSI is null, the only purpose of NODE is to give properties
8930 : : of the result, such as the vector type and number of SLP lanes.
8931 : : The node does not need to be a VEC_PERM_EXPR.
8932 : :
8933 : : If the target supports the operation, return the number of individual
8934 : : VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
8935 : : dump file if DUMP_P is true. */
8936 : :
8937 : : static int
8938 : 311952 : vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
8939 : : slp_tree node, lane_permutation_t &perm,
8940 : : vec<slp_tree> &children, bool dump_p)
8941 : : {
8942 : 311952 : tree vectype = SLP_TREE_VECTYPE (node);
8943 : :
8944 : : /* ??? We currently only support all same vector input types
8945 : : while the SLP IL should really do a concat + select and thus accept
8946 : : arbitrary mismatches. */
8947 : 311952 : slp_tree child;
8948 : 311952 : unsigned i;
8949 : 311952 : poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8950 : 311952 : bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
8951 : 311952 : tree op_vectype = NULL_TREE;
8952 : 311960 : FOR_EACH_VEC_ELT (children, i, child)
8953 : 311952 : if (SLP_TREE_VECTYPE (child))
8954 : : {
8955 : : op_vectype = SLP_TREE_VECTYPE (child);
8956 : : break;
8957 : : }
8958 : 311952 : if (!op_vectype)
8959 : 8 : op_vectype = vectype;
8960 : 672075 : FOR_EACH_VEC_ELT (children, i, child)
8961 : : {
8962 : 360123 : if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
8963 : 10715 : && !vect_maybe_update_slp_op_vectype (child, op_vectype))
8964 : 360123 : || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
8965 : 720246 : || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
8966 : : {
8967 : 0 : if (dump_p)
8968 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8969 : : "Unsupported vector types in lane permutation\n");
8970 : 0 : return -1;
8971 : : }
8972 : 360123 : if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
8973 : 6123 : repeating_p = false;
8974 : : }
8975 : :
8976 : 623904 : gcc_assert (perm.length () == SLP_TREE_LANES (node));
8977 : 311952 : if (dump_p)
8978 : : {
8979 : 878 : dump_printf_loc (MSG_NOTE, vect_location,
8980 : : "vectorizing permutation");
8981 : 11076 : for (unsigned i = 0; i < perm.length (); ++i)
8982 : 4660 : dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
8983 : 878 : if (repeating_p)
8984 : 658 : dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
8985 : 878 : dump_printf (MSG_NOTE, "\n");
8986 : : }
8987 : :
8988 : : /* REPEATING_P is true if every output vector is guaranteed to use the
8989 : : same permute vector. We can handle that case for both variable-length
8990 : : and constant-length vectors, but we only handle other cases for
8991 : : constant-length vectors.
8992 : :
8993 : : Set:
8994 : :
8995 : : - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
8996 : : mask vector that we want to build.
8997 : :
8998 : : - NCOPIES to the number of copies of PERM that we need in order
8999 : : to build the necessary permute mask vectors.
9000 : :
9001 : : - NOUTPUTS_PER_MASK to the number of output vectors we want to create
9002 : : for each permute mask vector. This is only relevant when GSI is
9003 : : nonnull. */
9004 : 311952 : uint64_t npatterns;
9005 : 311952 : unsigned nelts_per_pattern;
9006 : 311952 : uint64_t ncopies;
9007 : 311952 : unsigned noutputs_per_mask;
9008 : 311952 : if (repeating_p)
9009 : : {
9010 : : /* We need a single permute mask vector that has the form:
9011 : :
9012 : : { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
9013 : :
9014 : : In other words, the original n-element permute in PERM is
9015 : : "unrolled" to fill a full vector. The stepped vector encoding
9016 : : that we use for permutes requires 3n elements. */
9017 : 286709 : npatterns = SLP_TREE_LANES (node);
9018 : 286709 : nelts_per_pattern = ncopies = 3;
9019 : 286709 : noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9020 : : }
9021 : : else
9022 : : {
9023 : : /* Calculate every element of every permute mask vector explicitly,
9024 : : instead of relying on the pattern described above. */
9025 : 25243 : if (!nunits.is_constant (&npatterns)
9026 : 25243 : || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
9027 : : return -1;
9028 : 25243 : nelts_per_pattern = ncopies = 1;
9029 : 25243 : if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
9030 : 2627 : if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
9031 : : return -1;
9032 : : noutputs_per_mask = 1;
9033 : : }
9034 : 311952 : unsigned olanes = ncopies * SLP_TREE_LANES (node);
9035 : 311952 : gcc_assert (repeating_p || multiple_p (olanes, nunits));
9036 : :
9037 : : /* Compute the { { SLP operand, vector index}, lane } permutation sequence
9038 : : from the { SLP operand, scalar lane } permutation as recorded in the
9039 : : SLP node as intermediate step. This part should already work
9040 : : with SLP children with arbitrary number of lanes. */
9041 : 311952 : auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
9042 : 311952 : auto_vec<unsigned> active_lane;
9043 : 311952 : vperm.create (olanes);
9044 : 311952 : active_lane.safe_grow_cleared (children.length (), true);
9045 : 1200161 : for (unsigned i = 0; i < ncopies; ++i)
9046 : : {
9047 : 5601970 : for (unsigned pi = 0; pi < perm.length (); ++pi)
9048 : : {
9049 : 1912776 : std::pair<unsigned, unsigned> p = perm[pi];
9050 : 1912776 : tree vtype = SLP_TREE_VECTYPE (children[p.first]);
9051 : 1912776 : if (repeating_p)
9052 : 1777182 : vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
9053 : : else
9054 : : {
9055 : : /* We checked above that the vectors are constant-length. */
9056 : 135594 : unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
9057 : 135594 : unsigned vi = (active_lane[p.first] + p.second) / vnunits;
9058 : 135594 : unsigned vl = (active_lane[p.first] + p.second) % vnunits;
9059 : 135594 : vperm.quick_push ({{p.first, vi}, vl});
9060 : : }
9061 : : }
9062 : : /* Advance to the next group. */
9063 : 3836098 : for (unsigned j = 0; j < children.length (); ++j)
9064 : 1029840 : active_lane[j] += SLP_TREE_LANES (children[j]);
9065 : : }
9066 : :
9067 : 311952 : if (dump_p)
9068 : : {
9069 : 878 : dump_printf_loc (MSG_NOTE, vect_location,
9070 : : "vectorizing permutation");
9071 : 11076 : for (unsigned i = 0; i < perm.length (); ++i)
9072 : 4660 : dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
9073 : 878 : if (repeating_p)
9074 : 658 : dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
9075 : 878 : dump_printf (MSG_NOTE, "\n");
9076 : 878 : dump_printf_loc (MSG_NOTE, vect_location, "as");
9077 : 18212 : for (unsigned i = 0; i < vperm.length (); ++i)
9078 : : {
9079 : 8228 : if (i != 0
9080 : 8228 : && (repeating_p
9081 : 4694 : ? multiple_p (i, npatterns)
9082 : 6912 : : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
9083 : 2060 : dump_printf (MSG_NOTE, ",");
9084 : 16456 : dump_printf (MSG_NOTE, " vops%u[%u][%u]",
9085 : 8228 : vperm[i].first.first, vperm[i].first.second,
9086 : 8228 : vperm[i].second);
9087 : : }
9088 : 878 : dump_printf (MSG_NOTE, "\n");
9089 : : }
9090 : :
9091 : : /* We can only handle two-vector permutes, everything else should
9092 : : be lowered on the SLP level. The following is closely inspired
9093 : : by vect_transform_slp_perm_load and is supposed to eventually
9094 : : replace it.
9095 : : ??? As intermediate step do code-gen in the SLP tree representation
9096 : : somehow? */
9097 : 311952 : std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
9098 : 311952 : std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
9099 : 311952 : unsigned int index = 0;
9100 : 311952 : poly_uint64 mask_element;
9101 : 311952 : vec_perm_builder mask;
9102 : 311952 : mask.new_vector (nunits, npatterns, nelts_per_pattern);
9103 : 311952 : unsigned int count = mask.encoded_nelts ();
9104 : 311952 : mask.quick_grow (count);
9105 : 311952 : vec_perm_indices indices;
9106 : 311952 : unsigned nperms = 0;
9107 : 4417354 : for (unsigned i = 0; i < vperm.length (); ++i)
9108 : : {
9109 : 1908049 : mask_element = vperm[i].second;
9110 : 1908049 : if (first_vec.first == -1U
9111 : 1908049 : || first_vec == vperm[i].first)
9112 : 1751497 : first_vec = vperm[i].first;
9113 : 156552 : else if (second_vec.first == -1U
9114 : 156552 : || second_vec == vperm[i].first)
9115 : : {
9116 : 155893 : second_vec = vperm[i].first;
9117 : 155893 : mask_element += nunits;
9118 : : }
9119 : : else
9120 : : {
9121 : 659 : if (dump_p)
9122 : 0 : dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9123 : : "permutation requires at "
9124 : : "least three vectors\n");
9125 : 659 : gcc_assert (!gsi);
9126 : : return -1;
9127 : : }
9128 : :
9129 : 1907390 : mask[index++] = mask_element;
9130 : :
9131 : 1907390 : if (index == count)
9132 : : {
9133 : 406398 : indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
9134 : : TYPE_VECTOR_SUBPARTS (op_vectype));
9135 : 350150 : bool identity_p = (indices.series_p (0, 1, mask[0], 1)
9136 : 613576 : && constant_multiple_p (mask[0], nunits));
9137 : 350150 : machine_mode vmode = TYPE_MODE (vectype);
9138 : 350150 : machine_mode op_vmode = TYPE_MODE (op_vectype);
9139 : 350150 : unsigned HOST_WIDE_INT c;
9140 : 350150 : if ((!identity_p
9141 : 319408 : && !can_vec_perm_const_p (vmode, op_vmode, indices))
9142 : 350150 : || (identity_p
9143 : 30742 : && !known_le (nunits,
9144 : : TYPE_VECTOR_SUBPARTS (op_vectype))
9145 : 8 : && (!constant_multiple_p (nunits,
9146 : 8 : TYPE_VECTOR_SUBPARTS (op_vectype),
9147 : 8 : &c) || c != 2)))
9148 : : {
9149 : 10665 : if (dump_p)
9150 : : {
9151 : 109 : dump_printf_loc (MSG_MISSED_OPTIMIZATION,
9152 : : vect_location,
9153 : : "unsupported vect permute { ");
9154 : 1027 : for (i = 0; i < count; ++i)
9155 : : {
9156 : 918 : dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
9157 : 918 : dump_printf (MSG_MISSED_OPTIMIZATION, " ");
9158 : : }
9159 : 109 : dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
9160 : : }
9161 : 10665 : gcc_assert (!gsi);
9162 : 11324 : return -1;
9163 : : }
9164 : :
9165 : 339485 : if (!identity_p)
9166 : 308743 : nperms++;
9167 : 339485 : if (gsi)
9168 : : {
9169 : 1993 : if (second_vec.first == -1U)
9170 : 957 : second_vec = first_vec;
9171 : :
9172 : 1993 : slp_tree
9173 : 1993 : first_node = children[first_vec.first],
9174 : 1993 : second_node = children[second_vec.first];
9175 : :
9176 : 1993 : tree mask_vec = NULL_TREE;
9177 : 1993 : if (!identity_p)
9178 : 1439 : mask_vec = vect_gen_perm_mask_checked (vectype, indices);
9179 : :
9180 : 4014 : for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
9181 : : {
9182 : 2021 : tree first_def
9183 : 2021 : = vect_get_slp_vect_def (first_node,
9184 : : first_vec.second + vi);
9185 : 2021 : tree second_def
9186 : 2021 : = vect_get_slp_vect_def (second_node,
9187 : : second_vec.second + vi);
9188 : 2021 : vect_add_slp_permutation (vinfo, gsi, node, first_def,
9189 : 2021 : second_def, mask_vec, mask[0]);
9190 : : }
9191 : : }
9192 : :
9193 : : index = 0;
9194 : : first_vec = std::make_pair (-1U, -1U);
9195 : : second_vec = std::make_pair (-1U, -1U);
9196 : : }
9197 : : }
9198 : :
9199 : 300628 : return nperms;
9200 : 311952 : }
9201 : :
9202 : : /* Vectorize the SLP permutations in NODE as specified
9203 : : in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
9204 : : child number and lane number.
9205 : : Interleaving of two two-lane two-child SLP subtrees (not supported):
9206 : : [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
9207 : : A blend of two four-lane two-child SLP subtrees:
9208 : : [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
9209 : : Highpart of a four-lane one-child SLP subtree (not supported):
9210 : : [ { 0, 2 }, { 0, 3 } ]
9211 : : Where currently only a subset is supported by code generating below. */
9212 : :
9213 : : static bool
9214 : 30426 : vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9215 : : slp_tree node, stmt_vector_for_cost *cost_vec)
9216 : : {
9217 : 30426 : tree vectype = SLP_TREE_VECTYPE (node);
9218 : 30426 : lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
9219 : 30426 : int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
9220 : 30426 : SLP_TREE_CHILDREN (node),
9221 : : dump_enabled_p ());
9222 : 30426 : if (nperms < 0)
9223 : : return false;
9224 : :
9225 : 29432 : if (!gsi)
9226 : 27875 : record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
9227 : :
9228 : : return true;
9229 : : }
9230 : :
9231 : : /* Vectorize SLP NODE. */
9232 : :
9233 : : static void
9234 : 1105687 : vect_schedule_slp_node (vec_info *vinfo,
9235 : : slp_tree node, slp_instance instance)
9236 : : {
9237 : 1105687 : gimple_stmt_iterator si;
9238 : 1105687 : int i;
9239 : 1105687 : slp_tree child;
9240 : :
9241 : : /* Vectorize externals and constants. */
9242 : 1105687 : if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
9243 : 1105687 : || SLP_TREE_DEF_TYPE (node) == vect_external_def)
9244 : : {
9245 : : /* ??? vectorizable_shift can end up using a scalar operand which is
9246 : : currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
9247 : : node in this case. */
9248 : 444499 : if (!SLP_TREE_VECTYPE (node))
9249 : 444499 : return;
9250 : :
9251 : : /* There are two reasons vector defs might already exist. The first
9252 : : is that we are vectorizing an existing vector def. The second is
9253 : : when performing BB vectorization shared constant/external nodes
9254 : : are not split apart during partitioning so during the code-gen
9255 : : DFS walk we can end up visiting them twice. */
9256 : 444008 : if (! SLP_TREE_VEC_DEFS (node).exists ())
9257 : 443217 : vect_create_constant_vectors (vinfo, node);
9258 : 444008 : return;
9259 : : }
9260 : :
9261 : 661188 : gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
9262 : :
9263 : 661188 : stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
9264 : :
9265 : 661188 : gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
9266 : 661188 : SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
9267 : :
9268 : 661188 : if (dump_enabled_p ())
9269 : 9347 : dump_printf_loc (MSG_NOTE, vect_location,
9270 : : "------>vectorizing SLP node starting from: %G",
9271 : : stmt_info->stmt);
9272 : :
9273 : 661188 : if (STMT_VINFO_DATA_REF (stmt_info)
9274 : 617352 : && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9275 : : {
9276 : : /* Vectorized loads go before the first scalar load to make it
9277 : : ready early, vectorized stores go before the last scalar
9278 : : stmt which is where all uses are ready. */
9279 : 617222 : stmt_vec_info last_stmt_info = NULL;
9280 : 617222 : if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
9281 : 106554 : last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
9282 : : else /* DR_IS_WRITE */
9283 : 510668 : last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
9284 : 617222 : si = gsi_for_stmt (last_stmt_info->stmt);
9285 : 617222 : }
9286 : 43966 : else if ((STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
9287 : : || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
9288 : : || STMT_VINFO_TYPE (stmt_info) == phi_info_type)
9289 : 20976 : && SLP_TREE_CODE (node) != VEC_PERM_EXPR)
9290 : : {
9291 : : /* For PHI node vectorization we do not use the insertion iterator. */
9292 : 20956 : si = gsi_none ();
9293 : : }
9294 : : else
9295 : : {
9296 : : /* Emit other stmts after the children vectorized defs which is
9297 : : earliest possible. */
9298 : 23010 : gimple *last_stmt = NULL;
9299 : 23010 : if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
9300 : 28 : if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9301 : 6694 : || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
9302 : : {
9303 : : /* But avoid scheduling internal defs outside of the loop when
9304 : : we might have only implicitly tracked loop mask/len defs. */
9305 : 28 : gimple_stmt_iterator si
9306 : 28 : = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
9307 : 28 : last_stmt = *si;
9308 : : }
9309 : : bool seen_vector_def = false;
9310 : 64194 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9311 : 41184 : if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
9312 : : {
9313 : : /* For fold-left reductions we are retaining the scalar
9314 : : reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
9315 : : set so the representation isn't perfect. Resort to the
9316 : : last scalar def here. */
9317 : 28632 : if (SLP_TREE_VEC_DEFS (child).is_empty ())
9318 : : {
9319 : 49 : gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
9320 : : == cycle_phi_info_type);
9321 : 49 : gphi *phi = as_a <gphi *>
9322 : 49 : (vect_find_last_scalar_stmt_in_slp (child)->stmt);
9323 : 49 : if (!last_stmt
9324 : 49 : || vect_stmt_dominates_stmt_p (last_stmt, phi))
9325 : : last_stmt = phi;
9326 : : }
9327 : : /* We are emitting all vectorized stmts in the same place and
9328 : : the last one is the last.
9329 : : ??? Unless we have a load permutation applied and that
9330 : : figures to re-use an earlier generated load. */
9331 : : unsigned j;
9332 : : tree vdef;
9333 : 71702 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9334 : : {
9335 : 43070 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9336 : 43070 : if (!last_stmt
9337 : 43070 : || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9338 : : last_stmt = vstmt;
9339 : : }
9340 : : }
9341 : 12552 : else if (!SLP_TREE_VECTYPE (child))
9342 : : {
9343 : : /* For externals we use unvectorized at all scalar defs. */
9344 : : unsigned j;
9345 : : tree def;
9346 : 3102 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
9347 : 2611 : if (TREE_CODE (def) == SSA_NAME
9348 : 2611 : && !SSA_NAME_IS_DEFAULT_DEF (def))
9349 : : {
9350 : 36 : gimple *stmt = SSA_NAME_DEF_STMT (def);
9351 : 36 : if (!last_stmt
9352 : 36 : || vect_stmt_dominates_stmt_p (last_stmt, stmt))
9353 : : last_stmt = stmt;
9354 : : }
9355 : : }
9356 : : else
9357 : : {
9358 : : /* For externals we have to look at all defs since their
9359 : : insertion place is decided per vector. But beware
9360 : : of pre-existing vectors where we need to make sure
9361 : : we do not insert before the region boundary. */
9362 : 12061 : if (SLP_TREE_SCALAR_OPS (child).is_empty ()
9363 : 625 : && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
9364 : : seen_vector_def = true;
9365 : : else
9366 : : {
9367 : : unsigned j;
9368 : : tree vdef;
9369 : 70874 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
9370 : 17744 : if (TREE_CODE (vdef) == SSA_NAME
9371 : 17744 : && !SSA_NAME_IS_DEFAULT_DEF (vdef))
9372 : : {
9373 : 10478 : gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
9374 : 10478 : if (!last_stmt
9375 : 10478 : || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
9376 : : last_stmt = vstmt;
9377 : : }
9378 : : }
9379 : : }
9380 : : /* This can happen when all children are pre-existing vectors or
9381 : : constants. */
9382 : 23010 : if (!last_stmt)
9383 : 118 : last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
9384 : 118 : if (!last_stmt)
9385 : : {
9386 : 0 : gcc_assert (seen_vector_def);
9387 : 0 : si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9388 : : }
9389 : 23010 : else if (is_ctrl_altering_stmt (last_stmt))
9390 : : {
9391 : : /* We split regions to vectorize at control altering stmts
9392 : : with a definition so this must be an external which
9393 : : we can insert at the start of the region. */
9394 : 4 : si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
9395 : : }
9396 : 23006 : else if (is_a <bb_vec_info> (vinfo)
9397 : 16312 : && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
9398 : 24939 : && gimple_could_trap_p (stmt_info->stmt))
9399 : : {
9400 : : /* We've constrained possibly trapping operations to all come
9401 : : from the same basic-block, if vectorized defs would allow earlier
9402 : : scheduling still force vectorized stmts to the original block.
9403 : : This is only necessary for BB vectorization since for loop vect
9404 : : all operations are in a single BB and scalar stmt based
9405 : : placement doesn't play well with epilogue vectorization. */
9406 : 53 : gcc_assert (dominated_by_p (CDI_DOMINATORS,
9407 : : gimple_bb (stmt_info->stmt),
9408 : : gimple_bb (last_stmt)));
9409 : 53 : si = gsi_after_labels (gimple_bb (stmt_info->stmt));
9410 : : }
9411 : 22953 : else if (is_a <gphi *> (last_stmt))
9412 : 572 : si = gsi_after_labels (gimple_bb (last_stmt));
9413 : : else
9414 : : {
9415 : 22381 : si = gsi_for_stmt (last_stmt);
9416 : 22381 : gsi_next (&si);
9417 : : }
9418 : : }
9419 : :
9420 : : /* Handle purely internal nodes. */
9421 : 661188 : if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
9422 : : {
9423 : : /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
9424 : : be shared with different SLP nodes (but usually it's the same
9425 : : operation apart from the case the stmt is only there for denoting
9426 : : the actual scalar lane defs ...). So do not call vect_transform_stmt
9427 : : but open-code it here (partly). */
9428 : 1557 : bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
9429 : 1557 : gcc_assert (done);
9430 : : stmt_vec_info slp_stmt_info;
9431 : : unsigned int i;
9432 : 667677 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
9433 : 4946 : if (STMT_VINFO_LIVE_P (slp_stmt_info))
9434 : : {
9435 : 256 : done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
9436 : : instance, i, true, NULL);
9437 : 256 : gcc_assert (done);
9438 : : }
9439 : : }
9440 : : else
9441 : 659631 : vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
9442 : : }
9443 : :
9444 : : /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
9445 : : For loop vectorization this is done in vectorizable_call, but for SLP
9446 : : it needs to be deferred until end of vect_schedule_slp, because multiple
9447 : : SLP instances may refer to the same scalar stmt. */
9448 : :
9449 : : static void
9450 : 25498 : vect_remove_slp_scalar_calls (vec_info *vinfo,
9451 : : slp_tree node, hash_set<slp_tree> &visited)
9452 : : {
9453 : 25498 : gimple *new_stmt;
9454 : 25498 : gimple_stmt_iterator gsi;
9455 : 25498 : int i;
9456 : 25498 : slp_tree child;
9457 : 25498 : tree lhs;
9458 : 25498 : stmt_vec_info stmt_info;
9459 : :
9460 : 25498 : if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9461 : 5702 : return;
9462 : :
9463 : 21441 : if (visited.add (node))
9464 : : return;
9465 : :
9466 : 39568 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9467 : 19772 : vect_remove_slp_scalar_calls (vinfo, child, visited);
9468 : :
9469 : 105079 : FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
9470 : : {
9471 : 66482 : gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
9472 : 87 : if (!stmt || gimple_bb (stmt) == NULL)
9473 : 66395 : continue;
9474 : 87 : if (is_pattern_stmt_p (stmt_info)
9475 : 87 : || !PURE_SLP_STMT (stmt_info))
9476 : 2 : continue;
9477 : 85 : lhs = gimple_call_lhs (stmt);
9478 : 85 : if (lhs)
9479 : 73 : new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
9480 : : else
9481 : : {
9482 : 12 : new_stmt = gimple_build_nop ();
9483 : 12 : unlink_stmt_vdef (stmt_info->stmt);
9484 : : }
9485 : 85 : gsi = gsi_for_stmt (stmt);
9486 : 85 : vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
9487 : 85 : if (lhs)
9488 : 73 : SSA_NAME_DEF_STMT (lhs) = new_stmt;
9489 : : }
9490 : : }
9491 : :
9492 : : static void
9493 : 5726 : vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
9494 : : {
9495 : 5726 : hash_set<slp_tree> visited;
9496 : 5726 : vect_remove_slp_scalar_calls (vinfo, node, visited);
9497 : 5726 : }
9498 : :
9499 : : /* Vectorize the instance root. */
9500 : :
9501 : : void
9502 : 8494 : vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
9503 : : {
9504 : 8494 : gassign *rstmt = NULL;
9505 : :
9506 : 8494 : if (instance->kind == slp_inst_kind_ctor)
9507 : : {
9508 : 4380 : if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
9509 : : {
9510 : 4347 : tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
9511 : 4347 : tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9512 : 4347 : if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
9513 : 4347 : TREE_TYPE (vect_lhs)))
9514 : 0 : vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
9515 : : vect_lhs);
9516 : 4347 : rstmt = gimple_build_assign (root_lhs, vect_lhs);
9517 : : }
9518 : 33 : else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
9519 : : {
9520 : 33 : int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9521 : 33 : tree child_def;
9522 : 33 : int j;
9523 : 33 : vec<constructor_elt, va_gc> *v;
9524 : 33 : vec_alloc (v, nelts);
9525 : :
9526 : : /* A CTOR can handle V16HI composition from VNx8HI so we
9527 : : do not need to convert vector elements if the types
9528 : : do not match. */
9529 : 135 : FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
9530 : 102 : CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
9531 : 33 : tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
9532 : 33 : tree rtype
9533 : 33 : = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
9534 : 33 : tree r_constructor = build_constructor (rtype, v);
9535 : 33 : rstmt = gimple_build_assign (lhs, r_constructor);
9536 : : }
9537 : : }
9538 : 4114 : else if (instance->kind == slp_inst_kind_bb_reduc)
9539 : : {
9540 : : /* Largely inspired by reduction chain epilogue handling in
9541 : : vect_create_epilog_for_reduction. */
9542 : 4114 : vec<tree> vec_defs = vNULL;
9543 : 4114 : vect_get_slp_defs (node, &vec_defs);
9544 : 4114 : enum tree_code reduc_code
9545 : 4114 : = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
9546 : : /* ??? We actually have to reflect signs somewhere. */
9547 : 4114 : if (reduc_code == MINUS_EXPR)
9548 : 0 : reduc_code = PLUS_EXPR;
9549 : 4114 : gimple_seq epilogue = NULL;
9550 : : /* We may end up with more than one vector result, reduce them
9551 : : to one vector. */
9552 : 4114 : tree vec_def = vec_defs[0];
9553 : 4114 : tree vectype = TREE_TYPE (vec_def);
9554 : 4114 : tree compute_vectype = vectype;
9555 : 4114 : bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
9556 : 4047 : && TYPE_OVERFLOW_UNDEFINED (vectype)
9557 : 7031 : && operation_can_overflow (reduc_code));
9558 : 2810 : if (pun_for_overflow_p)
9559 : : {
9560 : 2810 : compute_vectype = unsigned_type_for (vectype);
9561 : 2810 : vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9562 : : compute_vectype, vec_def);
9563 : : }
9564 : 13014 : for (unsigned i = 1; i < vec_defs.length (); ++i)
9565 : : {
9566 : 2393 : tree def = vec_defs[i];
9567 : 2393 : if (pun_for_overflow_p)
9568 : 2262 : def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
9569 : : compute_vectype, def);
9570 : 2393 : vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
9571 : : vec_def, def);
9572 : : }
9573 : 4114 : vec_defs.release ();
9574 : : /* ??? Support other schemes than direct internal fn. */
9575 : 4114 : internal_fn reduc_fn;
9576 : 4114 : if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
9577 : 4114 : || reduc_fn == IFN_LAST)
9578 : 0 : gcc_unreachable ();
9579 : 4114 : tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
9580 : 4114 : TREE_TYPE (compute_vectype), vec_def);
9581 : 4114 : if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
9582 : : {
9583 : 2682 : tree rem_def = NULL_TREE;
9584 : 12142 : for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
9585 : : {
9586 : 9460 : def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
9587 : 9460 : if (!rem_def)
9588 : : rem_def = def;
9589 : : else
9590 : 6778 : rem_def = gimple_build (&epilogue, reduc_code,
9591 : 6778 : TREE_TYPE (scalar_def),
9592 : : rem_def, def);
9593 : : }
9594 : 2682 : scalar_def = gimple_build (&epilogue, reduc_code,
9595 : 2682 : TREE_TYPE (scalar_def),
9596 : : scalar_def, rem_def);
9597 : : }
9598 : 4114 : scalar_def = gimple_convert (&epilogue,
9599 : 4114 : TREE_TYPE (vectype), scalar_def);
9600 : 4114 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9601 : 4114 : gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
9602 : 4114 : gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
9603 : 4114 : update_stmt (gsi_stmt (rgsi));
9604 : 4114 : return;
9605 : : }
9606 : : else
9607 : 0 : gcc_unreachable ();
9608 : :
9609 : 4380 : gcc_assert (rstmt);
9610 : :
9611 : 4380 : gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
9612 : 4380 : gsi_replace (&rgsi, rstmt, true);
9613 : : }
9614 : :
9615 : : struct slp_scc_info
9616 : : {
9617 : : bool on_stack;
9618 : : int dfs;
9619 : : int lowlink;
9620 : : };
9621 : :
9622 : : /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
9623 : :
9624 : : static void
9625 : 1105687 : vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
9626 : : hash_map<slp_tree, slp_scc_info> &scc_info,
9627 : : int &maxdfs, vec<slp_tree> &stack)
9628 : : {
9629 : 1105687 : bool existed_p;
9630 : 1105687 : slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
9631 : 1105687 : gcc_assert (!existed_p);
9632 : 1105687 : info->dfs = maxdfs;
9633 : 1105687 : info->lowlink = maxdfs;
9634 : 1105687 : maxdfs++;
9635 : :
9636 : : /* Leaf. */
9637 : 1105687 : if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
9638 : : {
9639 : 444499 : info->on_stack = false;
9640 : 444499 : vect_schedule_slp_node (vinfo, node, instance);
9641 : 890341 : return;
9642 : : }
9643 : :
9644 : 661188 : info->on_stack = true;
9645 : 661188 : stack.safe_push (node);
9646 : :
9647 : 661188 : unsigned i;
9648 : 661188 : slp_tree child;
9649 : : /* DFS recurse. */
9650 : 1262574 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
9651 : : {
9652 : 601386 : if (!child)
9653 : 1075 : continue;
9654 : 600311 : slp_scc_info *child_info = scc_info.get (child);
9655 : 600311 : if (!child_info)
9656 : : {
9657 : 585866 : vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
9658 : : /* Recursion might have re-allocated the node. */
9659 : 585866 : info = scc_info.get (node);
9660 : 585866 : child_info = scc_info.get (child);
9661 : 585866 : info->lowlink = MIN (info->lowlink, child_info->lowlink);
9662 : : }
9663 : 14445 : else if (child_info->on_stack)
9664 : 1226 : info->lowlink = MIN (info->lowlink, child_info->dfs);
9665 : : }
9666 : 661188 : if (info->lowlink != info->dfs)
9667 : : return;
9668 : :
9669 : 659845 : auto_vec<slp_tree, 4> phis_to_fixup;
9670 : :
9671 : : /* Singleton. */
9672 : 659845 : if (stack.last () == node)
9673 : : {
9674 : 658813 : stack.pop ();
9675 : 658813 : info->on_stack = false;
9676 : 658813 : vect_schedule_slp_node (vinfo, node, instance);
9677 : 658813 : if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
9678 : 658813 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
9679 : 19653 : phis_to_fixup.quick_push (node);
9680 : : }
9681 : : else
9682 : : {
9683 : : /* SCC. */
9684 : 1032 : int last_idx = stack.length () - 1;
9685 : 2375 : while (stack[last_idx] != node)
9686 : 1343 : last_idx--;
9687 : : /* We can break the cycle at PHIs who have at least one child
9688 : : code generated. Then we could re-start the DFS walk until
9689 : : all nodes in the SCC are covered (we might have new entries
9690 : : for only back-reachable nodes). But it's simpler to just
9691 : : iterate and schedule those that are ready. */
9692 : 1032 : unsigned todo = stack.length () - last_idx;
9693 : 1138 : do
9694 : : {
9695 : 5019 : for (int idx = stack.length () - 1; idx >= last_idx; --idx)
9696 : : {
9697 : 2743 : slp_tree entry = stack[idx];
9698 : 2743 : if (!entry)
9699 : 221 : continue;
9700 : 2522 : bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
9701 : 2522 : && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
9702 : 2522 : bool ready = !phi;
9703 : 5594 : FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
9704 : 4513 : if (!child)
9705 : : {
9706 : 681 : gcc_assert (phi);
9707 : : ready = true;
9708 : : break;
9709 : : }
9710 : 3832 : else if (scc_info.get (child)->on_stack)
9711 : : {
9712 : 1177 : if (!phi)
9713 : : {
9714 : : ready = false;
9715 : : break;
9716 : : }
9717 : : }
9718 : : else
9719 : : {
9720 : 2655 : if (phi)
9721 : : {
9722 : : ready = true;
9723 : : break;
9724 : : }
9725 : : }
9726 : 1841 : if (ready)
9727 : : {
9728 : 2375 : vect_schedule_slp_node (vinfo, entry, instance);
9729 : 2375 : scc_info.get (entry)->on_stack = false;
9730 : 2375 : stack[idx] = NULL;
9731 : 2375 : todo--;
9732 : 2375 : if (phi)
9733 : 1335 : phis_to_fixup.safe_push (entry);
9734 : : }
9735 : : }
9736 : : }
9737 : 1138 : while (todo != 0);
9738 : :
9739 : : /* Pop the SCC. */
9740 : 1032 : stack.truncate (last_idx);
9741 : : }
9742 : :
9743 : : /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
9744 : : slp_tree phi_node;
9745 : 1340678 : FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
9746 : : {
9747 : 20988 : gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
9748 : 20988 : edge_iterator ei;
9749 : 20988 : edge e;
9750 : 70531 : FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
9751 : : {
9752 : 49543 : unsigned dest_idx = e->dest_idx;
9753 : 49543 : child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
9754 : 49543 : if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
9755 : 15246 : continue;
9756 : 34297 : unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
9757 : : /* Simply fill all args. */
9758 : 34297 : if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
9759 : : != vect_first_order_recurrence)
9760 : 71684 : for (unsigned i = 0; i < n; ++i)
9761 : : {
9762 : 37399 : tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
9763 : 37399 : gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
9764 : 37399 : add_phi_arg (phi, vect_get_slp_vect_def (child, i),
9765 : : e, gimple_phi_arg_location (phi, dest_idx));
9766 : : }
9767 : : else
9768 : : {
9769 : : /* Unless it is a first order recurrence which needs
9770 : : args filled in for both the PHI node and the permutes. */
9771 : 12 : gimple *perm
9772 : 12 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
9773 : 12 : gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
9774 : 12 : add_phi_arg (as_a <gphi *> (rphi),
9775 : : vect_get_slp_vect_def (child, n - 1),
9776 : : e, gimple_phi_arg_location (phi, dest_idx));
9777 : 30 : for (unsigned i = 0; i < n; ++i)
9778 : : {
9779 : 18 : gimple *perm
9780 : 18 : = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
9781 : 18 : if (i > 0)
9782 : 6 : gimple_assign_set_rhs1 (perm,
9783 : : vect_get_slp_vect_def (child, i - 1));
9784 : 18 : gimple_assign_set_rhs2 (perm,
9785 : : vect_get_slp_vect_def (child, i));
9786 : 18 : update_stmt (perm);
9787 : : }
9788 : : }
9789 : : }
9790 : : }
9791 : 659845 : }
9792 : :
9793 : : /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
9794 : :
9795 : : void
9796 : 512091 : vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
9797 : : {
9798 : 512091 : slp_instance instance;
9799 : 512091 : unsigned int i;
9800 : :
9801 : 512091 : hash_map<slp_tree, slp_scc_info> scc_info;
9802 : 512091 : int maxdfs = 0;
9803 : 1031934 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9804 : : {
9805 : 519843 : slp_tree node = SLP_INSTANCE_TREE (instance);
9806 : 519843 : if (dump_enabled_p ())
9807 : : {
9808 : 2391 : dump_printf_loc (MSG_NOTE, vect_location,
9809 : : "Vectorizing SLP tree:\n");
9810 : : /* ??? Dump all? */
9811 : 2391 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9812 : 69 : dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
9813 : 69 : SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
9814 : 2391 : vect_print_slp_graph (MSG_NOTE, vect_location,
9815 : : SLP_INSTANCE_TREE (instance));
9816 : : }
9817 : : /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
9818 : : have a PHI be the node breaking the cycle. */
9819 : 519843 : auto_vec<slp_tree> stack;
9820 : 519843 : if (!scc_info.get (node))
9821 : 519821 : vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
9822 : :
9823 : 519843 : if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
9824 : 8494 : vectorize_slp_instance_root_stmt (node, instance);
9825 : :
9826 : 519843 : if (dump_enabled_p ())
9827 : 2391 : dump_printf_loc (MSG_NOTE, vect_location,
9828 : : "vectorizing stmts using SLP.\n");
9829 : 519843 : }
9830 : :
9831 : 1544025 : FOR_EACH_VEC_ELT (slp_instances, i, instance)
9832 : : {
9833 : 519843 : slp_tree root = SLP_INSTANCE_TREE (instance);
9834 : 519843 : stmt_vec_info store_info;
9835 : 519843 : unsigned int j;
9836 : :
9837 : : /* Remove scalar call stmts. Do not do this for basic-block
9838 : : vectorization as not all uses may be vectorized.
9839 : : ??? Why should this be necessary? DCE should be able to
9840 : : remove the stmts itself.
9841 : : ??? For BB vectorization we can as well remove scalar
9842 : : stmts starting from the SLP tree root if they have no
9843 : : uses. */
9844 : 519843 : if (is_a <loop_vec_info> (vinfo))
9845 : 5726 : vect_remove_slp_scalar_calls (vinfo, root);
9846 : :
9847 : : /* Remove vectorized stores original scalar stmts. */
9848 : 2410753 : for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
9849 : : {
9850 : 1380242 : if (!STMT_VINFO_DATA_REF (store_info)
9851 : 1379023 : || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
9852 : : break;
9853 : :
9854 : 1371067 : store_info = vect_orig_stmt (store_info);
9855 : : /* Free the attached stmt_vec_info and remove the stmt. */
9856 : 1371067 : vinfo->remove_stmt (store_info);
9857 : :
9858 : : /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
9859 : : to not crash in vect_free_slp_tree later. */
9860 : 1371067 : if (SLP_TREE_REPRESENTATIVE (root) == store_info)
9861 : 510618 : SLP_TREE_REPRESENTATIVE (root) = NULL;
9862 : : }
9863 : : }
9864 : 512091 : }
|