LCOV - code coverage report
Current view: top level - gcc - tree-vect-slp.cc (source / functions) Coverage Total Hit
Test: gcc.info Lines: 92.4 % 5947 5495
Test Date: 2026-06-20 15:32:29 Functions: 95.1 % 182 173
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /* SLP - Basic Block Vectorization
       2              :    Copyright (C) 2007-2026 Free Software Foundation, Inc.
       3              :    Contributed by Dorit Naishlos <dorit@il.ibm.com>
       4              :    and Ira Rosen <irar@il.ibm.com>
       5              : 
       6              : This file is part of GCC.
       7              : 
       8              : GCC is free software; you can redistribute it and/or modify it under
       9              : the terms of the GNU General Public License as published by the Free
      10              : Software Foundation; either version 3, or (at your option) any later
      11              : version.
      12              : 
      13              : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
      14              : WARRANTY; without even the implied warranty of MERCHANTABILITY or
      15              : FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      16              : for more details.
      17              : 
      18              : You should have received a copy of the GNU General Public License
      19              : along with GCC; see the file COPYING3.  If not see
      20              : <http://www.gnu.org/licenses/>.  */
      21              : 
      22              : #include "config.h"
      23              : #define INCLUDE_ALGORITHM
      24              : #include "system.h"
      25              : #include "coretypes.h"
      26              : #include "backend.h"
      27              : #include "target.h"
      28              : #include "rtl.h"
      29              : #include "tree.h"
      30              : #include "gimple.h"
      31              : #include "tree-pass.h"
      32              : #include "ssa.h"
      33              : #include "optabs-tree.h"
      34              : #include "insn-config.h"
      35              : #include "recog.h"            /* FIXME: for insn_data */
      36              : #include "fold-const.h"
      37              : #include "stor-layout.h"
      38              : #include "gimple-iterator.h"
      39              : #include "cfgloop.h"
      40              : #include "tree-vectorizer.h"
      41              : #include "langhooks.h"
      42              : #include "gimple-walk.h"
      43              : #include "dbgcnt.h"
      44              : #include "tree-vector-builder.h"
      45              : #include "vec-perm-indices.h"
      46              : #include "gimple-fold.h"
      47              : #include "internal-fn.h"
      48              : #include "dump-context.h"
      49              : #include "cfganal.h"
      50              : #include "tree-eh.h"
      51              : #include "tree-cfg.h"
      52              : #include "alloc-pool.h"
      53              : #include "sreal.h"
      54              : #include "predict.h"
      55              : 
      56              : #define REDUC_GROUP_FIRST_ELEMENT(S) \
      57              :   (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element)
      58              : 
      59              : static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
      60              :                                             load_permutation_t &,
      61              :                                             const vec<tree> &,
      62              :                                             gimple_stmt_iterator *,
      63              :                                             poly_uint64, bool, bool,
      64              :                                             unsigned *,
      65              :                                             unsigned * = nullptr,
      66              :                                             bool = false);
      67              : static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
      68              :                                            slp_tree, lane_permutation_t &,
      69              :                                            vec<slp_tree> &, bool);
      70              : static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
      71              : static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
      72              : 
      73              : static object_allocator<_slp_tree> *slp_tree_pool;
      74              : static slp_tree slp_first_node;
      75              : 
      76              : void
      77      1113429 : vect_slp_init (void)
      78              : {
      79      1113429 :   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
      80      1113429 : }
      81              : 
      82              : void
      83      1113429 : vect_slp_fini (void)
      84              : {
      85      1776573 :   while (slp_first_node)
      86       663144 :     delete slp_first_node;
      87      2226858 :   delete slp_tree_pool;
      88      1113429 :   slp_tree_pool = NULL;
      89      1113429 : }
      90              : 
      91              : void *
      92      7711454 : _slp_tree::operator new (size_t n)
      93              : {
      94      7711454 :   gcc_assert (n == sizeof (_slp_tree));
      95      7711454 :   return slp_tree_pool->allocate_raw ();
      96              : }
      97              : 
      98              : void
      99      7711454 : _slp_tree::operator delete (void *node, size_t n)
     100              : {
     101      7711454 :   gcc_assert (n == sizeof (_slp_tree));
     102      7711454 :   slp_tree_pool->remove_raw (node);
     103      7711454 : }
     104              : 
     105              : 
     106              : /* Initialize a SLP node.  */
     107              : 
     108      7711454 : _slp_tree::_slp_tree ()
     109              : {
     110      7711454 :   this->prev_node = NULL;
     111      7711454 :   if (slp_first_node)
     112      6750082 :     slp_first_node->prev_node = this;
     113      7711454 :   this->next_node = slp_first_node;
     114      7711454 :   slp_first_node = this;
     115      7711454 :   SLP_TREE_SCALAR_STMTS (this) = vNULL;
     116      7711454 :   SLP_TREE_SCALAR_OPS (this) = vNULL;
     117      7711454 :   SLP_TREE_LIVE_LANES (this) = vNULL;
     118      7711454 :   SLP_TREE_VEC_DEFS (this) = vNULL;
     119      7711454 :   SLP_TREE_CHILDREN (this) = vNULL;
     120      7711454 :   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
     121      7711454 :   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
     122      7711454 :   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
     123      7711454 :   SLP_TREE_CODE (this) = ERROR_MARK;
     124      7711454 :   SLP_TREE_GS_SCALE (this) = 0;
     125      7711454 :   SLP_TREE_GS_BASE (this) = NULL_TREE;
     126      7711454 :   this->ldst_lanes = false;
     127      7711454 :   this->avoid_stlf_fail = false;
     128      7711454 :   SLP_TREE_VECTYPE (this) = NULL_TREE;
     129      7711454 :   SLP_TREE_REPRESENTATIVE (this) = NULL;
     130      7711454 :   this->cycle_info.id = -1;
     131      7711454 :   this->cycle_info.reduc_idx = -1;
     132      7711454 :   SLP_TREE_REF_COUNT (this) = 1;
     133      7711454 :   this->failed = NULL;
     134      7711454 :   this->max_nunits = 1;
     135      7711454 :   this->lanes = 0;
     136      7711454 :   SLP_TREE_TYPE (this) = undef_vec_info_type;
     137      7711454 :   this->data = NULL;
     138      7711454 : }
     139              : 
     140              : /* Tear down a SLP node.  */
     141              : 
     142      7711454 : _slp_tree::~_slp_tree ()
     143              : {
     144      7711454 :   if (this->prev_node)
     145      4663488 :     this->prev_node->next_node = this->next_node;
     146              :   else
     147      3047966 :     slp_first_node = this->next_node;
     148      7711454 :   if (this->next_node)
     149      5816335 :     this->next_node->prev_node = this->prev_node;
     150      7711454 :   SLP_TREE_CHILDREN (this).release ();
     151      7711454 :   SLP_TREE_SCALAR_STMTS (this).release ();
     152      7711454 :   SLP_TREE_SCALAR_OPS (this).release ();
     153      7711454 :   SLP_TREE_LIVE_LANES (this).release ();
     154      7711454 :   SLP_TREE_VEC_DEFS (this).release ();
     155      7711454 :   SLP_TREE_LOAD_PERMUTATION (this).release ();
     156      7711454 :   SLP_TREE_LANE_PERMUTATION (this).release ();
     157      7711454 :   if (this->failed)
     158      2037326 :     free (failed);
     159      7711454 :   if (this->data)
     160      1239221 :     delete this->data;
     161      7711454 : }
     162              : 
     163              : /* Push the single SSA definition in DEF to the vector of vector defs.  */
     164              : 
     165              : void
     166       528123 : _slp_tree::push_vec_def (gimple *def)
     167              : {
     168       528123 :   if (gphi *phi = dyn_cast <gphi *> (def))
     169        58968 :     vec_defs.quick_push (gimple_phi_result (phi));
     170              :   else
     171              :     {
     172       469155 :       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
     173       469155 :       vec_defs.quick_push (get_def_from_ptr (defop));
     174              :     }
     175       528123 : }
     176              : 
     177              : /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
     178              : 
     179              : void
     180     14607938 : vect_free_slp_tree (slp_tree node)
     181              : {
     182     14607938 :   int i;
     183     14607938 :   slp_tree child;
     184              : 
     185     14607938 :   if (--SLP_TREE_REF_COUNT (node) != 0)
     186     14607938 :     return;
     187              : 
     188     10924675 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     189      3876365 :     if (child)
     190      3519902 :       vect_free_slp_tree (child);
     191              : 
     192      7048310 :   delete node;
     193              : }
     194              : 
     195              : /* Return a location suitable for dumpings related to the SLP instance.  */
     196              : 
     197              : dump_user_location_t
     198      3390688 : _slp_instance::location () const
     199              : {
     200      3390688 :   if (!root_stmts.is_empty ())
     201       316323 :     return root_stmts[0]->stmt;
     202              :   else
     203      3074365 :     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
     204              : }
     205              : 
     206              : 
     207              : /* Free the memory allocated for the SLP instance.  */
     208              : 
     209              : void
     210      1551364 : vect_free_slp_instance (slp_instance instance)
     211              : {
     212      1551364 :   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
     213      1551364 :   SLP_INSTANCE_LOADS (instance).release ();
     214      1551364 :   SLP_INSTANCE_ROOT_STMTS (instance).release ();
     215      1551364 :   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
     216      1551364 :   instance->subgraph_entries.release ();
     217      1551364 :   instance->cost_vec.release ();
     218      1551364 :   free (instance);
     219      1551364 : }
     220              : 
     221              : 
     222              : /* Create a SLP node with NOPS children with CODE, either VEC_PERM_EXPR
     223              :    for a permute node or else ERROR_MARK.  */
     224              : 
     225              : slp_tree
     226        95876 : vect_create_new_slp_node (unsigned nops, tree_code code)
     227              : {
     228        95876 :   gcc_assert (code == ERROR_MARK || code == VEC_PERM_EXPR);
     229        95876 :   slp_tree node = new _slp_tree;
     230        95876 :   SLP_TREE_SCALAR_STMTS (node) = vNULL;
     231        95876 :   SLP_TREE_CHILDREN (node).create (nops);
     232        95876 :   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
     233        95876 :   SLP_TREE_CODE (node) = code;
     234        95876 :   return node;
     235              : }
     236              : 
     237              : /* Create a SLP node inplace at NODE for SCALAR_STMTS and NOPS children.  */
     238              : 
     239              : static slp_tree
     240      3744535 : vect_create_new_slp_node (slp_tree node,
     241              :                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
     242              : {
     243      3744535 :   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
     244      3744535 :   SLP_TREE_CHILDREN (node).create (nops);
     245      3744535 :   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
     246      3744535 :   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
     247      3744535 :   SLP_TREE_LANES (node) = scalar_stmts.length ();
     248      3744535 :   return node;
     249              : }
     250              : 
     251              : /* Create an SLP node for SCALAR_STMTS and NOPS children.  */
     252              : 
     253              : static slp_tree
     254         8164 : vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
     255              : {
     256         8164 :   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
     257              : }
     258              : 
     259              : /* Create a vect_external_def SLP node inplace at NODE for scalar
     260              :    operands OPS.  */
     261              : 
     262              : static slp_tree
     263      1823423 : vect_create_new_slp_node (slp_tree node, vec<tree> ops)
     264              : {
     265      1823423 :   SLP_TREE_SCALAR_OPS (node) = ops;
     266      1823423 :   SLP_TREE_DEF_TYPE (node) = vect_external_def;
     267            0 :   SLP_TREE_LANES (node) = ops.length ();
     268      1823423 :   return node;
     269              : }
     270              : 
     271              : /* Create a vect_external_def SLP node for scalar operands OPS.  */
     272              : 
     273              : static slp_tree
     274      1823423 : vect_create_new_slp_node (vec<tree> ops)
     275              : {
     276      1823423 :   return vect_create_new_slp_node (new _slp_tree, ops);
     277              : }
     278              : 
     279              : 
     280              : /* This structure is used in creation of an SLP tree.  Each instance
     281              :    corresponds to the same operand in a group of scalar stmts in an SLP
     282              :    node.  */
     283              : typedef struct _slp_oprnd_info
     284              : {
     285              :   /* Def-stmts for the operands.  */
     286              :   vec<stmt_vec_info> def_stmts;
     287              :   /* Operands.  */
     288              :   vec<tree> ops;
     289              :   /* Information about the first statement, its vector def-type, type, the
     290              :      operand itself in case it's constant, and an indication if it's a pattern
     291              :      stmt and gather/scatter info.  */
     292              :   tree first_op_type;
     293              :   enum vect_def_type first_dt;
     294              :   bool any_pattern;
     295              :   bool first_gs_p;
     296              :   gather_scatter_info first_gs_info;
     297              : } *slp_oprnd_info;
     298              : 
     299              : 
     300              : /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
     301              :    operand.  */
     302              : static vec<slp_oprnd_info>
     303      3322682 : vect_create_oprnd_info (int nops, int group_size)
     304              : {
     305      3322682 :   int i;
     306      3322682 :   slp_oprnd_info oprnd_info;
     307      3322682 :   vec<slp_oprnd_info> oprnds_info;
     308              : 
     309      3322682 :   oprnds_info.create (nops);
     310     11918375 :   for (i = 0; i < nops; i++)
     311              :     {
     312      5273011 :       oprnd_info = XNEW (struct _slp_oprnd_info);
     313      5273011 :       oprnd_info->def_stmts.create (group_size);
     314      5273011 :       oprnd_info->ops.create (group_size);
     315      5273011 :       oprnd_info->first_dt = vect_uninitialized_def;
     316      5273011 :       oprnd_info->first_op_type = NULL_TREE;
     317      5273011 :       oprnd_info->any_pattern = false;
     318      5273011 :       oprnd_info->first_gs_p = false;
     319      5273011 :       oprnds_info.quick_push (oprnd_info);
     320              :     }
     321              : 
     322      3322682 :   return oprnds_info;
     323              : }
     324              : 
     325              : 
     326              : /* Free operands info.  */
     327              : 
     328              : static void
     329      3322682 : vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
     330              : {
     331      3322682 :   int i;
     332      3322682 :   slp_oprnd_info oprnd_info;
     333              : 
     334      8595693 :   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     335              :     {
     336      5273011 :       oprnd_info->def_stmts.release ();
     337      5273011 :       oprnd_info->ops.release ();
     338      5273011 :       XDELETE (oprnd_info);
     339              :     }
     340              : 
     341      3322682 :   oprnds_info.release ();
     342      3322682 : }
     343              : 
     344              : /* Return the execution frequency of NODE (so that a higher value indicates
     345              :    a "more important" node when optimizing for speed).  */
     346              : 
     347              : static sreal
     348      3465920 : vect_slp_node_weight (slp_tree node)
     349              : {
     350      3465920 :   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
     351      3465920 :   basic_block bb = gimple_bb (stmt_info->stmt);
     352      3465920 :   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
     353              : }
     354              : 
     355              : /* Return true if STMTS contains a pattern statement.  */
     356              : 
     357              : static bool
     358        20353 : vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
     359              : {
     360        20353 :   stmt_vec_info stmt_info;
     361        20353 :   unsigned int i;
     362        66691 :   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
     363        48365 :     if (stmt_info && is_pattern_stmt_p (stmt_info))
     364              :       return true;
     365              :   return false;
     366              : }
     367              : 
     368              : /* Return true when all lanes in the external or constant NODE have
     369              :    the same value.  */
     370              : 
     371              : static bool
     372       578826 : vect_slp_tree_uniform_p (slp_tree node)
     373              : {
     374       578826 :   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
     375              :               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
     376              : 
     377              :   /* Pre-existing vectors.  */
     378      1018788 :   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
     379              :     return false;
     380              : 
     381              :   unsigned i;
     382              :   tree op, first = NULL_TREE;
     383      1326173 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
     384      1187309 :     if (!first)
     385              :       first = op;
     386       608483 :     else if (!operand_equal_p (first, op, 0))
     387              :       return false;
     388              : 
     389              :   return true;
     390              : }
     391              : 
     392              : /* Find the place of the data-ref in STMT_INFO in the interleaving chain
     393              :    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
     394              :    of the chain.  */
     395              : 
     396              : int
     397       701430 : vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
     398              :                                       stmt_vec_info first_stmt_info)
     399              : {
     400       701430 :   stmt_vec_info next_stmt_info = first_stmt_info;
     401       701430 :   int result = 0;
     402              : 
     403       701430 :   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
     404              :     return -1;
     405              : 
     406      1755568 :   do
     407              :     {
     408      1755568 :       if (next_stmt_info == stmt_info)
     409              :         return result;
     410      1054138 :       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
     411      1054138 :       if (next_stmt_info)
     412      1054138 :         result += DR_GROUP_GAP (next_stmt_info);
     413              :     }
     414      1054138 :   while (next_stmt_info);
     415              : 
     416              :   return -1;
     417              : }
     418              : 
     419              : /* Check whether it is possible to load COUNT elements of type ELT_TYPE
     420              :    using the method implemented by duplicate_and_interleave.  Return true
     421              :    if so, returning the number of intermediate vectors in *NVECTORS_OUT
     422              :    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
     423              :    (if nonnull).  */
     424              : 
     425              : bool
     426            0 : can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
     427              :                                 tree elt_type, unsigned int *nvectors_out,
     428              :                                 tree *vector_type_out,
     429              :                                 tree *permutes)
     430              : {
     431            0 :   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
     432            0 :   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
     433            0 :     return false;
     434              : 
     435            0 :   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
     436            0 :   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
     437            0 :   unsigned int nvectors = 1;
     438            0 :   for (;;)
     439              :     {
     440            0 :       scalar_int_mode int_mode;
     441            0 :       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
     442            0 :       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
     443              :         {
     444              :           /* Get the natural vector type for this SLP group size.  */
     445            0 :           tree int_type = build_nonstandard_integer_type
     446            0 :             (GET_MODE_BITSIZE (int_mode), 1);
     447            0 :           tree vector_type
     448            0 :             = get_vectype_for_scalar_type (vinfo, int_type, count);
     449            0 :           poly_int64 half_nelts;
     450            0 :           if (vector_type
     451            0 :               && VECTOR_MODE_P (TYPE_MODE (vector_type))
     452            0 :               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
     453              :                            GET_MODE_SIZE (base_vector_mode))
     454            0 :               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
     455              :                              2, &half_nelts))
     456              :             {
     457              :               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
     458              :                  together into elements of type INT_TYPE and using the result
     459              :                  to build NVECTORS vectors.  */
     460            0 :               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
     461            0 :               vec_perm_builder sel1 (nelts, 2, 3);
     462            0 :               vec_perm_builder sel2 (nelts, 2, 3);
     463              : 
     464            0 :               for (unsigned int i = 0; i < 3; ++i)
     465              :                 {
     466            0 :                   sel1.quick_push (i);
     467            0 :                   sel1.quick_push (i + nelts);
     468            0 :                   sel2.quick_push (half_nelts + i);
     469            0 :                   sel2.quick_push (half_nelts + i + nelts);
     470              :                 }
     471            0 :               vec_perm_indices indices1 (sel1, 2, nelts);
     472            0 :               vec_perm_indices indices2 (sel2, 2, nelts);
     473            0 :               machine_mode vmode = TYPE_MODE (vector_type);
     474            0 :               if (can_vec_perm_const_p (vmode, vmode, indices1)
     475            0 :                   && can_vec_perm_const_p (vmode, vmode, indices2))
     476              :                 {
     477            0 :                   if (nvectors_out)
     478            0 :                     *nvectors_out = nvectors;
     479            0 :                   if (vector_type_out)
     480            0 :                     *vector_type_out = vector_type;
     481            0 :                   if (permutes)
     482              :                     {
     483            0 :                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
     484              :                                                                 indices1);
     485            0 :                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
     486              :                                                                 indices2);
     487              :                     }
     488            0 :                   return true;
     489              :                 }
     490            0 :             }
     491              :         }
     492            0 :       if (!multiple_p (elt_bytes, 2, &elt_bytes))
     493              :         return false;
     494            0 :       nvectors *= 2;
     495              :       /* We need to be able to fuse COUNT / NVECTORS elements together.  */
     496            0 :       if (!multiple_p (count, nvectors))
     497              :         return false;
     498              :     }
     499              : }
     500              : 
     501              : /* Return true if DTA and DTB match.  */
     502              : 
     503              : static bool
     504     16990139 : vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
     505              : {
     506     16990139 :   return (dta == dtb
     507       351389 :           || ((dta == vect_external_def || dta == vect_constant_def)
     508       217971 :               && (dtb == vect_external_def || dtb == vect_constant_def)));
     509              : }
     510              : 
     511              : #define GATHER_SCATTER_OFFSET (-3)
     512              : 
     513              : /* For most SLP statements, there is a one-to-one mapping between
     514              :    gimple arguments and child nodes.  If that is not true for STMT,
     515              :    return an array that contains:
     516              : 
     517              :    - the number of child nodes, followed by
     518              :    - for each child node, the index of the argument associated with that node.
     519              :      The special index -1 is the first operand of an embedded comparison and
     520              :      the special index -2 is the second operand of an embedded comparison.
     521              :      The special index -3 is the offset of a gather as analyzed by
     522              :      vect_check_gather_scatter.
     523              : 
     524              :    SWAP is as for vect_get_and_check_slp_defs.  */
     525              : 
     526              : static const int *
     527     24229913 : vect_get_operand_map (const gimple *stmt, bool gather_scatter_p,
     528              :                       unsigned char swap)
     529              : {
     530     24229913 :   static const int no_arg_map[] = { 0 };
     531     24229913 :   static const int arg0_map[] = { 1, 0 };
     532     24229913 :   static const int arg2_map[] = { 1, 2 };
     533     24229913 :   static const int arg2_arg3_map[] = { 2, 2, 3 };
     534     24229913 :   static const int arg2_arg4_map[] = { 2, 2, 4 };
     535     24229913 :   static const int arg2_arg5_arg6_map[] = { 3, 2, 5, 6 };
     536     24229913 :   static const int arg2_arg4_arg5_map[] = { 3, 2, 4, 5 };
     537     24229913 :   static const int arg3_arg2_map[] = { 2, 3, 2 };
     538     24229913 :   static const int op00_map[] = { 1, -1 };
     539     24229913 :   static const int op1_op0_map[] = { 2, 1, 0 };
     540     24229913 :   static const int off_map[] = { 1, GATHER_SCATTER_OFFSET };
     541     24229913 :   static const int off_op0_map[] = { 2, GATHER_SCATTER_OFFSET, 0 };
     542     24229913 :   static const int off_arg2_arg3_map[] = { 3, GATHER_SCATTER_OFFSET, 2, 3 };
     543     24229913 :   static const int off_arg3_arg2_map[] = { 3, GATHER_SCATTER_OFFSET, 3, 2 };
     544     24229913 :   static const int mask_call_maps[6][7] = {
     545              :         { 1, 1, },
     546              :         { 2, 1, 2, },
     547              :         { 3, 1, 2, 3, },
     548              :         { 4, 1, 2, 3, 4, },
     549              :         { 5, 1, 2, 3, 4, 5, },
     550              :         { 6, 1, 2, 3, 4, 5, 6 },
     551              :   };
     552              : 
     553     24229913 :   gcc_checking_assert (!swap
     554              :                        || !is_gimple_assign (stmt)
     555              :                        || TREE_CODE_CLASS
     556              :                             (gimple_assign_rhs_code (stmt)) == tcc_comparison
     557              :                        || commutative_tree_code
     558              :                             (gimple_assign_rhs_code (stmt)));
     559              : 
     560     24229913 :   if (auto assign = dyn_cast<const gassign *> (stmt))
     561              :     {
     562     22773587 :       tree_code code = gimple_assign_rhs_code (assign);
     563     22773587 :       if (code == COND_EXPR
     564     22773587 :           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
     565            0 :         gcc_unreachable ();
     566     22773587 :       else if ((TREE_CODE_CLASS (code) == tcc_comparison
     567     21436105 :                 || commutative_tree_code (code))
     568     31701574 :                && swap)
     569              :         return op1_op0_map;
     570     22732778 :       else if (code == VIEW_CONVERT_EXPR)
     571              :         return op00_map;
     572     22724627 :       else if (gather_scatter_p)
     573        43351 :         return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
     574        43351 :                 ? off_op0_map : off_map);
     575              :     }
     576      1456326 :   else if (auto call = dyn_cast<const gcall *> (stmt))
     577              :     {
     578       161292 :       if (gimple_call_internal_p (call))
     579        92253 :         switch (gimple_call_internal_fn (call))
     580              :           {
     581        15952 :           case IFN_MASK_LOAD:
     582        27210 :             return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
     583              : 
     584              :           case IFN_GATHER_LOAD:
     585              :             return arg2_map;
     586              : 
     587            0 :           case IFN_MASK_GATHER_LOAD:
     588            0 :           case IFN_MASK_LEN_GATHER_LOAD:
     589            0 :             return arg2_arg5_arg6_map;
     590              : 
     591            0 :           case IFN_SCATTER_STORE:
     592            0 :             return arg2_arg4_map;
     593              : 
     594            0 :           case IFN_MASK_SCATTER_STORE:
     595            0 :           case IFN_MASK_LEN_SCATTER_STORE:
     596            0 :             return arg2_arg4_arg5_map;
     597              : 
     598         9538 :           case IFN_MASK_STORE:
     599        17654 :             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
     600              : 
     601          988 :           case IFN_MASK_CALL:
     602          988 :             {
     603          988 :               unsigned nargs = gimple_call_num_args (call);
     604          988 :               if (nargs >= 2 && nargs <= 7)
     605          988 :                 return mask_call_maps[nargs-2];
     606              :               else
     607              :                 return nullptr;
     608              :             }
     609              : 
     610          278 :           case IFN_CLZ:
     611          278 :           case IFN_CTZ:
     612          278 :             return arg0_map;
     613              : 
     614         6306 :           case IFN_GOMP_SIMD_LANE:
     615         6306 :             return no_arg_map;
     616              : 
     617              :           default:
     618              :             break;
     619              :           }
     620              :     }
     621              :   return nullptr;
     622              : }
     623              : 
     624              : static const int *
     625     24213986 : vect_get_operand_map (const stmt_vec_info stmt, unsigned char swap = 0)
     626              : {
     627            0 :   return vect_get_operand_map (stmt->stmt, STMT_VINFO_GATHER_SCATTER_P (stmt),
     628            0 :                                swap);
     629              : }
     630              : 
     631              : /* Return the SLP node child index for operand OP of STMT.  */
     632              : 
     633              : int
     634      1371829 : vect_slp_child_index_for_operand (const stmt_vec_info stmt, int op)
     635              : {
     636      1371829 :   const int *opmap = vect_get_operand_map (stmt);
     637      1371829 :   if (!opmap)
     638              :     return op;
     639        21917 :   for (int i = 1; i < 1 + opmap[0]; ++i)
     640        21917 :     if (opmap[i] == op)
     641        12272 :       return i - 1;
     642            0 :   gcc_unreachable ();
     643              : }
     644              : 
     645              : /* Helper class for mapping of GIMPLE operands to SLP children.  */
     646              : /* ???  Add vect_slp_child_index_for_operand here and amend opmaps
     647              :    with the full reverse mapping and indicating the position of the
     648              :    first commutative operand index, eliding the swap_p argument from
     649              :    vect_get_operand_map.  Adjust all consumers.  */
     650              : 
     651              : struct slp_oprnds {
     652              :   slp_oprnds (stmt_vec_info);
     653              :   tree get_op_for_slp_child (stmt_vec_info, unsigned);
     654              :   const int *opmap;
     655              :   const unsigned int num_slp_children;
     656              : };
     657              : 
     658      4383610 : slp_oprnds::slp_oprnds (stmt_vec_info stmt_info)
     659      4383610 :   : opmap (vect_get_operand_map (stmt_info)),
     660      4383610 :     num_slp_children (opmap ? opmap[0] : gimple_num_args (stmt_info->stmt))
     661              : {
     662      4383610 : }
     663              : 
     664              : /* For SLP child number N get the corresponding tree operand from GIMPLE
     665              :    statement described by STMT_INFO.  */
     666              : 
     667              : tree
     668      4822197 : slp_oprnds::get_op_for_slp_child (stmt_vec_info stmt_info, unsigned n)
     669              : {
     670      4822197 :   gcc_assert (n < num_slp_children);
     671      4822197 :   int opno = opmap ? opmap[n + 1] : (int) n;
     672      4822197 :   if (opno == GATHER_SCATTER_OFFSET)
     673            0 :     gcc_unreachable (); // TODO
     674      4822197 :   else if (opno < 0)
     675         1934 :     return TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
     676              :   else
     677      4820263 :     return gimple_arg (stmt_info->stmt, opno);
     678              : }
     679              : 
     680              : /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
     681              :    they are of a valid type and that they match the defs of the first stmt of
     682              :    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
     683              :    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
     684              :    indicates swap is required for cond_expr stmts.  Specifically, SWAP
     685              :    is 1 if STMT is cond and operands of comparison need to be swapped;
     686              :    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
     687              : 
     688              :    If there was a fatal error return -1; if the error could be corrected by
     689              :    swapping operands of father node of this one, return 1; if everything is
     690              :    ok return 0.  */
     691              : static int
     692     12688321 : vect_get_and_check_slp_defs (vec_info *vinfo, tree vectype, unsigned char swap,
     693              :                              bool *skip_args,
     694              :                              vec<stmt_vec_info> stmts, unsigned stmt_num,
     695              :                              vec<slp_oprnd_info> *oprnds_info)
     696              : {
     697     12688321 :   stmt_vec_info stmt_info = stmts[stmt_num];
     698     12688321 :   tree oprnd;
     699     12688321 :   unsigned int i, number_of_oprnds;
     700     12688321 :   enum vect_def_type dt = vect_uninitialized_def;
     701     12688321 :   slp_oprnd_info oprnd_info;
     702     12688321 :   gather_scatter_info gs_info;
     703     12688321 :   unsigned int gs_op = -1u;
     704     12688321 :   unsigned int commutative_op = -1U;
     705     12688321 :   bool first = stmt_num == 0;
     706              : 
     707     12688321 :   if (!stmt_info)
     708              :     {
     709            0 :       for (auto oi : *oprnds_info)
     710              :         {
     711            0 :           oi->def_stmts.quick_push (NULL);
     712            0 :           oi->ops.quick_push (NULL_TREE);
     713              :         }
     714              :       return 0;
     715              :     }
     716              : 
     717     12688321 :   if (!is_a<gcall *> (stmt_info->stmt)
     718              :       && !is_a<gassign *> (stmt_info->stmt)
     719              :       && !is_a<gphi *> (stmt_info->stmt))
     720              :     return -1;
     721              : 
     722     12688321 :   number_of_oprnds = gimple_num_args (stmt_info->stmt);
     723     12688321 :   const int *map = vect_get_operand_map (stmt_info, swap);
     724     12688321 :   if (map)
     725        76006 :     number_of_oprnds = *map++;
     726     12688321 :   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
     727              :     {
     728        49339 :       if (gimple_call_internal_p (stmt))
     729              :         {
     730        32675 :           internal_fn ifn = gimple_call_internal_fn (stmt);
     731        32675 :           commutative_op = first_commutative_argument (ifn);
     732        32675 :           if (internal_gather_scatter_fn_p (ifn))
     733              :             {
     734            0 :               vect_describe_gather_scatter_call
     735            0 :                 (stmt_info,
     736            0 :                  first ? &(*oprnds_info)[0]->first_gs_info : &gs_info);
     737            0 :               if (first)
     738            0 :                 (*oprnds_info)[0]->first_gs_p = true;
     739              :               gs_op = 0;
     740              :             }
     741              :         }
     742              :     }
     743     12638982 :   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
     744              :     {
     745     14752689 :       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
     746      8360245 :         commutative_op = 0;
     747              :     }
     748              : 
     749     12688321 :   bool swapped = (swap != 0);
     750     12688321 :   bool backedge = false;
     751     12688321 :   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
     752     35101252 :   for (i = 0; i < number_of_oprnds; i++)
     753              :     {
     754     22414129 :       oprnd_info = (*oprnds_info)[i];
     755     22414129 :       int opno = map ? map[i] : int (i);
     756     22414129 :       if (opno == GATHER_SCATTER_OFFSET)
     757              :         {
     758        22739 :           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
     759        22739 :           if (!is_a <loop_vec_info> (vinfo)
     760        22739 :               || !vect_check_gather_scatter (stmt_info, vectype,
     761              :                                              as_a <loop_vec_info> (vinfo),
     762              :                                              first ? &oprnd_info->first_gs_info
     763              :                                              : &gs_info))
     764         1198 :             return -1;
     765              : 
     766        22739 :           if (first)
     767              :             {
     768        22486 :               oprnd_info->first_gs_p = true;
     769        22486 :               oprnd = oprnd_info->first_gs_info.offset;
     770              :             }
     771              :           else
     772              :             {
     773          253 :               gs_op = i;
     774          253 :               oprnd = gs_info.offset;
     775              :             }
     776              :         }
     777     22391390 :       else if (opno < 0)
     778         2842 :         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
     779              :       else
     780              :         {
     781     22388548 :           oprnd = gimple_arg (stmt_info->stmt, opno);
     782     22388548 :           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
     783              :             {
     784      1218247 :               edge e = gimple_phi_arg_edge (stmt, opno);
     785      2436494 :               backedge = (is_a <bb_vec_info> (vinfo)
     786      1879802 :                           ? e->flags & EDGE_DFS_BACK
     787       661555 :                           : dominated_by_p (CDI_DOMINATORS, e->src,
     788       661555 :                                             gimple_bb (stmt_info->stmt)));
     789              :             }
     790              :         }
     791              : 
     792     22414129 :       stmt_vec_info def_stmt_info;
     793     22414129 :       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
     794              :         {
     795          976 :           if (dump_enabled_p ())
     796            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     797              :                              "Build SLP failed: can't analyze def for %T\n",
     798              :                              oprnd);
     799              : 
     800          976 :           return -1;
     801              :         }
     802              : 
     803     22413153 :       if (skip_args[i])
     804              :         {
     805       527137 :           oprnd_info->def_stmts.quick_push (NULL);
     806       527137 :           oprnd_info->ops.quick_push (NULL_TREE);
     807       527137 :           oprnd_info->first_dt = vect_uninitialized_def;
     808       527137 :           continue;
     809              :         }
     810              : 
     811     21886016 :       oprnd_info->def_stmts.quick_push (def_stmt_info);
     812     21886016 :       oprnd_info->ops.quick_push (oprnd);
     813              : 
     814     21886016 :       if (def_stmt_info
     815     21886016 :           && is_pattern_stmt_p (def_stmt_info))
     816              :         {
     817       391615 :           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
     818              :               != def_stmt_info)
     819       274343 :             oprnd_info->any_pattern = true;
     820              :           else
     821              :             /* If we promote this to external use the original stmt def.  */
     822       117272 :             oprnd_info->ops.last ()
     823       234544 :               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
     824              :         }
     825              : 
     826              :       /* If there's a extern def on a backedge make sure we can
     827              :          code-generate at the region start.
     828              :          ???  This is another case that could be fixed by adjusting
     829              :          how we split the function but at the moment we'd have conflicting
     830              :          goals there.  */
     831     21886016 :       if (backedge
     832       167943 :           && dts[i] == vect_external_def
     833          243 :           && is_a <bb_vec_info> (vinfo)
     834          243 :           && TREE_CODE (oprnd) == SSA_NAME
     835          222 :           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
     836     21886238 :           && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
     837          222 :                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
     838              :         {
     839          222 :           if (dump_enabled_p ())
     840            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     841              :                              "Build SLP failed: extern def %T only defined "
     842              :                              "on backedge\n", oprnd);
     843          222 :           return -1;
     844              :         }
     845              : 
     846     21885794 :       if (first)
     847              :         {
     848      4784605 :           tree type = TREE_TYPE (oprnd);
     849      4784605 :           dt = dts[i];
     850              : 
     851              :           /* For the swapping logic below force vect_reduction_def
     852              :              for the reduction op in a SLP reduction group.  */
     853      4784605 :           if (!STMT_VINFO_DATA_REF (stmt_info)
     854      3615927 :               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     855         5336 :               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
     856      4787233 :               && def_stmt_info)
     857         2628 :             dts[i] = dt = vect_reduction_def;
     858              : 
     859              :           /* Check the types of the definition.  */
     860      4784605 :           switch (dt)
     861              :             {
     862      4784605 :             case vect_external_def:
     863      4784605 :             case vect_constant_def:
     864      4784605 :             case vect_internal_def:
     865      4784605 :             case vect_reduction_def:
     866      4784605 :             case vect_double_reduction_def:
     867      4784605 :             case vect_induction_def:
     868      4784605 :             case vect_nested_cycle:
     869      4784605 :             case vect_first_order_recurrence:
     870      4784605 :               break;
     871              : 
     872            0 :             default:
     873              :               /* FORNOW: Not supported.  */
     874            0 :               if (dump_enabled_p ())
     875            0 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     876              :                                  "Build SLP failed: illegal type of def %T\n",
     877              :                                  oprnd);
     878            0 :               return -1;
     879              :             }
     880              : 
     881      4784605 :           oprnd_info->first_dt = dt;
     882      4784605 :           oprnd_info->first_op_type = type;
     883              :         }
     884              :     }
     885     12687123 :   if (first)
     886              :     return 0;
     887              : 
     888              :   /* Now match the operand definition types to that of the first stmt.  */
     889     26214003 :   for (i = 0; i < number_of_oprnds;)
     890              :     {
     891     17116110 :       if (skip_args[i])
     892              :         {
     893        44120 :           ++i;
     894        44120 :           continue;
     895              :         }
     896              : 
     897     17071990 :       oprnd_info = (*oprnds_info)[i];
     898     17071990 :       dt = dts[i];
     899     17071990 :       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
     900     17071990 :       oprnd = oprnd_info->ops[stmt_num];
     901     17071990 :       tree type = TREE_TYPE (oprnd);
     902              : 
     903     17071990 :       if (!types_compatible_p (oprnd_info->first_op_type, type))
     904              :         {
     905        87672 :           if (dump_enabled_p ())
     906          109 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     907              :                              "Build SLP failed: different operand types\n");
     908        87672 :           return 1;
     909              :         }
     910              : 
     911     16984318 :       if ((gs_op == i) != oprnd_info->first_gs_p)
     912              :         {
     913            0 :           if (dump_enabled_p ())
     914            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     915              :                              "Build SLP failed: mixed gather and non-gather\n");
     916            0 :           return 1;
     917              :         }
     918     16984318 :       else if (gs_op == i)
     919              :         {
     920          223 :           if (!operand_equal_p (oprnd_info->first_gs_info.base,
     921          223 :                                 gs_info.base))
     922              :             {
     923           16 :               if (dump_enabled_p ())
     924            6 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     925              :                                  "Build SLP failed: different gather base\n");
     926           16 :               return 1;
     927              :             }
     928          207 :           if (oprnd_info->first_gs_info.scale != gs_info.scale)
     929              :             {
     930            8 :               if (dump_enabled_p ())
     931            2 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     932              :                                  "Build SLP failed: different gather scale\n");
     933            8 :               return 1;
     934              :             }
     935              :         }
     936              : 
     937              :       /* Not first stmt of the group, check that the def-stmt/s match
     938              :          the def-stmt/s of the first stmt.  Allow different definition
     939              :          types for reduction chains: the first stmt must be a
     940              :          vect_reduction_def (a phi node), and the rest
     941              :          end in the reduction chain.  */
     942     16984294 :       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
     943       294692 :            && !(oprnd_info->first_dt == vect_reduction_def
     944         4797 :                 && !STMT_VINFO_DATA_REF (stmt_info)
     945         4797 :                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     946         4771 :                 && def_stmt_info
     947         4769 :                 && !STMT_VINFO_DATA_REF (def_stmt_info)
     948         4769 :                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
     949              :                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
     950     16694371 :           || (!STMT_VINFO_DATA_REF (stmt_info)
     951     15386649 :               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     952         9943 :               && ((!def_stmt_info
     953         9747 :                    || STMT_VINFO_DATA_REF (def_stmt_info)
     954        17960 :                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
     955              :                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
     956         9943 :                   != (oprnd_info->first_dt != vect_reduction_def))))
     957              :         {
     958              :           /* Try swapping operands if we got a mismatch.  For BB
     959              :              vectorization only in case it will clearly improve things.  */
     960       292358 :           if (i == commutative_op && !swapped
     961       289923 :               && (!is_a <bb_vec_info> (vinfo)
     962         4595 :                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
     963         4595 :                                              dts[i+1])
     964         1094 :                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
     965              :                           || vect_def_types_match
     966          156 :                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
     967              :             {
     968         2435 :               if (dump_enabled_p ())
     969          153 :                 dump_printf_loc (MSG_NOTE, vect_location,
     970              :                                  "trying swapped operands\n");
     971         2435 :               std::swap (dts[i], dts[i+1]);
     972         2435 :               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
     973         2435 :                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
     974         2435 :               std::swap ((*oprnds_info)[i]->ops[stmt_num],
     975         2435 :                          (*oprnds_info)[i+1]->ops[stmt_num]);
     976              :               /* After swapping some operands we lost track whether an
     977              :                  operand has any pattern defs so be conservative here.  */
     978         2435 :               if ((*oprnds_info)[i]->any_pattern
     979         2435 :                   || (*oprnds_info)[i+1]->any_pattern)
     980           36 :                 (*oprnds_info)[i]->any_pattern
     981           18 :                   = (*oprnds_info)[i+1]->any_pattern = true;
     982         2435 :               swapped = true;
     983         2435 :               continue;
     984              :             }
     985              : 
     986       287488 :           if (is_a <bb_vec_info> (vinfo)
     987       271908 :               && !oprnd_info->any_pattern
     988       559160 :               && number_of_oprnds > 1)
     989              :             {
     990              :               /* Now for commutative ops we should see whether we can
     991              :                  make the other operand matching.  */
     992       103629 :               if (dump_enabled_p ())
     993          203 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     994              :                                  "treating operand as external\n");
     995       103629 :               oprnd_info->first_dt = dt = vect_external_def;
     996              :             }
     997              :           else
     998              :             {
     999       183859 :               if (dump_enabled_p ())
    1000          411 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1001              :                                  "Build SLP failed: different types\n");
    1002       183859 :               return 1;
    1003              :             }
    1004              :         }
    1005              : 
    1006              :       /* Make sure to demote the overall operand to external.  */
    1007     16798000 :       if (dt == vect_external_def)
    1008       333490 :         oprnd_info->first_dt = vect_external_def;
    1009              :       /* For a SLP reduction chain we want to duplicate the reduction to
    1010              :          each of the chain members.  That gets us a sane SLP graph (still
    1011              :          the stmts are not 100% correct wrt the initial values).  */
    1012     16464510 :       else if ((dt == vect_internal_def
    1013     16464510 :                 || dt == vect_reduction_def)
    1014     15542617 :                && oprnd_info->first_dt == vect_reduction_def
    1015       101224 :                && !STMT_VINFO_DATA_REF (stmt_info)
    1016       101224 :                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
    1017         4769 :                && !STMT_VINFO_DATA_REF (def_stmt_info)
    1018     16469279 :                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
    1019              :                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
    1020              :         {
    1021         4769 :           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
    1022         4769 :           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
    1023              :         }
    1024              : 
    1025     16798000 :       ++i;
    1026              :     }
    1027              : 
    1028              :   /* Swap operands.  */
    1029      9097893 :   if (swapped)
    1030              :     {
    1031        40861 :       if (dump_enabled_p ())
    1032          453 :         dump_printf_loc (MSG_NOTE, vect_location,
    1033              :                          "swapped operands to match def types in %G",
    1034              :                          stmt_info->stmt);
    1035              :     }
    1036              : 
    1037              :   return 0;
    1038              : }
    1039              : 
    1040              : /* Return true if call statements CALL1 and CALL2 are similar enough
    1041              :    to be combined into the same SLP group.  */
    1042              : 
    1043              : bool
    1044        21185 : compatible_calls_p (gcall *call1, gcall *call2, bool allow_two_operators)
    1045              : {
    1046        21185 :   unsigned int nargs = gimple_call_num_args (call1);
    1047        21185 :   if (nargs != gimple_call_num_args (call2))
    1048              :     return false;
    1049              : 
    1050        19234 :   auto cfn1 = gimple_call_combined_fn (call1);
    1051        19234 :   auto cfn2 = gimple_call_combined_fn (call2);
    1052        19234 :   if (cfn1 != cfn2
    1053            2 :       && (!allow_two_operators
    1054            2 :           || !((cfn1 == CFN_FMA || cfn1 == CFN_FMS)
    1055            2 :                && (cfn2 == CFN_FMA || cfn2 == CFN_FMS))))
    1056              :     return false;
    1057              : 
    1058        19234 :   if (gimple_call_internal_p (call1))
    1059              :     {
    1060         7031 :       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
    1061         7031 :                                TREE_TYPE (gimple_call_lhs (call2))))
    1062              :         return false;
    1063        14476 :       for (unsigned int i = 0; i < nargs; ++i)
    1064         7445 :         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
    1065         7445 :                                  TREE_TYPE (gimple_call_arg (call2, i))))
    1066              :           return false;
    1067              :     }
    1068              :   else
    1069              :     {
    1070        12203 :       if (!operand_equal_p (gimple_call_fn (call1),
    1071        12203 :                             gimple_call_fn (call2), 0))
    1072              :         return false;
    1073              : 
    1074        26688 :       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
    1075              :         return false;
    1076              :     }
    1077              : 
    1078              :   /* Check that any unvectorized arguments are equal.  */
    1079        15927 :   if (const int *map = vect_get_operand_map (call1, false, false))
    1080              :     {
    1081           15 :       unsigned int nkept = *map++;
    1082           15 :       unsigned int mapi = 0;
    1083           57 :       for (unsigned int i = 0; i < nargs; ++i)
    1084           42 :         if (mapi < nkept && map[mapi] == int (i))
    1085           27 :           mapi += 1;
    1086           15 :         else if (!operand_equal_p (gimple_call_arg (call1, i),
    1087           15 :                                    gimple_call_arg (call2, i)))
    1088              :           return false;
    1089              :     }
    1090              : 
    1091              :   return true;
    1092              : }
    1093              : 
    1094              : /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
    1095              :    caller's attempt to find the vector type in STMT_INFO with the narrowest
    1096              :    element type.  Return true if VECTYPE is nonnull and if it is valid
    1097              :    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
    1098              :    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
    1099              :    vect_build_slp_tree.  */
    1100              : 
    1101              : static bool
    1102      5500535 : vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
    1103              :                         unsigned int group_size,
    1104              :                         tree vectype, poly_uint64 *max_nunits)
    1105              : {
    1106      5500535 :   if (!vectype)
    1107              :     {
    1108         3884 :       if (dump_enabled_p ())
    1109            7 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1110              :                          "Build SLP failed: unsupported data-type in %G\n",
    1111              :                          stmt_info->stmt);
    1112              :       /* Fatal mismatch.  */
    1113         3884 :       return false;
    1114              :     }
    1115              : 
    1116              :   /* If populating the vector type requires unrolling then fail
    1117              :      before adjusting *max_nunits for basic-block vectorization.  */
    1118      5496651 :   if (is_a <bb_vec_info> (vinfo)
    1119      5496651 :       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
    1120              :     {
    1121       141146 :       if (dump_enabled_p ())
    1122           36 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1123              :                          "Build SLP failed: unrolling required "
    1124              :                          "in basic block SLP\n");
    1125              :       /* Fatal mismatch.  */
    1126       141146 :       return false;
    1127              :     }
    1128              : 
    1129              :   /* In case of multiple types we need to detect the smallest type.  */
    1130      5355505 :   vect_update_max_nunits (max_nunits, vectype);
    1131      5355505 :   return true;
    1132              : }
    1133              : 
    1134              : /* Verify if the scalar stmts STMTS are isomorphic, require data
    1135              :    permutation or are of unsupported types of operation.  Return
    1136              :    true if they are, otherwise return false and indicate in *MATCHES
    1137              :    which stmts are not isomorphic to the first one.  If MATCHES[0]
    1138              :    is false then this indicates the comparison could not be
    1139              :    carried out or the stmts will never be vectorized by SLP.
    1140              : 
    1141              :    Note COND_EXPR is possibly isomorphic to another one after swapping its
    1142              :    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
    1143              :    the first stmt by swapping the two operands of comparison; set SWAP[i]
    1144              :    to 2 if stmt I is isormorphic to the first stmt by inverting the code
    1145              :    of comparison.  Take A1 >= B1 ? X1 : Y1 as an example, it can be swapped
    1146              :    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
    1147              : 
    1148              : static bool
    1149      5766336 : vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
    1150              :                        vec<stmt_vec_info> stmts, unsigned int group_size,
    1151              :                        poly_uint64 *max_nunits, bool *matches,
    1152              :                        bool *two_operators, tree *node_vectype)
    1153              : {
    1154      5766336 :   unsigned int i;
    1155      5766336 :   stmt_vec_info first_stmt_info = stmts[0];
    1156      5766336 :   code_helper first_stmt_code = ERROR_MARK;
    1157      5766336 :   code_helper alt_stmt_code = ERROR_MARK;
    1158      5766336 :   code_helper first_cond_code = ERROR_MARK;
    1159      5766336 :   bool need_same_oprnds = false;
    1160      5766336 :   tree first_lhs = NULL_TREE;
    1161      5766336 :   tree first_op1 = NULL_TREE;
    1162      5766336 :   stmt_vec_info first_load = NULL, prev_first_load = NULL;
    1163      5766336 :   bool first_stmt_ldst_p = false, first_stmt_ldst_masklen_p = false;
    1164      5766336 :   bool first_stmt_phi_p = false;
    1165      5766336 :   int first_reduc_idx = -1;
    1166      5766336 :   bool maybe_soft_fail = false;
    1167      5766336 :   tree soft_fail_nunits_vectype = NULL_TREE;
    1168              : 
    1169      5766336 :   tree vectype, nunits_vectype;
    1170      5766336 :   if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
    1171              :                                        &nunits_vectype, group_size))
    1172              :     {
    1173              :       /* Fatal mismatch.  */
    1174       207015 :       matches[0] = false;
    1175       207015 :       return false;
    1176              :     }
    1177      5559321 :   if (is_a <bb_vec_info> (vinfo)
    1178      5559321 :       && known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U))
    1179              :     {
    1180       358738 :       if (dump_enabled_p ())
    1181          296 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1182              :                          "Build SLP failed: not using single lane "
    1183              :                          "vector type %T\n", vectype);
    1184       358738 :       matches[0] = false;
    1185       358738 :       return false;
    1186              :     }
    1187              :   /* Record nunits required but continue analysis, producing matches[]
    1188              :      as if nunits was not an issue.  This allows splitting of groups
    1189              :      to happen.  */
    1190      5200583 :   if (nunits_vectype
    1191      5200583 :       && !vect_record_max_nunits (vinfo, first_stmt_info, group_size,
    1192              :                                   nunits_vectype, max_nunits))
    1193              :     {
    1194       141146 :       gcc_assert (is_a <bb_vec_info> (vinfo));
    1195       141146 :       maybe_soft_fail = true;
    1196       141146 :       soft_fail_nunits_vectype = nunits_vectype;
    1197              :     }
    1198              : 
    1199      5200583 :   gcc_assert (vectype || !gimple_get_lhs (first_stmt_info->stmt));
    1200      5200583 :   *node_vectype = vectype;
    1201              : 
    1202              :   /* For every stmt in NODE find its def stmt/s.  */
    1203      5200583 :   stmt_vec_info stmt_info;
    1204     22191460 :   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
    1205              :     {
    1206     17153412 :       bool ldst_p = false;
    1207     17153412 :       bool ldst_masklen_p = false;
    1208     17153412 :       bool phi_p = false;
    1209     17153412 :       code_helper rhs_code = ERROR_MARK;
    1210              : 
    1211     17153412 :       swap[i] = 0;
    1212     17153412 :       matches[i] = false;
    1213     17153412 :       if (!stmt_info)
    1214              :         {
    1215        40707 :           matches[i] = true;
    1216     17031584 :           continue;
    1217              :         }
    1218              : 
    1219     17112705 :       gimple *stmt = stmt_info->stmt;
    1220     17112705 :       if (dump_enabled_p ())
    1221       220005 :         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
    1222              : 
    1223              :       /* Fail to vectorize statements marked as unvectorizable, throw
    1224              :          or are volatile.  */
    1225     17112705 :       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
    1226     16922701 :           || stmt_can_throw_internal (cfun, stmt)
    1227     33241732 :           || gimple_has_volatile_ops (stmt))
    1228              :         {
    1229       195503 :           if (dump_enabled_p ())
    1230          199 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1231              :                              "Build SLP failed: unvectorizable statement %G",
    1232              :                              stmt);
    1233              :           /* ???  For BB vectorization we want to commutate operands in a way
    1234              :              to shuffle all unvectorizable defs into one operand and have
    1235              :              the other still vectorized.  The following doesn't reliably
    1236              :              work for this though but it's the easiest we can do here.  */
    1237       195503 :           if (is_a <bb_vec_info> (vinfo) && i != 0)
    1238        64496 :             continue;
    1239              :           /* Fatal mismatch.  */
    1240       131007 :           matches[0] = false;
    1241       131007 :           return false;
    1242              :         }
    1243              : 
    1244     16917202 :       gcall *call_stmt = dyn_cast <gcall *> (stmt);
    1245     16917202 :       tree lhs = gimple_get_lhs (stmt);
    1246     16917202 :       if (lhs == NULL_TREE && !call_stmt)
    1247              :         {
    1248           36 :           if (dump_enabled_p ())
    1249            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1250              :                              "Build SLP failed: not GIMPLE_ASSIGN nor "
    1251              :                              "GIMPLE_CALL %G", stmt);
    1252           36 :           if (is_a <bb_vec_info> (vinfo) && i != 0)
    1253           36 :             continue;
    1254              :           /* Fatal mismatch.  */
    1255            0 :           matches[0] = false;
    1256            0 :           return false;
    1257              :         }
    1258              : 
    1259     16917166 :       if (call_stmt)
    1260              :         {
    1261       102521 :           combined_fn cfn = gimple_call_combined_fn (call_stmt);
    1262       102521 :           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
    1263        58793 :             rhs_code = cfn;
    1264              :           else
    1265              :             rhs_code = CALL_EXPR;
    1266              : 
    1267       102521 :           if (cfn == CFN_GATHER_LOAD
    1268       102521 :               || cfn == CFN_SCATTER_STORE)
    1269              :             ldst_p = true;
    1270              :           else if (cfn == CFN_MASK_LOAD
    1271              :                    || cfn == CFN_MASK_GATHER_LOAD
    1272              :                    || cfn == CFN_MASK_LEN_GATHER_LOAD
    1273              :                    || cfn == CFN_MASK_SCATTER_STORE
    1274              :                    || cfn == CFN_MASK_LEN_SCATTER_STORE)
    1275              :             {
    1276              :               ldst_p = true;
    1277              :               ldst_masklen_p = true;
    1278              :             }
    1279              :           else if (cfn == CFN_MASK_STORE)
    1280              :             {
    1281              :               ldst_p = true;
    1282              :               ldst_masklen_p = true;
    1283              :               rhs_code = CFN_MASK_STORE;
    1284              :             }
    1285              :           else if (cfn == CFN_GOMP_SIMD_LANE)
    1286              :             ;
    1287        90807 :           else if ((cfn != CFN_LAST
    1288              :                     && cfn != CFN_MASK_CALL
    1289        47079 :                     && internal_fn_p (cfn)
    1290        36947 :                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
    1291        90732 :                    || gimple_call_tail_p (call_stmt)
    1292        90732 :                    || gimple_call_noreturn_p (call_stmt)
    1293       181539 :                    || gimple_call_chain (call_stmt))
    1294              :             {
    1295          424 :               if (dump_enabled_p ())
    1296           13 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1297              :                                  "Build SLP failed: unsupported call type %G",
    1298              :                                  (gimple *) call_stmt);
    1299          424 :               if (is_a <bb_vec_info> (vinfo) && i != 0)
    1300           64 :                 continue;
    1301              :               /* Fatal mismatch.  */
    1302          360 :               matches[0] = false;
    1303          360 :               return false;
    1304              :             }
    1305              :         }
    1306     16814645 :       else if (gimple_code (stmt) == GIMPLE_PHI)
    1307              :         {
    1308              :           rhs_code = ERROR_MARK;
    1309              :           phi_p = true;
    1310              :         }
    1311              :       else
    1312              :         {
    1313     16020971 :           rhs_code = gimple_assign_rhs_code (stmt);
    1314     16020971 :           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
    1315              :         }
    1316              : 
    1317              :       /* Check the operation.  */
    1318     16916742 :       if (i == 0)
    1319              :         {
    1320      5069216 :           first_lhs = lhs;
    1321      5069216 :           first_stmt_code = rhs_code;
    1322      5069216 :           first_stmt_ldst_p = ldst_p;
    1323      5069216 :           first_stmt_ldst_masklen_p = ldst_masklen_p;
    1324      5069216 :           first_stmt_phi_p = phi_p;
    1325      5069216 :           first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
    1326              : 
    1327              :           /* Shift arguments should be equal in all the packed stmts for a
    1328              :              vector shift with scalar shift operand.  */
    1329      5069216 :           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
    1330      4935443 :               || rhs_code == LROTATE_EXPR
    1331     10004587 :               || rhs_code == RROTATE_EXPR)
    1332              :             {
    1333              :               /* First see if we have a vector/vector shift.  */
    1334       134228 :               if (!directly_supported_p (rhs_code, vectype, optab_vector))
    1335              :                 {
    1336              :                   /* No vector/vector shift, try for a vector/scalar shift.  */
    1337       122213 :                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
    1338              :                     {
    1339        11866 :                       if (dump_enabled_p ())
    1340          386 :                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1341              :                                          "Build SLP failed: "
    1342              :                                          "op not supported by target.\n");
    1343        11866 :                       if (is_a <bb_vec_info> (vinfo) && i != 0)
    1344              :                         continue;
    1345              :                       /* Fatal mismatch.  */
    1346        11866 :                       matches[0] = false;
    1347        11866 :                       return false;
    1348              :                     }
    1349       110347 :                   need_same_oprnds = true;
    1350       110347 :                   first_op1 = gimple_assign_rhs2 (stmt);
    1351              :                 }
    1352              :             }
    1353      4934988 :           else if (rhs_code == WIDEN_LSHIFT_EXPR)
    1354              :             {
    1355            0 :               need_same_oprnds = true;
    1356            0 :               first_op1 = gimple_assign_rhs2 (stmt);
    1357              :             }
    1358      4934988 :           else if (!ldst_p
    1359      4934988 :                    && rhs_code == BIT_FIELD_REF)
    1360              :             {
    1361         5504 :               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
    1362         5504 :               if (!is_a <bb_vec_info> (vinfo)
    1363         5378 :                   || TREE_CODE (vec) != SSA_NAME
    1364              :                   /* When the element types are not compatible we pun the
    1365              :                      source to the target vectype which requires equal size.  */
    1366        10870 :                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
    1367         4649 :                        || !types_compatible_p (TREE_TYPE (vectype),
    1368         4649 :                                                TREE_TYPE (TREE_TYPE (vec))))
    1369         1007 :                       && !operand_equal_p (TYPE_SIZE (vectype),
    1370         1007 :                                            TYPE_SIZE (TREE_TYPE (vec)))))
    1371              :                 {
    1372          753 :                   if (dump_enabled_p ())
    1373            0 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1374              :                                      "Build SLP failed: "
    1375              :                                      "BIT_FIELD_REF not supported\n");
    1376              :                   /* Fatal mismatch.  */
    1377          753 :                   matches[0] = false;
    1378          753 :                   return false;
    1379              :                 }
    1380              :             }
    1381      4929484 :           else if (rhs_code == CFN_DIV_POW2)
    1382              :             {
    1383            0 :               need_same_oprnds = true;
    1384            0 :               first_op1 = gimple_call_arg (call_stmt, 1);
    1385              :             }
    1386      4929484 :           else if (rhs_code == CFN_GOMP_SIMD_LANE)
    1387              :             {
    1388         3153 :               need_same_oprnds = true;
    1389         3153 :               first_op1 = gimple_call_arg (call_stmt, 1);
    1390              :             }
    1391              :         }
    1392              :       else
    1393              :         {
    1394     11847526 :           int comm_arg;
    1395     11847904 :           if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
    1396              :               /* For SLP reduction groups the index isn't necessarily
    1397              :                  uniform but only that of the first stmt matters.  */
    1398         2296 :               && !(first_reduc_idx != -1
    1399         2296 :                    && STMT_VINFO_REDUC_IDX (stmt_info) != -1
    1400         2296 :                    && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
    1401     11847526 :               && !(first_reduc_idx != -1
    1402         1049 :                    && STMT_VINFO_REDUC_IDX (stmt_info) != -1
    1403         1049 :                    && (comm_arg = first_commutative_argument
    1404         1049 :                                     (rhs_code, TREE_TYPE (lhs))) >= 0
    1405              :                    && (first_reduc_idx
    1406          815 :                        == 2 * comm_arg + 1 - STMT_VINFO_REDUC_IDX (stmt_info))))
    1407              :             {
    1408          378 :               if (dump_enabled_p ())
    1409              :                 {
    1410           12 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1411              :                                    "Build SLP failed: different reduc_idx "
    1412              :                                    "%d instead of %d in %G",
    1413              :                                    STMT_VINFO_REDUC_IDX (stmt_info),
    1414              :                                    first_reduc_idx, stmt);
    1415              :                 }
    1416              :               /* Mismatch.  */
    1417          378 :               continue;
    1418              :             }
    1419     11847148 :           if (!ldst_p
    1420      9300700 :               && first_stmt_code != rhs_code
    1421     13270958 :               && alt_stmt_code == ERROR_MARK)
    1422              :             alt_stmt_code = rhs_code;
    1423     13247114 :           if ((!ldst_p
    1424      9300700 :                && first_stmt_code != rhs_code
    1425      1423810 :                && (first_stmt_code != IMAGPART_EXPR
    1426          127 :                    || rhs_code != REALPART_EXPR)
    1427      1423790 :                && (first_stmt_code != REALPART_EXPR
    1428          531 :                    || rhs_code != IMAGPART_EXPR)
    1429              :                /* Handle mismatches in plus/minus by computing both
    1430              :                   and merging the results.  */
    1431      1423779 :                && !((((first_stmt_code == PLUS_EXPR
    1432      1311519 :                        || first_stmt_code == MINUS_EXPR)
    1433       139578 :                       && (alt_stmt_code == PLUS_EXPR
    1434       131005 :                           || alt_stmt_code == MINUS_EXPR))
    1435      1394774 :                      || ((first_stmt_code == CFN_FMA
    1436      1394772 :                           || first_stmt_code == CFN_FMS)
    1437            2 :                          && (alt_stmt_code == CFN_FMA
    1438            2 :                              || alt_stmt_code == CFN_FMS)))
    1439        29007 :                     && rhs_code == alt_stmt_code)
    1440      1434654 :                && !(first_stmt_code.is_tree_code ()
    1441      1317296 :                     && rhs_code.is_tree_code ()
    1442      1221624 :                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
    1443              :                         == tcc_comparison)
    1444       127274 :                     && (swap_tree_comparison (tree_code (first_stmt_code))
    1445       127274 :                         == tree_code (rhs_code))
    1446              :                     && (first_reduc_idx == -1
    1447            0 :                         || REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
    1448              :               || (ldst_p
    1449      5092896 :                   && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
    1450      2546448 :                       != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
    1451              :               || (ldst_p
    1452      2501940 :                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    1453      2501940 :                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
    1454     10447329 :               || first_stmt_ldst_p != ldst_p
    1455     10447190 :               || (ldst_p && first_stmt_ldst_masklen_p != ldst_masklen_p)
    1456     22294330 :               || first_stmt_phi_p != phi_p)
    1457              :             {
    1458      1399966 :               if (dump_enabled_p ())
    1459              :                 {
    1460         3130 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1461              :                                    "Build SLP failed: different operation "
    1462              :                                    "in stmt %G", stmt);
    1463         3130 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1464              :                                    "original stmt %G", first_stmt_info->stmt);
    1465              :                 }
    1466              :               /* Mismatch.  */
    1467      1399966 :               continue;
    1468              :             }
    1469              : 
    1470     10449252 :           if (!ldst_p
    1471      7945377 :               && first_stmt_code == BIT_FIELD_REF
    1472     10452435 :               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
    1473         5253 :                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
    1474              :             {
    1475         2070 :               if (dump_enabled_p ())
    1476           40 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1477              :                                  "Build SLP failed: different BIT_FIELD_REF "
    1478              :                                  "arguments in %G", stmt);
    1479              :               /* Mismatch.  */
    1480         2070 :               continue;
    1481              :             }
    1482              : 
    1483     10445112 :           if (call_stmt
    1484        22165 :               && first_stmt_code != CFN_MASK_LOAD
    1485     10466713 :               && first_stmt_code != CFN_MASK_STORE)
    1486              :             {
    1487        21185 :               if (!is_a <gcall *> (stmts[0]->stmt)
    1488        21185 :                   || !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
    1489              :                                           call_stmt, true))
    1490              :                 {
    1491         5258 :                   if (dump_enabled_p ())
    1492            0 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1493              :                                      "Build SLP failed: different calls in %G",
    1494              :                                      stmt);
    1495              :                   /* Mismatch.  */
    1496         5258 :                   continue;
    1497              :                 }
    1498              :             }
    1499              : 
    1500     10253672 :           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
    1501     11238315 :               && (gimple_bb (first_stmt_info->stmt)
    1502       984643 :                   != gimple_bb (stmt_info->stmt)))
    1503              :             {
    1504        27295 :               if (dump_enabled_p ())
    1505            8 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1506              :                                  "Build SLP failed: different BB for PHI "
    1507              :                                  "or possibly trapping operation in %G", stmt);
    1508              :               /* Mismatch.  */
    1509        27295 :               continue;
    1510              :             }
    1511              : 
    1512     10412559 :           if (need_same_oprnds)
    1513              :             {
    1514        52873 :               tree other_op1 = gimple_arg (stmt, 1);
    1515        52873 :               if (!operand_equal_p (first_op1, other_op1, 0))
    1516              :                 {
    1517         5503 :                   if (dump_enabled_p ())
    1518          123 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1519              :                                      "Build SLP failed: different shift "
    1520              :                                      "arguments in %G", stmt);
    1521              :                   /* Mismatch.  */
    1522         5503 :                   continue;
    1523              :                 }
    1524              :             }
    1525              : 
    1526     10407793 :           if (first_lhs
    1527     10407056 :               && lhs
    1528     10407056 :               && !types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (first_lhs)))
    1529              :             {
    1530          737 :               if (dump_enabled_p ())
    1531            6 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1532              :                                  "Build SLP failed: different vector type "
    1533              :                                  "in %G", stmt);
    1534              :               /* Mismatch.  */
    1535          737 :               continue;
    1536              :             }
    1537              :         }
    1538              : 
    1539              :       /* Grouped store or load.  */
    1540     15462916 :       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    1541              :         {
    1542      3864258 :           gcc_assert (ldst_p);
    1543      3864258 :           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
    1544              :             {
    1545              :               /* Store.  */
    1546      3028510 :               gcc_assert (rhs_code == CFN_MASK_STORE
    1547              :                           || REFERENCE_CLASS_P (lhs)
    1548              :                           || DECL_P (lhs));
    1549              :             }
    1550              :           else
    1551              :             {
    1552              :               /* Load.  */
    1553       835748 :               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
    1554       835748 :               if (prev_first_load)
    1555              :                 {
    1556              :                   /* Check that there are no loads from different interleaving
    1557              :                      chains in the same node.  */
    1558       380380 :                   if (prev_first_load != first_load)
    1559              :                     {
    1560        54189 :                       if (dump_enabled_p ())
    1561         1994 :                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
    1562              :                                          vect_location,
    1563              :                                          "Build SLP failed: different "
    1564              :                                          "interleaving chains in one node %G",
    1565              :                                          stmt);
    1566              :                       /* Mismatch.  */
    1567        54189 :                       continue;
    1568              :                     }
    1569              :                 }
    1570              :               else
    1571              :                 prev_first_load = first_load;
    1572              :            }
    1573              :         }
    1574              :       /* Non-grouped store or load.  */
    1575     11598658 :       else if (ldst_p)
    1576              :         {
    1577       887228 :           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
    1578       616527 :               && rhs_code != CFN_GATHER_LOAD
    1579              :               && rhs_code != CFN_MASK_GATHER_LOAD
    1580              :               && rhs_code != CFN_MASK_LEN_GATHER_LOAD
    1581              :               && rhs_code != CFN_SCATTER_STORE
    1582              :               && rhs_code != CFN_MASK_SCATTER_STORE
    1583              :               && rhs_code != CFN_MASK_LEN_SCATTER_STORE
    1584       616527 :               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    1585              :               /* Not grouped loads are handled as externals for BB
    1586              :                  vectorization.  For loop vectorization we can handle
    1587              :                  splats the same we handle single element interleaving.
    1588              :                  Likewise we can handle a collection of invariant refs.  */
    1589      1484934 :               && (is_a <bb_vec_info> (vinfo)
    1590       597706 :                   || (stmt_info != first_stmt_info
    1591        68088 :                   && !(integer_zerop (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
    1592          241 :                       && integer_zerop (DR_STEP (STMT_VINFO_DATA_REF
    1593              :                                                          (first_stmt_info)))))))
    1594              :             {
    1595              :               /* Not grouped load.  */
    1596        67606 :               if (dump_enabled_p ())
    1597          145 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1598              :                                  "Build SLP failed: not grouped load %G", stmt);
    1599              : 
    1600        67606 :               if (i != 0)
    1601        67606 :                 continue;
    1602              :               /* Fatal mismatch.  */
    1603            0 :               matches[0] = false;
    1604            0 :               return false;
    1605              :             }
    1606              :         }
    1607              :       /* Not memory operation.  */
    1608              :       else
    1609              :         {
    1610     10711430 :           if (!phi_p
    1611     10042837 :               && rhs_code.is_tree_code ()
    1612      9994236 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
    1613      1517045 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
    1614       939313 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
    1615       877526 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
    1616        64319 :               && rhs_code != VIEW_CONVERT_EXPR
    1617              :               && rhs_code != CALL_EXPR
    1618              :               && rhs_code != BIT_FIELD_REF
    1619     10711430 :               && rhs_code != SSA_NAME)
    1620              :             {
    1621        18549 :               if (dump_enabled_p ())
    1622            7 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1623              :                                  "Build SLP failed: operation unsupported %G",
    1624              :                                  stmt);
    1625        18549 :               if (is_a <bb_vec_info> (vinfo) && i != 0)
    1626            0 :                 continue;
    1627              :               /* Fatal mismatch.  */
    1628        18549 :               matches[0] = false;
    1629        18549 :               return false;
    1630              :             }
    1631              : 
    1632     10692881 :           if (rhs_code == COND_EXPR)
    1633              :             {
    1634        58935 :               tree cond_expr = gimple_assign_rhs1 (stmt);
    1635        58935 :               enum tree_code cond_code = TREE_CODE (cond_expr);
    1636        58935 :               enum tree_code swap_code = ERROR_MARK;
    1637        58935 :               enum tree_code invert_code = ERROR_MARK;
    1638              : 
    1639        58935 :               if (i == 0)
    1640        49846 :                 first_cond_code = TREE_CODE (cond_expr);
    1641         9089 :               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
    1642              :                 {
    1643            0 :                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
    1644            0 :                   swap_code = swap_tree_comparison (cond_code);
    1645            0 :                   invert_code = invert_tree_comparison (cond_code, honor_nans);
    1646              :                 }
    1647              : 
    1648        58935 :               if (first_cond_code == cond_code)
    1649              :                 ;
    1650              :               /* Isomorphic can be achieved by swapping.  */
    1651            0 :               else if (first_cond_code == swap_code)
    1652            0 :                 swap[i] = 1;
    1653              :               /* Isomorphic can be achieved by inverting.  */
    1654            0 :               else if (first_cond_code == invert_code)
    1655            0 :                 swap[i] = 2;
    1656              :               else
    1657              :                 {
    1658            0 :                   if (dump_enabled_p ())
    1659            0 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1660              :                                      "Build SLP failed: different"
    1661              :                                      " operation %G", stmt);
    1662              :                   /* Mismatch.  */
    1663            0 :                   continue;
    1664              :                 }
    1665              :             }
    1666              : 
    1667     10692881 :           if (i != 0
    1668      7905765 :               && first_stmt_code != rhs_code
    1669        68487 :               && first_stmt_code.is_tree_code ()
    1670        68485 :               && rhs_code.is_tree_code ()
    1671        68485 :               && TREE_CODE_CLASS ((tree_code)first_stmt_code) == tcc_comparison
    1672     10732562 :               && (swap_tree_comparison ((tree_code)first_stmt_code)
    1673        39681 :                   == (tree_code)rhs_code))
    1674        39681 :             swap[i] = 1;
    1675              : 
    1676     10692881 :           if (i != 0
    1677      7905765 :               && first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
    1678         1648 :               && first_reduc_idx != -1
    1679         1648 :               && STMT_VINFO_REDUC_IDX (stmt_info) != -1
    1680         1648 :               && rhs_code.is_tree_code ()
    1681         1640 :               && commutative_tree_code (tree_code (rhs_code))
    1682     10694519 :               && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info))
    1683         1638 :             swap[i] = 1;
    1684              :         }
    1685              : 
    1686     15322572 :       matches[i] = true;
    1687              :     }
    1688              : 
    1689     20369350 :   for (i = 0; i < group_size; ++i)
    1690     16028430 :     if (!matches[i])
    1691              :       return false;
    1692              : 
    1693              :   /* If we allowed a two-operation SLP node verify the target can cope
    1694              :      with the permute we are going to use.  */
    1695      4340920 :   if (alt_stmt_code != ERROR_MARK
    1696      4340920 :       && (!alt_stmt_code.is_tree_code ()
    1697        53539 :           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
    1698        53539 :               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
    1699              :     {
    1700        14449 :       *two_operators = true;
    1701              :     }
    1702              : 
    1703      4340920 :   if (maybe_soft_fail)
    1704              :     {
    1705       140738 :       unsigned HOST_WIDE_INT const_nunits;
    1706       140738 :       if (!TYPE_VECTOR_SUBPARTS
    1707       140738 :             (soft_fail_nunits_vectype).is_constant (&const_nunits)
    1708       140738 :           || const_nunits > group_size)
    1709            0 :         matches[0] = false;
    1710              :       else
    1711              :         {
    1712              :           /* With constant vector elements simulate a mismatch at the
    1713              :              point we need to split.  */
    1714       140738 :           unsigned tail = group_size & (const_nunits - 1);
    1715       140738 :           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
    1716              :         }
    1717       140738 :       return false;
    1718              :     }
    1719              : 
    1720              :   return true;
    1721              : }
    1722              : 
    1723              : /* Traits for the hash_set to record failed SLP builds for a stmt set.
    1724              :    Note we never remove apart from at destruction time so we do not
    1725              :    need a special value for deleted that differs from empty.  */
    1726              : struct bst_traits
    1727              : {
    1728              :   typedef vec <stmt_vec_info> value_type;
    1729              :   typedef vec <stmt_vec_info> compare_type;
    1730              :   static inline hashval_t hash (value_type);
    1731              :   static inline bool equal (value_type existing, value_type candidate);
    1732    478395520 :   static inline bool is_empty (value_type x) { return !x.exists (); }
    1733    107128487 :   static inline bool is_deleted (value_type x) { return !x.exists (); }
    1734              :   static const bool empty_zero_p = true;
    1735            0 :   static inline void mark_empty (value_type &x) { x.release (); }
    1736              :   static inline void mark_deleted (value_type &x) { x.release (); }
    1737      9211504 :   static inline void remove (value_type &x) { x.release (); }
    1738              : };
    1739              : inline hashval_t
    1740     93332974 : bst_traits::hash (value_type x)
    1741              : {
    1742     93332974 :   inchash::hash h;
    1743    422945170 :   for (unsigned i = 0; i < x.length (); ++i)
    1744    329612196 :     h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
    1745     93332974 :   return h.end ();
    1746              : }
    1747              : inline bool
    1748     81629928 : bst_traits::equal (value_type existing, value_type candidate)
    1749              : {
    1750    244889784 :   if (existing.length () != candidate.length ())
    1751              :     return false;
    1752     83007214 :   for (unsigned i = 0; i < existing.length (); ++i)
    1753     78688398 :     if (existing[i] != candidate[i])
    1754              :       return false;
    1755              :   return true;
    1756              : }
    1757              : 
    1758              : typedef hash_map <vec <stmt_vec_info>, slp_tree,
    1759              :                   simple_hashmap_traits <bst_traits, slp_tree> >
    1760              :   scalar_stmts_to_slp_tree_map_t;
    1761              : 
    1762              : /* Release BST_MAP.  */
    1763              : 
    1764              : static void
    1765      1790680 : release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
    1766              : {
    1767              :   /* The map keeps a reference on SLP nodes built, release that.  */
    1768     11002184 :   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
    1769     20213688 :        it != bst_map->end (); ++it)
    1770      9211504 :     if ((*it).second)
    1771      9211504 :       vect_free_slp_tree ((*it).second);
    1772      1790680 :   delete bst_map;
    1773      1790680 : }
    1774              : 
    1775              : /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
    1776              :    but then vec::insert does memmove and that's not compatible with
    1777              :    std::pair.  */
    1778              : struct chain_op_t
    1779              : {
    1780      3708863 :   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
    1781      3708863 :       : code (code_), dt (dt_), op (op_) {}
    1782              :   tree_code code;
    1783              :   vect_def_type dt;
    1784              :   tree op;
    1785              : };
    1786              : 
    1787              : /* Comparator for sorting associatable chains.  */
    1788              : 
    1789              : static int
    1790      8251107 : dt_sort_cmp (const void *op1_, const void *op2_, void *)
    1791              : {
    1792      8251107 :   auto *op1 = (const chain_op_t *) op1_;
    1793      8251107 :   auto *op2 = (const chain_op_t *) op2_;
    1794      8251107 :   if (op1->dt != op2->dt)
    1795       940598 :     return (int)op1->dt - (int)op2->dt;
    1796      7310509 :   return (int)op1->code - (int)op2->code;
    1797              : }
    1798              : 
    1799              : /* Linearize the associatable expression chain at START with the
    1800              :    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
    1801              :    filling CHAIN with the result and using WORKLIST as intermediate storage.
    1802              :    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
    1803              :    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
    1804              :    stmts, starting with START.  When ALLOW_ALT_CODE is false, do not
    1805              :    follow into MINUS_EXPR when building a PLUS chain (treat MINUS as leaf).  */
    1806              : 
    1807              : static void
    1808      1680052 : vect_slp_linearize_chain (vec_info *vinfo,
    1809              :                           vec<std::pair<tree_code, gimple *> > &worklist,
    1810              :                           vec<chain_op_t> &chain,
    1811              :                           enum tree_code code, gimple *start,
    1812              :                           gimple *&code_stmt, gimple *&alt_code_stmt,
    1813              :                           vec<gimple *> *chain_stmts,
    1814              :                           bool allow_alt_code = true)
    1815              : {
    1816              :   /* For each lane linearize the addition/subtraction (or other
    1817              :      uniform associatable operation) expression tree.  */
    1818      1680052 :   worklist.safe_push (std::make_pair (code, start));
    1819      3708863 :   while (!worklist.is_empty ())
    1820              :     {
    1821      2028811 :       auto entry = worklist.pop ();
    1822      2028811 :       gassign *stmt = as_a <gassign *> (entry.second);
    1823      2028811 :       enum tree_code in_code = entry.first;
    1824      4057622 :       enum tree_code this_code = gimple_assign_rhs_code (stmt);
    1825              :       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
    1826      2028811 :       if (!code_stmt
    1827      2028811 :           && gimple_assign_rhs_code (stmt) == code)
    1828      1427004 :         code_stmt = stmt;
    1829       601807 :       else if (!alt_code_stmt
    1830       601807 :                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
    1831       305611 :         alt_code_stmt = stmt;
    1832      2028811 :       if (chain_stmts)
    1833      1948091 :         chain_stmts->safe_push (stmt);
    1834      6086433 :       for (unsigned opnum = 1; opnum <= 2; ++opnum)
    1835              :         {
    1836      4057622 :           tree op = gimple_op (stmt, opnum);
    1837      4057622 :           vect_def_type dt;
    1838      4057622 :           stmt_vec_info def_stmt_info;
    1839      4057622 :           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
    1840      4057622 :           gcc_assert (res);
    1841      4057622 :           if (dt == vect_internal_def
    1842      4057622 :               && is_pattern_stmt_p (def_stmt_info))
    1843         9112 :             op = gimple_get_lhs (def_stmt_info->stmt);
    1844      4057622 :           gimple *use_stmt;
    1845      4057622 :           use_operand_p use_p;
    1846      4057622 :           if (dt == vect_internal_def
    1847      3767721 :               && single_imm_use (op, &use_p, &use_stmt)
    1848      2338256 :               && is_gimple_assign (def_stmt_info->stmt)
    1849      6214990 :               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
    1850      1808924 :                   || (allow_alt_code
    1851        56905 :                       && code == PLUS_EXPR
    1852        36106 :                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
    1853              :                           == MINUS_EXPR))))
    1854              :             {
    1855       348759 :               tree_code op_def_code = this_code;
    1856       348759 :               if (op_def_code == MINUS_EXPR && opnum == 1)
    1857        50974 :                 op_def_code = PLUS_EXPR;
    1858       348759 :               if (in_code == MINUS_EXPR)
    1859          135 :                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
    1860       348759 :               worklist.safe_push (std::make_pair (op_def_code,
    1861       348759 :                                                   def_stmt_info->stmt));
    1862              :             }
    1863              :           else
    1864              :             {
    1865      3708863 :               tree_code op_def_code = this_code;
    1866      3708863 :               if (op_def_code == MINUS_EXPR && opnum == 1)
    1867       254754 :                 op_def_code = PLUS_EXPR;
    1868      3708863 :               if (in_code == MINUS_EXPR)
    1869         3769 :                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
    1870      3708863 :               chain.safe_push (chain_op_t (op_def_code, dt, op));
    1871              :             }
    1872              :         }
    1873              :     }
    1874      1680052 : }
    1875              : 
    1876              : /* Distance from the node currently being discovered to the closest upthread
    1877              :    commutative operation whose operand-zero discovery may still be fixed by
    1878              :    retrying with swapped operands, or -1U if there is none.  */
    1879              : 
    1880              : static unsigned least_upthread_swappable_op_distance = -1U;
    1881              : 
    1882              : static slp_tree
    1883              : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
    1884              :                        vec<stmt_vec_info> stmts, unsigned int group_size,
    1885              :                        poly_uint64 *max_nunits,
    1886              :                        bool *matches, unsigned *limit, unsigned *tree_size,
    1887              :                        scalar_stmts_to_slp_tree_map_t *bst_map);
    1888              : 
    1889              : static slp_tree
    1890      6246869 : vect_build_slp_tree (vec_info *vinfo,
    1891              :                      vec<stmt_vec_info> stmts, unsigned int group_size,
    1892              :                      poly_uint64 *max_nunits,
    1893              :                      bool *matches, unsigned *limit, unsigned *tree_size,
    1894              :                      scalar_stmts_to_slp_tree_map_t *bst_map)
    1895              : {
    1896      6246869 :   if (slp_tree *leader = bst_map->get (stmts))
    1897              :     {
    1898       475333 :       if (dump_enabled_p ())
    1899        17184 :         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
    1900        17184 :                          !(*leader)->failed ? "" : "failed ",
    1901              :                          (void *) *leader);
    1902       475333 :       if (!(*leader)->failed)
    1903              :         {
    1904       430275 :           SLP_TREE_REF_COUNT (*leader)++;
    1905       430275 :           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
    1906       430275 :           stmts.release ();
    1907       430275 :           return *leader;
    1908              :         }
    1909        45058 :       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
    1910        45058 :       return NULL;
    1911              :     }
    1912              : 
    1913              :   /* Single-lane SLP doesn't have the chance of run-away, do not account
    1914              :      it to the limit.  */
    1915      5771536 :   if (stmts.length () > 1)
    1916              :     {
    1917      3185259 :       if (*limit == 0)
    1918              :         {
    1919         1239 :           if (dump_enabled_p ())
    1920           15 :             dump_printf_loc (MSG_NOTE, vect_location,
    1921              :                              "SLP discovery limit exceeded\n");
    1922         1239 :           memset (matches, 0, sizeof (bool) * group_size);
    1923         1239 :           return NULL;
    1924              :         }
    1925      3184020 :       --*limit;
    1926              :     }
    1927              : 
    1928              :   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
    1929              :      so we can pick up backedge destinations during discovery.  */
    1930      5770297 :   slp_tree res = new _slp_tree;
    1931      5770297 :   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
    1932      5770297 :   SLP_TREE_SCALAR_STMTS (res) = stmts;
    1933      5770297 :   bst_map->put (stmts.copy (), res);
    1934              : 
    1935      5770297 :   if (dump_enabled_p ())
    1936       146543 :     dump_printf_loc (MSG_NOTE, vect_location,
    1937              :                      "starting SLP discovery for node %p\n", (void *) res);
    1938              : 
    1939      5770297 :   poly_uint64 this_max_nunits = 1;
    1940      5770297 :   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
    1941              :                                         &this_max_nunits,
    1942              :                                         matches, limit, tree_size, bst_map);
    1943      5770297 :   if (!res_)
    1944              :     {
    1945      2037326 :       if (dump_enabled_p ())
    1946         8484 :         dump_printf_loc (MSG_NOTE, vect_location,
    1947              :                          "SLP discovery for node %p failed\n", (void *) res);
    1948              :       /* Mark the node invalid so we can detect those when still in use
    1949              :          as backedge destinations.  */
    1950      2037326 :       SLP_TREE_SCALAR_STMTS (res) = vNULL;
    1951      2037326 :       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
    1952      2037326 :       res->failed = XNEWVEC (bool, group_size);
    1953      2037326 :       if (flag_checking)
    1954              :         {
    1955              :           unsigned i;
    1956      3621687 :           for (i = 0; i < group_size; ++i)
    1957      3621687 :             if (!matches[i])
    1958              :               break;
    1959      2037326 :           gcc_assert (i < group_size);
    1960              :         }
    1961      2037326 :       memcpy (res->failed, matches, sizeof (bool) * group_size);
    1962              :     }
    1963              :   else
    1964              :     {
    1965      3732971 :       if (dump_enabled_p ())
    1966       138059 :         dump_printf_loc (MSG_NOTE, vect_location,
    1967              :                          "SLP discovery for node %p succeeded\n",
    1968              :                          (void *) res);
    1969      3732971 :       gcc_assert (res_ == res);
    1970      3732971 :       res->max_nunits = this_max_nunits;
    1971      3732971 :       vect_update_max_nunits (max_nunits, this_max_nunits);
    1972              :       /* Keep a reference for the bst_map use.  */
    1973      3732971 :       SLP_TREE_REF_COUNT (res)++;
    1974              :     }
    1975              :   return res_;
    1976              : }
    1977              : 
    1978              : /* Helper for building an associated SLP node chain.  */
    1979              : 
    1980              : static void
    1981          158 : vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
    1982              :                                    slp_tree op0, slp_tree op1,
    1983              :                                    stmt_vec_info oper1, stmt_vec_info oper2,
    1984              :                                    vec<std::pair<unsigned, unsigned> > lperm)
    1985              : {
    1986          158 :   unsigned group_size = SLP_TREE_LANES (op1);
    1987              : 
    1988          158 :   slp_tree child1 = new _slp_tree;
    1989          158 :   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
    1990          158 :   SLP_TREE_VECTYPE (child1) = vectype;
    1991          158 :   SLP_TREE_LANES (child1) = group_size;
    1992          158 :   SLP_TREE_CHILDREN (child1).create (2);
    1993          158 :   SLP_TREE_CHILDREN (child1).quick_push (op0);
    1994          158 :   SLP_TREE_CHILDREN (child1).quick_push (op1);
    1995          158 :   SLP_TREE_REPRESENTATIVE (child1) = oper1;
    1996              : 
    1997          158 :   slp_tree child2 = new _slp_tree;
    1998          158 :   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
    1999          158 :   SLP_TREE_VECTYPE (child2) = vectype;
    2000          158 :   SLP_TREE_LANES (child2) = group_size;
    2001          158 :   SLP_TREE_CHILDREN (child2).create (2);
    2002          158 :   SLP_TREE_CHILDREN (child2).quick_push (op0);
    2003          158 :   SLP_TREE_REF_COUNT (op0)++;
    2004          158 :   SLP_TREE_CHILDREN (child2).quick_push (op1);
    2005          158 :   SLP_TREE_REF_COUNT (op1)++;
    2006          158 :   SLP_TREE_REPRESENTATIVE (child2) = oper2;
    2007              : 
    2008          158 :   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
    2009          158 :   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
    2010          158 :   SLP_TREE_VECTYPE (perm) = vectype;
    2011          158 :   SLP_TREE_LANES (perm) = group_size;
    2012              :   /* ???  We should set this NULL but that's not expected.  */
    2013          158 :   SLP_TREE_REPRESENTATIVE (perm) = oper1;
    2014          158 :   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
    2015          158 :   SLP_TREE_CHILDREN (perm).quick_push (child1);
    2016          158 :   SLP_TREE_CHILDREN (perm).quick_push (child2);
    2017          158 : }
    2018              : 
    2019              : /* Recursively build an SLP tree starting from NODE.
    2020              :    Fail (and return a value not equal to zero) if def-stmts are not
    2021              :    isomorphic, require data permutation or are of unsupported types of
    2022              :    operation.  Otherwise, return 0.
    2023              :    The value returned is the depth in the SLP tree where a mismatch
    2024              :    was found.  */
    2025              : 
    2026              : static slp_tree
    2027      5770297 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
    2028              :                        vec<stmt_vec_info> stmts, unsigned int group_size,
    2029              :                        poly_uint64 *max_nunits,
    2030              :                        bool *matches, unsigned *limit, unsigned *tree_size,
    2031              :                        scalar_stmts_to_slp_tree_map_t *bst_map)
    2032              : {
    2033      5770297 :   unsigned nops, i, this_tree_size = 0;
    2034      5770297 :   poly_uint64 this_max_nunits = *max_nunits;
    2035              : 
    2036      5770297 :   matches[0] = false;
    2037              : 
    2038      5770297 :   stmt_vec_info stmt_info = stmts[0];
    2039      5770297 :   if (!is_a<gcall *> (stmt_info->stmt)
    2040              :       && !is_a<gassign *> (stmt_info->stmt)
    2041              :       && !is_a<gphi *> (stmt_info->stmt))
    2042              :     return NULL;
    2043              : 
    2044      5770226 :   nops = gimple_num_args (stmt_info->stmt);
    2045      5770226 :   if (const int *map = vect_get_operand_map (stmt_info))
    2046        35130 :     nops = map[0];
    2047              : 
    2048              :   /* If the SLP node is a PHI (induction or reduction), terminate
    2049              :      the recursion.  */
    2050      5770226 :   bool *skip_args = XALLOCAVEC (bool, nops);
    2051      5770226 :   memset (skip_args, 0, sizeof (bool) * nops);
    2052      5770226 :   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
    2053      2782864 :     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
    2054              :       {
    2055       299972 :         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
    2056       299972 :         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
    2057              :                                                     group_size);
    2058       299972 :         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
    2059              :                                      max_nunits))
    2060              :           return NULL;
    2061              : 
    2062       296088 :         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
    2063       296088 :         if (def_type == vect_induction_def)
    2064              :           {
    2065              :             /* Induction PHIs are not cycles but walk the initial
    2066              :                value.  Only for inner loops through, for outer loops
    2067              :                we need to pick up the value from the actual PHIs
    2068              :                to more easily support peeling and epilogue vectorization.  */
    2069       189878 :             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    2070       189878 :             if (!nested_in_vect_loop_p (loop, stmt_info))
    2071       189054 :               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
    2072              :             else
    2073              :               loop = loop->inner;
    2074       189878 :             skip_args[loop_latch_edge (loop)->dest_idx] = true;
    2075              :           }
    2076       106210 :         else if (def_type == vect_reduction_def
    2077              :                  || def_type == vect_double_reduction_def
    2078              :                  || def_type == vect_nested_cycle
    2079       106210 :                  || def_type == vect_first_order_recurrence)
    2080              :           {
    2081              :             /* Else def types have to match.  */
    2082              :             stmt_vec_info other_info;
    2083              :             bool all_same = true;
    2084       240628 :             FOR_EACH_VEC_ELT (stmts, i, other_info)
    2085              :               {
    2086       135732 :                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
    2087      1782918 :                   return NULL;
    2088       135726 :                 if (other_info != stmt_info)
    2089        26231 :                   all_same = false;
    2090              :               }
    2091       104896 :             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    2092              :             /* Reduction initial values are not explicitly represented.  */
    2093       104896 :             if (def_type != vect_first_order_recurrence
    2094       104896 :                 && gimple_bb (stmt_info->stmt) == loop->header)
    2095       101731 :               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
    2096              :             /* Reduction chain backedge defs are filled manually.
    2097              :                ???  Need a better way to identify a SLP reduction chain PHI.
    2098              :                Or a better overall way to SLP match those.  */
    2099       104896 :             if (stmts.length () > 1
    2100       104896 :                 && all_same && def_type == vect_reduction_def)
    2101         2354 :               skip_args[loop_latch_edge (loop)->dest_idx] = true;
    2102              :           }
    2103         1308 :         else if (def_type != vect_internal_def)
    2104              :           return NULL;
    2105              :       }
    2106              : 
    2107              : 
    2108      5766336 :   bool two_operators = false;
    2109      5766336 :   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
    2110      5766336 :   tree vectype = NULL_TREE;
    2111      5766336 :   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
    2112              :                               &this_max_nunits, matches, &two_operators,
    2113              :                               &vectype))
    2114              :     return NULL;
    2115              : 
    2116              :   /* If the SLP node is a load, terminate the recursion unless masked.  */
    2117      4200182 :   if (STMT_VINFO_DATA_REF (stmt_info)
    2118      2034754 :       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
    2119              :     {
    2120       900598 :       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
    2121              :         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
    2122              :       else
    2123              :         {
    2124       882030 :           *max_nunits = this_max_nunits;
    2125       882030 :           (*tree_size)++;
    2126       882030 :           node = vect_create_new_slp_node (node, stmts, 0);
    2127       882030 :           SLP_TREE_VECTYPE (node) = vectype;
    2128              :           /* And compute the load permutation.  Whether it is actually
    2129              :              a permutation depends on the unrolling factor which is
    2130              :              decided later.  */
    2131       882030 :           vec<unsigned> load_permutation;
    2132       882030 :           int j;
    2133       882030 :           stmt_vec_info load_info;
    2134       882030 :           load_permutation.create (group_size);
    2135       882030 :           stmt_vec_info first_stmt_info
    2136       882030 :             = STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2137       882030 :               ? DR_GROUP_FIRST_ELEMENT (stmt_info) : stmt_info;
    2138       882030 :           bool any_permute = false;
    2139      2125659 :           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
    2140              :             {
    2141      1243629 :               int load_place;
    2142      1243629 :               if (! load_info)
    2143              :                 {
    2144        40227 :                   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    2145              :                     load_place = j;
    2146              :                   else
    2147              :                     load_place = 0;
    2148              :                 }
    2149      1203402 :               else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    2150       701430 :                 load_place = vect_get_place_in_interleaving_chain
    2151       701430 :                     (load_info, first_stmt_info);
    2152              :               else
    2153              :                 /* Recognize the splat case as { 0, 0, ... } but make
    2154              :                    sure to use the appropriate refs for collections
    2155              :                    of invariant refs.  */
    2156       501972 :                 load_place = (load_info == stmt_info) ? 0 : j;
    2157       741898 :               gcc_assert (load_place != -1);
    2158      1243629 :               any_permute |= load_place != j;
    2159      1243629 :               load_permutation.quick_push (load_place);
    2160              :             }
    2161              : 
    2162       882030 :           if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
    2163              :             {
    2164         3416 :               gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
    2165         3416 :               bool has_gaps = false;
    2166         3416 :               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    2167          219 :                 for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
    2168         1596 :                      si; si = DR_GROUP_NEXT_ELEMENT (si))
    2169         1377 :                   if (DR_GROUP_GAP (si) != 1)
    2170          200 :                     has_gaps = true;
    2171              :               /* We cannot handle permuted masked loads directly, see
    2172              :                  PR114375.  We cannot handle strided masked loads or masked
    2173              :                  loads with gaps unless the mask is uniform.  */
    2174         3416 :               if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2175          219 :                    && (DR_GROUP_GAP (first_stmt_info) != 0
    2176          159 :                        || (has_gaps
    2177           65 :                            && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))))
    2178         6737 :                   || STMT_VINFO_STRIDED_P (stmt_info))
    2179              :                 {
    2180          108 :                   load_permutation.release ();
    2181          108 :                   matches[0] = false;
    2182       878776 :                   return NULL;
    2183              :                 }
    2184              : 
    2185              :               /* For permuted masked loads do an unpermuted masked load of
    2186              :                  the whole group followed by a SLP permute node.  */
    2187         3308 :               if (any_permute
    2188         3308 :                   || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2189           86 :                       && DR_GROUP_SIZE (first_stmt_info) != group_size))
    2190              :                 {
    2191              :                   /* Discover the whole unpermuted load.  */
    2192           54 :                   vec<stmt_vec_info> stmts2;
    2193           54 :                   unsigned dr_group_size = STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2194           98 :                       ? DR_GROUP_SIZE (first_stmt_info) : 1;
    2195           54 :                   stmts2.create (dr_group_size);
    2196           54 :                   stmts2.quick_grow_cleared (dr_group_size);
    2197           54 :                   unsigned i = 0;
    2198           54 :                   for (stmt_vec_info si = first_stmt_info;
    2199          854 :                        si; si = DR_GROUP_NEXT_ELEMENT (si))
    2200              :                     {
    2201          800 :                       if (si != first_stmt_info)
    2202         3146 :                         for (unsigned k = 1; k < DR_GROUP_GAP (si); ++k)
    2203         2400 :                           stmts2[i++] = NULL;
    2204          800 :                       stmts2[i++] = si;
    2205              :                     }
    2206           54 :                   bool *matches2 = XALLOCAVEC (bool, dr_group_size);
    2207           54 :                   slp_tree unperm_load
    2208           54 :                     = vect_build_slp_tree (vinfo, stmts2, dr_group_size,
    2209              :                                            &this_max_nunits, matches2, limit,
    2210           54 :                                            &this_tree_size, bst_map);
    2211              :                   /* When we are able to do the full masked load emit that
    2212              :                      followed by 'node' being the desired final permutation.  */
    2213           54 :                   if (unperm_load)
    2214              :                     {
    2215           16 :                       gcc_assert
    2216              :                         (!SLP_TREE_LOAD_PERMUTATION (unperm_load).exists ());
    2217           16 :                       lane_permutation_t lperm;
    2218           16 :                       lperm.create (group_size);
    2219           56 :                       for (unsigned j = 0; j < load_permutation.length (); ++j)
    2220           40 :                         lperm.quick_push
    2221           40 :                           (std::make_pair (0, load_permutation[j]));
    2222           16 :                       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
    2223           16 :                       SLP_TREE_CHILDREN (node).safe_push (unperm_load);
    2224           16 :                       SLP_TREE_LANE_PERMUTATION (node) = lperm;
    2225           16 :                       load_permutation.release ();
    2226           16 :                       return node;
    2227              :                     }
    2228           38 :                   stmts2.release ();
    2229           38 :                   load_permutation.release ();
    2230           38 :                   matches[0] = false;
    2231           38 :                   return NULL;
    2232              :                 }
    2233         3254 :               load_permutation.release ();
    2234              :             }
    2235              :           else
    2236              :             {
    2237       878614 :               if (!any_permute
    2238       766274 :                   && STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2239      1168840 :                   && group_size == DR_GROUP_SIZE (first_stmt_info))
    2240       126494 :                 load_permutation.release ();
    2241       878614 :               SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
    2242       878614 :               return node;
    2243              :             }
    2244              :         }
    2245              :     }
    2246      3299584 :   else if (gimple_assign_single_p (stmt_info->stmt)
    2247      2272402 :            && !gimple_vuse (stmt_info->stmt)
    2248      3307205 :            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
    2249              :     {
    2250              :       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
    2251              :          the same SSA name vector of a compatible type to vectype.  */
    2252         2210 :       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
    2253         2210 :       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
    2254         2210 :       stmt_vec_info estmt_info;
    2255         6972 :       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
    2256              :         {
    2257         4909 :           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
    2258         4909 :           tree bfref = gimple_assign_rhs1 (estmt);
    2259         4909 :           HOST_WIDE_INT lane;
    2260         4909 :           if (!known_eq (bit_field_size (bfref),
    2261              :                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
    2262         9671 :               || !constant_multiple_p (bit_field_offset (bfref),
    2263         4762 :                                        bit_field_size (bfref), &lane))
    2264              :             {
    2265          147 :               lperm.release ();
    2266          147 :               matches[0] = false;
    2267          147 :               return NULL;
    2268              :             }
    2269         4762 :           lperm.safe_push (std::make_pair (0, (unsigned)lane));
    2270              :         }
    2271         2063 :       slp_tree vnode = vect_create_new_slp_node (vNULL);
    2272         2063 :       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
    2273              :         /* ???  We record vectype here but we hide eventually necessary
    2274              :            punning and instead rely on code generation to materialize
    2275              :            VIEW_CONVERT_EXPRs as necessary.  We instead should make
    2276              :            this explicit somehow.  */
    2277          625 :         SLP_TREE_VECTYPE (vnode) = vectype;
    2278              :       else
    2279              :         {
    2280              :           /* For different size but compatible elements we can still
    2281              :              use VEC_PERM_EXPR without punning.  */
    2282         1438 :           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
    2283              :                       && types_compatible_p (TREE_TYPE (vectype),
    2284              :                                              TREE_TYPE (TREE_TYPE (vec))));
    2285         1438 :           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
    2286              :         }
    2287         2063 :       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
    2288         2063 :       unsigned HOST_WIDE_INT const_nunits;
    2289         2063 :       if (nunits.is_constant (&const_nunits))
    2290         2063 :         SLP_TREE_LANES (vnode) = const_nunits;
    2291         2063 :       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
    2292              :       /* We are always building a permutation node even if it is an identity
    2293              :          permute to shield the rest of the vectorizer from the odd node
    2294              :          representing an actual vector without any scalar ops.
    2295              :          ???  We could hide it completely with making the permute node
    2296              :          external?  */
    2297         2063 :       node = vect_create_new_slp_node (node, stmts, 1);
    2298         2063 :       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
    2299         2063 :       SLP_TREE_LANE_PERMUTATION (node) = lperm;
    2300         2063 :       SLP_TREE_VECTYPE (node) = vectype;
    2301         2063 :       SLP_TREE_CHILDREN (node).quick_push (vnode);
    2302         2063 :       return node;
    2303              :     }
    2304              :   /* When discovery reaches an associatable operation see whether we can
    2305              :      improve that to match up lanes in a way superior to the operand
    2306              :      swapping code which at most looks at two defs.
    2307              :      ???  For BB vectorization we cannot do the brute-force search
    2308              :      for matching as we can succeed by means of builds from scalars
    2309              :      and have no good way to "cost" one build against another.  */
    2310      3297374 :   else if (is_a <loop_vec_info> (vinfo)
    2311              :            /* Do not bother for single-lane SLP.  */
    2312      1965858 :            && group_size > 1
    2313              :            /* ???  We don't handle !vect_internal_def defs below.  */
    2314       111996 :            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
    2315              :            /* ???  Do not associate a reduction, this will wreck REDUC_IDX
    2316              :               mapping as long as that exists on the stmt_info level.  */
    2317        86482 :            && STMT_VINFO_REDUC_IDX (stmt_info) == -1
    2318        77886 :            && is_gimple_assign (stmt_info->stmt)
    2319        77570 :            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
    2320        50947 :                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
    2321      3325881 :            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
    2322        16252 :                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
    2323        13721 :                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
    2324              :     {
    2325              :       /* See if we have a chain of (mixed) adds or subtracts or other
    2326              :          associatable ops.  */
    2327        21541 :       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
    2328        21541 :       if (code == MINUS_EXPR)
    2329          799 :         code = PLUS_EXPR;
    2330        21541 :       stmt_vec_info other_op_stmt_info = NULL;
    2331        21541 :       stmt_vec_info op_stmt_info = NULL;
    2332        21541 :       unsigned chain_len = 0;
    2333        21541 :       auto_vec<chain_op_t> chain;
    2334        21541 :       auto_vec<std::pair<tree_code, gimple *> > worklist;
    2335        21541 :       auto_vec<vec<chain_op_t> > chains (group_size);
    2336        21541 :       auto_vec<slp_tree, 4> children;
    2337        21541 :       bool hard_fail = true;
    2338        22568 :       for (unsigned lane = 0; lane < group_size; ++lane)
    2339              :         {
    2340        22232 :           if (!stmts[lane])
    2341              :             {
    2342              :               /* ???  Below we require lane zero is present.  */
    2343            0 :               if (lane == 0)
    2344              :                 {
    2345              :                   hard_fail = false;
    2346        21205 :                   break;
    2347              :                 }
    2348            0 :               chains.quick_push (vNULL);
    2349            0 :               continue;
    2350              :             }
    2351              :           /* For each lane linearize the addition/subtraction (or other
    2352              :              uniform associatable operation) expression tree.  */
    2353        22232 :           gimple *op_stmt = NULL, *other_op_stmt = NULL;
    2354        22232 :           vect_slp_linearize_chain (vinfo, worklist, chain, code,
    2355        22232 :                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
    2356              :                                     NULL);
    2357        22232 :           if (!op_stmt_info && op_stmt)
    2358        20939 :             op_stmt_info = vinfo->lookup_stmt (op_stmt);
    2359        22232 :           if (!other_op_stmt_info && other_op_stmt)
    2360          835 :             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
    2361        22232 :           if (chain.length () == 2)
    2362              :             {
    2363              :               /* In a chain of just two elements resort to the regular
    2364              :                  operand swapping scheme.  Likewise if we run into a
    2365              :                  length mismatch process regularly as well as we did not
    2366              :                  process the other lanes we cannot report a good hint what
    2367              :                  lanes to try swapping in the parent.  */
    2368              :               hard_fail = false;
    2369              :               break;
    2370              :             }
    2371         1030 :           else if (chain_len == 0)
    2372          376 :             chain_len = chain.length ();
    2373         1308 :           else if (chain.length () != chain_len)
    2374              :             {
    2375              :               /* ???  Here we could slip in magic to compensate with
    2376              :                  neutral operands.  */
    2377            3 :               matches[lane] = false;
    2378            3 :               if (lane != group_size - 1)
    2379            3 :                 matches[0] = false;
    2380              :               break;
    2381              :             }
    2382         1027 :           chains.quick_push (chain.copy ());
    2383         1027 :           chain.truncate (0);
    2384              :         }
    2385        43082 :       if (chains.length () == group_size)
    2386              :         {
    2387              :           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
    2388          336 :           if (!op_stmt_info)
    2389              :             {
    2390            3 :               hard_fail = false;
    2391            3 :               goto out;
    2392              :             }
    2393              :           /* Now we have a set of chains with the same length.  */
    2394              :           /* 1. pre-sort according to def_type and operation.  */
    2395         1248 :           for (unsigned lane = 0; lane < group_size; ++lane)
    2396         1830 :             chains[lane].stablesort (dt_sort_cmp, vinfo);
    2397          333 :           if (dump_enabled_p ())
    2398              :             {
    2399          157 :               dump_printf_loc (MSG_NOTE, vect_location,
    2400              :                                "pre-sorted chains of %s\n",
    2401              :                                get_tree_code_name (code));
    2402          685 :               for (unsigned lane = 0; lane < group_size; ++lane)
    2403              :                 {
    2404          528 :                   if (!stmts[lane])
    2405            0 :                     dump_printf (MSG_NOTE, "--");
    2406              :                   else
    2407         2422 :                     for (unsigned opnum = 0; opnum < chain_len; ++opnum)
    2408         3788 :                       dump_printf (MSG_NOTE, "%s %T ",
    2409         1894 :                                    get_tree_code_name (chains[lane][opnum].code),
    2410         1894 :                                    chains[lane][opnum].op);
    2411          528 :                   dump_printf (MSG_NOTE, "\n");
    2412              :                 }
    2413              :             }
    2414              :           /* 2. try to build children nodes, associating as necessary.  */
    2415              :           /* 2a. prepare and perform early checks to avoid eating into
    2416              :              discovery limit unnecessarily.  */
    2417          333 :           vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
    2418         1407 :           for (unsigned n = 0; n < chain_len; ++n)
    2419              :             {
    2420         1074 :               vect_def_type dt = chains[0][n].dt;
    2421         1074 :               unsigned lane;
    2422         4177 :               for (lane = 0; lane < group_size; ++lane)
    2423         6206 :                 if (stmts[lane] && chains[lane][n].dt != dt)
    2424              :                   {
    2425            0 :                     if (dt == vect_constant_def
    2426            0 :                         && chains[lane][n].dt == vect_external_def)
    2427              :                       dt = vect_external_def;
    2428            0 :                     else if (dt == vect_external_def
    2429            0 :                              && chains[lane][n].dt == vect_constant_def)
    2430              :                       ;
    2431              :                     else
    2432              :                       break;
    2433              :                   }
    2434         1074 :               if (lane != group_size)
    2435              :                 {
    2436            0 :                   if (dump_enabled_p ())
    2437            0 :                     dump_printf_loc (MSG_NOTE, vect_location,
    2438              :                                      "giving up on chain due to mismatched "
    2439              :                                      "def types\n");
    2440            0 :                   matches[lane] = false;
    2441            0 :                   if (lane != group_size - 1)
    2442            0 :                     matches[0] = false;
    2443            0 :                   goto out;
    2444              :                 }
    2445         1074 :               dts[n] = dt;
    2446         1074 :               if (dt == vect_constant_def
    2447         1074 :                   || dt == vect_external_def)
    2448              :                 {
    2449              :                   /* Check whether we can build the invariant.  If we can't
    2450              :                      we never will be able to.  */
    2451           93 :                   tree type = TREE_TYPE (chains[0][n].op);
    2452         1074 :                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
    2453              :                       && (TREE_CODE (type) == BOOLEAN_TYPE
    2454              :                           || !can_duplicate_and_interleave_p (vinfo, group_size,
    2455              :                                                               type)))
    2456              :                     {
    2457              :                       matches[0] = false;
    2458              :                       goto out;
    2459              :                     }
    2460              :                 }
    2461          981 :               else if (dt != vect_internal_def)
    2462              :                 {
    2463              :                   /* Not sure, we might need sth special.
    2464              :                      gcc.dg/vect/pr96854.c,
    2465              :                      gfortran.dg/vect/fast-math-pr37021.f90
    2466              :                      and gfortran.dg/vect/pr61171.f trigger.  */
    2467              :                   /* Soft-fail for now.  */
    2468            0 :                   hard_fail = false;
    2469            0 :                   goto out;
    2470              :                 }
    2471              :             }
    2472              :           /* 2b. do the actual build.  */
    2473         1349 :           for (unsigned n = 0; n < chain_len; ++n)
    2474              :             {
    2475         1036 :               vect_def_type dt = dts[n];
    2476         1036 :               unsigned lane;
    2477         1036 :               if (dt == vect_constant_def
    2478         1036 :                   || dt == vect_external_def)
    2479              :                 {
    2480           93 :                   vec<tree> ops;
    2481           93 :                   ops.create (group_size);
    2482          461 :                   for (lane = 0; lane < group_size; ++lane)
    2483          275 :                     if (stmts[lane])
    2484          275 :                       ops.quick_push (chains[lane][n].op);
    2485              :                     else
    2486            0 :                       ops.quick_push (NULL_TREE);
    2487           93 :                   slp_tree child = vect_create_new_slp_node (ops);
    2488           93 :                   SLP_TREE_DEF_TYPE (child) = dt;
    2489           93 :                   children.safe_push (child);
    2490              :                 }
    2491              :               else
    2492              :                 {
    2493          943 :                   vec<stmt_vec_info> op_stmts;
    2494          943 :                   op_stmts.create (group_size);
    2495          943 :                   slp_tree child = NULL;
    2496              :                   /* Brute-force our way.  We have to consider a lane
    2497              :                      failing after fixing an earlier fail up in the
    2498              :                      SLP discovery recursion.  So track the current
    2499              :                      permute per lane.  */
    2500          943 :                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
    2501          943 :                   memset (perms, 0, sizeof (unsigned) * group_size);
    2502         1037 :                   do
    2503              :                     {
    2504         1037 :                       op_stmts.truncate (0);
    2505         5080 :                       for (lane = 0; lane < group_size; ++lane)
    2506         3006 :                         if (stmts[lane])
    2507         3006 :                           op_stmts.quick_push
    2508         3006 :                             (vinfo->lookup_def (chains[lane][n].op));
    2509              :                         else
    2510            0 :                           op_stmts.quick_push (NULL);
    2511         1037 :                       child = vect_build_slp_tree (vinfo, op_stmts,
    2512              :                                                    group_size, &this_max_nunits,
    2513              :                                                    matches, limit,
    2514              :                                                    &this_tree_size, bst_map);
    2515              :                       /* ???  We're likely getting too many fatal mismatches
    2516              :                          here so maybe we want to ignore them (but then we
    2517              :                          have no idea which lanes fatally mismatched).  */
    2518         1037 :                       if (child || !matches[0])
    2519              :                         break;
    2520              :                       /* Swap another lane we have not yet matched up into
    2521              :                          lanes that did not match.  If we run out of
    2522              :                          permute possibilities for a lane terminate the
    2523              :                          search.  */
    2524          287 :                       bool term = false;
    2525          287 :                       for (lane = 1; lane < group_size; ++lane)
    2526          193 :                         if (!matches[lane])
    2527              :                           {
    2528          165 :                             if (n + perms[lane] + 1 == chain_len)
    2529              :                               {
    2530              :                                 term = true;
    2531              :                                 break;
    2532              :                               }
    2533          146 :                             if (dump_enabled_p ())
    2534          113 :                               dump_printf_loc (MSG_NOTE, vect_location,
    2535              :                                                "swapping operand %d and %d "
    2536              :                                                "of lane %d\n",
    2537              :                                                n, n + perms[lane] + 1, lane);
    2538          292 :                             std::swap (chains[lane][n],
    2539          146 :                                        chains[lane][n + perms[lane] + 1]);
    2540          146 :                             perms[lane]++;
    2541              :                           }
    2542          113 :                       if (term)
    2543              :                         break;
    2544              :                     }
    2545              :                   while (1);
    2546          943 :                   if (!child)
    2547              :                     {
    2548           20 :                       if (dump_enabled_p ())
    2549           18 :                         dump_printf_loc (MSG_NOTE, vect_location,
    2550              :                                          "failed to match up op %d\n", n);
    2551           20 :                       op_stmts.release ();
    2552           20 :                       if (lane != group_size - 1)
    2553           10 :                         matches[0] = false;
    2554              :                       else
    2555           10 :                         matches[lane] = false;
    2556           20 :                       goto out;
    2557              :                     }
    2558          923 :                   if (dump_enabled_p ())
    2559              :                     {
    2560          421 :                       dump_printf_loc (MSG_NOTE, vect_location,
    2561              :                                        "matched up op %d to\n", n);
    2562          421 :                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
    2563              :                     }
    2564          923 :                   children.safe_push (child);
    2565              :                 }
    2566              :             }
    2567              :           /* 3. build SLP nodes to combine the chain.  */
    2568         1153 :           for (unsigned lane = 0; lane < group_size; ++lane)
    2569         1692 :             if (stmts[lane] && chains[lane][0].code != code)
    2570              :               {
    2571              :                 /* See if there's any alternate all-PLUS entry.  */
    2572              :                 unsigned n;
    2573            6 :                 for (n = 1; n < chain_len; ++n)
    2574              :                   {
    2575           30 :                     for (lane = 0; lane < group_size; ++lane)
    2576           48 :                       if (stmts[lane] && chains[lane][n].code != code)
    2577              :                         break;
    2578            6 :                     if (lane == group_size)
    2579              :                       break;
    2580              :                   }
    2581            6 :                 if (n != chain_len)
    2582              :                   {
    2583              :                     /* Swap that in at first position.  */
    2584            6 :                     std::swap (children[0], children[n]);
    2585           30 :                     for (lane = 0; lane < group_size; ++lane)
    2586           24 :                       if (stmts[lane])
    2587           24 :                         std::swap (chains[lane][0], chains[lane][n]);
    2588              :                   }
    2589              :                 else
    2590              :                   {
    2591              :                     /* ???  When this triggers and we end up with two
    2592              :                        vect_constant/external_def up-front things break (ICE)
    2593              :                        spectacularly finding an insertion place for the
    2594              :                        all-constant op.  We should have a fully
    2595              :                        vect_internal_def operand though(?) so we can swap
    2596              :                        that into first place and then prepend the all-zero
    2597              :                        constant.  */
    2598            0 :                     if (dump_enabled_p ())
    2599            0 :                       dump_printf_loc (MSG_NOTE, vect_location,
    2600              :                                        "inserting constant zero to compensate "
    2601              :                                        "for (partially) negated first "
    2602              :                                        "operand\n");
    2603            0 :                     chain_len++;
    2604            0 :                     for (lane = 0; lane < group_size; ++lane)
    2605            0 :                       if (stmts[lane])
    2606            0 :                         chains[lane].safe_insert
    2607            0 :                           (0, chain_op_t (code, vect_constant_def, NULL_TREE));
    2608            0 :                     vec<tree> zero_ops;
    2609            0 :                     zero_ops.create (group_size);
    2610            0 :                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
    2611            0 :                     for (lane = 1; lane < group_size; ++lane)
    2612            0 :                       if (stmts[lane])
    2613            0 :                         zero_ops.quick_push (zero_ops[0]);
    2614              :                       else
    2615            0 :                         zero_ops.quick_push (NULL_TREE);
    2616            0 :                     slp_tree zero = vect_create_new_slp_node (zero_ops);
    2617            0 :                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
    2618            0 :                     children.safe_insert (0, zero);
    2619              :                   }
    2620              :                 break;
    2621              :               }
    2622         1011 :           for (unsigned i = 1; i < children.length (); ++i)
    2623              :             {
    2624          698 :               slp_tree op0 = children[i - 1];
    2625          698 :               slp_tree op1 = children[i];
    2626          698 :               bool this_two_op = false;
    2627         2560 :               for (unsigned lane = 0; lane < group_size; ++lane)
    2628         4040 :                 if (stmts[lane] && chains[lane][i].code != chains[0][i].code)
    2629              :                   {
    2630              :                     this_two_op = true;
    2631              :                     break;
    2632              :                   }
    2633          698 :               slp_tree child;
    2634          698 :               if (i == children.length () - 1)
    2635          313 :                 child = vect_create_new_slp_node (node, stmts, 2);
    2636              :               else
    2637          385 :                 child = vect_create_new_slp_node (2, ERROR_MARK);
    2638          698 :               if (this_two_op)
    2639              :                 {
    2640          158 :                   vec<std::pair<unsigned, unsigned> > lperm;
    2641          158 :                   lperm.create (group_size);
    2642          570 :                   for (unsigned lane = 0; lane < group_size; ++lane)
    2643          824 :                     lperm.quick_push (std::make_pair
    2644          412 :                       (chains[lane][i].code != chains[0][i].code, lane));
    2645          316 :                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
    2646          158 :                                                      (chains[0][i].code == code
    2647              :                                                       ? op_stmt_info
    2648              :                                                       : other_op_stmt_info),
    2649          158 :                                                      (chains[0][i].code == code
    2650              :                                                       ? other_op_stmt_info
    2651              :                                                       : op_stmt_info),
    2652              :                                                      lperm);
    2653              :                 }
    2654              :               else
    2655              :                 {
    2656          540 :                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
    2657          540 :                   SLP_TREE_VECTYPE (child) = vectype;
    2658          540 :                   SLP_TREE_LANES (child) = group_size;
    2659          540 :                   SLP_TREE_CHILDREN (child).quick_push (op0);
    2660          540 :                   SLP_TREE_CHILDREN (child).quick_push (op1);
    2661          540 :                   SLP_TREE_REPRESENTATIVE (child)
    2662         1080 :                     = (chains[0][i].code == code
    2663          540 :                        ? op_stmt_info : other_op_stmt_info);
    2664              :                 }
    2665          698 :               children[i] = child;
    2666              :             }
    2667          313 :           *tree_size += this_tree_size + 1;
    2668          313 :           *max_nunits = this_max_nunits;
    2669         1513 :           while (!chains.is_empty ())
    2670          864 :             chains.pop ().release ();
    2671              :           return node;
    2672              :         }
    2673        21205 : out:
    2674        21228 :       if (dump_enabled_p ())
    2675         2815 :         dump_printf_loc (MSG_NOTE, vect_location,
    2676              :                          "failed to line up SLP graph by re-associating "
    2677              :                          "operations in lanes%s\n",
    2678              :                          !hard_fail ? " trying regular discovery" : "");
    2679        21233 :       while (!children.is_empty ())
    2680            5 :         vect_free_slp_tree (children.pop ());
    2681        21391 :       while (!chains.is_empty ())
    2682          163 :         chains.pop ().release ();
    2683              :       /* Hard-fail, otherwise we might run into quadratic processing of the
    2684              :          chains starting one stmt into the chain again.  */
    2685        21228 :       if (hard_fail)
    2686              :         return NULL;
    2687              :       /* Fall thru to normal processing.  */
    2688        21541 :     }
    2689              : 
    2690              :   /* Get at the operands, verifying they are compatible.  */
    2691      3318860 :   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
    2692      3318860 :   slp_oprnd_info oprnd_info;
    2693     16005983 :   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
    2694              :     {
    2695     25376642 :       int res = vect_get_and_check_slp_defs (vinfo, vectype,
    2696     12688321 :                                              swap[i], skip_args,
    2697              :                                              stmts, i, &oprnds_info);
    2698     12688321 :       if (res != 0)
    2699       544308 :         matches[(res == -1) ? 0 : i] = false;
    2700     12688321 :       if (!matches[0])
    2701              :         break;
    2702              :     }
    2703     15694349 :   for (i = 0; i < group_size; ++i)
    2704     12588292 :     if (!matches[i])
    2705              :       {
    2706       212803 :         vect_free_oprnd_info (oprnds_info);
    2707       212803 :         return NULL;
    2708              :       }
    2709      9318171 :   swap = NULL;
    2710              : 
    2711      9318171 :   bool has_two_operators_perm = false;
    2712     18636342 :   auto_vec<unsigned> two_op_perm_indices[2];
    2713      3106057 :   vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
    2714              : 
    2715      3120298 :   if (two_operators && oprnds_info.length () == 2 && group_size > 2)
    2716              :     {
    2717         3822 :       unsigned idx = 0;
    2718         3822 :       hash_map<gimple *, unsigned> seen;
    2719         3822 :       vec<slp_oprnd_info> new_oprnds_info
    2720         3822 :         = vect_create_oprnd_info (1, group_size);
    2721         3822 :       bool success = true;
    2722              : 
    2723         3822 :       enum tree_code code = ERROR_MARK;
    2724         3822 :       if (oprnds_info[0]->def_stmts[0]
    2725         3822 :           && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
    2726         3764 :         code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
    2727         3822 :       basic_block bb = nullptr;
    2728              : 
    2729         7424 :       for (unsigned j = 0; j < group_size; ++j)
    2730              :         {
    2731        17398 :           FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
    2732              :             {
    2733        13796 :               stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
    2734        13796 :               if (!stmt_info
    2735        13635 :                   || !is_a<gassign *> (stmt_info->stmt)
    2736        13632 :                   || gimple_assign_rhs_code (stmt_info->stmt) != code
    2737        24233 :                   || skip_args[i])
    2738              :                 {
    2739              :                   success = false;
    2740         3363 :                   break;
    2741              :                 }
    2742              :               /* Avoid mixing lanes with defs in different basic-blocks.  */
    2743        10437 :               if (!bb)
    2744         3940 :                 bb = gimple_bb (vect_orig_stmt (stmt_info)->stmt);
    2745         8261 :               else if (gimple_bb (vect_orig_stmt (stmt_info)->stmt) != bb)
    2746              :                 {
    2747              :                   success = false;
    2748              :                   break;
    2749              :                 }
    2750              : 
    2751        10433 :               bool exists;
    2752        10433 :               unsigned &stmt_idx
    2753        10433 :                 = seen.get_or_insert (stmt_info->stmt, &exists);
    2754              : 
    2755        10433 :               if (!exists)
    2756              :                 {
    2757         9092 :                   new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
    2758         9092 :                   new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
    2759         9092 :                   stmt_idx = idx;
    2760         9092 :                   idx++;
    2761              :                 }
    2762              : 
    2763        10433 :               two_op_perm_indices[i].safe_push (stmt_idx);
    2764              :             }
    2765              : 
    2766         6965 :           if (!success)
    2767              :             break;
    2768              :         }
    2769              : 
    2770         3822 :       if (success && idx == group_size)
    2771              :         {
    2772           94 :           if (dump_enabled_p ())
    2773              :             {
    2774            0 :               dump_printf_loc (MSG_NOTE, vect_location,
    2775              :                                "Replace two_operators operands:\n");
    2776              : 
    2777            0 :               FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
    2778              :                 {
    2779            0 :                   dump_printf_loc (MSG_NOTE, vect_location,
    2780              :                                    "Operand %u:\n", i);
    2781            0 :                   for (unsigned j = 0; j < group_size; j++)
    2782            0 :                     dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
    2783            0 :                                      j, oprnd_info->def_stmts[j]->stmt);
    2784              :                 }
    2785              : 
    2786            0 :               dump_printf_loc (MSG_NOTE, vect_location,
    2787              :                                "With a single operand:\n");
    2788            0 :               for (unsigned j = 0; j < group_size; j++)
    2789            0 :                 dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
    2790            0 :                                  j, new_oprnds_info[0]->def_stmts[j]->stmt);
    2791              :             }
    2792              : 
    2793           94 :           two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
    2794           94 :           two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
    2795              : 
    2796           94 :           new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
    2797           94 :           new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
    2798           94 :           new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
    2799           94 :           new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
    2800           94 :           new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
    2801              : 
    2802           94 :           vect_free_oprnd_info (oprnds_info);
    2803           94 :           oprnds_info = new_oprnds_info;
    2804           94 :           nops = 1;
    2805           94 :           has_two_operators_perm = true;
    2806              :         }
    2807              :       else
    2808         3728 :         vect_free_oprnd_info (new_oprnds_info);
    2809         3822 :     }
    2810              : 
    2811      6212114 :   auto_vec<slp_tree, 4> children;
    2812              : 
    2813      3106057 :   stmt_info = stmts[0];
    2814              : 
    2815      3106057 :   int reduc_idx = -1;
    2816      3106057 :   int gs_scale = 0;
    2817      3106057 :   tree gs_base = NULL_TREE;
    2818              : 
    2819              :   /* Create SLP_TREE nodes for the definition node/s.  */
    2820      7929154 :   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
    2821              :     {
    2822      4950456 :       slp_tree child = nullptr;
    2823      4950456 :       unsigned int j;
    2824      4950456 :       unsigned old_swap_distance;
    2825      4950456 :       bool can_swap;
    2826      4950456 :       bool can_swap_nonmatching;
    2827      4950456 :       bool *stmt_can_swap;
    2828              : 
    2829              :       /* We're skipping certain operands from processing, for example
    2830              :          outer loop reduction initial defs.  */
    2831      4950456 :       if (skip_args[i])
    2832              :         {
    2833       483017 :           children.safe_push (NULL);
    2834      5306114 :           continue;
    2835              :         }
    2836              : 
    2837      4467439 :       if (oprnd_info->first_dt == vect_uninitialized_def)
    2838              :         {
    2839              :           /* COND_EXPR have one too many eventually if the condition
    2840              :              is a SSA name.  */
    2841            0 :           gcc_assert (i == 3 && nops == 4);
    2842            0 :           continue;
    2843              :         }
    2844              : 
    2845      4467439 :       if (oprnd_info->first_gs_p)
    2846              :         {
    2847        22438 :           gs_scale = oprnd_info->first_gs_info.scale;
    2848        22438 :           gs_base = oprnd_info->first_gs_info.base;
    2849              :         }
    2850              : 
    2851      4467439 :       if (is_a <bb_vec_info> (vinfo)
    2852      1551776 :           && oprnd_info->first_dt == vect_internal_def
    2853      5267604 :           && !oprnd_info->any_pattern)
    2854              :         {
    2855              :           /* For BB vectorization, if all defs are the same do not
    2856              :              bother to continue the build along the single-lane
    2857              :              graph but use a splat of the scalar value.  */
    2858       757195 :           stmt_vec_info first_def = oprnd_info->def_stmts[0];
    2859       813292 :           for (j = 1; j < group_size; ++j)
    2860       773209 :             if (oprnd_info->def_stmts[j] != first_def)
    2861              :               break;
    2862       757195 :           if (j == group_size
    2863              :               /* But avoid doing this for loads where we may be
    2864              :                  able to CSE things, unless the stmt is not
    2865              :                  vectorizable.  */
    2866       757195 :               && (!STMT_VINFO_VECTORIZABLE (first_def)
    2867        48786 :                   || !gimple_vuse (first_def->stmt)))
    2868              :             {
    2869        31199 :               if (dump_enabled_p ())
    2870          107 :                 dump_printf_loc (MSG_NOTE, vect_location,
    2871              :                                  "Using a splat of the uniform operand %G",
    2872              :                                  first_def->stmt);
    2873        31199 :               oprnd_info->first_dt = vect_external_def;
    2874              :             }
    2875              :         }
    2876              : 
    2877      4467439 :       if (oprnd_info->first_dt == vect_external_def
    2878      4467439 :           || oprnd_info->first_dt == vect_constant_def)
    2879              :         {
    2880      1466279 :           if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
    2881              :             {
    2882              :               tree op0;
    2883              :               tree uniform_val = op0 = oprnd_info->ops[0];
    2884              :               for (j = 1; j < oprnd_info->ops.length (); ++j)
    2885              :                 if (oprnd_info->ops[j]
    2886              :                     && !operand_equal_p (uniform_val, oprnd_info->ops[j]))
    2887              :                   {
    2888              :                     uniform_val = NULL_TREE;
    2889              :                     break;
    2890              :                   }
    2891              :               if (!uniform_val
    2892              :                   && !can_duplicate_and_interleave_p (vinfo,
    2893              :                                                       oprnd_info->ops.length (),
    2894              :                                                       TREE_TYPE (op0)))
    2895              :                 {
    2896              :                   matches[j] = false;
    2897              :                   if (dump_enabled_p ())
    2898              :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    2899              :                                      "Build SLP failed: invalid type of def "
    2900              :                                      "for variable-length SLP %T\n", op0);
    2901              :                   goto fail;
    2902              :                 }
    2903              :             }
    2904      1466279 :           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
    2905      1466279 :           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
    2906      1466279 :           oprnd_info->ops = vNULL;
    2907      1466279 :           children.safe_push (invnode);
    2908      1466279 :           continue;
    2909      1466279 :         }
    2910              : 
    2911              :       /* See which SLP operand a reduction chain continues on.  We want
    2912              :          to chain even PHIs but not backedges.  */
    2913      3001160 :       if (STMT_VINFO_REDUC_DEF (oprnd_info->def_stmts[0])
    2914      3001160 :           || STMT_VINFO_REDUC_IDX (oprnd_info->def_stmts[0]) != -1)
    2915              :         {
    2916       233366 :           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
    2917              :             {
    2918          776 :               if (oprnd_info->first_dt == vect_double_reduction_def)
    2919          388 :                 reduc_idx = i;
    2920              :             }
    2921       232590 :           else if (is_a <gphi *> (stmt_info->stmt)
    2922       232590 :                    && gimple_phi_num_args
    2923        99770 :                         (as_a <gphi *> (stmt_info->stmt)) != 1)
    2924              :             ;
    2925       133213 :           else if (STMT_VINFO_REDUC_IDX (stmt_info) == -1
    2926          393 :                    && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
    2927              :             ;
    2928       133213 :           else if (reduc_idx == -1)
    2929       124804 :             reduc_idx = i;
    2930              :           else
    2931              :             /* For .COND_* reduction operations the else value can be the
    2932              :                same as one of the operation operands.  The other def
    2933              :                stmts have been moved, so we can't check easily.  Check
    2934              :                it's a call at least.  */
    2935         8409 :             gcc_assert (is_a <gcall *> (stmt_info->stmt));
    2936              :         }
    2937              : 
    2938              :       /* When we have a masked load with uniform mask discover this
    2939              :          as a single-lane mask with a splat permute.  This way we can
    2940              :          recognize this as a masked load-lane by stripping the splat.  */
    2941      3001160 :       if (is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
    2942        57555 :           && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
    2943              :                                      IFN_MASK_LOAD)
    2944         6075 :           && STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2945      3001237 :           && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info)))
    2946              :         {
    2947           35 :           vec<stmt_vec_info> def_stmts2;
    2948           35 :           def_stmts2.create (1);
    2949           35 :           def_stmts2.quick_push (oprnd_info->def_stmts[0]);
    2950           35 :           child = vect_build_slp_tree (vinfo, def_stmts2, 1,
    2951              :                                        &this_max_nunits,
    2952              :                                        matches, limit,
    2953              :                                        &this_tree_size, bst_map);
    2954           35 :           if (child)
    2955              :             {
    2956           35 :               slp_tree pnode = vect_create_new_slp_node (1, VEC_PERM_EXPR);
    2957           35 :               SLP_TREE_VECTYPE (pnode) = SLP_TREE_VECTYPE (child);
    2958           35 :               SLP_TREE_LANES (pnode) = group_size;
    2959           35 :               SLP_TREE_SCALAR_STMTS (pnode).create (group_size);
    2960           35 :               SLP_TREE_LANE_PERMUTATION (pnode).create (group_size);
    2961          210 :               for (unsigned k = 0; k < group_size; ++k)
    2962              :                 {
    2963          175 :                   SLP_TREE_SCALAR_STMTS (pnode)
    2964          175 :                     .quick_push (oprnd_info->def_stmts[0]);
    2965          175 :                   SLP_TREE_LANE_PERMUTATION (pnode)
    2966          175 :                     .quick_push (std::make_pair (0u, 0u));
    2967              :                 }
    2968           35 :               SLP_TREE_CHILDREN (pnode).quick_push (child);
    2969           35 :               pnode->max_nunits = child->max_nunits;
    2970           35 :               children.safe_push (pnode);
    2971           35 :               oprnd_info->def_stmts = vNULL;
    2972           35 :               continue;
    2973           35 :             }
    2974              :           else
    2975            0 :             def_stmts2.release ();
    2976              :         }
    2977              : 
    2978      6002250 :       can_swap = (i == 0
    2979      2224575 :                   && (nops == 2 || nops == 3)
    2980      1429788 :                   && oprnds_info.length () > 1
    2981      1429788 :                   && oprnds_info[1]->first_dt == vect_internal_def
    2982       583916 :                   && (is_gimple_assign (stmt_info->stmt)
    2983        49541 :                       || is_gimple_call (stmt_info->stmt))
    2984              :                   /* Swapping operands for reductions breaks assumptions
    2985              :                      later on.  */
    2986      3540738 :                   && STMT_VINFO_REDUC_IDX (stmt_info) == -1);
    2987      3001125 :       can_swap_nonmatching = can_swap;
    2988      3001125 :       stmt_can_swap = NULL;
    2989      3001125 :       if (can_swap)
    2990              :         {
    2991       484258 :           stmt_can_swap = XALLOCAVEC (bool, group_size);
    2992      8103630 :           for (j = 0; j < group_size; ++j)
    2993              :             {
    2994      7619372 :               stmt_can_swap[j] = false;
    2995      7619372 :               if (!stmts[j])
    2996              :                 /* NULL lanes are gaps and have no stmt to swap.  */
    2997            0 :                 stmt_can_swap[j] = true;
    2998      7619372 :               else if (gassign *stmt = dyn_cast <gassign *> (stmts[j]->stmt))
    2999              :                 {
    3000      7613668 :                   tree_code code = gimple_assign_rhs_code (stmt);
    3001     15227336 :                   stmt_can_swap[j] = (commutative_tree_code (code)
    3002      7613668 :                                       || commutative_ternary_tree_code (code));
    3003              :                 }
    3004         5704 :               else if (gcall *call = dyn_cast <gcall *> (stmts[j]->stmt))
    3005              :                 {
    3006         5704 :                   internal_fn fn = (gimple_call_internal_p (call)
    3007         5704 :                                     ? gimple_call_internal_fn (call) : IFN_LAST);
    3008        11408 :                   stmt_can_swap[j] = ((commutative_binary_fn_p (fn)
    3009         5388 :                                        || commutative_ternary_fn_p (fn))
    3010         5740 :                                       && first_commutative_argument (fn) == 0);
    3011              :                 }
    3012              : 
    3013      7619372 :               if (j != 0 && !stmt_can_swap[j])
    3014      7619372 :                 can_swap_nonmatching = false;
    3015              :             }
    3016              :         }
    3017              : 
    3018      3001125 :       old_swap_distance = least_upthread_swappable_op_distance;
    3019      3001125 :       if (can_swap_nonmatching)
    3020       451435 :         least_upthread_swappable_op_distance = 1;
    3021      2549690 :       else if (least_upthread_swappable_op_distance != -1U)
    3022       302365 :         least_upthread_swappable_op_distance++;
    3023      3001125 :       child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
    3024              :                                    group_size, &this_max_nunits,
    3025              :                                    matches, limit,
    3026              :                                    &this_tree_size, bst_map);
    3027      3001125 :       least_upthread_swappable_op_distance = old_swap_distance;
    3028      3001125 :       if (child != NULL)
    3029              :         {
    3030      2513915 :           oprnd_info->def_stmts = vNULL;
    3031      2513915 :           children.safe_push (child);
    3032      2513915 :           continue;
    3033              :         }
    3034              : 
    3035              :       /* If the SLP build for operand zero failed and operand zero
    3036              :          and one can be commuted try that for the scalar stmts
    3037              :          that failed the match.  */
    3038       487210 :       if (/* A first scalar stmt mismatch signals a fatal mismatch.  */
    3039       487210 :           matches[0]
    3040       258013 :           && can_swap)
    3041              :         {
    3042              :           /* See whether we can swap the matching or the non-matching
    3043              :              stmt operands.  */
    3044              :           bool swap_not_matching = true;
    3045        65186 :           do
    3046              :             {
    3047      7103690 :               for (j = 0; j < group_size; ++j)
    3048              :                 {
    3049      7053440 :                   if (matches[j] != !swap_not_matching)
    3050        86583 :                     continue;
    3051              :                   /* Verify if we can swap operands of this stmt.  */
    3052      6966857 :                   if (!stmt_can_swap[j])
    3053              :                     {
    3054        14936 :                       if (!swap_not_matching)
    3055         6951 :                         goto fail;
    3056              :                       swap_not_matching = false;
    3057              :                       break;
    3058              :                     }
    3059              :                 }
    3060              :             }
    3061        58235 :           while (j != group_size);
    3062              : 
    3063              :           /* Swap mismatched definition stmts.  */
    3064        50250 :           if (dump_enabled_p ())
    3065          392 :             dump_printf_loc (MSG_NOTE, vect_location,
    3066              :                              "Re-trying with swapped operands of stmts ");
    3067      7079848 :           for (j = 0; j < group_size; ++j)
    3068      7029598 :             if (matches[j] == !swap_not_matching)
    3069              :               {
    3070     13903446 :                 std::swap (oprnds_info[0]->def_stmts[j],
    3071      6951723 :                            oprnds_info[1]->def_stmts[j]);
    3072     13903446 :                 std::swap (oprnds_info[0]->ops[j],
    3073      6951723 :                            oprnds_info[1]->ops[j]);
    3074      6951723 :                 if (dump_enabled_p ())
    3075         1079 :                   dump_printf (MSG_NOTE, "%d ", j);
    3076              :               }
    3077        50250 :           if (dump_enabled_p ())
    3078          392 :             dump_printf (MSG_NOTE, "\n");
    3079              :           /* After swapping some operands we lost track whether an
    3080              :              operand has any pattern defs so be conservative here.  */
    3081        98065 :           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
    3082         2888 :             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
    3083              :           /* And try again with scratch 'matches' ... */
    3084        50250 :           bool *tem = XALLOCAVEC (bool, group_size);
    3085        50250 :           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
    3086              :                                             group_size, &this_max_nunits,
    3087              :                                             tem, limit,
    3088              :                                             &this_tree_size, bst_map)) != NULL)
    3089              :             {
    3090         6313 :               oprnd_info->def_stmts = vNULL;
    3091         6313 :               children.safe_push (child);
    3092         6313 :               continue;
    3093              :             }
    3094              :         }
    3095       430009 : fail:
    3096              : 
    3097              :       /* If the SLP build failed and we analyze a basic-block
    3098              :          simply treat nodes we fail to build as externally defined
    3099              :          (and thus build vectors from the scalar defs).
    3100              :          The cost model will reject outright expensive cases.
    3101              :          ???  This doesn't treat cases where permutation ultimatively
    3102              :          fails (or we don't try permutation below).  Ideally we'd
    3103              :          even compute a permutation that will end up with the maximum
    3104              :          SLP tree size...  */
    3105       480897 :       if (is_a <bb_vec_info> (vinfo)
    3106              :           /* ???  Rejecting patterns this way doesn't work.  We'd have to
    3107              :              do extra work to cancel the pattern so the uses see the
    3108              :              scalar version.  */
    3109              :           /* Skip building vector operands from scalars while operand
    3110              :              discovery may still be fixed by retrying with swapped operands.  */
    3111       400181 :           && (least_upthread_swappable_op_distance != 1
    3112              :               /* A first scalar stmt mismatch signals a fatal mismatch
    3113              :                  that the parent commutative retry cannot recover.  */
    3114        26339 :               || !matches[0])
    3115       382276 :           && !is_pattern_stmt_p (stmt_info)
    3116       840810 :           && !oprnd_info->any_pattern)
    3117              :         {
    3118              :           /* But if there's a leading vector sized set of matching stmts
    3119              :              fail here so we can split the group.  This matches the condition
    3120              :              vect_analyze_slp_instance uses.  */
    3121              :           /* ???  We might want to split here and combine the results to support
    3122              :              multiple vector sizes better.  */
    3123       557711 :           for (j = 0; j < group_size; ++j)
    3124       557711 :             if (!matches[j])
    3125              :               break;
    3126       359672 :           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
    3127       359643 :               && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
    3128              :             {
    3129       353538 :               if (dump_enabled_p ())
    3130          616 :                 dump_printf_loc (MSG_NOTE, vect_location,
    3131              :                                  "Building vector operands from scalars\n");
    3132       353538 :               this_tree_size++;
    3133       353538 :               child = vect_create_new_slp_node (oprnd_info->ops);
    3134       353538 :               children.safe_push (child);
    3135       353538 :               oprnd_info->ops = vNULL;
    3136       353538 :               continue;
    3137              :             }
    3138              :         }
    3139              : 
    3140       127359 :       gcc_assert (child == NULL);
    3141       145969 :       FOR_EACH_VEC_ELT (children, j, child)
    3142        18610 :         if (child)
    3143        18610 :           vect_free_slp_tree (child);
    3144       127359 :       vect_free_oprnd_info (oprnds_info);
    3145       127359 :       return NULL;
    3146              :     }
    3147              : 
    3148      2978698 :   vect_free_oprnd_info (oprnds_info);
    3149              : 
    3150              :   /* If we have all children of a child built up from uniform scalars
    3151              :      or does more than one possibly expensive vector construction then
    3152              :      just throw that away, causing it built up from scalars.
    3153              :      The exception is the SLP node for the vector store.  */
    3154      2978698 :   if (is_a <bb_vec_info> (vinfo)
    3155      1077928 :       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
    3156              :       /* ???  Rejecting patterns this way doesn't work.  We'd have to
    3157              :          do extra work to cancel the pattern so the uses see the
    3158              :          scalar version.  */
    3159      3395756 :       && !is_pattern_stmt_p (stmt_info))
    3160              :     {
    3161              :       slp_tree child;
    3162              :       unsigned j;
    3163              :       bool all_uniform_p = true;
    3164              :       unsigned n_vector_builds = 0;
    3165      1183549 :       FOR_EACH_VEC_ELT (children, j, child)
    3166              :         {
    3167       791995 :           if (!child)
    3168              :             ;
    3169       791995 :           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    3170              :             all_uniform_p = false;
    3171       576567 :           else if (!vect_slp_tree_uniform_p (child))
    3172              :             {
    3173       438512 :               all_uniform_p = false;
    3174       438512 :               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
    3175       404210 :                 n_vector_builds++;
    3176              :             }
    3177              :         }
    3178       391554 :       if (all_uniform_p
    3179       391554 :           || n_vector_builds > 1
    3180       661297 :           || (n_vector_builds == children.length ()
    3181        29744 :               && is_a <gphi *> (stmt_info->stmt)))
    3182              :         {
    3183              :           /* Roll back.  */
    3184       126733 :           matches[0] = false;
    3185       402701 :           FOR_EACH_VEC_ELT (children, j, child)
    3186       275968 :             if (child)
    3187       275968 :               vect_free_slp_tree (child);
    3188              : 
    3189       126733 :           if (dump_enabled_p ())
    3190          205 :             dump_printf_loc (MSG_NOTE, vect_location,
    3191              :                              "Building parent vector operands from "
    3192              :                              "scalars instead\n");
    3193       126733 :           return NULL;
    3194              :         }
    3195              :     }
    3196              : 
    3197      2851965 :   *tree_size += this_tree_size + 1;
    3198      2851965 :   *max_nunits = this_max_nunits;
    3199              : 
    3200      2851965 :   if (two_operators)
    3201              :     {
    3202              :       /* ???  We'd likely want to either cache in bst_map sth like
    3203              :          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
    3204              :          the true { a+b, a+b, a+b, a+b } ... but there we don't have
    3205              :          explicit stmts to put in so the keying on 'stmts' doesn't
    3206              :          work (but we have the same issue with nodes that use 'ops').  */
    3207              : 
    3208         6689 :       if (has_two_operators_perm)
    3209              :         {
    3210           40 :           slp_tree child = children[0];
    3211           40 :           children.truncate (0);
    3212          120 :           for (i = 0; i < 2; i++)
    3213              :             {
    3214           80 :               slp_tree pnode
    3215           80 :                 = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
    3216           80 :               SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
    3217           80 :               SLP_TREE_VECTYPE (pnode) = vectype;
    3218           80 :               SLP_TREE_CHILDREN (pnode).quick_push (child);
    3219           80 :               SLP_TREE_CHILDREN (pnode).quick_push (child);
    3220           80 :               lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
    3221           80 :               children.safe_push (pnode);
    3222              : 
    3223          656 :               for (unsigned j = 0; j < stmts.length (); j++)
    3224          576 :                 perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
    3225              :             }
    3226              : 
    3227           40 :           SLP_TREE_REF_COUNT (child) += 4;
    3228              :         }
    3229              : 
    3230         6689 :       slp_tree one = new _slp_tree;
    3231         6689 :       slp_tree two = new _slp_tree;
    3232         6689 :       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
    3233         6689 :       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
    3234         6689 :       SLP_TREE_VECTYPE (one) = vectype;
    3235         6689 :       SLP_TREE_VECTYPE (two) = vectype;
    3236         6689 :       SLP_TREE_CHILDREN (one).safe_splice (children);
    3237         6689 :       SLP_TREE_CHILDREN (two).safe_splice (children);
    3238         6689 :       slp_tree child;
    3239        26758 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
    3240        13380 :         SLP_TREE_REF_COUNT (child)++;
    3241              : 
    3242              :       /* Here we record the original defs since this
    3243              :          node represents the final lane configuration.  */
    3244         6689 :       node = vect_create_new_slp_node (node, stmts, 2);
    3245         6689 :       SLP_TREE_VECTYPE (node) = vectype;
    3246         6689 :       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
    3247         6689 :       SLP_TREE_CHILDREN (node).quick_push (one);
    3248         6689 :       SLP_TREE_CHILDREN (node).quick_push (two);
    3249         6689 :       enum tree_code code0 = ERROR_MARK;
    3250         6689 :       enum tree_code ocode = ERROR_MARK;
    3251         6689 :       if (gassign *stmt = dyn_cast <gassign *> (stmts[0]->stmt))
    3252         6687 :         code0 = gimple_assign_rhs_code (stmt);
    3253         6689 :       stmt_vec_info ostmt_info;
    3254         6689 :       unsigned j = 0;
    3255        24440 :       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
    3256              :         {
    3257        17751 :           int op = 0;
    3258        17751 :           if (gassign *ostmt = dyn_cast <gassign *> (ostmt_info->stmt))
    3259              :             {
    3260        17747 :               if (gimple_assign_rhs_code (ostmt) != code0)
    3261              :                 {
    3262         8894 :                   ocode = gimple_assign_rhs_code (ostmt);
    3263              :                   op = 1;
    3264              :                   j = i;
    3265              :                 }
    3266              :             }
    3267              :           else
    3268              :             {
    3269            8 :               if (gimple_call_combined_fn (stmts[0]->stmt)
    3270            4 :                   != gimple_call_combined_fn (ostmt_info->stmt))
    3271              :                 {
    3272            2 :                   op = 1;
    3273            2 :                   j = i;
    3274              :                 }
    3275              :             }
    3276        17751 :           SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (op, i));
    3277              :         }
    3278         6689 :       SLP_TREE_CODE (one) = code0;
    3279         6689 :       SLP_TREE_CODE (two) = ocode;
    3280         6689 :       SLP_TREE_LANES (one) = stmts.length ();
    3281         6689 :       SLP_TREE_LANES (two) = stmts.length ();
    3282         6689 :       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
    3283         6689 :       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
    3284              : 
    3285         6689 :       return node;
    3286              :     }
    3287              : 
    3288      2845276 :   node = vect_create_new_slp_node (node, stmts, nops);
    3289      2845276 :   SLP_TREE_VECTYPE (node) = vectype;
    3290      2845276 :   SLP_TREE_CHILDREN (node).splice (children);
    3291      2845276 :   SLP_TREE_GS_SCALE (node) = gs_scale;
    3292      2845276 :   SLP_TREE_GS_BASE (node) = gs_base;
    3293      2845276 :   if (reduc_idx != -1)
    3294              :     {
    3295       116410 :       gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) != -1
    3296              :                   || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
    3297              :                   || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def);
    3298       116410 :       SLP_TREE_REDUC_IDX (node) = reduc_idx;
    3299       116410 :       node->cycle_info.id = SLP_TREE_CHILDREN (node)[reduc_idx]->cycle_info.id;
    3300              :     }
    3301              :   /* When reaching the reduction PHI, create a vect_reduc_info.  */
    3302      2728866 :   else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
    3303      2728866 :             || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
    3304      2728866 :            && is_a <gphi *> (STMT_VINFO_STMT (stmt_info)))
    3305              :     {
    3306       101731 :       loop_vec_info loop_vinfo = as_a <loop_vec_info> (vinfo);
    3307       101731 :       gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) == -1);
    3308       101731 :       node->cycle_info.id = loop_vinfo->reduc_infos.length ();
    3309       101731 :       vect_reduc_info reduc_info = new vect_reduc_info_s ();
    3310       101731 :       loop_vinfo->reduc_infos.safe_push (reduc_info);
    3311       101731 :       stmt_vec_info reduc_phi = stmt_info;
    3312              :       /* ???  For double reductions vect_is_simple_reduction stores the
    3313              :          reduction type and code on the inner loop header PHI.  */
    3314       101731 :       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
    3315              :         {
    3316          388 :           use_operand_p use_p;
    3317          388 :           gimple *use_stmt;
    3318          388 :           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
    3319              :                                      &use_p, &use_stmt);
    3320          388 :           gcc_assert (res);
    3321          388 :           reduc_phi = loop_vinfo->lookup_stmt (use_stmt);
    3322              :         }
    3323       101731 :       VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (stmt_info);
    3324       101731 :       VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (reduc_phi);
    3325       101731 :       VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (reduc_phi);
    3326       101731 :       VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
    3327              :     }
    3328              :   return node;
    3329      9318171 : }
    3330              : 
    3331              : /* Dump a single SLP tree NODE.  */
    3332              : 
    3333              : static void
    3334       445871 : vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
    3335              :                      slp_tree node)
    3336              : {
    3337       445871 :   unsigned i, j;
    3338       445871 :   slp_tree child;
    3339       445871 :   stmt_vec_info stmt_info;
    3340       445871 :   tree op;
    3341              : 
    3342       445871 :   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
    3343       445871 :   dump_user_location_t user_loc = loc.get_user_location ();
    3344       445871 :   dump_printf_loc (metadata, user_loc,
    3345              :                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
    3346              :                    ", refcnt=%u)",
    3347       445871 :                    SLP_TREE_DEF_TYPE (node) == vect_external_def
    3348              :                    ? " (external)"
    3349              :                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
    3350       430148 :                       ? " (constant)"
    3351              :                       : ""), (void *) node,
    3352       445871 :                    estimated_poly_value (node->max_nunits),
    3353              :                                          SLP_TREE_REF_COUNT (node));
    3354       445871 :   if (SLP_TREE_VECTYPE (node))
    3355       378124 :     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
    3356       445871 :   dump_printf (metadata, "%s",
    3357       445871 :                node->avoid_stlf_fail ? " (avoid-stlf-fail)" : "");
    3358       445871 :   if (node->cycle_info.id != -1 || node->cycle_info.reduc_idx != -1)
    3359        23994 :     dump_printf (metadata, " cycle %d, link %d", node->cycle_info.id,
    3360              :                  node->cycle_info.reduc_idx);
    3361       445871 :   dump_printf (metadata, "\n");
    3362       445871 :   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
    3363              :     {
    3364       363073 :       if (SLP_TREE_PERMUTE_P (node))
    3365        13801 :         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
    3366              :       else
    3367       349272 :         dump_printf_loc (metadata, user_loc, "op template: %G",
    3368       349272 :                          SLP_TREE_REPRESENTATIVE (node)->stmt);
    3369              :     }
    3370       445871 :   if (SLP_TREE_SCALAR_STMTS (node).exists ())
    3371       869239 :     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    3372       514321 :       if (stmt_info)
    3373       508956 :         dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
    3374       508956 :                          SLP_TREE_LIVE_LANES (node).contains (i)
    3375       505284 :                          ? "[l*]" : (STMT_VINFO_LIVE_P (stmt_info)
    3376       505284 :                                      ? "[l] " : ""),
    3377              :                          i, stmt_info->stmt);
    3378              :       else
    3379         5365 :         dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
    3380              :   else
    3381              :     {
    3382        90953 :       dump_printf_loc (metadata, user_loc, "\t{ ");
    3383       199800 :       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
    3384       108847 :         dump_printf (metadata, "%T%s ", op,
    3385       108847 :                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
    3386        90953 :       dump_printf (metadata, "}\n");
    3387              :     }
    3388       445871 :   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
    3389              :     {
    3390        64912 :       dump_printf_loc (metadata, user_loc, "\tload permutation {");
    3391       148299 :       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
    3392        83387 :         dump_printf (dump_kind, " %u", j);
    3393        64912 :       dump_printf (dump_kind, " }\n");
    3394              :     }
    3395       445871 :   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
    3396              :     {
    3397        13809 :       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
    3398        51913 :       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
    3399        38104 :         dump_printf (dump_kind, " %u[%u]",
    3400        38104 :                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
    3401        38104 :                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
    3402        13809 :       dump_printf (dump_kind, " }%s\n",
    3403        13809 :                    node->ldst_lanes ? " (load-lanes)" : "");
    3404              :     }
    3405       445871 :   if (SLP_TREE_CHILDREN (node).is_empty ())
    3406       169958 :     return;
    3407       275913 :   dump_printf_loc (metadata, user_loc, "\tchildren");
    3408       727946 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3409       452033 :     dump_printf (dump_kind, " %p", (void *)child);
    3410       275913 :   dump_printf (dump_kind, "%s\n",
    3411       275913 :                node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
    3412              :                ? " (store-lanes)" : "");
    3413              : }
    3414              : 
    3415              : DEBUG_FUNCTION void
    3416            0 : debug (slp_tree node)
    3417              : {
    3418            0 :   debug_dump_context ctx;
    3419            0 :   vect_print_slp_tree (MSG_NOTE,
    3420            0 :                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
    3421              :                        node);
    3422            0 : }
    3423              : 
    3424              : /* Recursive helper for the dot producer below.  */
    3425              : 
    3426              : static void
    3427            0 : dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
    3428              : {
    3429            0 :   if (visited.add (node))
    3430              :     return;
    3431              : 
    3432            0 :   fprintf (f, "\"%p\" [label=\"", (void *)node);
    3433            0 :   vect_print_slp_tree (MSG_NOTE,
    3434            0 :                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
    3435              :                        node);
    3436            0 :   fprintf (f, "\"];\n");
    3437              : 
    3438              : 
    3439            0 :   for (slp_tree child : SLP_TREE_CHILDREN (node))
    3440            0 :     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
    3441              : 
    3442            0 :   for (slp_tree child : SLP_TREE_CHILDREN (node))
    3443            0 :     if (child)
    3444            0 :       dot_slp_tree (f, child, visited);
    3445              : }
    3446              : 
    3447              : DEBUG_FUNCTION void
    3448            0 : dot_slp_tree (const char *fname, slp_tree node)
    3449              : {
    3450            0 :   FILE *f = fopen (fname, "w");
    3451            0 :   fprintf (f, "digraph {\n");
    3452            0 :   fflush (f);
    3453            0 :     {
    3454            0 :       debug_dump_context ctx (f);
    3455            0 :       hash_set<slp_tree> visited;
    3456            0 :       dot_slp_tree (f, node, visited);
    3457            0 :     }
    3458            0 :   fflush (f);
    3459            0 :   fprintf (f, "}\n");
    3460            0 :   fclose (f);
    3461            0 : }
    3462              : 
    3463              : DEBUG_FUNCTION void
    3464            0 : dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
    3465              : {
    3466            0 :   FILE *f = fopen (fname, "w");
    3467            0 :   fprintf (f, "digraph {\n");
    3468            0 :   fflush (f);
    3469            0 :     {
    3470            0 :       debug_dump_context ctx (f);
    3471            0 :       hash_set<slp_tree> visited;
    3472            0 :       for (auto inst : slp_instances)
    3473            0 :         dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
    3474            0 :     }
    3475            0 :   fflush (f);
    3476            0 :   fprintf (f, "}\n");
    3477            0 :   fclose (f);
    3478            0 : }
    3479              : 
    3480              : /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
    3481              : 
    3482              : static void
    3483       484806 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
    3484              :                       slp_tree node, hash_set<slp_tree> &visited)
    3485              : {
    3486       484806 :   unsigned i;
    3487       484806 :   slp_tree child;
    3488              : 
    3489       484806 :   if (visited.add (node))
    3490       484806 :     return;
    3491              : 
    3492       445397 :   vect_print_slp_tree (dump_kind, loc, node);
    3493              : 
    3494      1342313 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3495       451519 :     if (child)
    3496       408685 :       vect_print_slp_graph (dump_kind, loc, child, visited);
    3497              : }
    3498              : 
    3499              : static void
    3500        46744 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
    3501              :                       slp_tree entry)
    3502              : {
    3503        46744 :   hash_set<slp_tree> visited;
    3504        46744 :   vect_print_slp_graph (dump_kind, loc, entry, visited);
    3505        46744 : }
    3506              : 
    3507              : DEBUG_FUNCTION void
    3508            0 : debug (slp_instance instance)
    3509              : {
    3510            0 :   debug_dump_context ctx;
    3511            0 :   vect_print_slp_graph (MSG_NOTE,
    3512            0 :                         dump_location_t::from_location_t (UNKNOWN_LOCATION),
    3513              :                         SLP_INSTANCE_TREE (instance));
    3514            0 : }
    3515              : 
    3516              : 
    3517              : /* Compute the set of scalar stmts participating in external nodes.  */
    3518              : 
    3519              : static void
    3520      1553024 : vect_slp_gather_extern_scalar_stmts (vec_info *vinfo, slp_tree node,
    3521              :                                      hash_set<slp_tree> &visited,
    3522              :                                      hash_set<stmt_vec_info> &estmts)
    3523              : {
    3524      1553024 :   if (visited.add (node))
    3525              :     return;
    3526              : 
    3527      1509135 :   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
    3528              :     {
    3529              :       slp_tree child;
    3530              :       int i;
    3531      1739136 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3532       872475 :         if (child)
    3533       872475 :           vect_slp_gather_extern_scalar_stmts (vinfo, child, visited, estmts);
    3534              :     }
    3535              :   else
    3536      3617994 :     for (tree def : SLP_TREE_SCALAR_OPS (node))
    3537              :       {
    3538      1691838 :         stmt_vec_info def_stmt = vinfo->lookup_def (def);
    3539      1691838 :         if (def_stmt)
    3540       333070 :           estmts.add (def_stmt);
    3541              :       }
    3542              : }
    3543              : 
    3544              : /* Mark the original scalar stmt coverage of the vector SLP graph of VINFO
    3545              :    with STMT_SLP_TYPE == pure_slp.  */
    3546              : 
    3547              : static void
    3548       234430 : vect_bb_slp_mark_stmts_vectorized (bb_vec_info vinfo)
    3549              : {
    3550              :   /* Gather the scalar stmt leafs of the SLP graph to stop the below DFS
    3551              :      walk on.  */
    3552       234430 :   hash_set<stmt_vec_info> scalar_stmts_in_externs;
    3553       234430 :   hash_set<slp_tree> visited;
    3554      1383839 :   for (auto instance : BB_VINFO_SLP_INSTANCES (vinfo))
    3555       680549 :     vect_slp_gather_extern_scalar_stmts (vinfo, SLP_INSTANCE_TREE (instance),
    3556              :                                          visited, scalar_stmts_in_externs);
    3557              : 
    3558              :   /* DFS walk scalar stmts to compute the vectorized coverage indicated
    3559              :      by STMT_SLP_TYPE (stmt) == pure_slp on the original scalar (non-pattern)
    3560              :      stmts.  */
    3561      1383839 :   for (auto instance : BB_VINFO_SLP_INSTANCES (vinfo))
    3562              :     {
    3563       789986 :       for (auto stmt : SLP_INSTANCE_ROOT_STMTS (instance))
    3564        51905 :         if (!scalar_stmts_in_externs.contains (stmt))
    3565        51377 :           STMT_SLP_TYPE (stmt) = pure_slp;
    3566       680549 :       auto_vec<stmt_vec_info> worklist;
    3567      3845941 :       for (auto stmt : SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)))
    3568              :         {
    3569      1804294 :           stmt = vect_orig_stmt (stmt);
    3570      1804294 :           if (!scalar_stmts_in_externs.contains (stmt)
    3571      1804294 :               && STMT_SLP_TYPE (stmt) != pure_slp)
    3572              :             {
    3573      1794686 :               STMT_SLP_TYPE (stmt) = pure_slp;
    3574      1794686 :               worklist.safe_push (stmt);
    3575              :             }
    3576              :         }
    3577      3586899 :       while (!worklist.is_empty ())
    3578              :         {
    3579      2228675 :           stmt_vec_info stmt = worklist.pop ();
    3580              : 
    3581              :           /* Now walk relevant parts of the SSA use-def graph.  */
    3582      2228675 :           slp_oprnds child_ops (stmt);
    3583      4690227 :           for (unsigned i = 0; i < child_ops.num_slp_children; ++i)
    3584              :             {
    3585      2461552 :               tree op = child_ops.get_op_for_slp_child (stmt, i);
    3586      2461552 :               stmt_vec_info def = vinfo->lookup_def (op);
    3587      2461552 :               if (def
    3588       849887 :                   && !scalar_stmts_in_externs.contains (def)
    3589      2979498 :                   && STMT_SLP_TYPE (def) != pure_slp)
    3590              :                 {
    3591       433989 :                   STMT_SLP_TYPE (def) = pure_slp;
    3592       433989 :                   worklist.safe_push (def);
    3593              :                 }
    3594              :             }
    3595              :         }
    3596       680549 :     }
    3597       234430 : }
    3598              : 
    3599              : /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
    3600              : 
    3601              : static void
    3602      2403231 : vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
    3603              : {
    3604      2403231 :   int i;
    3605      2403231 :   stmt_vec_info stmt_info;
    3606      2403231 :   slp_tree child;
    3607              : 
    3608      2403231 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    3609              :     return;
    3610              : 
    3611      1432140 :   if (visited.add (node))
    3612              :     return;
    3613              : 
    3614      4298133 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    3615      3002514 :     if (stmt_info)
    3616              :       {
    3617      3002514 :         gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
    3618              :                     || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
    3619      3002514 :         STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
    3620              :       }
    3621              : 
    3622      2918644 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3623      1623025 :     if (child)
    3624      1623025 :       vect_mark_slp_stmts_relevant (child, visited);
    3625              : }
    3626              : 
    3627              : static void
    3628       780206 : vect_mark_slp_stmts_relevant (slp_tree node)
    3629              : {
    3630       780206 :   hash_set<slp_tree> visited;
    3631       780206 :   vect_mark_slp_stmts_relevant (node, visited);
    3632       780206 : }
    3633              : 
    3634              : 
    3635              : /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
    3636              : 
    3637              : static void
    3638     10506174 : vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
    3639              :                        hash_set<slp_tree> &visited)
    3640              : {
    3641     10506174 :   if (!node || visited.add (node))
    3642      1734858 :     return;
    3643              : 
    3644      8771316 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    3645              :     return;
    3646              : 
    3647      6509904 :   if (!SLP_TREE_PERMUTE_P (node))
    3648              :     {
    3649      6304258 :       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
    3650      6304258 :       if (STMT_VINFO_DATA_REF (stmt_info)
    3651      2752610 :           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
    3652      1552607 :         loads.safe_push (node);
    3653              :     }
    3654              : 
    3655              :   unsigned i;
    3656              :   slp_tree child;
    3657     14794840 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3658      8284936 :     vect_gather_slp_loads (loads, child, visited);
    3659              : }
    3660              : 
    3661              : 
    3662              : /* Find the last store in SLP INSTANCE.  */
    3663              : 
    3664              : stmt_vec_info
    3665      2726617 : vect_find_last_scalar_stmt_in_slp (slp_tree node)
    3666              : {
    3667      2726617 :   stmt_vec_info last = NULL;
    3668      2726617 :   stmt_vec_info stmt_vinfo;
    3669              : 
    3670      9941361 :   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
    3671      7214744 :     if (stmt_vinfo)
    3672              :       {
    3673      7214744 :         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
    3674      7214744 :         last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
    3675              :       }
    3676              : 
    3677      2726617 :   return last;
    3678              : }
    3679              : 
    3680              : /* Find the first stmt in NODE.  */
    3681              : 
    3682              : stmt_vec_info
    3683       527585 : vect_find_first_scalar_stmt_in_slp (slp_tree node)
    3684              : {
    3685       527585 :   stmt_vec_info first = NULL;
    3686       527585 :   stmt_vec_info stmt_vinfo;
    3687              : 
    3688      1790377 :   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
    3689      1262792 :     if (stmt_vinfo)
    3690              :       {
    3691      1260098 :         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
    3692      1260098 :         if (!first
    3693      1260098 :             || get_later_stmt (stmt_vinfo, first) == first)
    3694              :           first = stmt_vinfo;
    3695              :       }
    3696              : 
    3697       527585 :   return first;
    3698              : }
    3699              : 
    3700              : /* Splits a group of stores, currently beginning at FIRST_VINFO, into
    3701              :    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
    3702              :    (also containing the first GROUP1_SIZE stmts, since stores are
    3703              :    consecutive), the second containing the remainder.
    3704              :    Return the first stmt in the second group.  */
    3705              : 
    3706              : static stmt_vec_info
    3707       156668 : vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
    3708              : {
    3709       156668 :   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
    3710       156668 :   gcc_assert (group1_size > 0);
    3711       156668 :   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
    3712       156668 :   gcc_assert (group2_size > 0);
    3713       156668 :   DR_GROUP_SIZE (first_vinfo) = group1_size;
    3714              : 
    3715       156668 :   stmt_vec_info stmt_info = first_vinfo;
    3716       526293 :   for (unsigned i = group1_size; i > 1; i--)
    3717              :     {
    3718       369625 :       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
    3719       369625 :       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
    3720              :     }
    3721              :   /* STMT is now the last element of the first group.  */
    3722       156668 :   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
    3723       156668 :   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
    3724              : 
    3725       156668 :   DR_GROUP_SIZE (group2) = group2_size;
    3726       438742 :   for (stmt_info = group2; stmt_info;
    3727       282074 :        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
    3728              :     {
    3729       282074 :       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
    3730       282074 :       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
    3731              :     }
    3732              : 
    3733              :   /* For the second group, the DR_GROUP_GAP is that before the original group,
    3734              :      plus skipping over the first vector.  */
    3735       156668 :   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
    3736              : 
    3737              :   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
    3738       156668 :   DR_GROUP_GAP (first_vinfo) += group2_size;
    3739              : 
    3740       156668 :   if (dump_enabled_p ())
    3741           70 :     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
    3742              :                      group1_size, group2_size);
    3743              : 
    3744       156668 :   return group2;
    3745              : }
    3746              : 
    3747              : /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
    3748              :    statements and a vector of NUNITS elements.  */
    3749              : 
    3750              : static poly_uint64
    3751      4156729 : calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
    3752              : {
    3753      4156729 :   return exact_div (common_multiple (nunits, group_size), group_size);
    3754              : }
    3755              : 
    3756              : /* Helper that checks to see if a node is a load node.  */
    3757              : 
    3758              : static inline bool
    3759          102 : vect_is_slp_load_node  (slp_tree root)
    3760              : {
    3761          102 :   return (!SLP_TREE_PERMUTE_P (root)
    3762          102 :           && SLP_TREE_DEF_TYPE (root) == vect_internal_def
    3763           96 :           && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
    3764          166 :           && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
    3765              : }
    3766              : 
    3767              : 
    3768              : /* Helper function of optimize_load_redistribution that performs the operation
    3769              :    recursively.  */
    3770              : 
    3771              : static slp_tree
    3772        18862 : optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
    3773              :                                 vec_info *vinfo, unsigned int group_size,
    3774              :                                 hash_map<slp_tree, slp_tree> *load_map,
    3775              :                                 slp_tree root)
    3776              : {
    3777        18862 :   if (slp_tree *leader = load_map->get (root))
    3778         3517 :     return *leader;
    3779              : 
    3780        15345 :   slp_tree node;
    3781        15345 :   unsigned i;
    3782              : 
    3783              :   /* For now, we don't know anything about externals so do not do anything.  */
    3784        15345 :   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
    3785              :     return NULL;
    3786        11386 :   else if (SLP_TREE_PERMUTE_P (root))
    3787              :     {
    3788              :       /* First convert this node into a load node and add it to the leaves
    3789              :          list and flatten the permute from a lane to a load one.  If it's
    3790              :          unneeded it will be elided later.  */
    3791           70 :       vec<stmt_vec_info> stmts;
    3792           70 :       stmts.create (SLP_TREE_LANES (root));
    3793           70 :       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
    3794          134 :       for (unsigned j = 0; j < lane_perm.length (); j++)
    3795              :         {
    3796          102 :           std::pair<unsigned, unsigned> perm = lane_perm[j];
    3797          102 :           node = SLP_TREE_CHILDREN (root)[perm.first];
    3798              : 
    3799          102 :           if (!vect_is_slp_load_node (node)
    3800          102 :               || SLP_TREE_CHILDREN (node).exists ())
    3801              :             {
    3802           38 :               stmts.release ();
    3803           38 :               goto next;
    3804              :             }
    3805              : 
    3806           64 :           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
    3807              :         }
    3808              : 
    3809           32 :       if (dump_enabled_p ())
    3810            0 :         dump_printf_loc (MSG_NOTE, vect_location,
    3811              :                          "converting stmts on permute node %p\n",
    3812              :                          (void *) root);
    3813              : 
    3814           32 :       bool *matches = XALLOCAVEC (bool, group_size);
    3815           32 :       poly_uint64 max_nunits = 1;
    3816           32 :       unsigned tree_size = 0, limit = 1;
    3817           32 :       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
    3818              :                                   matches, &limit, &tree_size, bst_map);
    3819           32 :       if (!node)
    3820            0 :         stmts.release ();
    3821              : 
    3822           32 :       load_map->put (root, node);
    3823           32 :       return node;
    3824              :     }
    3825              : 
    3826        11316 : next:
    3827        11354 :   load_map->put (root, NULL);
    3828              : 
    3829        26495 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
    3830              :     {
    3831        15141 :       slp_tree value
    3832        15141 :         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
    3833              :                                           node);
    3834        15141 :       if (value)
    3835              :         {
    3836           32 :           SLP_TREE_REF_COUNT (value)++;
    3837           32 :           SLP_TREE_CHILDREN (root)[i] = value;
    3838              :           /* ???  We know the original leafs of the replaced nodes will
    3839              :              be referenced by bst_map, only the permutes created by
    3840              :              pattern matching are not.  */
    3841           32 :           if (SLP_TREE_REF_COUNT (node) == 1)
    3842           32 :             load_map->remove (node);
    3843           32 :           vect_free_slp_tree (node);
    3844              :         }
    3845              :     }
    3846              : 
    3847              :   return NULL;
    3848              : }
    3849              : 
    3850              : /* Temporary workaround for loads not being CSEd during SLP build.  This
    3851              :    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
    3852              :    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
    3853              :    same DR such that the final operation is equal to a permuted load.  Such
    3854              :    NODES are then directly converted into LOADS themselves.  The nodes are
    3855              :    CSEd using BST_MAP.  */
    3856              : 
    3857              : static void
    3858         2838 : optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
    3859              :                               vec_info *vinfo, unsigned int group_size,
    3860              :                               hash_map<slp_tree, slp_tree> *load_map,
    3861              :                               slp_tree root)
    3862              : {
    3863         2838 :   slp_tree node;
    3864         2838 :   unsigned i;
    3865              : 
    3866         6559 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
    3867              :     {
    3868         3721 :       slp_tree value
    3869         3721 :         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
    3870              :                                           node);
    3871         3721 :       if (value)
    3872              :         {
    3873            0 :           SLP_TREE_REF_COUNT (value)++;
    3874            0 :           SLP_TREE_CHILDREN (root)[i] = value;
    3875              :           /* ???  We know the original leafs of the replaced nodes will
    3876              :              be referenced by bst_map, only the permutes created by
    3877              :              pattern matching are not.  */
    3878            0 :           if (SLP_TREE_REF_COUNT (node) == 1)
    3879            0 :             load_map->remove (node);
    3880            0 :           vect_free_slp_tree (node);
    3881              :         }
    3882              :     }
    3883         2838 : }
    3884              : 
    3885              : /* Helper function of vect_match_slp_patterns.
    3886              : 
    3887              :    Attempts to match patterns against the slp tree rooted in REF_NODE using
    3888              :    VINFO.  Patterns are matched in post-order traversal.
    3889              : 
    3890              :    If matching is successful the value in REF_NODE is updated and returned, if
    3891              :    not then it is returned unchanged.  */
    3892              : 
    3893              : static bool
    3894      6079507 : vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
    3895              :                            slp_tree_to_load_perm_map_t *perm_cache,
    3896              :                            slp_compat_nodes_map_t *compat_cache,
    3897              :                            hash_set<slp_tree> *visited)
    3898              : {
    3899      6079507 :   unsigned i;
    3900      6079507 :   slp_tree node = *ref_node;
    3901      6079507 :   bool found_p = false;
    3902      6079507 :   if (!node || visited->add (node))
    3903       870089 :     return false;
    3904              : 
    3905              :   slp_tree child;
    3906      9740484 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3907      4531066 :     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
    3908              :                                           vinfo, perm_cache, compat_cache,
    3909              :                                           visited);
    3910              : 
    3911     15628254 :   for (unsigned x = 0; x < num__slp_patterns; x++)
    3912              :     {
    3913     10418836 :       vect_pattern *pattern
    3914     10418836 :         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
    3915     10418836 :       if (pattern)
    3916              :         {
    3917         1107 :           pattern->build (vinfo);
    3918         1107 :           delete pattern;
    3919         1107 :           found_p = true;
    3920              :         }
    3921              :     }
    3922              : 
    3923              :   return found_p;
    3924              : }
    3925              : 
    3926              : /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
    3927              :    vec_info VINFO.
    3928              : 
    3929              :    The modified tree is returned.  Patterns are tried in order and multiple
    3930              :    patterns may match.  */
    3931              : 
    3932              : static bool
    3933      1548441 : vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
    3934              :                          hash_set<slp_tree> *visited,
    3935              :                          slp_tree_to_load_perm_map_t *perm_cache,
    3936              :                          slp_compat_nodes_map_t *compat_cache)
    3937              : {
    3938      1548441 :   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
    3939      1548441 :   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
    3940              : 
    3941      1548441 :   if (dump_enabled_p ())
    3942        30574 :     dump_printf_loc (MSG_NOTE, vect_location,
    3943              :                      "Analyzing SLP tree %p for patterns\n",
    3944        30574 :                      (void *) SLP_INSTANCE_TREE (instance));
    3945              : 
    3946      1548441 :   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
    3947      1548441 :                                     visited);
    3948              : }
    3949              : 
    3950              : /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
    3951              :    vectorizing with VECTYPE that might be NULL.  MASKED_P indicates whether
    3952              :    the stores are masked.
    3953              :    Return true if we could use IFN_STORE_LANES instead and if that appears
    3954              :    to be the better approach.  */
    3955              : 
    3956              : static bool
    3957         6115 : vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
    3958              :                                tree vectype, bool masked_p,
    3959              :                                unsigned int group_size,
    3960              :                                unsigned int new_group_size)
    3961              : {
    3962         6115 :   if (!vectype)
    3963              :     {
    3964         6115 :       tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
    3965         6115 :       vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
    3966              :     }
    3967         6115 :   if (!vectype)
    3968              :     return false;
    3969              :   /* Allow the split if one of the two new groups would operate on full
    3970              :      vectors *within* rather than across one scalar loop iteration.
    3971              :      This is purely a heuristic, but it should work well for group
    3972              :      sizes of 3 and 4, where the possible splits are:
    3973              : 
    3974              :        3->2+1:  OK if the vector has exactly two elements
    3975              :        4->2+2:  Likewise
    3976              :        4->3+1:  Less clear-cut.  */
    3977         6115 :   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
    3978         3446 :       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
    3979         2692 :     return false;
    3980         3423 :   return vect_store_lanes_supported (vectype, group_size, masked_p) != IFN_LAST;
    3981              : }
    3982              : 
    3983              : /* Analyze an SLP instance starting from a group of grouped stores.  Call
    3984              :    vect_build_slp_tree to build a tree of packed stmts if possible.
    3985              :    Return FALSE if it's impossible to SLP any stmt in the loop.  */
    3986              : 
    3987              : static bool
    3988              : vect_analyze_slp_instance (vec_info *vinfo,
    3989              :                            scalar_stmts_to_slp_tree_map_t *bst_map,
    3990              :                            stmt_vec_info stmt_info, slp_instance_kind kind,
    3991              :                            unsigned max_tree_size, unsigned *limit,
    3992              :                            bool force_single_lane);
    3993              : 
    3994              : /* Build an interleaving scheme for the store sources RHS_NODES from
    3995              :    SCALAR_STMTS.  */
    3996              : 
    3997              : static slp_tree
    3998         8041 : vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
    3999              :                                    vec<stmt_vec_info> &scalar_stmts,
    4000              :                                    poly_uint64 max_nunits)
    4001              : {
    4002         8041 :   unsigned int group_size = scalar_stmts.length ();
    4003        16082 :   slp_tree node = vect_create_new_slp_node (scalar_stmts,
    4004         8041 :                                             SLP_TREE_CHILDREN
    4005              :                                               (rhs_nodes[0]).length ());
    4006         8041 :   SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
    4007         8041 :   node->max_nunits = max_nunits;
    4008         8041 :   for (unsigned l = 0;
    4009        16109 :        l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
    4010              :     {
    4011              :       /* And a permute merging all RHS SLP trees.  */
    4012         8068 :       slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
    4013         8068 :                                                 VEC_PERM_EXPR);
    4014         8068 :       SLP_TREE_CHILDREN (node).quick_push (perm);
    4015         8068 :       SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
    4016         8068 :       SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
    4017         8068 :       perm->max_nunits = max_nunits;
    4018         8068 :       SLP_TREE_LANES (perm) = group_size;
    4019              :       /* ???  We should set this NULL but that's not expected.  */
    4020         8068 :       SLP_TREE_REPRESENTATIVE (perm)
    4021         8068 :         = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
    4022        31405 :       for (unsigned j = 0; j < rhs_nodes.length (); ++j)
    4023              :         {
    4024        23337 :           SLP_TREE_CHILDREN (perm)
    4025        23337 :             .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
    4026        23337 :           SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
    4027        23337 :           for (unsigned k = 0;
    4028        48990 :                k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
    4029              :             {
    4030              :               /* ???  We should populate SLP_TREE_SCALAR_STMTS
    4031              :                  or SLP_TREE_SCALAR_OPS but then we might have
    4032              :                  a mix of both in our children.  */
    4033        25653 :               SLP_TREE_LANE_PERMUTATION (perm)
    4034        25653 :                 .quick_push (std::make_pair (j, k));
    4035              :             }
    4036              :         }
    4037              : 
    4038              :       /* Now we have a single permute node but we cannot code-generate
    4039              :          the case with more than two inputs.
    4040              :          Perform pairwise reduction, reducing the two inputs
    4041              :          with the least number of lanes to one and then repeat until
    4042              :          we end up with two inputs.  That scheme makes sure we end
    4043              :          up with permutes satisfying the restriction of requiring at
    4044              :          most two vector inputs to produce a single vector output
    4045              :          when the number of lanes is even.  */
    4046        15269 :       while (SLP_TREE_CHILDREN (perm).length () > 2)
    4047              :         {
    4048              :           /* When we have three equal sized groups left the pairwise
    4049              :              reduction does not result in a scheme that avoids using
    4050              :              three vectors.  Instead merge the first two groups
    4051              :              to the final size with do-not-care elements (chosen
    4052              :              from the first group) and then merge with the third.
    4053              :                   { A0, B0,  x, A1, B1,  x, ... }
    4054              :                -> { A0, B0, C0, A1, B1, C1, ... }
    4055              :              This handles group size of three (and at least
    4056              :              power-of-two multiples of that).  */
    4057         7201 :           if (SLP_TREE_CHILDREN (perm).length () == 3
    4058         3291 :               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
    4059         3291 :                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
    4060         7201 :               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
    4061         2459 :                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
    4062              :             {
    4063         2153 :               int ai = 0;
    4064         2153 :               int bi = 1;
    4065         2153 :               slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
    4066         2153 :               slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
    4067         2153 :               unsigned n = SLP_TREE_LANES (perm);
    4068              : 
    4069         2153 :               slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
    4070         2153 :               SLP_TREE_LANES (permab) = n;
    4071         2153 :               SLP_TREE_LANE_PERMUTATION (permab).create (n);
    4072         2153 :               SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
    4073         2153 :               permab->max_nunits = max_nunits;
    4074              :               /* ???  Should be NULL but that's not expected.  */
    4075         2153 :               SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
    4076         2153 :               SLP_TREE_CHILDREN (permab).quick_push (a);
    4077         4320 :               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
    4078         2167 :                 SLP_TREE_LANE_PERMUTATION (permab)
    4079         2167 :                   .quick_push (std::make_pair (0, k));
    4080         2153 :               SLP_TREE_CHILDREN (permab).quick_push (b);
    4081         4320 :               for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
    4082         2167 :                 SLP_TREE_LANE_PERMUTATION (permab)
    4083         2167 :                   .quick_push (std::make_pair (1, k));
    4084              :               /* Push the do-not-care lanes.  */
    4085         4320 :               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
    4086         2167 :                 SLP_TREE_LANE_PERMUTATION (permab)
    4087         2167 :                   .quick_push (std::make_pair (0, k));
    4088              : 
    4089              :               /* Put the merged node into 'perm', in place of a.  */
    4090         2153 :               SLP_TREE_CHILDREN (perm)[ai] = permab;
    4091              :               /* Adjust the references to b in the permutation
    4092              :                  of perm and to the later children which we'll
    4093              :                  remove.  */
    4094         8654 :               for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
    4095              :                 {
    4096         6501 :                   std::pair<unsigned, unsigned> &p
    4097         6501 :                     = SLP_TREE_LANE_PERMUTATION (perm)[k];
    4098         6501 :                   if (p.first == (unsigned) bi)
    4099              :                     {
    4100         2167 :                       p.first = ai;
    4101         2167 :                       p.second += SLP_TREE_LANES (a);
    4102              :                     }
    4103         4334 :                   else if (p.first > (unsigned) bi)
    4104         2167 :                     p.first--;
    4105              :                 }
    4106         2153 :               SLP_TREE_CHILDREN (perm).ordered_remove (bi);
    4107         2153 :               break;
    4108              :             }
    4109              : 
    4110              :           /* Pick the two nodes with the least number of lanes,
    4111              :              prefer the earliest candidate and maintain ai < bi.  */
    4112              :           int ai = -1;
    4113              :           int bi = -1;
    4114        45840 :           for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
    4115              :             {
    4116        40792 :               if (ai == -1)
    4117         5048 :                 ai = ci;
    4118        35744 :               else if (bi == -1)
    4119         5048 :                 bi = ci;
    4120        30696 :               else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
    4121        30696 :                         < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
    4122        30696 :                        || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
    4123        25256 :                            < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
    4124              :                 {
    4125        11768 :                   if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
    4126         5884 :                       <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
    4127         2727 :                     bi = ci;
    4128              :                   else
    4129              :                     {
    4130         3157 :                       ai = bi;
    4131         3157 :                       bi = ci;
    4132              :                     }
    4133              :                 }
    4134              :             }
    4135              : 
    4136              :           /* Produce a merge of nodes ai and bi.  */
    4137         5048 :           slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
    4138         5048 :           slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
    4139         5048 :           unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
    4140         5048 :           slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
    4141         5048 :           SLP_TREE_LANES (permab) = n;
    4142         5048 :           SLP_TREE_LANE_PERMUTATION (permab).create (n);
    4143         5048 :           SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
    4144         5048 :           permab->max_nunits = max_nunits;
    4145              :           /* ???  Should be NULL but that's not expected.  */
    4146         5048 :           SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
    4147         5048 :           SLP_TREE_CHILDREN (permab).quick_push (a);
    4148        13340 :           for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
    4149         8292 :             SLP_TREE_LANE_PERMUTATION (permab)
    4150         8292 :               .quick_push (std::make_pair (0, k));
    4151         5048 :           SLP_TREE_CHILDREN (permab).quick_push (b);
    4152        12664 :           for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
    4153         7616 :             SLP_TREE_LANE_PERMUTATION (permab)
    4154         7616 :               .quick_push (std::make_pair (1, k));
    4155              : 
    4156              :           /* Put the merged node into 'perm', in place of a.  */
    4157         5048 :           SLP_TREE_CHILDREN (perm)[ai] = permab;
    4158              :           /* Adjust the references to b in the permutation
    4159              :              of perm and to the later children which we'll
    4160              :              remove.  */
    4161        73221 :           for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
    4162              :             {
    4163        68173 :               std::pair<unsigned, unsigned> &p
    4164        68173 :                 = SLP_TREE_LANE_PERMUTATION (perm)[k];
    4165        68173 :               if (p.first == (unsigned) bi)
    4166              :                 {
    4167         7616 :                   p.first = ai;
    4168         7616 :                   p.second += SLP_TREE_LANES (a);
    4169              :                 }
    4170        60557 :               else if (p.first > (unsigned) bi)
    4171        25450 :                 p.first--;
    4172              :             }
    4173         5048 :           SLP_TREE_CHILDREN (perm).ordered_remove (bi);
    4174              :         }
    4175              :     }
    4176              : 
    4177         8041 :   return node;
    4178              : }
    4179              : 
    4180              : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
    4181              :    of KIND.  Return true if successful.  SCALAR_STMTS is owned by this
    4182              :    function, REMAIN and ROOT_STMT_INFOS ownership is transferred back to
    4183              :    the caller upon failure.  */
    4184              : 
    4185              : static bool
    4186      1899689 : vect_build_slp_instance (vec_info *vinfo,
    4187              :                          slp_instance_kind kind,
    4188              :                          vec<stmt_vec_info> &scalar_stmts,
    4189              :                          vec<stmt_vec_info> &root_stmt_infos,
    4190              :                          vec<tree> &remain,
    4191              :                          unsigned max_tree_size, unsigned *limit,
    4192              :                          scalar_stmts_to_slp_tree_map_t *bst_map,
    4193              :                          bool force_single_lane)
    4194              : {
    4195              :   /* If there's no budget left bail out early.  */
    4196      1899689 :   if (*limit == 0)
    4197              :     {
    4198        22260 :       scalar_stmts.release ();
    4199        22260 :       return false;
    4200              :     }
    4201              : 
    4202      1877429 :   if (kind == slp_inst_kind_ctor)
    4203              :     {
    4204        13097 :       if (dump_enabled_p ())
    4205           86 :         dump_printf_loc (MSG_NOTE, vect_location,
    4206              :                          "Analyzing vectorizable constructor: %G\n",
    4207           43 :                          root_stmt_infos[0]->stmt);
    4208              :     }
    4209      1864332 :   else if (kind == slp_inst_kind_gcond)
    4210              :     {
    4211       277494 :       if (dump_enabled_p ())
    4212         5720 :         dump_printf_loc (MSG_NOTE, vect_location,
    4213              :                          "Analyzing vectorizable control flow: %G",
    4214         2860 :                          root_stmt_infos[0]->stmt);
    4215              :     }
    4216              : 
    4217      1877429 :   if (dump_enabled_p ())
    4218              :     {
    4219        25587 :       dump_printf_loc (MSG_NOTE, vect_location,
    4220              :                        "Starting SLP discovery for\n");
    4221        54634 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4222        58094 :         dump_printf_loc (MSG_NOTE, vect_location,
    4223        29047 :                          "  %G", scalar_stmts[i]->stmt);
    4224              :     }
    4225              : 
    4226              :   /* Build the tree for the SLP instance.  */
    4227      1877429 :   unsigned int group_size = scalar_stmts.length ();
    4228      1877429 :   bool *matches = XALLOCAVEC (bool, group_size);
    4229      1877429 :   poly_uint64 max_nunits = 1;
    4230      1877429 :   unsigned tree_size = 0;
    4231              : 
    4232      1877429 :   slp_tree node = NULL;
    4233      1877429 :   if (group_size > 1 && force_single_lane)
    4234              :     {
    4235            0 :       matches[0] = true;
    4236            0 :       matches[1] = false;
    4237              :     }
    4238              :   else
    4239      1877429 :     node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4240              :                                 &max_nunits, matches, limit,
    4241              :                                 &tree_size, bst_map);
    4242      1877429 :   if (node != NULL)
    4243              :     {
    4244              :       /* Calculate the unrolling factor based on the smallest type.  */
    4245       762507 :       poly_uint64 unrolling_factor
    4246       762507 :         = calculate_unrolling_factor (max_nunits, group_size);
    4247              : 
    4248       762507 :       if (maybe_ne (unrolling_factor, 1U)
    4249       762507 :           && is_a <bb_vec_info> (vinfo))
    4250              :         {
    4251            0 :           unsigned HOST_WIDE_INT const_max_nunits;
    4252            0 :           if (!max_nunits.is_constant (&const_max_nunits)
    4253            0 :               || const_max_nunits > group_size)
    4254              :             {
    4255            0 :               if (dump_enabled_p ())
    4256            0 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    4257              :                                  "Build SLP failed: store group "
    4258              :                                  "size not a multiple of the vector size "
    4259              :                                  "in basic block SLP\n");
    4260            0 :               vect_free_slp_tree (node);
    4261            0 :               return false;
    4262              :             }
    4263              :           /* Fatal mismatch.  */
    4264            0 :           if (dump_enabled_p ())
    4265            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    4266              :                              "SLP discovery succeeded but node needs "
    4267              :                              "splitting\n");
    4268            0 :           memset (matches, true, group_size);
    4269            0 :           matches[group_size / const_max_nunits * const_max_nunits] = false;
    4270            0 :           vect_free_slp_tree (node);
    4271              :         }
    4272              :       else
    4273              :         {
    4274              :           /* Create a new SLP instance.  */
    4275       762507 :           slp_instance new_instance = XNEW (class _slp_instance);
    4276       762507 :           SLP_INSTANCE_TREE (new_instance) = node;
    4277       762507 :           SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4278       762507 :           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    4279       762507 :           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    4280       762507 :           SLP_INSTANCE_KIND (new_instance) = kind;
    4281       762507 :           new_instance->reduc_phis = NULL;
    4282       762507 :           new_instance->cost_vec = vNULL;
    4283       762507 :           new_instance->subgraph_entries = vNULL;
    4284              : 
    4285       762507 :           if (dump_enabled_p ())
    4286        22531 :             dump_printf_loc (MSG_NOTE, vect_location,
    4287              :                              "SLP size %u vs. limit %u.\n",
    4288              :                              tree_size, max_tree_size);
    4289              : 
    4290       762507 :           vinfo->slp_instances.safe_push (new_instance);
    4291              : 
    4292              :           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4293              :              the number of scalar stmts in the root in a few places.
    4294              :              Verify that assumption holds.  */
    4295      1525014 :           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4296              :                         .length () == group_size);
    4297              : 
    4298       762507 :           if (dump_enabled_p ())
    4299              :             {
    4300        22531 :               if (kind == slp_inst_kind_reduc_group)
    4301         1455 :                 dump_printf_loc (MSG_NOTE, vect_location,
    4302              :                                  "SLP discovery of size %d reduction group "
    4303              :                                  "succeeded\n", group_size);
    4304        22531 :               dump_printf_loc (MSG_NOTE, vect_location,
    4305              :                                "Final SLP tree for instance %p:\n",
    4306              :                                (void *) new_instance);
    4307        22531 :               vect_print_slp_graph (MSG_NOTE, vect_location,
    4308              :                                     SLP_INSTANCE_TREE (new_instance));
    4309              :             }
    4310              : 
    4311       762507 :           return true;
    4312              :         }
    4313              :     }
    4314              :   /* Failed to SLP.  */
    4315              : 
    4316              :   /* While we arrive here even with slp_inst_kind_store we should only
    4317              :      for group_size == 1.  The code to split store groups is only in
    4318              :      vect_analyze_slp_instance now.  */
    4319      1114922 :   gcc_assert (kind != slp_inst_kind_store || group_size == 1);
    4320              : 
    4321              :   /* Free the allocated memory.  */
    4322      1114922 :   scalar_stmts.release ();
    4323              : 
    4324              :   /* Failed to SLP.  */
    4325      1114922 :   if (dump_enabled_p ())
    4326         3056 :     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
    4327              :   return false;
    4328              : }
    4329              : 
    4330              : /* Analyze an SLP instance starting from a the start of a reduction chain.
    4331              :    Call vect_build_slp_tree to build a tree of packed stmts if possible.
    4332              :    Return FALSE if SLP build fails.  */
    4333              : 
    4334              : static bool
    4335        73013 : vect_analyze_slp_reduc_chain (loop_vec_info vinfo,
    4336              :                               scalar_stmts_to_slp_tree_map_t *bst_map,
    4337              :                               stmt_vec_info scalar_stmt,
    4338              :                               unsigned max_tree_size, unsigned *limit)
    4339              : {
    4340        73013 :   vec<stmt_vec_info> scalar_stmts = vNULL;
    4341              : 
    4342        73013 :   bool fail = false;
    4343              :   /* ???  We could leave operation code checking to SLP discovery.  */
    4344        73013 :   code_helper code = STMT_VINFO_REDUC_CODE (STMT_VINFO_REDUC_DEF
    4345              :                                               (vect_orig_stmt (scalar_stmt)));
    4346        73013 :   bool first = true;
    4347        73013 :   stmt_vec_info next_stmt = scalar_stmt;
    4348        82978 :   do
    4349              :     {
    4350        82978 :       stmt_vec_info stmt = next_stmt;
    4351        82978 :       gimple_match_op op;
    4352        82978 :       if (!gimple_extract_op (STMT_VINFO_STMT (stmt), &op))
    4353            0 :         gcc_unreachable ();
    4354       165956 :       tree reduc_def = gimple_arg (STMT_VINFO_STMT (stmt),
    4355        82978 :                                    STMT_VINFO_REDUC_IDX (stmt));
    4356        82978 :       next_stmt = vect_stmt_to_vectorize (vinfo->lookup_def (reduc_def));
    4357        82978 :       gcc_assert (is_a <gphi *> (STMT_VINFO_STMT (next_stmt))
    4358              :                   || STMT_VINFO_REDUC_IDX (next_stmt) != -1);
    4359        88881 :       if (!gimple_extract_op (STMT_VINFO_STMT (vect_orig_stmt (stmt)), &op))
    4360            0 :         gcc_unreachable ();
    4361        82978 :       if (CONVERT_EXPR_CODE_P (op.code)
    4362         4917 :           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))
    4363        87883 :           && (first
    4364         2440 :               || is_a <gphi *> (STMT_VINFO_STMT (next_stmt))))
    4365              :         ;
    4366        78077 :       else if (code != op.code)
    4367              :         {
    4368         2610 :           fail = true;
    4369         2610 :           break;
    4370              :         }
    4371              :       else
    4372        75467 :         scalar_stmts.safe_push (stmt);
    4373        80368 :       first = false;
    4374              :     }
    4375        80368 :   while (!is_a <gphi *> (STMT_VINFO_STMT (next_stmt)));
    4376        73013 :   if (fail)
    4377         2610 :     return false;
    4378              : 
    4379              :   /* Remember a stmt with the actual reduction operation.  */
    4380        70403 :   stmt_vec_info reduc_scalar_stmt = scalar_stmts[0];
    4381              : 
    4382              :   /* When the SSA def chain through reduc-idx does not form a natural
    4383              :      reduction chain try to linearize an associative operation manually.  */
    4384        70403 :   if (scalar_stmts.length () == 1
    4385        67733 :       && code.is_tree_code ()
    4386        61661 :       && associative_tree_code ((tree_code)code)
    4387              :       /* We may not associate if a fold-left reduction is required.  */
    4388       130646 :       && !needs_fold_left_reduction_p (TREE_TYPE (gimple_get_lhs
    4389              :                                                     (reduc_scalar_stmt->stmt)),
    4390              :                                        code))
    4391              :     {
    4392        57026 :       auto_vec<chain_op_t> chain;
    4393        57026 :       auto_vec<std::pair<tree_code, gimple *> > worklist;
    4394        57026 :       gimple *op_stmt = NULL, *other_op_stmt = NULL;
    4395        57026 :       if (is_a <gassign *> (scalar_stmts[0]->stmt)
    4396              :           /* We cannot linearize an operation that vect_slp_linearize_chain
    4397              :              would not put on its worklist.  */
    4398        57026 :           && gimple_assign_rhs_code (scalar_stmts[0]->stmt) == (tree_code)code)
    4399              :         {
    4400        56379 :           vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
    4401        56379 :                                     scalar_stmts[0]->stmt, op_stmt,
    4402              :                                     other_op_stmt,
    4403              :                                     NULL);
    4404              : 
    4405        56379 :           scalar_stmts.truncate (0);
    4406        56379 :           stmt_vec_info tail = NULL;
    4407       282092 :           for (auto el : chain)
    4408              :             {
    4409       113312 :               if (el.dt == vect_external_def
    4410       113312 :                   || el.dt == vect_constant_def
    4411       113312 :                   || el.code != (tree_code) code)
    4412              :                 {
    4413          357 :                   scalar_stmts.release ();
    4414          357 :                   return false;
    4415              :                 }
    4416       112955 :               stmt_vec_info stmt = vinfo->lookup_def (el.op);
    4417       112955 :               if (STMT_VINFO_REDUC_IDX (stmt) != -1
    4418       110671 :                   || STMT_VINFO_REDUC_DEF (stmt))
    4419              :                 {
    4420        56255 :                   gcc_assert (tail == NULL);
    4421        56255 :                   tail = stmt;
    4422        56255 :                   continue;
    4423              :                 }
    4424        56700 :               scalar_stmts.safe_push (stmt);
    4425              :             }
    4426        56022 :           gcc_assert (tail);
    4427              :         }
    4428              : 
    4429              :       /* When this linearization didn't produce a chain see if stripping
    4430              :          a wrapping sign conversion produces one.  */
    4431        56669 :       if (scalar_stmts.length () == 1
    4432        56669 :           && (code == PLUS_EXPR || code == MULT_EXPR || code == BIT_IOR_EXPR
    4433              :               || code == BIT_AND_EXPR || code == BIT_XOR_EXPR))
    4434              :         {
    4435        54882 :           gimple *stmt = scalar_stmts[0]->stmt;
    4436        54882 :           if (!is_gimple_assign (stmt)
    4437        53716 :               || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
    4438         4579 :               || TREE_CODE (gimple_assign_rhs1 (stmt)) != SSA_NAME
    4439        59461 :               || !tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
    4440         4579 :                                          TREE_TYPE (gimple_assign_rhs1 (stmt))))
    4441              :             {
    4442        53127 :               scalar_stmts.release ();
    4443        53127 :               return false;
    4444              :             }
    4445         1755 :           stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
    4446         1755 :           if (!is_gimple_assign (stmt)
    4447         1755 :               || gimple_assign_rhs_code (stmt) != (tree_code)code)
    4448              :             {
    4449         1736 :               scalar_stmts.release ();
    4450         1736 :               return false;
    4451              :             }
    4452           19 :           chain.truncate (0);
    4453           19 :           vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
    4454              :                                     stmt, op_stmt, other_op_stmt, NULL);
    4455              : 
    4456           19 :           scalar_stmts.truncate (0);
    4457           19 :           stmt_vec_info tail = NULL;
    4458           93 :           for (auto el : chain)
    4459              :             {
    4460           44 :               if (el.dt == vect_external_def
    4461           44 :                   || el.dt == vect_constant_def
    4462           44 :                   || el.code != (tree_code) code)
    4463              :                 {
    4464            8 :                   scalar_stmts.release ();
    4465            8 :                   return false;
    4466              :                 }
    4467           36 :               stmt_vec_info stmt = vinfo->lookup_def (el.op);
    4468           36 :               if (STMT_VINFO_REDUC_IDX (stmt) != -1
    4469           36 :                   || STMT_VINFO_REDUC_DEF (stmt))
    4470              :                 {
    4471            0 :                   gcc_assert (tail == NULL);
    4472            0 :                   tail = stmt;
    4473            0 :                   continue;
    4474              :                 }
    4475           36 :               scalar_stmts.safe_push (stmt);
    4476              :             }
    4477              :           /* Unlike the above this does not include the reduction SSA
    4478              :              cycle.  */
    4479           11 :           gcc_assert (!tail);
    4480              :         }
    4481              : 
    4482         1798 :       if (scalar_stmts.length () < 2)
    4483              :         {
    4484         1673 :           scalar_stmts.release ();
    4485         1673 :           return false;
    4486              :         }
    4487              : 
    4488          125 :       if (dump_enabled_p ())
    4489              :         {
    4490           34 :           dump_printf_loc (MSG_NOTE, vect_location,
    4491              :                            "Starting SLP discovery of reduction chain for\n");
    4492          140 :           for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4493          212 :             dump_printf_loc (MSG_NOTE, vect_location,
    4494          106 :                              "  %G", scalar_stmts[i]->stmt);
    4495              :         }
    4496              : 
    4497          125 :       unsigned int group_size = scalar_stmts.length ();
    4498          125 :       bool *matches = XALLOCAVEC (bool, group_size);
    4499          125 :       poly_uint64 max_nunits = 1;
    4500          125 :       unsigned tree_size = 0;
    4501          125 :       slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4502              :                                            &max_nunits, matches, limit,
    4503          125 :                                            &tree_size, bst_map);
    4504          125 :       if (!node)
    4505              :         {
    4506           47 :           scalar_stmts.release ();
    4507           47 :           return false;
    4508              :         }
    4509              : 
    4510           78 :       unsigned cycle_id = vinfo->reduc_infos.length ();
    4511           78 :       vect_reduc_info reduc_info = new vect_reduc_info_s ();
    4512           78 :       vinfo->reduc_infos.safe_push (reduc_info);
    4513           78 :       VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (next_stmt);
    4514           78 :       VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (next_stmt);
    4515           78 :       VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (next_stmt);
    4516           78 :       VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
    4517           78 :       reduc_info->is_reduc_chain = true;
    4518              : 
    4519              :       /* Build the node for the PHI and possibly the conversions.  */
    4520           78 :       slp_tree phis = vect_create_new_slp_node (2, ERROR_MARK);
    4521           78 :       SLP_TREE_REPRESENTATIVE (phis) = next_stmt;
    4522           78 :       phis->cycle_info.id = cycle_id;
    4523           78 :       SLP_TREE_LANES (phis) = group_size;
    4524           78 :       if (reduc_scalar_stmt == scalar_stmt)
    4525           74 :         SLP_TREE_VECTYPE (phis) = SLP_TREE_VECTYPE (node);
    4526              :       else
    4527            4 :         SLP_TREE_VECTYPE (phis)
    4528            4 :           = signed_or_unsigned_type_for (TYPE_UNSIGNED
    4529              :                                            (TREE_TYPE (gimple_get_lhs
    4530              :                                                          (scalar_stmt->stmt))),
    4531              :                                          SLP_TREE_VECTYPE (node));
    4532              :       /* ???  vect_cse_slp_nodes cannot cope with cycles without any
    4533              :          SLP_TREE_SCALAR_STMTS.  */
    4534           78 :       SLP_TREE_SCALAR_STMTS (phis).create (group_size);
    4535          393 :       for (unsigned i = 0; i < group_size; ++i)
    4536          315 :         SLP_TREE_SCALAR_STMTS (phis).quick_push (next_stmt);
    4537              : 
    4538           78 :       slp_tree op_input = phis;
    4539           78 :       if (reduc_scalar_stmt != scalar_stmt)
    4540              :         {
    4541            4 :           slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
    4542            4 :           SLP_TREE_REPRESENTATIVE (conv)
    4543            4 :             = vinfo->lookup_def (gimple_arg (reduc_scalar_stmt->stmt,
    4544            4 :                                              STMT_VINFO_REDUC_IDX
    4545              :                                                (reduc_scalar_stmt)));
    4546            4 :           SLP_TREE_CHILDREN (conv).quick_push (phis);
    4547            4 :           conv->cycle_info.id = cycle_id;
    4548            4 :           SLP_TREE_REDUC_IDX (conv) = 0;
    4549            4 :           SLP_TREE_LANES (conv) = group_size;
    4550            4 :           SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (node);
    4551            4 :           SLP_TREE_SCALAR_STMTS (conv) = vNULL;
    4552            4 :           op_input = conv;
    4553              :         }
    4554              : 
    4555           78 :       slp_tree reduc = vect_create_new_slp_node (2, ERROR_MARK);
    4556           78 :       SLP_TREE_REPRESENTATIVE (reduc) = reduc_scalar_stmt;
    4557           78 :       SLP_TREE_CHILDREN (reduc).quick_push (op_input);
    4558           78 :       SLP_TREE_CHILDREN (reduc).quick_push (node);
    4559           78 :       reduc->cycle_info.id = cycle_id;
    4560           78 :       SLP_TREE_REDUC_IDX (reduc) = 0;
    4561           78 :       SLP_TREE_LANES (reduc) = group_size;
    4562           78 :       SLP_TREE_VECTYPE (reduc) = SLP_TREE_VECTYPE (node);
    4563              :       /* ???  For the reduction epilogue we need a live lane.  */
    4564           78 :       SLP_TREE_SCALAR_STMTS (reduc).create (group_size);
    4565           78 :       SLP_TREE_SCALAR_STMTS (reduc).quick_push (reduc_scalar_stmt);
    4566          315 :       for (unsigned i = 1; i < group_size; ++i)
    4567          237 :         SLP_TREE_SCALAR_STMTS (reduc).quick_push (NULL);
    4568              : 
    4569           78 :       if (reduc_scalar_stmt != scalar_stmt)
    4570              :         {
    4571            4 :           slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
    4572            4 :           SLP_TREE_REPRESENTATIVE (conv) = scalar_stmt;
    4573            4 :           SLP_TREE_CHILDREN (conv).quick_push (reduc);
    4574            4 :           conv->cycle_info.id = cycle_id;
    4575            4 :           SLP_TREE_REDUC_IDX (conv) = 0;
    4576            4 :           SLP_TREE_LANES (conv) = group_size;
    4577            4 :           SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (phis);
    4578              :           /* ???  For the reduction epilogue we need a live lane.  */
    4579            4 :           SLP_TREE_SCALAR_STMTS (conv).create (group_size);
    4580            4 :           SLP_TREE_SCALAR_STMTS (conv).quick_push (scalar_stmt);
    4581            8 :           for (unsigned i = 1; i < group_size; ++i)
    4582            4 :             SLP_TREE_SCALAR_STMTS (conv).quick_push (NULL);
    4583            4 :           reduc = conv;
    4584              :         }
    4585              : 
    4586           78 :       edge le = loop_latch_edge (LOOP_VINFO_LOOP (vinfo));
    4587           78 :       SLP_TREE_CHILDREN (phis).quick_push (NULL);
    4588           78 :       SLP_TREE_CHILDREN (phis).quick_push (NULL);
    4589           78 :       SLP_TREE_CHILDREN (phis)[le->dest_idx] = reduc;
    4590           78 :       SLP_TREE_REF_COUNT (reduc)++;
    4591              : 
    4592              :       /* Create a new SLP instance.  */
    4593           78 :       slp_instance new_instance = XNEW (class _slp_instance);
    4594           78 :       SLP_INSTANCE_TREE (new_instance) = reduc;
    4595           78 :       SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4596           78 :       SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
    4597           78 :       SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
    4598           78 :       SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
    4599           78 :       new_instance->reduc_phis = NULL;
    4600           78 :       new_instance->cost_vec = vNULL;
    4601           78 :       new_instance->subgraph_entries = vNULL;
    4602              : 
    4603           78 :       vinfo->slp_instances.safe_push (new_instance);
    4604              : 
    4605           78 :       if (dump_enabled_p ())
    4606              :         {
    4607           24 :           dump_printf_loc (MSG_NOTE, vect_location,
    4608              :                            "Final SLP tree for instance %p:\n",
    4609              :                            (void *) new_instance);
    4610           24 :           vect_print_slp_graph (MSG_NOTE, vect_location,
    4611              :                                 SLP_INSTANCE_TREE (new_instance));
    4612              :         }
    4613              : 
    4614           78 :       return true;
    4615        57026 :     }
    4616              : 
    4617        13377 :   if (scalar_stmts.length () <= 1)
    4618              :     {
    4619        10707 :       scalar_stmts.release ();
    4620        10707 :       return false;
    4621              :     }
    4622              : 
    4623         2670 :   scalar_stmts.reverse ();
    4624         2670 :   stmt_vec_info reduc_phi_info = next_stmt;
    4625              : 
    4626              :   /* Build the tree for the SLP instance.  */
    4627         2670 :   vec<stmt_vec_info> root_stmt_infos = vNULL;
    4628         2670 :   vec<tree> remain = vNULL;
    4629              : 
    4630         2670 :   if (dump_enabled_p ())
    4631              :     {
    4632          193 :       dump_printf_loc (MSG_NOTE, vect_location,
    4633              :                        "Starting SLP discovery of reduction chain for\n");
    4634         1029 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4635         1672 :         dump_printf_loc (MSG_NOTE, vect_location,
    4636          836 :                          "  %G", scalar_stmts[i]->stmt);
    4637              :     }
    4638              : 
    4639              :   /* Build the tree for the SLP instance.  */
    4640         2670 :   unsigned int group_size = scalar_stmts.length ();
    4641         2670 :   bool *matches = XALLOCAVEC (bool, group_size);
    4642         2670 :   poly_uint64 max_nunits = 1;
    4643         2670 :   unsigned tree_size = 0;
    4644              : 
    4645              :   /* ???  We need this only for SLP discovery.  */
    4646        10378 :   for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4647         7708 :     REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = scalar_stmts[0];
    4648              : 
    4649         2670 :   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4650              :                                        &max_nunits, matches, limit,
    4651         2670 :                                        &tree_size, bst_map);
    4652              : 
    4653        10378 :   for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4654         7708 :     REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = NULL;
    4655              : 
    4656         2670 :   if (node != NULL)
    4657              :     {
    4658              :       /* Create a new SLP instance.  */
    4659         2329 :       slp_instance new_instance = XNEW (class _slp_instance);
    4660         2329 :       SLP_INSTANCE_TREE (new_instance) = node;
    4661         2329 :       SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4662         2329 :       SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    4663         2329 :       SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    4664         2329 :       SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
    4665         2329 :       new_instance->reduc_phis = NULL;
    4666         2329 :       new_instance->cost_vec = vNULL;
    4667         2329 :       new_instance->subgraph_entries = vNULL;
    4668              : 
    4669         2329 :       vect_reduc_info reduc_info = info_for_reduction (vinfo, node);
    4670         2329 :       reduc_info->is_reduc_chain = true;
    4671              : 
    4672         2329 :       if (dump_enabled_p ())
    4673          144 :         dump_printf_loc (MSG_NOTE, vect_location,
    4674              :                          "SLP size %u vs. limit %u.\n",
    4675              :                          tree_size, max_tree_size);
    4676              : 
    4677              :       /* Fixup SLP reduction chains.  If this is a reduction chain with
    4678              :          a conversion in front amend the SLP tree with a node for that.  */
    4679         2329 :       gimple *scalar_def = STMT_VINFO_REDUC_DEF (reduc_phi_info)->stmt;
    4680         2329 :       if (is_gimple_assign (scalar_def)
    4681         2329 :           && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (scalar_def)))
    4682              :         {
    4683           43 :           stmt_vec_info conv_info = vect_stmt_to_vectorize
    4684           43 :                                         (STMT_VINFO_REDUC_DEF (reduc_phi_info));
    4685           43 :           scalar_stmts = vNULL;
    4686           43 :           scalar_stmts.create (group_size);
    4687          135 :           for (unsigned i = 0; i < group_size; ++i)
    4688           92 :             scalar_stmts.quick_push (conv_info);
    4689           43 :           slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
    4690           43 :           SLP_TREE_VECTYPE (conv)
    4691           43 :             = get_vectype_for_scalar_type (vinfo,
    4692           43 :                                            TREE_TYPE
    4693              :                                              (gimple_assign_lhs (scalar_def)),
    4694              :                                            group_size);
    4695           43 :           SLP_TREE_REDUC_IDX (conv) = 0;
    4696           43 :           conv->cycle_info.id = node->cycle_info.id;
    4697           43 :           SLP_TREE_CHILDREN (conv).quick_push (node);
    4698           43 :           SLP_INSTANCE_TREE (new_instance) = conv;
    4699              :         }
    4700              :       /* Fill the backedge child of the PHI SLP node.  The
    4701              :          general matching code cannot find it because the
    4702              :          scalar code does not reflect how we vectorize the
    4703              :          reduction.  */
    4704         2329 :       use_operand_p use_p;
    4705         2329 :       imm_use_iterator imm_iter;
    4706         2329 :       class loop *loop = LOOP_VINFO_LOOP (vinfo);
    4707        11195 :       FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
    4708              :                              gimple_get_lhs (scalar_def))
    4709              :         /* There are exactly two non-debug uses, the reduction
    4710              :            PHI and the loop-closed PHI node.  */
    4711         6537 :         if (!is_gimple_debug (USE_STMT (use_p))
    4712         6537 :             && gimple_bb (USE_STMT (use_p)) == loop->header)
    4713              :           {
    4714         2329 :             auto_vec<stmt_vec_info, 64> phis (group_size);
    4715         2329 :             stmt_vec_info phi_info = vinfo->lookup_stmt (USE_STMT (use_p));
    4716         9180 :             for (unsigned i = 0; i < group_size; ++i)
    4717         6851 :               phis.quick_push (phi_info);
    4718         2329 :             slp_tree *phi_node = bst_map->get (phis);
    4719         2329 :             unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
    4720         4658 :             SLP_TREE_CHILDREN (*phi_node)[dest_idx]
    4721         2329 :               = SLP_INSTANCE_TREE (new_instance);
    4722         2329 :             SLP_INSTANCE_TREE (new_instance)->refcnt++;
    4723         2329 :           }
    4724              : 
    4725         2329 :       vinfo->slp_instances.safe_push (new_instance);
    4726              : 
    4727              :       /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4728              :          the number of scalar stmts in the root in a few places.
    4729              :          Verify that assumption holds.  */
    4730         4658 :       gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4731              :                   .length () == group_size);
    4732              : 
    4733         2329 :       if (dump_enabled_p ())
    4734              :         {
    4735          144 :           dump_printf_loc (MSG_NOTE, vect_location,
    4736              :                            "Final SLP tree for instance %p:\n",
    4737              :                            (void *) new_instance);
    4738          144 :           vect_print_slp_graph (MSG_NOTE, vect_location,
    4739              :                                 SLP_INSTANCE_TREE (new_instance));
    4740              :         }
    4741              : 
    4742         2329 :       return true;
    4743              :     }
    4744              : 
    4745              :   /* Failed to SLP.  */
    4746          341 :   scalar_stmts.release ();
    4747          341 :   if (dump_enabled_p ())
    4748           49 :     dump_printf_loc (MSG_NOTE, vect_location,
    4749              :                      "SLP discovery of reduction chain failed\n");
    4750              :   return false;
    4751              : }
    4752              : 
    4753              : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
    4754              :    of KIND.  Return true if successful.  */
    4755              : 
    4756              : static bool
    4757        99087 : vect_analyze_slp_reduction (loop_vec_info vinfo,
    4758              :                             stmt_vec_info scalar_stmt,
    4759              :                             unsigned max_tree_size, unsigned *limit,
    4760              :                             scalar_stmts_to_slp_tree_map_t *bst_map,
    4761              :                             bool force_single_lane)
    4762              : {
    4763        99087 :   slp_instance_kind kind = slp_inst_kind_reduc_group;
    4764              : 
    4765              :   /* Try to gather a reduction chain.  Only attempt if there's budget left
    4766              :      since chain analysis may build multi-lane trees that consume limit.  */
    4767        99087 :   if (! force_single_lane
    4768        73298 :       && *limit != 0
    4769        73298 :       && STMT_VINFO_DEF_TYPE (scalar_stmt) == vect_reduction_def
    4770       172100 :       && vect_analyze_slp_reduc_chain (vinfo, bst_map, scalar_stmt,
    4771              :                                        max_tree_size, limit))
    4772              :     return true;
    4773              : 
    4774        96680 :   vec<stmt_vec_info> scalar_stmts;
    4775        96680 :   scalar_stmts.create (1);
    4776        96680 :   scalar_stmts.quick_push (scalar_stmt);
    4777              : 
    4778        96680 :   if (dump_enabled_p ())
    4779              :     {
    4780         3864 :       dump_printf_loc (MSG_NOTE, vect_location,
    4781              :                        "Starting SLP discovery for\n");
    4782         7728 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4783         7728 :         dump_printf_loc (MSG_NOTE, vect_location,
    4784         3864 :                          "  %G", scalar_stmts[i]->stmt);
    4785              :     }
    4786              : 
    4787              :   /* Build the tree for the SLP instance.  */
    4788        96680 :   unsigned int group_size = scalar_stmts.length ();
    4789        96680 :   bool *matches = XALLOCAVEC (bool, group_size);
    4790        96680 :   poly_uint64 max_nunits = 1;
    4791        96680 :   unsigned tree_size = 0;
    4792              : 
    4793        96680 :   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4794              :                                        &max_nunits, matches, limit,
    4795              :                                        &tree_size, bst_map);
    4796        96680 :   if (node != NULL)
    4797              :     {
    4798              :       /* Create a new SLP instance.  */
    4799        93681 :       slp_instance new_instance = XNEW (class _slp_instance);
    4800        93681 :       SLP_INSTANCE_TREE (new_instance) = node;
    4801        93681 :       SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4802        93681 :       SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
    4803        93681 :       SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
    4804        93681 :       SLP_INSTANCE_KIND (new_instance) = kind;
    4805        93681 :       new_instance->reduc_phis = NULL;
    4806        93681 :       new_instance->cost_vec = vNULL;
    4807        93681 :       new_instance->subgraph_entries = vNULL;
    4808              : 
    4809        93681 :       if (dump_enabled_p ())
    4810         3744 :         dump_printf_loc (MSG_NOTE, vect_location,
    4811              :                          "SLP size %u vs. limit %u.\n",
    4812              :                          tree_size, max_tree_size);
    4813              : 
    4814        93681 :       vinfo->slp_instances.safe_push (new_instance);
    4815              : 
    4816              :       /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4817              :          the number of scalar stmts in the root in a few places.
    4818              :          Verify that assumption holds.  */
    4819       187362 :       gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4820              :                   .length () == group_size);
    4821              : 
    4822        93681 :       if (dump_enabled_p ())
    4823              :         {
    4824         3744 :           dump_printf_loc (MSG_NOTE, vect_location,
    4825              :                            "Final SLP tree for instance %p:\n",
    4826              :                            (void *) new_instance);
    4827         3744 :           vect_print_slp_graph (MSG_NOTE, vect_location,
    4828              :                                 SLP_INSTANCE_TREE (new_instance));
    4829              :         }
    4830              : 
    4831        93681 :       return true;
    4832              :     }
    4833              :   /* Failed to SLP.  */
    4834              : 
    4835              :   /* Free the allocated memory.  */
    4836         2999 :   scalar_stmts.release ();
    4837              : 
    4838              :   /* Failed to SLP.  */
    4839         2999 :   if (dump_enabled_p ())
    4840          120 :     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
    4841              :   return false;
    4842              : }
    4843              : 
    4844              : /* Analyze a single SLP reduction group.  If successful add a SLP instance
    4845              :    for it and return true, otherwise return false and have *MATCHES
    4846              :    populated.  */
    4847              : 
    4848              : static bool
    4849        24148 : vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo,
    4850              :                                   vec<stmt_vec_info> scalar_stmts,
    4851              :                                   scalar_stmts_to_slp_tree_map_t *bst_map,
    4852              :                                   unsigned max_tree_size, unsigned *limit,
    4853              :                                   bool *matches)
    4854              : {
    4855              :   /* Try to form a reduction group.  Size-1 groups are not suitable
    4856              :      for SLP reduction and should fall back to single-lane reduction.  */
    4857        45533 :   unsigned int group_size = scalar_stmts.length ();
    4858        24148 :   if (group_size <= 1)
    4859              :     return false;
    4860        17471 :   if (!matches)
    4861         4550 :     matches = XALLOCAVEC (bool, group_size);
    4862        17471 :   poly_uint64 max_nunits = 1;
    4863        17471 :   unsigned tree_size = 0;
    4864        17471 :   slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts,
    4865              :                                        group_size,
    4866              :                                        &max_nunits, matches, limit,
    4867              :                                        &tree_size, bst_map);
    4868        17471 :   if (!node)
    4869              :     return false;
    4870              : 
    4871              :   /* Create a new SLP instance.  */
    4872         2763 :   slp_instance new_instance = XNEW (class _slp_instance);
    4873         2763 :   SLP_INSTANCE_TREE (new_instance) = node;
    4874         2763 :   SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4875         2763 :   SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
    4876         2763 :   SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
    4877         2763 :   SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group;
    4878         2763 :   new_instance->reduc_phis = NULL;
    4879         2763 :   new_instance->cost_vec = vNULL;
    4880         2763 :   new_instance->subgraph_entries = vNULL;
    4881              : 
    4882         2763 :   if (dump_enabled_p ())
    4883          213 :     dump_printf_loc (MSG_NOTE, vect_location,
    4884              :                      "SLP size %u vs. limit %u.\n",
    4885              :                      tree_size, max_tree_size);
    4886              : 
    4887         2763 :   loop_vinfo->slp_instances.safe_push (new_instance);
    4888              : 
    4889              :   /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4890              :      the number of scalar stmts in the root in a few places.
    4891              :      Verify that assumption holds.  */
    4892         5526 :   gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4893              :               .length () == group_size);
    4894              : 
    4895         2763 :   if (dump_enabled_p ())
    4896              :     {
    4897          213 :       dump_printf_loc (MSG_NOTE, vect_location,
    4898              :                        "SLP discovery of size %d reduction group "
    4899              :                        "succeeded\n", group_size);
    4900          213 :       dump_printf_loc (MSG_NOTE, vect_location,
    4901              :                        "Final SLP tree for instance %p:\n",
    4902              :                        (void *) new_instance);
    4903          213 :       vect_print_slp_graph (MSG_NOTE, vect_location,
    4904              :                             SLP_INSTANCE_TREE (new_instance));
    4905              :     }
    4906              : 
    4907              :   return true;
    4908              : }
    4909              : 
    4910              : /* Analyze reductions in LOOP_VINFO and populate SLP instances
    4911              :    accordingly.  Returns false if something fails.  */
    4912              : 
    4913              : static bool
    4914       491259 : vect_analyze_slp_reductions (loop_vec_info loop_vinfo,
    4915              :                              unsigned max_tree_size, unsigned *limit,
    4916              :                              scalar_stmts_to_slp_tree_map_t *bst_map,
    4917              :                              bool force_single_lane)
    4918              : {
    4919       560015 :   if (loop_vinfo->reductions.is_empty ())
    4920              :     return true;
    4921              : 
    4922              :   /* Collect reduction statements we can combine into
    4923              :      a SLP reduction.  */
    4924        73311 :   vec<stmt_vec_info> scalar_stmts;
    4925        73311 :   scalar_stmts.create (loop_vinfo->reductions.length ());
    4926       325418 :   for (auto next_info : loop_vinfo->reductions)
    4927              :     {
    4928       105485 :       next_info = vect_stmt_to_vectorize (next_info);
    4929       105485 :       if ((STMT_VINFO_RELEVANT_P (next_info)
    4930           14 :            || STMT_VINFO_LIVE_P (next_info))
    4931              :           /* ???  Make sure we didn't skip a conversion around a
    4932              :              reduction path.  In that case we'd have to reverse
    4933              :              engineer that conversion stmt following the chain using
    4934              :              reduc_idx and from the PHI using reduc_def.  */
    4935       105471 :           && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
    4936       105471 :               || (STMT_VINFO_DEF_TYPE (next_info)
    4937              :                   == vect_double_reduction_def)))
    4938              :         {
    4939              :           /* Do not discover SLP reductions combining lane-reducing
    4940              :              ops, that will fail later.  */
    4941       105471 :           if (!force_single_lane
    4942       105471 :               && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
    4943        78991 :             scalar_stmts.quick_push (next_info);
    4944              :           /* Do SLP discovery for single-lane reductions.  */
    4945        26480 :           else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
    4946              :                                                  max_tree_size, limit,
    4947              :                                                  bst_map,
    4948              :                                                  force_single_lane))
    4949              :             {
    4950            0 :               scalar_stmts.release ();
    4951            0 :               return false;
    4952              :             }
    4953              :         }
    4954              :     }
    4955              : 
    4956        73311 :   if (scalar_stmts.length () > 1)
    4957              :     {
    4958              :       /* Try to form a reduction group.  */
    4959         4644 :       unsigned int group_size = scalar_stmts.length ();
    4960         4644 :       bool *matches = XALLOCAVEC (bool, group_size);
    4961         4644 :       if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts, bst_map,
    4962              :                                             max_tree_size, limit, matches))
    4963         1581 :         return true;
    4964              : 
    4965              :       /* When analysis as a single SLP reduction group failed try to
    4966              :          form sub-groups by collecting matching lanes.  Do not recurse
    4967              :          that on failure (to limit compile-time costs), but recurse
    4968              :          for the initial non-matching parts.  Everything not covered
    4969              :          by a sub-group gets single-reduction treatment.  */
    4970         3518 :       vec<stmt_vec_info> cands = vNULL;
    4971        11365 :       while (matches[0])
    4972              :         {
    4973        11227 :           cands.truncate (0);
    4974        11227 :           cands.reserve (group_size, true);
    4975        88333 :           for (unsigned i = 0; i < group_size; ++i)
    4976        77106 :             if (matches[i])
    4977        19560 :               cands.quick_push (scalar_stmts[i]);
    4978              : 
    4979              :           /* Try to form a reduction group.  */
    4980        11227 :           if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map,
    4981              :                                                 max_tree_size, limit, NULL))
    4982         1207 :             cands = vNULL;
    4983              :           else
    4984              :             {
    4985              :               /* Do SLP discovery for single-lane reductions.  */
    4986        47153 :               for (auto stmt_info : cands)
    4987        17118 :                 if (! vect_analyze_slp_reduction (loop_vinfo,
    4988              :                                                   vect_stmt_to_vectorize
    4989              :                                                     (stmt_info),
    4990              :                                                   max_tree_size, limit,
    4991              :                                                   bst_map, force_single_lane))
    4992              :                   {
    4993           25 :                     scalar_stmts.release ();
    4994           25 :                     cands.release ();
    4995           25 :                     return false;
    4996              :                   }
    4997              :             }
    4998              :           /* Remove the handled stmts from scalar_stmts and try again,
    4999              :              possibly repeating the above with updated matches[].  */
    5000              :           unsigned j = 0;
    5001        88238 :           for (unsigned i = 0; i < group_size; ++i)
    5002        77036 :             if (!matches[i])
    5003              :               {
    5004        57516 :                 scalar_stmts[j] = scalar_stmts[i];
    5005        57516 :                 ++j;
    5006              :               }
    5007        11202 :           scalar_stmts.truncate (j);
    5008        11202 :           group_size = scalar_stmts.length ();
    5009        11202 :           if (group_size <= 1)
    5010              :             break;
    5011         8277 :           if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
    5012              :                                                 bst_map, max_tree_size, limit,
    5013              :                                                 matches))
    5014              :             return true;
    5015              :         }
    5016              :     }
    5017              :   /* Do SLP discovery for single-lane reductions.  */
    5018       267705 :   for (auto stmt_info : scalar_stmts)
    5019        55489 :     if (! vect_analyze_slp_reduction (loop_vinfo,
    5020              :                                       vect_stmt_to_vectorize (stmt_info),
    5021              :                                       max_tree_size, limit,
    5022              :                                       bst_map, force_single_lane))
    5023              :       {
    5024         2974 :         scalar_stmts.release ();
    5025         2974 :         return false;
    5026              :       }
    5027              : 
    5028        68756 :   scalar_stmts.release ();
    5029        68756 :   return true;
    5030              : }
    5031              : 
    5032              : /* Analyze an SLP instance starting from a group of grouped stores.  Call
    5033              :    vect_build_slp_tree to build a tree of packed stmts if possible.
    5034              :    Return FALSE if it's impossible to SLP any stmt in the group.  */
    5035              : 
    5036              : static bool
    5037      1097162 : vect_analyze_slp_instance (vec_info *vinfo,
    5038              :                            scalar_stmts_to_slp_tree_map_t *bst_map,
    5039              :                            stmt_vec_info stmt_info,
    5040              :                            slp_instance_kind kind,
    5041              :                            unsigned max_tree_size, unsigned *limit,
    5042              :                            bool force_single_lane)
    5043              : {
    5044      1097162 :   vec<stmt_vec_info> scalar_stmts;
    5045              : 
    5046      1097162 :   if (is_a <bb_vec_info> (vinfo))
    5047      1067754 :     vect_location = stmt_info->stmt;
    5048              : 
    5049      1097162 :   gcc_assert (kind == slp_inst_kind_store);
    5050              : 
    5051              :   /* Collect the stores and store them in scalar_stmts.  */
    5052      1097162 :   scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
    5053      1097162 :   stmt_vec_info next_info = stmt_info;
    5054      5454343 :   while (next_info)
    5055              :     {
    5056      3260019 :       scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
    5057      3260019 :       next_info = DR_GROUP_NEXT_ELEMENT (next_info);
    5058              :     }
    5059              : 
    5060      1097162 :   vec<stmt_vec_info> root_stmt_infos = vNULL;
    5061      1097162 :   vec<tree> remain = vNULL;
    5062              : 
    5063              :   /* Build the tree for the SLP instance.  */
    5064              : 
    5065              :   /* If there's no budget left bail out early.  */
    5066      1097162 :   if (*limit == 0)
    5067              :     return false;
    5068              : 
    5069      1097142 :   if (dump_enabled_p ())
    5070              :     {
    5071         4164 :       dump_printf_loc (MSG_NOTE, vect_location,
    5072              :                        "Starting SLP discovery for\n");
    5073        24166 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    5074        40004 :         dump_printf_loc (MSG_NOTE, vect_location,
    5075        20002 :                          "  %G", scalar_stmts[i]->stmt);
    5076              :     }
    5077              : 
    5078              :   /* Build the tree for the SLP instance.  */
    5079      1097142 :   unsigned int group_size = scalar_stmts.length ();
    5080      1097142 :   bool *matches = XALLOCAVEC (bool, group_size);
    5081      1097142 :   poly_uint64 max_nunits = 1;
    5082      1097142 :   unsigned tree_size = 0;
    5083      1097142 :   unsigned i;
    5084              : 
    5085      1097142 :   slp_tree node = NULL;
    5086      1097142 :   if (group_size > 1 && force_single_lane)
    5087              :     {
    5088         1730 :       matches[0] = true;
    5089         1730 :       matches[1] = false;
    5090              :     }
    5091              :   else
    5092      1095412 :     node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    5093              :                                 &max_nunits, matches, limit,
    5094              :                                 &tree_size, bst_map);
    5095      1097142 :   if (node != NULL)
    5096              :     {
    5097              :       /* Calculate the unrolling factor based on the smallest type.  */
    5098       681965 :       poly_uint64 unrolling_factor
    5099       681965 :         = calculate_unrolling_factor (max_nunits, group_size);
    5100              : 
    5101       681965 :       if (maybe_ne (unrolling_factor, 1U)
    5102       681965 :           && is_a <bb_vec_info> (vinfo))
    5103              :         {
    5104            0 :           unsigned HOST_WIDE_INT const_max_nunits;
    5105            0 :           if (!max_nunits.is_constant (&const_max_nunits)
    5106            0 :               || const_max_nunits > group_size)
    5107              :             {
    5108            0 :               if (dump_enabled_p ())
    5109            0 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    5110              :                                  "Build SLP failed: store group "
    5111              :                                  "size not a multiple of the vector size "
    5112              :                                  "in basic block SLP\n");
    5113            0 :               vect_free_slp_tree (node);
    5114            0 :               return false;
    5115              :             }
    5116              :           /* Fatal mismatch.  */
    5117            0 :           if (dump_enabled_p ())
    5118            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    5119              :                              "SLP discovery succeeded but node needs "
    5120              :                              "splitting\n");
    5121            0 :           memset (matches, true, group_size);
    5122            0 :           matches[group_size / const_max_nunits * const_max_nunits] = false;
    5123            0 :           vect_free_slp_tree (node);
    5124              :         }
    5125              :       else
    5126              :         {
    5127              :           /* Create a new SLP instance.  */
    5128       681965 :           slp_instance new_instance = XNEW (class _slp_instance);
    5129       681965 :           SLP_INSTANCE_TREE (new_instance) = node;
    5130       681965 :           SLP_INSTANCE_LOADS (new_instance) = vNULL;
    5131       681965 :           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    5132       681965 :           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    5133       681965 :           SLP_INSTANCE_KIND (new_instance) = kind;
    5134       681965 :           new_instance->reduc_phis = NULL;
    5135       681965 :           new_instance->cost_vec = vNULL;
    5136       681965 :           new_instance->subgraph_entries = vNULL;
    5137              : 
    5138       681965 :           if (dump_enabled_p ())
    5139         3171 :             dump_printf_loc (MSG_NOTE, vect_location,
    5140              :                              "SLP size %u vs. limit %u.\n",
    5141              :                              tree_size, max_tree_size);
    5142              : 
    5143       681965 :           vinfo->slp_instances.safe_push (new_instance);
    5144              : 
    5145              :           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    5146              :              the number of scalar stmts in the root in a few places.
    5147              :              Verify that assumption holds.  */
    5148      1363930 :           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    5149              :                         .length () == group_size);
    5150              : 
    5151       681965 :           if (dump_enabled_p ())
    5152              :             {
    5153         3171 :               dump_printf_loc (MSG_NOTE, vect_location,
    5154              :                                "Final SLP tree for instance %p:\n",
    5155              :                                (void *) new_instance);
    5156         3171 :               vect_print_slp_graph (MSG_NOTE, vect_location,
    5157              :                                     SLP_INSTANCE_TREE (new_instance));
    5158              :             }
    5159              : 
    5160       681965 :           return true;
    5161              :         }
    5162              :     }
    5163              :   /* Failed to SLP.  */
    5164              : 
    5165              :   /* Try to break the group up into pieces.  */
    5166       415177 :   if (*limit > 0 && kind == slp_inst_kind_store)
    5167              :     {
    5168              :       /* ???  We could delay all the actual splitting of store-groups
    5169              :          until after SLP discovery of the original group completed.
    5170              :          Then we can recurse to vect_build_slp_instance directly.  */
    5171      1085187 :       for (i = 0; i < group_size; i++)
    5172      1085187 :         if (!matches[i])
    5173              :           break;
    5174              : 
    5175              :       /* For basic block SLP, try to break the group up into multiples of
    5176              :          a vector size.  */
    5177       415176 :       if (is_a <bb_vec_info> (vinfo)
    5178       415176 :           && (i > 1 && i < group_size))
    5179              :         {
    5180              :           /* Free the allocated memory.  */
    5181       154214 :           scalar_stmts.release ();
    5182              : 
    5183       154214 :           tree scalar_type
    5184       154214 :             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
    5185       308428 :           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
    5186       154214 :                                                       1 << floor_log2 (i));
    5187       154214 :           unsigned HOST_WIDE_INT const_nunits;
    5188       154214 :           if (vectype
    5189       154214 :               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
    5190              :             {
    5191              :               /* Split into two groups at the first vector boundary.  */
    5192       154214 :               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
    5193       154214 :               unsigned group1_size = i & ~(const_nunits - 1);
    5194              : 
    5195       154214 :               if (dump_enabled_p ())
    5196           66 :                 dump_printf_loc (MSG_NOTE, vect_location,
    5197              :                                  "Splitting SLP group at stmt %u\n", i);
    5198       154214 :               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
    5199              :                                                                group1_size);
    5200       154214 :               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
    5201              :                                                     kind, max_tree_size,
    5202              :                                                     limit, false);
    5203              :               /* Split the rest at the failure point and possibly
    5204              :                  re-analyze the remaining matching part if it has
    5205              :                  at least two lanes.  */
    5206       154214 :               if (group1_size < i
    5207         5376 :                   && (i + 1 < group_size
    5208         2950 :                       || i - group1_size > 1))
    5209              :                 {
    5210         2454 :                   stmt_vec_info rest2 = rest;
    5211         2454 :                   rest = vect_split_slp_store_group (rest, i - group1_size);
    5212         2454 :                   if (i - group1_size > 1)
    5213           57 :                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
    5214              :                                                       kind, max_tree_size,
    5215              :                                                       limit, false);
    5216              :                 }
    5217              :               /* Re-analyze the non-matching tail if it has at least
    5218              :                  two lanes.  */
    5219       154214 :               if (i + 1 < group_size)
    5220        22014 :                 res |= vect_analyze_slp_instance (vinfo, bst_map,
    5221              :                                                   rest, kind, max_tree_size,
    5222              :                                                   limit, false);
    5223       154214 :               return res;
    5224              :             }
    5225              :         }
    5226              : 
    5227              :       /* For loop vectorization split the RHS into arbitrary pieces of
    5228              :          size >= 1.  */
    5229       260962 :       else if (is_a <loop_vec_info> (vinfo)
    5230       260962 :                && (group_size != 1 && i < group_size))
    5231              :         {
    5232         8302 :           gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
    5233           28 :           bool masked_p = call
    5234           28 :               && gimple_call_internal_p (call)
    5235           28 :               && internal_fn_mask_index (gimple_call_internal_fn (call)) != -1;
    5236              :           /* There are targets that cannot do even/odd interleaving schemes
    5237              :              so they absolutely need to use load/store-lanes.  For now
    5238              :              force single-lane SLP for them - they would be happy with
    5239              :              uniform power-of-two lanes (but depending on element size),
    5240              :              but even if we can use 'i' as indicator we would need to
    5241              :              backtrack when later lanes fail to discover with the same
    5242              :              granularity.  We cannot turn any of strided or scatter store
    5243              :              into store-lanes.  */
    5244              :           /* ???  If this is not in sync with what get_load_store_type
    5245              :              later decides the SLP representation is not good for other
    5246              :              store vectorization methods.  */
    5247         8302 :           bool want_store_lanes
    5248         8302 :             = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    5249         8302 :                && ! STMT_VINFO_STRIDED_P (stmt_info)
    5250         6202 :                && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
    5251         6198 :                && compare_step_with_zero (vinfo, stmt_info) > 0
    5252        14417 :                && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
    5253        16604 :                                                  masked_p, group_size, i));
    5254         8302 :           if (want_store_lanes || force_single_lane)
    5255              :             i = 1;
    5256              : 
    5257              :           /* A fatal discovery fail doesn't always mean single-lane SLP
    5258              :              isn't a possibility, so try.  */
    5259         6572 :           if (i == 0)
    5260              :             i = 1;
    5261              : 
    5262         8302 :           if (dump_enabled_p ())
    5263          885 :             dump_printf_loc (MSG_NOTE, vect_location,
    5264              :                              "Splitting SLP group at stmt %u\n", i);
    5265              : 
    5266              :           /* Analyze the stored values and pinch them together with
    5267              :              a permute node so we can preserve the whole store group.  */
    5268         8302 :           auto_vec<slp_tree> rhs_nodes;
    5269         8302 :           poly_uint64 max_nunits = 1;
    5270              : 
    5271         8302 :           unsigned int rhs_common_nlanes = 0;
    5272         8302 :           unsigned int start = 0, end = i;
    5273        37183 :           while (start < group_size)
    5274              :             {
    5275        29142 :               gcc_assert (end - start >= 1);
    5276        29142 :               vec<stmt_vec_info> substmts;
    5277        29142 :               substmts.create (end - start);
    5278        90653 :               for (unsigned j = start; j < end; ++j)
    5279        61511 :                 substmts.quick_push (scalar_stmts[j]);
    5280        29142 :               max_nunits = 1;
    5281        29142 :               node = vect_build_slp_tree (vinfo, substmts, end - start,
    5282              :                                           &max_nunits,
    5283              :                                           matches, limit, &tree_size, bst_map);
    5284        29142 :               if (node)
    5285              :                 {
    5286        23282 :                   rhs_nodes.safe_push (node);
    5287        23282 :                   vect_update_max_nunits (&max_nunits, node->max_nunits);
    5288        23282 :                   if (start == 0)
    5289         8047 :                     rhs_common_nlanes = SLP_TREE_LANES (node);
    5290        15235 :                   else if (rhs_common_nlanes != SLP_TREE_LANES (node))
    5291         1375 :                     rhs_common_nlanes = 0;
    5292        23282 :                   start = end;
    5293        23282 :                   if (want_store_lanes || force_single_lane)
    5294         5202 :                     end = start + 1;
    5295              :                   else
    5296              :                     end = group_size;
    5297              :                 }
    5298              :               else
    5299              :                 {
    5300         5860 :                   substmts.release ();
    5301         5860 :                   if (end - start == 1)
    5302              :                     {
    5303              :                       /* Single-lane discovery failed.  Free resources.  */
    5304          281 :                       for (auto node : rhs_nodes)
    5305            8 :                         vect_free_slp_tree (node);
    5306          261 :                       scalar_stmts.release ();
    5307          261 :                       if (dump_enabled_p ())
    5308           39 :                         dump_printf_loc (MSG_NOTE, vect_location,
    5309              :                                          "SLP discovery failed\n");
    5310          261 :                       return false;
    5311              :                     }
    5312              : 
    5313              :                   /* ???  It really happens that we soft-fail SLP
    5314              :                      build at a mismatch but the matching part hard-fails
    5315              :                      later.  As we know we arrived here with a group
    5316              :                      larger than one try a group of size one!  */
    5317         5599 :                   if (!matches[0])
    5318           44 :                     end = start + 1;
    5319              :                   else
    5320        12209 :                     for (unsigned j = start; j < end; j++)
    5321        12209 :                       if (!matches[j - start])
    5322              :                         {
    5323              :                           end = j;
    5324              :                           break;
    5325              :                         }
    5326              :                 }
    5327              :             }
    5328              : 
    5329              :           /* Now re-assess whether we want store lanes in case the
    5330              :              discovery ended up producing all single-lane RHSs.  */
    5331         8041 :           if (! want_store_lanes
    5332         8041 :               && rhs_common_nlanes == 1
    5333         6992 :               && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    5334         6992 :               && ! STMT_VINFO_STRIDED_P (stmt_info)
    5335         5257 :               && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
    5336         5254 :               && compare_step_with_zero (vinfo, stmt_info) > 0
    5337        13238 :               && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes[0]),
    5338              :                                               group_size, masked_p)
    5339              :                   != IFN_LAST))
    5340              :             want_store_lanes = true;
    5341              : 
    5342              :           /* Now we assume we can build the root SLP node from all stores.  */
    5343         8041 :           if (want_store_lanes)
    5344              :             {
    5345              :               /* For store-lanes feed the store node with all RHS nodes
    5346              :                  in order.  */
    5347            0 :               node = vect_create_new_slp_node (scalar_stmts,
    5348            0 :                                                SLP_TREE_CHILDREN
    5349              :                                                  (rhs_nodes[0]).length ());
    5350            0 :               SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
    5351            0 :               node->max_nunits = max_nunits;
    5352            0 :               node->ldst_lanes = true;
    5353            0 :               SLP_TREE_CHILDREN (node)
    5354            0 :                 .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
    5355            0 :                                 + rhs_nodes.length () - 1);
    5356              :               /* First store value and possibly mask.  */
    5357            0 :               SLP_TREE_CHILDREN (node)
    5358            0 :                 .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
    5359              :               /* Rest of the store values.  All mask nodes are the same,
    5360              :                  this should be guaranteed by dataref group discovery.  */
    5361            0 :               for (unsigned j = 1; j < rhs_nodes.length (); ++j)
    5362            0 :                 SLP_TREE_CHILDREN (node)
    5363            0 :                   .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
    5364            0 :               for (slp_tree child : SLP_TREE_CHILDREN (node))
    5365            0 :                 child->refcnt++;
    5366              :             }
    5367              :           else
    5368         8041 :             node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
    5369              :                                                       max_nunits);
    5370              : 
    5371        31315 :           while (!rhs_nodes.is_empty ())
    5372        23274 :             vect_free_slp_tree (rhs_nodes.pop ());
    5373              : 
    5374              :           /* Create a new SLP instance.  */
    5375         8041 :           slp_instance new_instance = XNEW (class _slp_instance);
    5376         8041 :           SLP_INSTANCE_TREE (new_instance) = node;
    5377         8041 :           SLP_INSTANCE_LOADS (new_instance) = vNULL;
    5378         8041 :           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    5379         8041 :           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    5380         8041 :           SLP_INSTANCE_KIND (new_instance) = kind;
    5381         8041 :           new_instance->reduc_phis = NULL;
    5382         8041 :           new_instance->cost_vec = vNULL;
    5383         8041 :           new_instance->subgraph_entries = vNULL;
    5384              : 
    5385         8041 :           if (dump_enabled_p ())
    5386          846 :             dump_printf_loc (MSG_NOTE, vect_location,
    5387              :                              "SLP size %u vs. limit %u.\n",
    5388              :                              tree_size, max_tree_size);
    5389              : 
    5390         8041 :           vinfo->slp_instances.safe_push (new_instance);
    5391              : 
    5392              :           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    5393              :              the number of scalar stmts in the root in a few places.
    5394              :              Verify that assumption holds.  */
    5395        16082 :           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    5396              :                         .length () == group_size);
    5397              : 
    5398         8041 :           if (dump_enabled_p ())
    5399              :             {
    5400          846 :               dump_printf_loc (MSG_NOTE, vect_location,
    5401              :                                "Final SLP tree for instance %p:\n",
    5402              :                                (void *) new_instance);
    5403          846 :               vect_print_slp_graph (MSG_NOTE, vect_location,
    5404              :                                     SLP_INSTANCE_TREE (new_instance));
    5405              :             }
    5406         8041 :           return true;
    5407         8302 :         }
    5408              :       else
    5409              :         /* Free the allocated memory.  */
    5410       252660 :         scalar_stmts.release ();
    5411              : 
    5412              :       /* Even though the first vector did not all match, we might be able to SLP
    5413              :          (some) of the remainder.  FORNOW ignore this possibility.  */
    5414              :     }
    5415              :   else
    5416              :     /* Free the allocated memory.  */
    5417            1 :     scalar_stmts.release ();
    5418              : 
    5419              :   /* Failed to SLP.  */
    5420       252661 :   if (dump_enabled_p ())
    5421           42 :     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
    5422              :   return false;
    5423              : }
    5424              : 
    5425              : /* qsort comparator ordering SLP load nodes.  */
    5426              : 
    5427              : static int
    5428      2647652 : vllp_cmp (const void *a_, const void *b_)
    5429              : {
    5430      2647652 :   const slp_tree a = *(const slp_tree *)a_;
    5431      2647652 :   const slp_tree b = *(const slp_tree *)b_;
    5432      2647652 :   stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
    5433      2647652 :   stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
    5434      2647652 :   if (STMT_VINFO_GROUPED_ACCESS (a0)
    5435      1541665 :       && STMT_VINFO_GROUPED_ACCESS (b0)
    5436      4128011 :       && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
    5437              :     {
    5438              :       /* Same group, order after lanes used.  */
    5439       344808 :       if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
    5440              :         return 1;
    5441       336017 :       else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
    5442              :         return -1;
    5443              :       else
    5444              :         {
    5445              :           /* Try to order loads using the same lanes together, breaking
    5446              :              the tie with the lane number that first differs.  */
    5447       326477 :           if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
    5448       326477 :               && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
    5449              :             return 0;
    5450       326477 :           else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
    5451       326477 :                    && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
    5452              :             return 1;
    5453       322422 :           else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
    5454       322422 :                    && SLP_TREE_LOAD_PERMUTATION (b).exists ())
    5455              :             return -1;
    5456              :           else
    5457              :             {
    5458       314944 :               for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
    5459       314944 :                 if (SLP_TREE_LOAD_PERMUTATION (a)[i]
    5460       314944 :                     != SLP_TREE_LOAD_PERMUTATION (b)[i])
    5461              :                   {
    5462              :                     /* In-order lane first, that's what the above case for
    5463              :                        no permutation does.  */
    5464       313632 :                     if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
    5465              :                       return -1;
    5466       191852 :                     else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
    5467              :                       return 1;
    5468       100482 :                     else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
    5469       100482 :                              < SLP_TREE_LOAD_PERMUTATION (b)[i])
    5470              :                       return -1;
    5471              :                     else
    5472              :                       return 1;
    5473              :                   }
    5474              :               return 0;
    5475              :             }
    5476              :         }
    5477              :     }
    5478              :   else /* Different groups or non-groups.  */
    5479              :     {
    5480              :       /* Order groups as their first element to keep them together.  */
    5481      2302844 :       if (STMT_VINFO_GROUPED_ACCESS (a0))
    5482      2302844 :         a0 = DR_GROUP_FIRST_ELEMENT (a0);
    5483      2302844 :       if (STMT_VINFO_GROUPED_ACCESS (b0))
    5484      2302844 :         b0 = DR_GROUP_FIRST_ELEMENT (b0);
    5485      2302844 :       if (a0 == b0)
    5486              :         return 0;
    5487              :       /* Tie using UID.  */
    5488      2302724 :       else if (gimple_uid (STMT_VINFO_STMT (a0))
    5489      2302724 :                < gimple_uid (STMT_VINFO_STMT (b0)))
    5490              :         return -1;
    5491              :       else
    5492              :         {
    5493      1022835 :           gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
    5494              :                       != gimple_uid (STMT_VINFO_STMT (b0)));
    5495              :           return 1;
    5496              :         }
    5497              :     }
    5498              : }
    5499              : 
    5500              : /* Return whether if the load permutation of NODE is consecutive starting
    5501              :    with value START_VAL in the first element.  If START_VAL is not given
    5502              :    the first element's value is used.  */
    5503              : 
    5504              : bool
    5505       623134 : vect_load_perm_consecutive_p (slp_tree node, unsigned start_val)
    5506              : {
    5507       623134 :   load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
    5508              : 
    5509       623134 :   if (!perm.exists () || !perm.length ())
    5510              :     return false;
    5511              : 
    5512       623134 :   if (start_val == UINT_MAX)
    5513        79408 :     start_val = perm[0];
    5514              : 
    5515      1230198 :   for (unsigned int i = 0; i < perm.length (); i++)
    5516       630425 :     if (perm[i] != start_val + (unsigned int) i)
    5517              :       return false;
    5518              : 
    5519              :   return true;
    5520              : }
    5521              : 
    5522              : /* Process the set of LOADS that are all from the same dataref group.  */
    5523              : 
    5524              : static void
    5525       161418 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
    5526              :                               scalar_stmts_to_slp_tree_map_t *bst_map,
    5527              :                               const array_slice<slp_tree> &loads,
    5528              :                               bool force_single_lane)
    5529              : {
    5530              :   /* We at this point want to lower without a fixed VF or vector
    5531              :      size in mind which means we cannot actually compute whether we
    5532              :      need three or more vectors for a load permutation yet.  So always
    5533              :      lower.  */
    5534       161418 :   stmt_vec_info first
    5535       161418 :     = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
    5536       161418 :   unsigned group_lanes = DR_GROUP_SIZE (first);
    5537              : 
    5538              :   /* Verify if all load permutations can be implemented with a suitably
    5539              :      large element load-lanes operation.  */
    5540       161418 :   unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
    5541       161418 :   if (STMT_VINFO_STRIDED_P (first)
    5542       158971 :       || compare_step_with_zero (loop_vinfo, first) <= 0
    5543       156311 :       || exact_log2 (ld_lanes_lanes) == -1
    5544              :       /* ???  For now only support the single-lane case as there is
    5545              :          missing support on the store-lane side and code generation
    5546              :          isn't up to the task yet.  */
    5547       153530 :       || ld_lanes_lanes != 1
    5548       303978 :       || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
    5549              :                                     group_lanes / ld_lanes_lanes,
    5550              :                                     false) == IFN_LAST)
    5551              :     ld_lanes_lanes = 0;
    5552              :   else
    5553              :     /* Verify the loads access the same number of lanes aligned to
    5554              :        ld_lanes_lanes.  */
    5555            0 :     for (slp_tree load : loads)
    5556              :       {
    5557            0 :         if (SLP_TREE_LANES (load) != ld_lanes_lanes)
    5558              :           {
    5559              :             ld_lanes_lanes = 0;
    5560              :             break;
    5561              :           }
    5562            0 :         unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
    5563            0 :         if (first % ld_lanes_lanes != 0)
    5564              :           {
    5565              :             ld_lanes_lanes = 0;
    5566              :             break;
    5567              :           }
    5568            0 :         if (!vect_load_perm_consecutive_p (load))
    5569              :           {
    5570              :             ld_lanes_lanes = 0;
    5571              :             break;
    5572              :           }
    5573              :       }
    5574              : 
    5575              :   /* Only a power-of-two number of lanes matches interleaving with N levels.
    5576              :      ???  An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
    5577              :      at each step.  */
    5578       262146 :   if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
    5579              :     return;
    5580              : 
    5581       265193 :   for (slp_tree load : loads)
    5582              :     {
    5583              :       /* Leave masked or gather loads alone for now.  */
    5584       187209 :       if (!SLP_TREE_CHILDREN (load).is_empty ())
    5585        60470 :         continue;
    5586              : 
    5587              :       /* For single-element interleaving spanning multiple vectors avoid
    5588              :          lowering, we want to use VMAT_ELEMENTWISE later.  */
    5589       187203 :       if (ld_lanes_lanes == 0
    5590       187203 :           && SLP_TREE_LANES (load) == 1
    5591       167843 :           && !DR_GROUP_NEXT_ELEMENT (first)
    5592       266779 :           && maybe_gt (group_lanes,
    5593              :                        TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (load))))
    5594        51332 :         return;
    5595              : 
    5596              :       /* We want to pattern-match special cases here and keep those
    5597              :          alone.  Candidates are splats and load-lane.  */
    5598              : 
    5599              :       /* We need to lower only loads of less than half of the groups
    5600              :          lanes, including duplicate lanes.  Note this leaves nodes
    5601              :          with a non-1:1 load permutation around instead of canonicalizing
    5602              :          those into a load and a permute node.  Removing this early
    5603              :          check would do such canonicalization.  */
    5604       135871 :       if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
    5605        56896 :           && ld_lanes_lanes == 0)
    5606        56896 :         continue;
    5607              : 
    5608              :       /* Build the permute to get the original load permutation order.  */
    5609        78975 :       bool contiguous = vect_load_perm_consecutive_p (load);
    5610        78975 :       lane_permutation_t final_perm;
    5611        78975 :       final_perm.create (SLP_TREE_LANES (load));
    5612       158864 :       for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
    5613       159778 :         final_perm.quick_push (
    5614        79889 :           std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
    5615              : 
    5616              :       /* When the load permutation accesses a contiguous unpermuted,
    5617              :          power-of-two aligned and sized chunk leave the load alone.
    5618              :          We can likely (re-)load it more efficiently rather than
    5619              :          extracting it from the larger load.
    5620              :          ???  Long-term some of the lowering should move to where
    5621              :          the vector types involved are fixed.  */
    5622        82543 :       if (!force_single_lane
    5623        78975 :           && ld_lanes_lanes == 0
    5624        53231 :           && contiguous
    5625        52988 :           && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
    5626         6563 :           && pow2p_hwi (SLP_TREE_LANES (load))
    5627         6527 :           && pow2p_hwi (group_lanes)
    5628         3568 :           && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
    5629        82543 :           && group_lanes % SLP_TREE_LANES (load) == 0)
    5630              :         {
    5631         3568 :           final_perm.release ();
    5632         3568 :           continue;
    5633              :         }
    5634              : 
    5635              :       /* First build (and possibly re-use) a load node for the
    5636              :          unpermuted group.  Gaps in the middle and on the end are
    5637              :          represented with NULL stmts.  */
    5638        75407 :       vec<stmt_vec_info> stmts;
    5639        75407 :       stmts.create (group_lanes);
    5640       268221 :       for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
    5641              :         {
    5642       192814 :           if (s != first)
    5643       122260 :             for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
    5644         4853 :               stmts.quick_push (NULL);
    5645       192814 :           stmts.quick_push (s);
    5646              :         }
    5647       138200 :       for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
    5648        62793 :         stmts.quick_push (NULL);
    5649        75407 :       poly_uint64 max_nunits = 1;
    5650        75407 :       bool *matches = XALLOCAVEC (bool, group_lanes);
    5651        75407 :       unsigned limit = 1;
    5652        75407 :       unsigned tree_size = 0;
    5653        75407 :       slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
    5654              :                                          group_lanes,
    5655              :                                          &max_nunits, matches, &limit,
    5656        75407 :                                          &tree_size, bst_map);
    5657        75407 :       gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0).exists ());
    5658              : 
    5659        75407 :       if (ld_lanes_lanes != 0)
    5660              :         {
    5661              :           /* ???  If this is not in sync with what get_load_store_type
    5662              :              later decides the SLP representation is not good for other
    5663              :              store vectorization methods.  */
    5664            0 :           l0->ldst_lanes = true;
    5665            0 :           load->ldst_lanes = true;
    5666              :         }
    5667              : 
    5668       234275 :       while (1)
    5669              :         {
    5670       154841 :           unsigned group_lanes = SLP_TREE_LANES (l0);
    5671       154841 :           if (ld_lanes_lanes != 0
    5672       154841 :               || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
    5673              :             break;
    5674              : 
    5675              :           /* Try to lower by reducing the group to half its size using an
    5676              :              interleaving scheme.  For this try to compute whether all
    5677              :              elements needed for this load are in even or odd elements of
    5678              :              an even/odd decomposition with N consecutive elements.
    5679              :              Thus { e, e, o, o, e, e, o, o } would be an even/odd decomposition
    5680              :              with N == 2.  */
    5681              :           /* ???  Only an even number of lanes can be handed this way, but the
    5682              :              fallback below could work for any number.  We have to make sure
    5683              :              to round up in that case.  */
    5684        79434 :           gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
    5685        11362 :           unsigned even = 0, odd = 0;
    5686        11362 :           if ((group_lanes & 1) == 0)
    5687              :             {
    5688        11362 :               even = (1 << ceil_log2 (group_lanes)) - 1;
    5689        11362 :               odd = even;
    5690        46125 :               for (auto l : final_perm)
    5691              :                 {
    5692        12039 :                   even &= ~l.second;
    5693        12039 :                   odd &= l.second;
    5694              :                 }
    5695              :             }
    5696              : 
    5697              :           /* Now build an even or odd extraction from the unpermuted load.  */
    5698        79434 :           lane_permutation_t perm;
    5699        79434 :           perm.create ((group_lanes + 1) / 2);
    5700        79434 :           unsigned even_level = even ? 1 << ctz_hwi (even) : 0;
    5701        79434 :           unsigned odd_level = odd ? 1 << ctz_hwi (odd) : 0;
    5702        79434 :           if (even_level
    5703        10439 :               && group_lanes % (2 * even_level) == 0
    5704              :               /* ???  When code generating permutes we do not try to pun
    5705              :                  to larger component modes so level != 1 isn't a natural
    5706              :                  even/odd extract.  Prefer one if possible.  */
    5707        10439 :               && (even_level == 1 || !odd_level || odd_level != 1))
    5708              :             {
    5709              :               /* { 0, 1, ... 4, 5 ..., } */
    5710        37447 :               for (unsigned i = 0; i < group_lanes / 2 / even_level; ++i)
    5711        59114 :                 for (unsigned j = 0; j < even_level; ++j)
    5712        29730 :                   perm.quick_push (std::make_pair (0, 2 * i * even_level + j));
    5713              :             }
    5714        68995 :           else if (odd_level)
    5715              :             {
    5716              :               /* { ..., 2, 3, ... 6, 7 } */
    5717         3269 :               gcc_assert (group_lanes % (2 * odd_level) == 0);
    5718        14261 :               for (unsigned i = 0; i < group_lanes / 2 / odd_level; ++i)
    5719        22038 :                 for (unsigned j = 0; j < odd_level; ++j)
    5720        11046 :                   perm.quick_push
    5721        11046 :                     (std::make_pair (0, (2 * i + 1) * odd_level + j));
    5722              :             }
    5723              :           else
    5724              :             {
    5725              :               /* As fallback extract all used lanes and fill to half the
    5726              :                  group size by repeating the last element.
    5727              :                  ???  This is quite a bad strathegy for re-use - we could
    5728              :                  brute force our way to find more optimal filling lanes to
    5729              :                  maximize re-use when looking at all loads from the group.  */
    5730        68102 :               auto_bitmap l;
    5731       272464 :               for (auto p : final_perm)
    5732        68158 :                 bitmap_set_bit (l, p.second);
    5733        68102 :               unsigned i = 0;
    5734        68102 :               bitmap_iterator bi;
    5735       136260 :               EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
    5736        68158 :                   perm.quick_push (std::make_pair (0, i));
    5737       272560 :               while (perm.length () < (group_lanes + 1) / 2)
    5738        68178 :                 perm.quick_push (perm.last ());
    5739        68102 :             }
    5740              : 
    5741              :           /* Update final_perm with the intermediate permute.  */
    5742       159545 :           for (unsigned i = 0; i < final_perm.length (); ++i)
    5743              :             {
    5744        80111 :               unsigned l = final_perm[i].second;
    5745        80111 :               unsigned j;
    5746        88713 :               for (j = 0; j < perm.length (); ++j)
    5747        88713 :                 if (perm[j].second == l)
    5748              :                   {
    5749        80111 :                     final_perm[i].second = j;
    5750        80111 :                     break;
    5751              :                   }
    5752        80111 :               gcc_assert (j < perm.length ());
    5753              :             }
    5754              : 
    5755              :           /* And create scalar stmts.  */
    5756        79434 :           vec<stmt_vec_info> perm_stmts;
    5757        79434 :           perm_stmts.create (perm.length ());
    5758       256546 :           for (unsigned i = 0; i < perm.length (); ++i)
    5759       177112 :             perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
    5760              : 
    5761        79434 :           slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
    5762        79434 :           SLP_TREE_CHILDREN (p).quick_push (l0);
    5763        79434 :           SLP_TREE_LANE_PERMUTATION (p) = perm;
    5764        79434 :           SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
    5765        79434 :           SLP_TREE_LANES (p) = perm.length ();
    5766        79434 :           SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
    5767              :           /* ???  As we have scalar stmts for this intermediate permute we
    5768              :              could CSE it via bst_map but we do not want to pick up
    5769              :              another SLP node with a load permutation.  We instead should
    5770              :              have a "local" CSE map here.  */
    5771        79434 :           SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
    5772              : 
    5773              :           /* We now have a node for (group_lanes + 1) / 2 lanes.  */
    5774        79434 :           l0 = p;
    5775        79434 :         }
    5776              : 
    5777              :       /* And finally from the ordered reduction node create the
    5778              :          permute to shuffle the lanes into the original load-permutation
    5779              :          order.  We replace the original load node with this.  */
    5780        75407 :       SLP_TREE_CODE (load) = VEC_PERM_EXPR;
    5781        75407 :       SLP_TREE_LOAD_PERMUTATION (load).release ();
    5782        75407 :       SLP_TREE_LANE_PERMUTATION (load) = final_perm;
    5783        75407 :       SLP_TREE_CHILDREN (load).create (1);
    5784        75407 :       SLP_TREE_CHILDREN (load).quick_push (l0);
    5785              :     }
    5786              : }
    5787              : 
    5788              : /* Transform SLP loads in the SLP graph created by SLP discovery to
    5789              :    group loads from the same group and lower load permutations that
    5790              :    are unlikely to be supported into a series of permutes.
    5791              :    In the degenerate case of having only single-lane SLP instances
    5792              :    this should result in a series of permute nodes emulating an
    5793              :    interleaving scheme.  */
    5794              : 
    5795              : static void
    5796       473646 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
    5797              :                               scalar_stmts_to_slp_tree_map_t *bst_map,
    5798              :                               bool force_single_lane)
    5799              : {
    5800              :   /* Gather and sort loads across all instances.  */
    5801       473646 :   hash_set<slp_tree> visited;
    5802       473646 :   auto_vec<slp_tree> loads;
    5803      2178749 :   for (auto inst : loop_vinfo->slp_instances)
    5804       759757 :     vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
    5805       473646 :   if (loads.is_empty ())
    5806        90213 :     return;
    5807       383433 :   loads.qsort (vllp_cmp);
    5808              : 
    5809              :   /* Now process each dataref group separately.  */
    5810       383433 :   unsigned firsti = 0;
    5811       719257 :   for (unsigned i = 1; i < loads.length (); ++i)
    5812              :     {
    5813       335824 :       slp_tree first = loads[firsti];
    5814       335824 :       slp_tree next = loads[i];
    5815       335824 :       stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
    5816       335824 :       stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
    5817       335824 :       if (STMT_VINFO_GROUPED_ACCESS (a0)
    5818       158113 :           && STMT_VINFO_GROUPED_ACCESS (b0)
    5819       480892 :           && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
    5820        62964 :         continue;
    5821              :       /* Now we have one or multiple SLP loads of the same group from
    5822              :          firsti to i - 1.  */
    5823       272860 :       if (STMT_VINFO_GROUPED_ACCESS (a0))
    5824        95149 :         vect_lower_load_permutations (loop_vinfo, bst_map,
    5825        95149 :                                       make_array_slice (&loads[firsti],
    5826              :                                                         i - firsti),
    5827              :                                       force_single_lane);
    5828              :       firsti = i;
    5829              :     }
    5830       766866 :   if (firsti < loads.length ()
    5831       766866 :       && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
    5832        66269 :     vect_lower_load_permutations (loop_vinfo, bst_map,
    5833        66269 :                                   make_array_slice (&loads[firsti],
    5834        66269 :                                                     loads.length () - firsti),
    5835              :                                   force_single_lane);
    5836       473646 : }
    5837              : 
    5838              : /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
    5839              :    trees of packed scalar stmts if SLP is possible.  */
    5840              : 
    5841              : opt_result
    5842      1111744 : vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
    5843              :                   bool force_single_lane)
    5844              : {
    5845      1111744 :   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
    5846      1111744 :   unsigned int i;
    5847      1111744 :   stmt_vec_info first_element;
    5848      1111744 :   slp_instance instance;
    5849              : 
    5850      1111744 :   DUMP_VECT_SCOPE ("vect_analyze_slp");
    5851              : 
    5852      1111744 :   unsigned limit = max_tree_size;
    5853              : 
    5854      1111744 :   scalar_stmts_to_slp_tree_map_t *bst_map
    5855      1111744 :     = new scalar_stmts_to_slp_tree_map_t ();
    5856              : 
    5857              :   /* Find SLP sequences starting from groups of grouped stores.  */
    5858      3144096 :   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
    5859       920877 :     if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
    5860              :                                      slp_inst_kind_store, max_tree_size, &limit,
    5861              :                                      force_single_lane)
    5862       920877 :         && loop_vinfo)
    5863              :       {
    5864          269 :         release_scalar_stmts_to_slp_tree_map (bst_map);
    5865          269 :         return opt_result::failure_at (vect_location, "SLP build failed.\n");
    5866              :       }
    5867              : 
    5868              :   /* For loops also start SLP discovery from non-grouped stores.  */
    5869      1111475 :   if (loop_vinfo)
    5870              :     {
    5871              :       data_reference_p dr;
    5872      1629947 :       FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
    5873      1138688 :         if (DR_IS_WRITE (dr))
    5874              :           {
    5875       370781 :             stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
    5876              :             /* Grouped stores are already handled above.  */
    5877       370781 :             if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    5878       100080 :               continue;
    5879       270701 :             vec<stmt_vec_info> stmts;
    5880       270701 :             vec<stmt_vec_info> roots = vNULL;
    5881       270701 :             vec<tree> remain = vNULL;
    5882       270701 :             stmts.create (1);
    5883       270701 :             stmts.quick_push (stmt_info);
    5884       270701 :             if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
    5885              :                                            stmts, roots, remain, max_tree_size,
    5886              :                                            &limit, bst_map, force_single_lane))
    5887              :               {
    5888         6983 :                 release_scalar_stmts_to_slp_tree_map (bst_map);
    5889         6983 :                 return opt_result::failure_at (vect_location,
    5890              :                                                "SLP build failed.\n");
    5891              :               }
    5892              :           }
    5893              : 
    5894              :       stmt_vec_info stmt_info;
    5895       491299 :       FOR_EACH_VEC_ELT (LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo), i, stmt_info)
    5896              :         {
    5897           20 :           vec<stmt_vec_info> stmts;
    5898           20 :           vec<stmt_vec_info> roots = vNULL;
    5899           20 :           vec<tree> remain = vNULL;
    5900           20 :           stmts.create (1);
    5901           20 :           stmts.quick_push (stmt_info);
    5902           20 :           if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
    5903              :                                          stmts, roots, remain, max_tree_size,
    5904              :                                          &limit, bst_map, force_single_lane))
    5905              :             {
    5906            0 :               release_scalar_stmts_to_slp_tree_map (bst_map);
    5907            0 :               return opt_result::failure_at (vect_location,
    5908              :                                              "SLP build failed.\n");
    5909              :             }
    5910              :         }
    5911              :     }
    5912              : 
    5913      1104492 :   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
    5914              :     {
    5915      1858290 :       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
    5916              :         {
    5917      1245057 :           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
    5918              :           /* Apply patterns.  */
    5919      3889721 :           for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
    5920      5289328 :             bb_vinfo->roots[i].stmts[j]
    5921      2727668 :               = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
    5922      1245057 :           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
    5923      1245057 :                                        bb_vinfo->roots[i].stmts,
    5924      1245057 :                                        bb_vinfo->roots[i].roots,
    5925      1245057 :                                        bb_vinfo->roots[i].remain,
    5926              :                                        max_tree_size, &limit, bst_map, false))
    5927              :             {
    5928       127817 :               bb_vinfo->roots[i].roots = vNULL;
    5929       127817 :               bb_vinfo->roots[i].remain = vNULL;
    5930              :             }
    5931      1245057 :           bb_vinfo->roots[i].stmts = vNULL;
    5932              :         }
    5933              :     }
    5934              : 
    5935      1104492 :   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
    5936              :     {
    5937              :       /* Find SLP sequences starting from groups of reductions.  */
    5938       491259 :       if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit,
    5939              :                                         bst_map, force_single_lane))
    5940              :         {
    5941         2999 :           release_scalar_stmts_to_slp_tree_map (bst_map);
    5942         2999 :           return opt_result::failure_at (vect_location, "SLP build failed.\n");
    5943              :         }
    5944              : 
    5945              :       /* Make sure to vectorize only-live stmts, usually inductions.  */
    5946      2199375 :       for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
    5947      1422268 :         for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
    5948       678476 :              gsi_next (&gsi))
    5949              :           {
    5950       687673 :             gphi *lc_phi = *gsi;
    5951       687673 :             tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
    5952       687673 :             stmt_vec_info stmt_info;
    5953       687673 :             if (TREE_CODE (def) == SSA_NAME
    5954       575444 :                 && !virtual_operand_p (def)
    5955       298863 :                 && (stmt_info = loop_vinfo->lookup_def (def))
    5956       268104 :                 && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
    5957       268104 :                 && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
    5958       208321 :                 && STMT_VINFO_LIVE_P (stmt_info)
    5959       208321 :                 && !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))
    5960       794177 :                 && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
    5961              :               {
    5962       106417 :                 vec<stmt_vec_info> stmts;
    5963       106417 :                 vec<stmt_vec_info> roots = vNULL;
    5964       106417 :                 vec<tree> remain = vNULL;
    5965       106417 :                 stmts.create (1);
    5966       106417 :                 stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
    5967       106417 :                 if (! vect_build_slp_instance (vinfo,
    5968              :                                                slp_inst_kind_reduc_group,
    5969              :                                                stmts, roots, remain,
    5970              :                                                max_tree_size, &limit,
    5971              :                                                bst_map, force_single_lane))
    5972              :                   {
    5973         9197 :                     release_scalar_stmts_to_slp_tree_map (bst_map);
    5974         9197 :                     return opt_result::failure_at (vect_location,
    5975              :                                                    "SLP build failed.\n");
    5976              :                   }
    5977              :               }
    5978         9197 :           }
    5979              : 
    5980              :       /* Find SLP sequences starting from gconds.  */
    5981      1189875 :       for (auto cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
    5982              :         {
    5983       279149 :           auto cond_info = loop_vinfo->lookup_stmt (cond);
    5984              : 
    5985       279149 :           cond_info = vect_stmt_to_vectorize (cond_info);
    5986       279149 :           vec<stmt_vec_info> roots = vNULL;
    5987       279149 :           roots.safe_push (cond_info);
    5988       279149 :           gimple *stmt = STMT_VINFO_STMT (cond_info);
    5989       279149 :           tree args0 = gimple_cond_lhs (stmt);
    5990       279149 :           tree args1 = gimple_cond_rhs (stmt);
    5991              : 
    5992              :           /* These should be enforced by cond lowering, but if it failed
    5993              :              bail.  */
    5994       279149 :           if (gimple_cond_code (stmt) != NE_EXPR
    5995       278059 :               || TREE_TYPE (args0) != boolean_type_node
    5996       556643 :               || !integer_zerop (args1))
    5997              :             {
    5998         1655 :               roots.release ();
    5999         1655 :               release_scalar_stmts_to_slp_tree_map (bst_map);
    6000         1655 :               return opt_result::failure_at (vect_location,
    6001              :                                              "SLP build failed.\n");
    6002              :             }
    6003              : 
    6004              :           /* An argument without a loop def will be codegened from vectorizing the
    6005              :              root gcond itself.  As such we don't need to try to build an SLP tree
    6006              :              from them.  It's highly likely that the resulting SLP tree here if both
    6007              :              arguments have a def will be incompatible, but we rely on it being split
    6008              :              later on.  */
    6009       277494 :           auto varg = loop_vinfo->lookup_def (args0);
    6010       277494 :           vec<stmt_vec_info> stmts;
    6011       277494 :           vec<tree> remain = vNULL;
    6012       277494 :           stmts.create (1);
    6013       277494 :           stmts.quick_push (vect_stmt_to_vectorize (varg));
    6014              : 
    6015       277494 :           if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
    6016              :                                          stmts, roots, remain,
    6017              :                                          max_tree_size, &limit,
    6018              :                                          bst_map, force_single_lane))
    6019              :             {
    6020         3762 :               roots.release ();
    6021         3762 :               release_scalar_stmts_to_slp_tree_map (bst_map);
    6022         3762 :               return opt_result::failure_at (vect_location,
    6023              :                                              "SLP build failed.\n");
    6024              :             }
    6025              :         }
    6026              :     }
    6027              : 
    6028      1086879 :   hash_set<slp_tree> visited_patterns;
    6029      1086879 :   slp_tree_to_load_perm_map_t perm_cache;
    6030      1086879 :   slp_compat_nodes_map_t compat_cache;
    6031              : 
    6032              :   /* See if any patterns can be found in the SLP tree.  */
    6033      1086879 :   bool pattern_found = false;
    6034      3722199 :   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6035      1548441 :     pattern_found |= vect_match_slp_patterns (instance, vinfo,
    6036              :                                               &visited_patterns, &perm_cache,
    6037              :                                               &compat_cache);
    6038              : 
    6039              :   /* If any were found optimize permutations of loads.  */
    6040      1086879 :   if (pattern_found)
    6041              :     {
    6042          264 :       hash_map<slp_tree, slp_tree> load_map;
    6043         3366 :       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6044              :         {
    6045         2838 :           slp_tree root = SLP_INSTANCE_TREE (instance);
    6046         2838 :           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
    6047              :                                         &load_map, root);
    6048              :         }
    6049          264 :     }
    6050              : 
    6051              :   /* Check whether we should force some SLP instances to use load/store-lanes
    6052              :      and do so by forcing SLP re-discovery with single lanes.  We used
    6053              :      to cancel SLP when this applied to all instances in a loop but now
    6054              :      we decide this per SLP instance.  It's important to do this only
    6055              :      after SLP pattern recognition.  */
    6056      1086879 :   if (is_a <loop_vec_info> (vinfo))
    6057      1233403 :     FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6058       759757 :       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
    6059       291045 :           && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
    6060              :         {
    6061       291045 :           slp_tree slp_root = SLP_INSTANCE_TREE (instance);
    6062       291045 :           unsigned int group_size = SLP_TREE_LANES (slp_root);
    6063       291045 :           tree vectype = SLP_TREE_VECTYPE (slp_root);
    6064              : 
    6065       291045 :           stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
    6066       291045 :           gimple *rep = STMT_VINFO_STMT (rep_info);
    6067       291045 :           bool masked = (is_gimple_call (rep)
    6068         2556 :                          && gimple_call_internal_p (rep)
    6069       293581 :                          && internal_fn_mask_index
    6070         2536 :                               (gimple_call_internal_fn (rep)) != -1);
    6071       291025 :           if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
    6072        29104 :               || slp_root->ldst_lanes
    6073       320149 :               || (vect_store_lanes_supported (vectype, group_size, masked)
    6074              :                   == IFN_LAST))
    6075       291045 :             continue;
    6076              : 
    6077            0 :           auto_vec<slp_tree> loads;
    6078            0 :           hash_set<slp_tree> visited;
    6079            0 :           vect_gather_slp_loads (loads, slp_root, visited);
    6080              : 
    6081              :           /* Check whether any load in the SLP instance is possibly
    6082              :              permuted.  */
    6083            0 :           bool loads_permuted = false;
    6084            0 :           slp_tree load_node;
    6085            0 :           unsigned j;
    6086            0 :           FOR_EACH_VEC_ELT (loads, j, load_node)
    6087              :             {
    6088            0 :               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
    6089            0 :                 continue;
    6090              :               unsigned k;
    6091              :               stmt_vec_info load_info;
    6092            0 :               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
    6093            0 :                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
    6094              :                   {
    6095              :                     loads_permuted = true;
    6096              :                     break;
    6097              :                   }
    6098              :             }
    6099              : 
    6100              :           /* If the loads and stores can use load/store-lanes force re-discovery
    6101              :              with single lanes.  */
    6102            0 :           if (loads_permuted)
    6103              :             {
    6104            0 :               bool can_use_lanes = true;
    6105              :               bool prefer_load_lanes = false;
    6106            0 :               FOR_EACH_VEC_ELT (loads, j, load_node)
    6107            0 :                 if (STMT_VINFO_GROUPED_ACCESS
    6108              :                       (SLP_TREE_REPRESENTATIVE (load_node)))
    6109              :                   {
    6110            0 :                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
    6111              :                         (SLP_TREE_REPRESENTATIVE (load_node));
    6112            0 :                     rep = STMT_VINFO_STMT (stmt_vinfo);
    6113            0 :                     masked = (is_gimple_call (rep)
    6114            0 :                               && gimple_call_internal_p (rep)
    6115            0 :                               && internal_fn_mask_index
    6116            0 :                                    (gimple_call_internal_fn (rep)));
    6117              :                     /* Use SLP for strided accesses (or if we can't
    6118              :                        load-lanes).  */
    6119            0 :                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
    6120            0 :                         || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
    6121            0 :                         || vect_load_lanes_supported
    6122            0 :                              (SLP_TREE_VECTYPE (load_node),
    6123            0 :                               DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
    6124              :                         /* ???  During SLP re-discovery with a single lane
    6125              :                            a masked grouped load will appear permuted and
    6126              :                            discovery will fail.  We have to rework this
    6127              :                            on the discovery side - for now avoid ICEing.  */
    6128            0 :                         || masked)
    6129              :                       {
    6130              :                         can_use_lanes = false;
    6131              :                         break;
    6132              :                       }
    6133              :                     /* Make sure that the target would prefer store-lanes
    6134              :                        for at least one of the loads.
    6135              : 
    6136              :                        ??? Perhaps we should instead require this for
    6137              :                        all loads?  */
    6138            0 :                     prefer_load_lanes
    6139              :                       = (prefer_load_lanes
    6140            0 :                          || SLP_TREE_LANES (load_node) == group_size
    6141            0 :                          || (vect_slp_prefer_store_lanes_p
    6142            0 :                              (vinfo, stmt_vinfo,
    6143              :                               SLP_TREE_VECTYPE (load_node), masked,
    6144              :                               group_size, SLP_TREE_LANES (load_node))));
    6145              :                   }
    6146              : 
    6147            0 :               if (can_use_lanes && prefer_load_lanes)
    6148              :                 {
    6149            0 :                   if (dump_enabled_p ())
    6150            0 :                     dump_printf_loc (MSG_NOTE, vect_location,
    6151              :                                      "SLP instance %p can use load/store-lanes,"
    6152              :                                      " re-discovering with single-lanes\n",
    6153              :                                      (void *) instance);
    6154              : 
    6155            0 :                   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
    6156              : 
    6157            0 :                   vect_free_slp_instance (instance);
    6158            0 :                   limit = max_tree_size;
    6159            0 :                   bool res = vect_analyze_slp_instance (vinfo, bst_map,
    6160              :                                                         stmt_info,
    6161              :                                                         slp_inst_kind_store,
    6162              :                                                         max_tree_size, &limit,
    6163              :                                                         true);
    6164            0 :                   gcc_assert (res);
    6165            0 :                   auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
    6166            0 :                   LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
    6167              :                 }
    6168              :             }
    6169            0 :         }
    6170              : 
    6171              :   /* When we end up with load permutations that we cannot possibly handle,
    6172              :      like those requiring three vector inputs, lower them using interleaving
    6173              :      like schemes.  */
    6174      1086879 :   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
    6175              :     {
    6176       473646 :       vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
    6177       473646 :       if (dump_enabled_p ())
    6178              :         {
    6179        20019 :           dump_printf_loc (MSG_NOTE, vect_location,
    6180              :                            "SLP graph after lowering permutations:\n");
    6181        20019 :           hash_set<slp_tree> visited;
    6182        89263 :           FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6183        29231 :             vect_print_slp_graph (MSG_NOTE, vect_location,
    6184              :                                   SLP_INSTANCE_TREE (instance), visited);
    6185        20019 :         }
    6186              :     }
    6187              : 
    6188      1086879 :   release_scalar_stmts_to_slp_tree_map (bst_map);
    6189              : 
    6190      1086879 :   if (pattern_found && dump_enabled_p ())
    6191              :     {
    6192           18 :       dump_printf_loc (MSG_NOTE, vect_location,
    6193              :                        "Pattern matched SLP tree\n");
    6194           18 :       hash_set<slp_tree> visited;
    6195           90 :       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6196           36 :         vect_print_slp_graph (MSG_NOTE, vect_location,
    6197              :                               SLP_INSTANCE_TREE (instance), visited);
    6198           18 :     }
    6199              : 
    6200      1086879 :   return opt_result::success ();
    6201      1086879 : }
    6202              : 
    6203              : /* Estimates the cost of inserting layout changes into the SLP graph.
    6204              :    It can also say that the insertion is impossible.  */
    6205              : 
    6206              : struct slpg_layout_cost
    6207              : {
    6208     10489186 :   slpg_layout_cost () = default;
    6209              :   slpg_layout_cost (sreal, bool);
    6210              : 
    6211       430955 :   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
    6212      4677176 :   bool is_possible () const { return depth != sreal::max (); }
    6213              : 
    6214              :   bool operator== (const slpg_layout_cost &) const;
    6215              :   bool operator!= (const slpg_layout_cost &) const;
    6216              : 
    6217              :   bool is_better_than (const slpg_layout_cost &, bool) const;
    6218              : 
    6219              :   void add_parallel_cost (const slpg_layout_cost &);
    6220              :   void add_serial_cost (const slpg_layout_cost &);
    6221              :   void split (unsigned int);
    6222              : 
    6223              :   /* The longest sequence of layout changes needed during any traversal
    6224              :      of the partition dag, weighted by execution frequency.
    6225              : 
    6226              :      This is the most important metric when optimizing for speed, since
    6227              :      it helps to ensure that we keep the number of operations on
    6228              :      critical paths to a minimum.  */
    6229              :   sreal depth = 0;
    6230              : 
    6231              :   /* An estimate of the total number of operations needed.  It is weighted by
    6232              :      execution frequency when optimizing for speed but not when optimizing for
    6233              :      size.  In order to avoid double-counting, a node with a fanout of N will
    6234              :      distribute 1/N of its total cost to each successor.
    6235              : 
    6236              :      This is the most important metric when optimizing for size, since
    6237              :      it helps to keep the total number of operations to a minimum,  */
    6238              :   sreal total = 0;
    6239              : };
    6240              : 
    6241              : /* Construct costs for a node with weight WEIGHT.  A higher weight
    6242              :    indicates more frequent execution.  IS_FOR_SIZE is true if we are
    6243              :    optimizing for size rather than speed.  */
    6244              : 
    6245      1109172 : slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
    6246      1109862 :   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
    6247              : {
    6248      1109172 : }
    6249              : 
    6250              : bool
    6251            0 : slpg_layout_cost::operator== (const slpg_layout_cost &other) const
    6252              : {
    6253            0 :   return depth == other.depth && total == other.total;
    6254              : }
    6255              : 
    6256              : bool
    6257            0 : slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
    6258              : {
    6259            0 :   return !operator== (other);
    6260              : }
    6261              : 
    6262              : /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
    6263              :    true if we are optimizing for size rather than speed.  */
    6264              : 
    6265              : bool
    6266       268700 : slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
    6267              :                                   bool is_for_size) const
    6268              : {
    6269       268700 :   if (is_for_size)
    6270              :     {
    6271          301 :       if (total != other.total)
    6272          117 :         return total < other.total;
    6273          184 :       return depth < other.depth;
    6274              :     }
    6275              :   else
    6276              :     {
    6277       268399 :       if (depth != other.depth)
    6278       110562 :         return depth < other.depth;
    6279       157837 :       return total < other.total;
    6280              :     }
    6281              : }
    6282              : 
    6283              : /* Increase the costs to account for something with cost INPUT_COST
    6284              :    happening in parallel with the current costs.  */
    6285              : 
    6286              : void
    6287       326382 : slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
    6288              : {
    6289       326382 :   depth = std::max (depth, input_cost.depth);
    6290       326382 :   total += input_cost.total;
    6291       326382 : }
    6292              : 
    6293              : /* Increase the costs to account for something with cost INPUT_COST
    6294              :    happening in series with the current costs.  */
    6295              : 
    6296              : void
    6297      1321715 : slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
    6298              : {
    6299      1321715 :   depth += other.depth;
    6300      1321715 :   total += other.total;
    6301      1321715 : }
    6302              : 
    6303              : /* Split the total cost among TIMES successors or predecessors.  */
    6304              : 
    6305              : void
    6306      1090412 : slpg_layout_cost::split (unsigned int times)
    6307              : {
    6308      1090412 :   if (times > 1)
    6309       507182 :     total /= times;
    6310      1090412 : }
    6311              : 
    6312              : /* Information about one node in the SLP graph, for use during
    6313              :    vect_optimize_slp_pass.  */
    6314              : 
    6315              : struct slpg_vertex
    6316              : {
    6317      9872731 :   slpg_vertex (slp_tree node_) : node (node_) {}
    6318              : 
    6319              :   /* The node itself.  */
    6320              :   slp_tree node;
    6321              : 
    6322              :   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
    6323              :      partitions are flexible; they can have whichever layout consumers
    6324              :      want them to have.  */
    6325              :   int partition = -1;
    6326              : 
    6327              :   /* The number of nodes that directly use the result of this one
    6328              :      (i.e. the number of nodes that count this one as a child).  */
    6329              :   unsigned int out_degree = 0;
    6330              : 
    6331              :   /* The execution frequency of the node.  */
    6332              :   sreal weight = 0;
    6333              : 
    6334              :   /* The total execution frequency of all nodes that directly use the
    6335              :      result of this one.  */
    6336              :   sreal out_weight = 0;
    6337              : };
    6338              : 
    6339              : /* Information about one partition of the SLP graph, for use during
    6340              :    vect_optimize_slp_pass.  */
    6341              : 
    6342              : struct slpg_partition_info
    6343              : {
    6344              :   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
    6345              :      of m_partitioned_nodes.  */
    6346              :   unsigned int node_begin = 0;
    6347              :   unsigned int node_end = 0;
    6348              : 
    6349              :   /* Which layout we've chosen to use for this partition, or -1 if
    6350              :      we haven't picked one yet.  */
    6351              :   int layout = -1;
    6352              : 
    6353              :   /* The number of predecessors and successors in the partition dag.
    6354              :      The predecessors always have lower partition numbers and the
    6355              :      successors always have higher partition numbers.
    6356              : 
    6357              :      Note that the directions of these edges are not necessarily the
    6358              :      same as in the data flow graph.  For example, if an SCC has separate
    6359              :      partitions for an inner loop and an outer loop, the inner loop's
    6360              :      partition will have at least two incoming edges from the outer loop's
    6361              :      partition: one for a live-in value and one for a live-out value.
    6362              :      In data flow terms, one of these edges would also be from the outer loop
    6363              :      to the inner loop, but the other would be in the opposite direction.  */
    6364              :   unsigned int in_degree = 0;
    6365              :   unsigned int out_degree = 0;
    6366              : };
    6367              : 
    6368              : /* Information about the costs of using a particular layout for a
    6369              :    particular partition.  It can also say that the combination is
    6370              :    impossible.  */
    6371              : 
    6372              : struct slpg_partition_layout_costs
    6373              : {
    6374      1349234 :   bool is_possible () const { return internal_cost.is_possible (); }
    6375        50722 :   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
    6376              : 
    6377              :   /* The costs inherited from predecessor partitions.  */
    6378              :   slpg_layout_cost in_cost;
    6379              : 
    6380              :   /* The inherent cost of the layout within the node itself.  For example,
    6381              :      this is nonzero for a load if choosing a particular layout would require
    6382              :      the load to permute the loaded elements.  It is nonzero for a
    6383              :      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
    6384              :      to full-vector moves.  */
    6385              :   slpg_layout_cost internal_cost;
    6386              : 
    6387              :   /* The costs inherited from successor partitions.  */
    6388              :   slpg_layout_cost out_cost;
    6389              : };
    6390              : 
    6391              : /* This class tries to optimize the layout of vectors in order to avoid
    6392              :    unnecessary shuffling.  At the moment, the set of possible layouts are
    6393              :    restricted to bijective permutations.
    6394              : 
    6395              :    The goal of the pass depends on whether we're optimizing for size or
    6396              :    for speed.  When optimizing for size, the goal is to reduce the overall
    6397              :    number of layout changes (including layout changes implied by things
    6398              :    like load permutations).  When optimizing for speed, the goal is to
    6399              :    reduce the maximum latency attributable to layout changes on any
    6400              :    non-cyclical path through the data flow graph.
    6401              : 
    6402              :    For example, when optimizing a loop nest for speed, we will prefer
    6403              :    to make layout changes outside of a loop rather than inside of a loop,
    6404              :    and will prefer to make layout changes in parallel rather than serially,
    6405              :    even if that increases the overall number of layout changes.
    6406              : 
    6407              :    The high-level procedure is:
    6408              : 
    6409              :    (1) Build a graph in which edges go from uses (parents) to definitions
    6410              :        (children).
    6411              : 
    6412              :    (2) Divide the graph into a dag of strongly-connected components (SCCs).
    6413              : 
    6414              :    (3) When optimizing for speed, partition the nodes in each SCC based
    6415              :        on their containing cfg loop.  When optimizing for size, treat
    6416              :        each SCC as a single partition.
    6417              : 
    6418              :        This gives us a dag of partitions.  The goal is now to assign a
    6419              :        layout to each partition.
    6420              : 
    6421              :    (4) Construct a set of vector layouts that are worth considering.
    6422              :        Record which nodes must keep their current layout.
    6423              : 
    6424              :    (5) Perform a forward walk over the partition dag (from loads to stores)
    6425              :        accumulating the "forward" cost of using each layout.  When visiting
    6426              :        each partition, assign a tentative choice of layout to the partition
    6427              :        and use that choice when calculating the cost of using a different
    6428              :        layout in successor partitions.
    6429              : 
    6430              :    (6) Perform a backward walk over the partition dag (from stores to loads),
    6431              :        accumulating the "backward" cost of using each layout.  When visiting
    6432              :        each partition, make a final choice of layout for that partition based
    6433              :        on the accumulated forward costs (from (5)) and backward costs
    6434              :        (from (6)).
    6435              : 
    6436              :    (7) Apply the chosen layouts to the SLP graph.
    6437              : 
    6438              :    For example, consider the SLP statements:
    6439              : 
    6440              :    S1:      a_1 = load
    6441              :        loop:
    6442              :    S2:      a_2 = PHI<a_1, a_3>
    6443              :    S3:      b_1 = load
    6444              :    S4:      a_3 = a_2 + b_1
    6445              :        exit:
    6446              :    S5:      a_4 = PHI<a_3>
    6447              :    S6:      store a_4
    6448              : 
    6449              :    S2 and S4 form an SCC and are part of the same loop.  Every other
    6450              :    statement is in a singleton SCC.  In this example there is a one-to-one
    6451              :    mapping between SCCs and partitions and the partition dag looks like this;
    6452              : 
    6453              :         S1     S3
    6454              :          \     /
    6455              :           S2+S4
    6456              :             |
    6457              :            S5
    6458              :             |
    6459              :            S6
    6460              : 
    6461              :    S2, S3 and S4 will have a higher execution frequency than the other
    6462              :    statements, so when optimizing for speed, the goal is to avoid any
    6463              :    layout changes:
    6464              : 
    6465              :    - within S3
    6466              :    - within S2+S4
    6467              :    - on the S3->S2+S4 edge
    6468              : 
    6469              :    For example, if S3 was originally a reversing load, the goal of the
    6470              :    pass is to make it an unreversed load and change the layout on the
    6471              :    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
    6472              :    on S1->S2+S4 and S5->S6 would also be acceptable.)
    6473              : 
    6474              :    The difference between SCCs and partitions becomes important if we
    6475              :    add an outer loop:
    6476              : 
    6477              :    S1:      a_1 = ...
    6478              :        loop1:
    6479              :    S2:      a_2 = PHI<a_1, a_6>
    6480              :    S3:      b_1 = load
    6481              :    S4:      a_3 = a_2 + b_1
    6482              :        loop2:
    6483              :    S5:      a_4 = PHI<a_3, a_5>
    6484              :    S6:      c_1 = load
    6485              :    S7:      a_5 = a_4 + c_1
    6486              :        exit2:
    6487              :    S8:      a_6 = PHI<a_5>
    6488              :    S9:      store a_6
    6489              :        exit1:
    6490              : 
    6491              :    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
    6492              :    for speed, we usually do not want restrictions in the outer loop to "infect"
    6493              :    the decision for the inner loop.  For example, if an outer-loop node
    6494              :    in the SCC contains a statement with a fixed layout, that should not
    6495              :    prevent the inner loop from using a different layout.  Conversely,
    6496              :    the inner loop should not dictate a layout to the outer loop: if the
    6497              :    outer loop does a lot of computation, then it may not be efficient to
    6498              :    do all of that computation in the inner loop's preferred layout.
    6499              : 
    6500              :    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
    6501              :    and S5+S7 (inner).  We also try to arrange partitions so that:
    6502              : 
    6503              :    - the partition for an outer loop comes before the partition for
    6504              :      an inner loop
    6505              : 
    6506              :    - if a sibling loop A dominates a sibling loop B, A's partition
    6507              :      comes before B's
    6508              : 
    6509              :    This gives the following partition dag for the example above:
    6510              : 
    6511              :         S1        S3
    6512              :          \        /
    6513              :           S2+S4+S8   S6
    6514              :            |   \\    /
    6515              :            |    S5+S7
    6516              :            |
    6517              :           S9
    6518              : 
    6519              :    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
    6520              :    one for a reversal of the edge S7->S8.
    6521              : 
    6522              :    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
    6523              :    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
    6524              :    preferred layout against the cost of changing the layout on entry to the
    6525              :    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
    6526              : 
    6527              :    Although this works well when optimizing for speed, it has the downside
    6528              :    when optimizing for size that the choice of layout for S5+S7 is completely
    6529              :    independent of S9, which lessens the chance of reducing the overall number
    6530              :    of permutations.  We therefore do not partition SCCs when optimizing
    6531              :    for size.
    6532              : 
    6533              :    To give a concrete example of the difference between optimizing
    6534              :    for size and speed, consider:
    6535              : 
    6536              :    a[0] = (b[1] << c[3]) - d[1];
    6537              :    a[1] = (b[0] << c[2]) - d[0];
    6538              :    a[2] = (b[3] << c[1]) - d[3];
    6539              :    a[3] = (b[2] << c[0]) - d[2];
    6540              : 
    6541              :    There are three different layouts here: one for a, one for b and d,
    6542              :    and one for c.  When optimizing for speed it is better to permute each
    6543              :    of b, c and d into the order required by a, since those permutations
    6544              :    happen in parallel.  But when optimizing for size, it is better to:
    6545              : 
    6546              :    - permute c into the same order as b
    6547              :    - do the arithmetic
    6548              :    - permute the result into the order required by a
    6549              : 
    6550              :    This gives 2 permutations rather than 3.  */
    6551              : 
    6552              : class vect_optimize_slp_pass
    6553              : {
    6554              : public:
    6555       678936 :   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
    6556              :   void run ();
    6557              : 
    6558              : private:
    6559              :   /* Graph building.  */
    6560              :   struct loop *containing_loop (slp_tree);
    6561              :   bool is_cfg_latch_edge (graph_edge *);
    6562              :   void build_vertices (hash_set<slp_tree> &, slp_tree);
    6563              :   void build_vertices ();
    6564              :   void build_graph ();
    6565              : 
    6566              :   /* Partitioning.  */
    6567              :   void create_partitions ();
    6568              :   template<typename T> void for_each_partition_edge (unsigned int, T);
    6569              : 
    6570              :   /* Layout selection.  */
    6571              :   bool is_compatible_layout (slp_tree, unsigned int);
    6572              :   bool is_compatible_layout (const slpg_partition_info &, unsigned int);
    6573              :   int change_layout_cost (slp_tree, unsigned int, unsigned int);
    6574              :   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
    6575              :                                                        unsigned int);
    6576              :   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
    6577              :                                int, unsigned int);
    6578              :   int internal_node_cost (slp_tree, int, unsigned int);
    6579              :   void start_choosing_layouts ();
    6580              :   bool legitimize ();
    6581              : 
    6582              :   /* Cost propagation.  */
    6583              :   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
    6584              :                                      unsigned int, unsigned int);
    6585              :   slpg_layout_cost total_in_cost (unsigned int);
    6586              :   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
    6587              :   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
    6588              :   void forward_pass ();
    6589              :   void backward_pass ();
    6590              : 
    6591              :   /* Rematerialization.  */
    6592              :   slp_tree get_result_with_layout (slp_tree, unsigned int);
    6593              :   void materialize ();
    6594              : 
    6595              :   /* Clean-up.  */
    6596              :   void remove_redundant_permutations ();
    6597              : 
    6598              :   /* Masked load lanes discovery.  */
    6599              :   void decide_masked_load_lanes ();
    6600              : 
    6601              :   void dump ();
    6602              : 
    6603              :   vec_info *m_vinfo;
    6604              : 
    6605              :   /* True if we should optimize the graph for size, false if we should
    6606              :      optimize it for speed.  (It wouldn't be easy to make this decision
    6607              :      more locally.)  */
    6608              :   bool m_optimize_size;
    6609              : 
    6610              :   /* A graph of all SLP nodes, with edges leading from uses to definitions.
    6611              :      In other words, a node's predecessors are its slp_tree parents and
    6612              :      a node's successors are its slp_tree children.  */
    6613              :   graph *m_slpg = nullptr;
    6614              : 
    6615              :   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
    6616              :   auto_vec<slpg_vertex> m_vertices;
    6617              : 
    6618              :   /* The list of all leaves of M_SLPG. such as external definitions, constants,
    6619              :      and loads.  */
    6620              :   auto_vec<int> m_leafs;
    6621              : 
    6622              :   /* This array has one entry for every vector layout that we're considering.
    6623              :      Element 0 is null and indicates "no change".  Other entries describe
    6624              :      permutations that are inherent in the current graph and that we would
    6625              :      like to reverse if possible.
    6626              : 
    6627              :      For example, a permutation { 1, 2, 3, 0 } means that something has
    6628              :      effectively been permuted in that way, such as a load group
    6629              :      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
    6630              :      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
    6631              :      in order to put things "back" in order.  */
    6632              :   auto_vec<vec<unsigned> > m_perms;
    6633              : 
    6634              :   /* A partitioning of the nodes for which a layout must be chosen.
    6635              :      Each partition represents an <SCC, cfg loop> pair; that is,
    6636              :      nodes in different SCCs belong to different partitions, and nodes
    6637              :      within an SCC can be further partitioned according to a containing
    6638              :      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
    6639              : 
    6640              :      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
    6641              :        from leaves (such as loads) to roots (such as stores).
    6642              : 
    6643              :      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
    6644              :   auto_vec<slpg_partition_info> m_partitions;
    6645              : 
    6646              :   /* The list of all nodes for which a layout must be chosen.  Nodes for
    6647              :      partition P come before the nodes for partition P+1.  Nodes within a
    6648              :      partition are in reverse postorder.  */
    6649              :   auto_vec<unsigned int> m_partitioned_nodes;
    6650              : 
    6651              :   /* Index P * num-layouts + L contains the cost of using layout L
    6652              :      for partition P.  */
    6653              :   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
    6654              : 
    6655              :   /* Index N * num-layouts + L, if nonnull, is a node that provides the
    6656              :      original output of node N adjusted to have layout L.  */
    6657              :   auto_vec<slp_tree> m_node_layouts;
    6658              : };
    6659              : 
    6660              : /* Fill the vertices and leafs vector with all nodes in the SLP graph.
    6661              :    Also record whether we should optimize anything for speed rather
    6662              :    than size.  */
    6663              : 
    6664              : void
    6665     10674921 : vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
    6666              :                                         slp_tree node)
    6667              : {
    6668     10674921 :   unsigned i;
    6669     10674921 :   slp_tree child;
    6670              : 
    6671     10674921 :   if (visited.add (node))
    6672     10674921 :     return;
    6673              : 
    6674      9872731 :   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
    6675              :     {
    6676      7784818 :       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
    6677      6932394 :       if (optimize_bb_for_speed_p (bb))
    6678      6812674 :         m_optimize_size = false;
    6679              :     }
    6680              : 
    6681      9872731 :   node->vertex = m_vertices.length ();
    6682      9872731 :   m_vertices.safe_push (slpg_vertex (node));
    6683              : 
    6684      9872731 :   bool leaf = true;
    6685      9872731 :   bool force_leaf = false;
    6686     18487438 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    6687      8614707 :     if (child)
    6688              :       {
    6689      7751959 :         leaf = false;
    6690      7751959 :         build_vertices (visited, child);
    6691              :       }
    6692              :     else
    6693              :       force_leaf = true;
    6694              :   /* Since SLP discovery works along use-def edges all cycles have an
    6695              :      entry - but there's the exception of cycles where we do not handle
    6696              :      the entry explicitly (but with a NULL SLP node), like some reductions
    6697              :      and inductions.  Force those SLP PHIs to act as leafs to make them
    6698              :      backwards reachable.  */
    6699      9872731 :   if (leaf || force_leaf)
    6700      4879546 :     m_leafs.safe_push (node->vertex);
    6701              : }
    6702              : 
    6703              : /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
    6704              : 
    6705              : void
    6706      1357872 : vect_optimize_slp_pass::build_vertices ()
    6707              : {
    6708      1357872 :   hash_set<slp_tree> visited;
    6709      1357872 :   unsigned i;
    6710      1357872 :   slp_instance instance;
    6711      1357872 :   m_vertices.truncate (0);
    6712      1357872 :   m_leafs.truncate (0);
    6713      6996578 :   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
    6714      2922962 :     build_vertices (visited, SLP_INSTANCE_TREE (instance));
    6715      1357872 : }
    6716              : 
    6717              : /* Apply (reverse) bijectite PERM to VEC.  */
    6718              : 
    6719              : template <class T>
    6720              : static void
    6721       172837 : vect_slp_permute (vec<unsigned> perm,
    6722              :                   vec<T> &vec, bool reverse)
    6723              : {
    6724       172837 :   auto_vec<T, 64> saved;
    6725       172837 :   saved.create (vec.length ());
    6726       572225 :   for (unsigned i = 0; i < vec.length (); ++i)
    6727       399388 :     saved.quick_push (vec[i]);
    6728              : 
    6729       172837 :   if (reverse)
    6730              :     {
    6731      1135438 :       for (unsigned i = 0; i < vec.length (); ++i)
    6732       398056 :         vec[perm[i]] = saved[i];
    6733       570295 :       for (unsigned i = 0; i < vec.length (); ++i)
    6734       699787 :         gcc_assert (vec[perm[i]] == saved[i]);
    6735              :     }
    6736              :   else
    6737              :     {
    6738         3860 :       for (unsigned i = 0; i < vec.length (); ++i)
    6739         1332 :         vec[i] = saved[perm[i]];
    6740       174169 :       for (unsigned i = 0; i < vec.length (); ++i)
    6741         1998 :         gcc_assert (vec[i] == saved[perm[i]]);
    6742              :     }
    6743       172837 : }
    6744              : 
    6745              : /* Return the cfg loop that contains NODE.  */
    6746              : 
    6747              : struct loop *
    6748      3869483 : vect_optimize_slp_pass::containing_loop (slp_tree node)
    6749              : {
    6750      3869483 :   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
    6751      3869483 :   if (!rep)
    6752         5133 :     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
    6753      4303735 :   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
    6754              : }
    6755              : 
    6756              : /* Return true if UD (an edge from a use to a definition) is associated
    6757              :    with a loop latch edge in the cfg.  */
    6758              : 
    6759              : bool
    6760      7751959 : vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
    6761              : {
    6762      7751959 :   slp_tree use = m_vertices[ud->src].node;
    6763      7751959 :   slp_tree def = m_vertices[ud->dest].node;
    6764      7751959 :   if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
    6765      7751959 :        || SLP_TREE_PERMUTE_P (use))
    6766      7440800 :       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
    6767              :     return false;
    6768              : 
    6769      4513592 :   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
    6770      4513592 :   return (is_a<gphi *> (use_rep->stmt)
    6771       376248 :           && bb_loop_header_p (gimple_bb (use_rep->stmt))
    6772      4724988 :           && containing_loop (def) == containing_loop (use));
    6773              : }
    6774              : 
    6775              : /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
    6776              :    a nonnull data field.  */
    6777              : 
    6778              : void
    6779      1357872 : vect_optimize_slp_pass::build_graph ()
    6780              : {
    6781      1357872 :   m_optimize_size = true;
    6782      1357872 :   build_vertices ();
    6783              : 
    6784      2715744 :   m_slpg = new_graph (m_vertices.length ());
    6785     13946347 :   for (slpg_vertex &v : m_vertices)
    6786     29517700 :     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
    6787      8614707 :       if (child)
    6788              :         {
    6789      7751959 :           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
    6790      7751959 :           if (is_cfg_latch_edge (ud))
    6791       202676 :             ud->data = this;
    6792              :         }
    6793      1357872 : }
    6794              : 
    6795              : /* Return true if E corresponds to a loop latch edge in the cfg.  */
    6796              : 
    6797              : static bool
    6798      3977038 : skip_cfg_latch_edges (graph_edge *e)
    6799              : {
    6800      3977038 :   return e->data;
    6801              : }
    6802              : 
    6803              : /* Create the node partitions.  */
    6804              : 
    6805              : void
    6806       678936 : vect_optimize_slp_pass::create_partitions ()
    6807              : {
    6808              :   /* Calculate a postorder of the graph, ignoring edges that correspond
    6809              :      to natural latch edges in the cfg.  Reading the vector from the end
    6810              :      to the beginning gives the reverse postorder.  */
    6811       678936 :   auto_vec<int> initial_rpo;
    6812      1357872 :   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
    6813              :                false, NULL, skip_cfg_latch_edges);
    6814      2036808 :   gcc_assert (initial_rpo.length () == m_vertices.length ());
    6815              : 
    6816              :   /* Calculate the strongly connected components of the graph.  */
    6817       678936 :   auto_vec<int> scc_grouping;
    6818       678936 :   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
    6819              : 
    6820              :   /* Create a new index order in which all nodes from the same SCC are
    6821              :      consecutive.  Use scc_pos to record the index of the first node in
    6822              :      each SCC.  */
    6823       678936 :   auto_vec<unsigned int> scc_pos (num_sccs);
    6824       678936 :   int last_component = -1;
    6825       678936 :   unsigned int node_count = 0;
    6826      6972896 :   for (unsigned int node_i : scc_grouping)
    6827              :     {
    6828      4936088 :       if (last_component != m_slpg->vertices[node_i].component)
    6829              :         {
    6830      4808743 :           last_component = m_slpg->vertices[node_i].component;
    6831      9617486 :           gcc_assert (last_component == int (scc_pos.length ()));
    6832      4808743 :           scc_pos.quick_push (node_count);
    6833              :         }
    6834      4936088 :       node_count += 1;
    6835              :     }
    6836      1357872 :   gcc_assert (node_count == initial_rpo.length ()
    6837              :               && last_component + 1 == int (num_sccs));
    6838              : 
    6839              :   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
    6840              :      inside each SCC following the RPO we calculated above.  The fact that
    6841              :      we ignored natural latch edges when calculating the RPO should ensure
    6842              :      that, for natural loop nests:
    6843              : 
    6844              :      - the first node that we encounter in a cfg loop is the loop header phi
    6845              :      - the loop header phis are in dominance order
    6846              : 
    6847              :      Arranging for this is an optimization (see below) rather than a
    6848              :      correctness issue.  Unnatural loops with a tangled mess of backedges
    6849              :      will still work correctly, but might give poorer results.
    6850              : 
    6851              :      Also update scc_pos so that it gives 1 + the index of the last node
    6852              :      in the SCC.  */
    6853       678936 :   m_partitioned_nodes.safe_grow (node_count);
    6854      6293960 :   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
    6855              :     {
    6856      4936088 :       unsigned int node_i = initial_rpo[old_i];
    6857      4936088 :       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
    6858      4936088 :       m_partitioned_nodes[new_i] = node_i;
    6859              :     }
    6860              : 
    6861              :   /* When optimizing for speed, partition each SCC based on the containing
    6862              :      cfg loop. The order we constructed above should ensure that, for natural
    6863              :      cfg loops, we'll create sub-SCC partitions for outer loops before
    6864              :      the corresponding sub-SCC partitions for inner loops.  Similarly,
    6865              :      when one sibling loop A dominates another sibling loop B, we should
    6866              :      create a sub-SCC partition for A before a sub-SCC partition for B.
    6867              : 
    6868              :      As above, nothing depends for correctness on whether this achieves
    6869              :      a natural nesting, but we should get better results when it does.  */
    6870      1357872 :   m_partitions.reserve (m_vertices.length ());
    6871       678936 :   unsigned int next_partition_i = 0;
    6872       678936 :   hash_map<struct loop *, int> loop_partitions;
    6873       678936 :   unsigned int rpo_begin = 0;
    6874       678936 :   unsigned int num_partitioned_nodes = 0;
    6875      6845551 :   for (unsigned int rpo_end : scc_pos)
    6876              :     {
    6877      4808743 :       loop_partitions.empty ();
    6878              :       unsigned int partition_i = next_partition_i;
    6879      9744831 :       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
    6880              :         {
    6881              :           /* Handle externals and constants optimistically throughout.
    6882              :              But treat existing vectors as fixed since we do not handle
    6883              :              permuting them.  */
    6884      4936088 :           unsigned int node_i = m_partitioned_nodes[rpo_i];
    6885      4936088 :           auto &vertex = m_vertices[node_i];
    6886      4936088 :           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
    6887       496061 :                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
    6888      4938123 :               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
    6889      1465035 :             vertex.partition = -1;
    6890              :           else
    6891              :             {
    6892      3471053 :               bool existed;
    6893      3471053 :               if (m_optimize_size)
    6894        24362 :                 existed = next_partition_i > partition_i;
    6895              :               else
    6896              :                 {
    6897      3446691 :                   struct loop *loop = containing_loop (vertex.node);
    6898      3446691 :                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
    6899      3446691 :                   if (!existed)
    6900      3320393 :                     entry = next_partition_i;
    6901      3446691 :                   partition_i = entry;
    6902              :                 }
    6903      3471053 :               if (!existed)
    6904              :                 {
    6905      3344665 :                   m_partitions.quick_push (slpg_partition_info ());
    6906      3344665 :                   next_partition_i += 1;
    6907              :                 }
    6908      3471053 :               vertex.partition = partition_i;
    6909      3471053 :               num_partitioned_nodes += 1;
    6910      3471053 :               m_partitions[partition_i].node_end += 1;
    6911              :             }
    6912              :         }
    6913      4808743 :       rpo_begin = rpo_end;
    6914              :     }
    6915              : 
    6916              :   /* Assign ranges of consecutive node indices to each partition,
    6917              :      in partition order.  Start with node_end being the same as
    6918              :      node_begin so that the next loop can use it as a counter.  */
    6919       678936 :   unsigned int node_begin = 0;
    6920      5381473 :   for (auto &partition : m_partitions)
    6921              :     {
    6922      3344665 :       partition.node_begin = node_begin;
    6923      3344665 :       node_begin += partition.node_end;
    6924      3344665 :       partition.node_end = partition.node_begin;
    6925              :     }
    6926       678936 :   gcc_assert (node_begin == num_partitioned_nodes);
    6927              : 
    6928              :   /* Finally build the list of nodes in partition order.  */
    6929       678936 :   m_partitioned_nodes.truncate (num_partitioned_nodes);
    6930      5615024 :   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
    6931              :     {
    6932      4936088 :       int partition_i = m_vertices[node_i].partition;
    6933      4936088 :       if (partition_i >= 0)
    6934              :         {
    6935      3471053 :           unsigned int order_i = m_partitions[partition_i].node_end++;
    6936      3471053 :           m_partitioned_nodes[order_i] = node_i;
    6937              :         }
    6938              :     }
    6939       678936 : }
    6940              : 
    6941              : /* Look for edges from earlier partitions into node NODE_I and edges from
    6942              :    node NODE_I into later partitions.  Call:
    6943              : 
    6944              :       FN (ud, other_node_i)
    6945              : 
    6946              :    for each such use-to-def edge ud, where other_node_i is the node at the
    6947              :    other end of the edge.  */
    6948              : 
    6949              : template<typename T>
    6950              : void
    6951      3856908 : vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
    6952              : {
    6953      3856908 :   int partition_i = m_vertices[node_i].partition;
    6954      3856908 :   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
    6955      6668551 :        pred; pred = pred->pred_next)
    6956              :     {
    6957      2811643 :       int src_partition_i = m_vertices[pred->src].partition;
    6958      2811643 :       if (src_partition_i >= 0 && src_partition_i != partition_i)
    6959      2488590 :         fn (pred, pred->src);
    6960              :     }
    6961      3856908 :   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
    6962      8266592 :        succ; succ = succ->succ_next)
    6963              :     {
    6964      4409684 :       int dest_partition_i = m_vertices[succ->dest].partition;
    6965      4409684 :       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
    6966      2516189 :         fn (succ, succ->dest);
    6967              :     }
    6968      3856908 : }
    6969              : 
    6970              : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
    6971              :    that NODE would operate on.  This test is independent of NODE's actual
    6972              :    operation.  */
    6973              : 
    6974              : bool
    6975      1503197 : vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
    6976              :                                               unsigned int layout_i)
    6977              : {
    6978      1503197 :   if (layout_i == 0)
    6979              :     return true;
    6980              : 
    6981       848720 :   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
    6982        14558 :     return false;
    6983              : 
    6984              :   return true;
    6985              : }
    6986              : 
    6987              : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
    6988              :    that NODE would operate on for each NODE in PARTITION.
    6989              :    This test is independent of NODE's actual operations.  */
    6990              : 
    6991              : bool
    6992        16238 : vect_optimize_slp_pass::is_compatible_layout (const slpg_partition_info
    6993              :                                                 &partition,
    6994              :                                               unsigned int layout_i)
    6995              : {
    6996        32748 :   for (unsigned int order_i = partition.node_begin;
    6997        32748 :        order_i < partition.node_end; ++order_i)
    6998              :     {
    6999        16577 :       unsigned int node_i = m_partitioned_nodes[order_i];
    7000        16577 :       auto &vertex = m_vertices[node_i];
    7001              : 
    7002              :       /* The layout is incompatible if it is individually incompatible
    7003              :          with any node in the partition.  */
    7004        16577 :       if (!is_compatible_layout (vertex.node, layout_i))
    7005              :         return false;
    7006              :     }
    7007              :   return true;
    7008              : }
    7009              : 
    7010              : /* Return the cost (in arbitrary units) of going from layout FROM_LAYOUT_I
    7011              :    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
    7012              :    layouts is incompatible with NODE or if the change is not possible for
    7013              :    some other reason.
    7014              : 
    7015              :    The properties taken from NODE include the number of lanes and the
    7016              :    vector type.  The actual operation doesn't matter.  */
    7017              : 
    7018              : int
    7019       638892 : vect_optimize_slp_pass::change_layout_cost (slp_tree node,
    7020              :                                             unsigned int from_layout_i,
    7021              :                                             unsigned int to_layout_i)
    7022              : {
    7023       638892 :   if (!is_compatible_layout (node, from_layout_i)
    7024       638892 :       || !is_compatible_layout (node, to_layout_i))
    7025          545 :     return -1;
    7026              : 
    7027       638347 :   if (from_layout_i == to_layout_i)
    7028              :     return 0;
    7029              : 
    7030       262952 :   auto_vec<slp_tree, 1> children (1);
    7031       262952 :   children.quick_push (node);
    7032       262952 :   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
    7033       262952 :   if (from_layout_i > 0)
    7034       750666 :     for (unsigned int i : m_perms[from_layout_i])
    7035       333747 :       perm.quick_push ({ 0, i });
    7036              :   else
    7037       404712 :     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
    7038       280733 :       perm.quick_push ({ 0, i });
    7039       262952 :   if (to_layout_i > 0)
    7040       124406 :     vect_slp_permute (m_perms[to_layout_i], perm, true);
    7041       262952 :   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
    7042              :                                                children, false);
    7043       262952 :   if (count >= 0)
    7044       258743 :     return MAX (count, 1);
    7045              : 
    7046              :   /* ??? In principle we could try changing via layout 0, giving two
    7047              :      layout changes rather than 1.  Doing that would require
    7048              :      corresponding support in get_result_with_layout.  */
    7049              :   return -1;
    7050       262952 : }
    7051              : 
    7052              : /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
    7053              : 
    7054              : inline slpg_partition_layout_costs &
    7055       927206 : vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
    7056              :                                                 unsigned int layout_i)
    7057              : {
    7058      1854412 :   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
    7059              : }
    7060              : 
    7061              : /* Change PERM in one of two ways:
    7062              : 
    7063              :    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
    7064              :      chosen for child I of NODE.
    7065              : 
    7066              :    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
    7067              : 
    7068              :    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
    7069              : 
    7070              : void
    7071        30181 : vect_optimize_slp_pass::
    7072              : change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
    7073              :                         int in_layout_i, unsigned int out_layout_i)
    7074              : {
    7075       175721 :   for (auto &entry : perm)
    7076              :     {
    7077        85178 :       int this_in_layout_i = in_layout_i;
    7078        85178 :       if (this_in_layout_i < 0)
    7079              :         {
    7080        59111 :           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
    7081        59111 :           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
    7082        59111 :           if (in_partition_i == -1u)
    7083          329 :             continue;
    7084        58782 :           this_in_layout_i = m_partitions[in_partition_i].layout;
    7085              :         }
    7086        84849 :       if (this_in_layout_i > 0)
    7087        19065 :         entry.second = m_perms[this_in_layout_i][entry.second];
    7088              :     }
    7089        30181 :   if (out_layout_i > 0)
    7090         7031 :     vect_slp_permute (m_perms[out_layout_i], perm, true);
    7091        30181 : }
    7092              : 
    7093              : /* Check whether the target allows NODE to be rearranged so that the node's
    7094              :    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
    7095              :    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
    7096              : 
    7097              :    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
    7098              :    NODE can adapt to the layout changes that have (perhaps provisionally)
    7099              :    been chosen for NODE's children, so that no extra permutations are
    7100              :    needed on either the input or the output of NODE.
    7101              : 
    7102              :    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
    7103              :    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
    7104              : 
    7105              :    IN_LAYOUT_I has no meaning for other types of node.
    7106              : 
    7107              :    Keeping the node as-is is always valid.  If the target doesn't appear
    7108              :    to support the node as-is, but might realistically support other layouts,
    7109              :    then layout 0 instead has the cost of a worst-case permutation.  On the
    7110              :    one hand, this ensures that every node has at least one valid layout,
    7111              :    avoiding what would otherwise be an awkward special case.  On the other,
    7112              :    it still encourages the pass to change an invalid pre-existing layout
    7113              :    choice into a valid one.  */
    7114              : 
    7115              : int
    7116       205074 : vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
    7117              :                                             unsigned int out_layout_i)
    7118              : {
    7119       205074 :   const int fallback_cost = 1;
    7120              : 
    7121       205074 :   if (SLP_TREE_PERMUTE_P (node))
    7122              :     {
    7123        25071 :       auto_lane_permutation_t tmp_perm;
    7124        25071 :       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
    7125              : 
    7126              :       /* Check that the child nodes support the chosen layout.  Checking
    7127              :          the first child is enough, since any second child would have the
    7128              :          same shape.  */
    7129        25071 :       auto first_child = SLP_TREE_CHILDREN (node)[0];
    7130        25071 :       if (in_layout_i > 0
    7131        25071 :           && !is_compatible_layout (first_child, in_layout_i))
    7132              :         return -1;
    7133              : 
    7134        24531 :       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
    7135        49062 :       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
    7136              :                                                   node, tmp_perm,
    7137        24531 :                                                   SLP_TREE_CHILDREN (node),
    7138              :                                                   false);
    7139        24531 :       if (count < 0)
    7140              :         {
    7141         1510 :           if (in_layout_i == 0 && out_layout_i == 0)
    7142              :             {
    7143              :               /* Use the fallback cost if the node could in principle support
    7144              :                  some nonzero layout for both the inputs and the outputs.
    7145              :                  Otherwise assume that the node will be rejected later
    7146              :                  and rebuilt from scalars.  */
    7147          367 :               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
    7148              :                 return fallback_cost;
    7149          297 :               return 0;
    7150              :             }
    7151              :           return -1;
    7152              :         }
    7153              : 
    7154              :       /* We currently have no way of telling whether the new layout is cheaper
    7155              :          or more expensive than the old one.  But at least in principle,
    7156              :          it should be worth making zero permutations (whole-vector shuffles)
    7157              :          cheaper than real permutations, in case the pass is able to remove
    7158              :          the latter.  */
    7159        23021 :       return count == 0 ? 0 : 1;
    7160        25071 :     }
    7161              : 
    7162       180003 :   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
    7163       180003 :   if (rep
    7164       179169 :       && STMT_VINFO_DATA_REF (rep)
    7165        56756 :       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
    7166       219605 :       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7167              :     {
    7168        32527 :       auto_load_permutation_t tmp_perm;
    7169        32527 :       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
    7170        32527 :       if (out_layout_i > 0)
    7171        12252 :         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
    7172              : 
    7173        32527 :       poly_uint64 vf = 1;
    7174        32527 :       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
    7175        12152 :         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    7176        32527 :       unsigned int n_perms;
    7177        32527 :       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
    7178              :                                            nullptr, vf, true, false, &n_perms))
    7179              :         {
    7180         1503 :           auto rep = SLP_TREE_REPRESENTATIVE (node);
    7181         1503 :           if (out_layout_i == 0)
    7182              :             {
    7183              :               /* Use the fallback cost if the load is an N-to-N permutation.
    7184              :                  Otherwise assume that the node will be rejected later
    7185              :                  and rebuilt from scalars.  */
    7186         1097 :               if (STMT_VINFO_GROUPED_ACCESS (rep)
    7187         2194 :                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
    7188         1097 :                       == SLP_TREE_LANES (node)))
    7189          595 :                 return fallback_cost;
    7190              :               return 0;
    7191              :             }
    7192              :           return -1;
    7193              :         }
    7194              : 
    7195              :       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
    7196        31024 :       return n_perms == 0 ? 0 : 1;
    7197        32527 :     }
    7198              : 
    7199              :   return 0;
    7200              : }
    7201              : 
    7202              : /* Decide which element layouts we should consider using.  Calculate the
    7203              :    weights associated with inserting layout changes on partition edges.
    7204              :    Also mark partitions that cannot change layout, by setting their
    7205              :    layout to zero.  */
    7206              : 
    7207              : void
    7208       678936 : vect_optimize_slp_pass::start_choosing_layouts ()
    7209              : {
    7210              :   /* Used to assign unique permutation indices.  */
    7211       678936 :   using perm_hash = unbounded_hashmap_traits<
    7212              :     vec_free_hash_base<int_hash_base<unsigned>>,
    7213              :     int_hash<int, -1, -2>
    7214              :   >;
    7215       678936 :   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
    7216              : 
    7217              :   /* Layout 0 is "no change".  */
    7218       678936 :   m_perms.safe_push (vNULL);
    7219              : 
    7220              :   /* Create layouts from existing permutations.  */
    7221       678936 :   auto_load_permutation_t tmp_perm;
    7222      5507861 :   for (unsigned int node_i : m_partitioned_nodes)
    7223              :     {
    7224              :       /* Leafs also double as entries to the reverse graph.  Allow the
    7225              :          layout of those to be changed.  */
    7226      3471053 :       auto &vertex = m_vertices[node_i];
    7227      3471053 :       auto &partition = m_partitions[vertex.partition];
    7228      3471053 :       if (!m_slpg->vertices[node_i].succ)
    7229       884166 :         partition.layout = 0;
    7230              : 
    7231              :       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
    7232      3471053 :       slp_tree node = vertex.node;
    7233      3471053 :       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
    7234      3471053 :       slp_tree child;
    7235      3471053 :       unsigned HOST_WIDE_INT imin, imax = 0;
    7236      3471053 :       bool any_permute = false;
    7237      3471053 :       tmp_perm.truncate (0);
    7238      3471053 :       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7239              :         {
    7240              :           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
    7241              :              unpermuted, record a layout that reverses this permutation.
    7242              : 
    7243              :              We would need more work to cope with loads that are internally
    7244              :              permuted and also have inputs (such as masks for
    7245              :              IFN_MASK_LOADs).  */
    7246       594481 :           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
    7247       594481 :           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
    7248              :             {
    7249       422669 :               partition.layout = -1;
    7250      3454909 :               continue;
    7251              :             }
    7252       171812 :           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
    7253       171812 :           imin = DR_GROUP_SIZE (dr_stmt) + 1;
    7254       171812 :           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
    7255              :         }
    7256      5634878 :       else if (SLP_TREE_PERMUTE_P (node)
    7257       136783 :                && SLP_TREE_CHILDREN (node).length () == 1
    7258       118266 :                && (child = SLP_TREE_CHILDREN (node)[0])
    7259      2994838 :                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
    7260       118266 :                    .is_constant (&imin)))
    7261              :         {
    7262              :           /* If the child has the same vector size as this node,
    7263              :              reversing the permutation can make the permutation a no-op.
    7264              :              In other cases it can change a true permutation into a
    7265              :              full-vector extract.  */
    7266       118266 :           tmp_perm.reserve (SLP_TREE_LANES (node));
    7267       317572 :           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7268       199306 :             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
    7269              :         }
    7270              :       else
    7271      2758306 :         continue;
    7272              : 
    7273       764972 :       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7274              :         {
    7275       474894 :           unsigned idx = tmp_perm[j];
    7276       474894 :           imin = MIN (imin, idx);
    7277       474894 :           imax = MAX (imax, idx);
    7278       474894 :           if (idx - tmp_perm[0] != j)
    7279       138639 :             any_permute = true;
    7280              :         }
    7281              :       /* If the span doesn't match we'd disrupt VF computation, avoid
    7282              :          that for now.  */
    7283       290078 :       if (imax - imin + 1 != SLP_TREE_LANES (node))
    7284        82680 :         continue;
    7285              :       /* If there's no permute no need to split one out.  In this case
    7286              :          we can consider turning a load into a permuted load, if that
    7287              :          turns out to be cheaper than alternatives.  */
    7288       207398 :       if (!any_permute)
    7289              :         {
    7290       191105 :           partition.layout = -1;
    7291       191105 :           continue;
    7292              :         }
    7293              : 
    7294              :       /* For now only handle true permutes, like
    7295              :          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
    7296              :          when permuting constants and invariants keeping the permute
    7297              :          bijective.  */
    7298        16293 :       auto_sbitmap load_index (SLP_TREE_LANES (node));
    7299        16293 :       bitmap_clear (load_index);
    7300        63259 :       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7301        46966 :         bitmap_set_bit (load_index, tmp_perm[j] - imin);
    7302              :       unsigned j;
    7303        62430 :       for (j = 0; j < SLP_TREE_LANES (node); ++j)
    7304        46286 :         if (!bitmap_bit_p (load_index, j))
    7305              :           break;
    7306        16293 :       if (j != SLP_TREE_LANES (node))
    7307          149 :         continue;
    7308              : 
    7309        16144 :       vec<unsigned> perm = vNULL;
    7310        16144 :       perm.safe_grow (SLP_TREE_LANES (node), true);
    7311        62159 :       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7312        46015 :         perm[j] = tmp_perm[j] - imin;
    7313              : 
    7314        32288 :       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
    7315              :         {
    7316              :           /* Continue to use existing layouts, but don't add any more.  */
    7317            0 :           int *entry = layout_ids.get (perm);
    7318            0 :           partition.layout = entry ? *entry : 0;
    7319            0 :           perm.release ();
    7320              :         }
    7321              :       else
    7322              :         {
    7323        16144 :           bool existed;
    7324        16144 :           int &layout_i = layout_ids.get_or_insert (perm, &existed);
    7325        16144 :           if (existed)
    7326         5553 :             perm.release ();
    7327              :           else
    7328              :             {
    7329        10591 :               layout_i = m_perms.length ();
    7330        10591 :               m_perms.safe_push (perm);
    7331              :             }
    7332        16144 :           partition.layout = layout_i;
    7333              :         }
    7334        16293 :     }
    7335              : 
    7336              :   /* Initially assume that every layout is possible and has zero cost
    7337              :      in every partition.  */
    7338       678936 :   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
    7339      1357872 :                                               * m_perms.length ());
    7340              : 
    7341              :   /* We have to mark outgoing permutations facing non-associating-reduction
    7342              :      graph entries that are not represented as to be materialized.
    7343              :      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
    7344      3498289 :   for (slp_instance instance : m_vinfo->slp_instances)
    7345      1461481 :     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
    7346              :       {
    7347         6409 :         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
    7348         6409 :         m_partitions[m_vertices[node_i].partition].layout = 0;
    7349              :       }
    7350      1455072 :     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
    7351              :       {
    7352         2300 :         stmt_vec_info stmt_info
    7353         2300 :           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
    7354         2300 :         vect_reduc_info reduc_info
    7355         2300 :           = info_for_reduction (as_a <loop_vec_info> (m_vinfo),
    7356              :                                 SLP_INSTANCE_TREE (instance));
    7357         2300 :         if (needs_fold_left_reduction_p (TREE_TYPE
    7358              :                                            (gimple_get_lhs (stmt_info->stmt)),
    7359              :                                          VECT_REDUC_INFO_CODE (reduc_info)))
    7360              :           {
    7361           97 :             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
    7362           97 :             m_partitions[m_vertices[node_i].partition].layout = 0;
    7363              :           }
    7364              :       }
    7365              : 
    7366              :   /* Check which layouts each node and partition can handle.  Calculate the
    7367              :      weights associated with inserting layout changes on edges.  */
    7368      5507861 :   for (unsigned int node_i : m_partitioned_nodes)
    7369              :     {
    7370      3471053 :       auto &vertex = m_vertices[node_i];
    7371      3471053 :       auto &partition = m_partitions[vertex.partition];
    7372      3471053 :       slp_tree node = vertex.node;
    7373              : 
    7374      3471053 :       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
    7375              :         {
    7376      3465920 :           vertex.weight = vect_slp_node_weight (node);
    7377              : 
    7378              :           /* We do not handle stores with a permutation, so all
    7379              :              incoming permutations must have been materialized.
    7380              : 
    7381              :              We also don't handle masked grouped loads, which lack a
    7382              :              permutation vector.  In this case the memory locations
    7383              :              form an implicit second input to the loads, on top of the
    7384              :              explicit mask input, and the memory input's layout cannot
    7385              :              be changed.
    7386              : 
    7387              :              On the other hand, we do support permuting gather loads and
    7388              :              masked gather loads, where each scalar load is independent
    7389              :              of the others.  This can be useful if the address/index input
    7390              :              benefits from permutation.  */
    7391      3465920 :           if (STMT_VINFO_DATA_REF (rep)
    7392      1756908 :               && STMT_VINFO_GROUPED_ACCESS (rep)
    7393      4556285 :               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7394       918553 :             partition.layout = 0;
    7395              : 
    7396              :           /* We cannot change the layout of an operation that is
    7397              :              not independent on lanes.  Note this is an explicit
    7398              :              negative list since that's much shorter than the respective
    7399              :              positive one but it's critical to keep maintaining it.  */
    7400      3465920 :           if (is_gimple_call (STMT_VINFO_STMT (rep)))
    7401        31608 :             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
    7402              :               {
    7403         1091 :               case CFN_COMPLEX_ADD_ROT90:
    7404         1091 :               case CFN_COMPLEX_ADD_ROT270:
    7405         1091 :               case CFN_COMPLEX_MUL:
    7406         1091 :               case CFN_COMPLEX_MUL_CONJ:
    7407         1091 :               case CFN_VEC_ADDSUB:
    7408         1091 :               case CFN_VEC_FMADDSUB:
    7409         1091 :               case CFN_VEC_FMSUBADD:
    7410         1091 :                 partition.layout = 0;
    7411              :               default:;
    7412              :               }
    7413              :         }
    7414              : 
    7415      7809521 :       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
    7416              :         {
    7417      4338468 :           auto &other_vertex = m_vertices[other_node_i];
    7418              : 
    7419              :           /* Count the number of edges from earlier partitions and the number
    7420              :              of edges to later partitions.  */
    7421      4338468 :           if (other_vertex.partition < vertex.partition)
    7422      2169234 :             partition.in_degree += 1;
    7423              :           else
    7424      2169234 :             partition.out_degree += 1;
    7425              : 
    7426              :           /* If the current node uses the result of OTHER_NODE_I, accumulate
    7427              :              the effects of that.  */
    7428      4338468 :           if (ud->src == int (node_i))
    7429              :             {
    7430      2169234 :               other_vertex.out_weight += vertex.weight;
    7431      2169234 :               other_vertex.out_degree += 1;
    7432              :             }
    7433      7809521 :         };
    7434      3471053 :       for_each_partition_edge (node_i, process_edge);
    7435              :     }
    7436       678936 : }
    7437              : 
    7438              : /* Return the incoming costs for node NODE_I, assuming that each input keeps
    7439              :    its current (provisional) choice of layout.  The inputs do not necessarily
    7440              :    have the same layout as each other.  */
    7441              : 
    7442              : slpg_layout_cost
    7443         3076 : vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
    7444              : {
    7445         3076 :   auto &vertex = m_vertices[node_i];
    7446         3076 :   slpg_layout_cost cost;
    7447        11238 :   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
    7448              :     {
    7449         8162 :       auto &other_vertex = m_vertices[other_node_i];
    7450         8162 :       if (other_vertex.partition < vertex.partition)
    7451              :         {
    7452         5241 :           auto &other_partition = m_partitions[other_vertex.partition];
    7453        10482 :           auto &other_costs = partition_layout_costs (other_vertex.partition,
    7454         5241 :                                                       other_partition.layout);
    7455         5241 :           slpg_layout_cost this_cost = other_costs.in_cost;
    7456         5241 :           this_cost.add_serial_cost (other_costs.internal_cost);
    7457         5241 :           this_cost.split (other_partition.out_degree);
    7458         5241 :           cost.add_parallel_cost (this_cost);
    7459              :         }
    7460        11238 :     };
    7461         3076 :   for_each_partition_edge (node_i, add_cost);
    7462         3076 :   return cost;
    7463              : }
    7464              : 
    7465              : /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
    7466              :    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
    7467              :    slpg_layout_cost::impossible () if the change isn't possible.  */
    7468              : 
    7469              : slpg_layout_cost
    7470       638892 : vect_optimize_slp_pass::
    7471              : edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
    7472              :                   unsigned int layout2_i)
    7473              : {
    7474       638892 :   auto &def_vertex = m_vertices[ud->dest];
    7475       638892 :   auto &use_vertex = m_vertices[ud->src];
    7476       638892 :   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
    7477       638892 :   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
    7478       638892 :   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
    7479              :                                     use_layout_i);
    7480       638892 :   if (factor < 0)
    7481         4754 :     return slpg_layout_cost::impossible ();
    7482              : 
    7483              :   /* We have a choice of putting the layout change at the site of the
    7484              :      definition or at the site of the use.  Prefer the former when
    7485              :      optimizing for size or when the execution frequency of the
    7486              :      definition is no greater than the combined execution frequencies of
    7487              :      the uses.  When putting the layout change at the site of the definition,
    7488              :      divvy up the cost among all consumers.  */
    7489       634138 :   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
    7490              :     {
    7491       616444 :       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
    7492       616444 :       cost.split (def_vertex.out_degree);
    7493       616444 :       return cost;
    7494              :     }
    7495        17694 :   return { use_vertex.weight * factor, m_optimize_size };
    7496              : }
    7497              : 
    7498              : /* UD represents a use-def link between FROM_NODE_I and a node in a later
    7499              :    partition; FROM_NODE_I could be the definition node or the use node.
    7500              :    The node at the other end of the link wants to use layout TO_LAYOUT_I.
    7501              :    Return the cost of any necessary fix-ups on edge UD, or return
    7502              :    slpg_layout_cost::impossible () if the change isn't possible.
    7503              : 
    7504              :    At this point, FROM_NODE_I's partition has chosen the cheapest
    7505              :    layout based on the information available so far, but this choice
    7506              :    is only provisional.  */
    7507              : 
    7508              : slpg_layout_cost
    7509       169523 : vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
    7510              :                                       unsigned int to_layout_i)
    7511              : {
    7512       169523 :   auto &from_vertex = m_vertices[from_node_i];
    7513       169523 :   unsigned int from_partition_i = from_vertex.partition;
    7514       169523 :   slpg_partition_info &from_partition = m_partitions[from_partition_i];
    7515       169523 :   gcc_assert (from_partition.layout >= 0);
    7516              : 
    7517              :   /* First calculate the cost on the assumption that FROM_PARTITION sticks
    7518              :      with its current layout preference.  */
    7519       169523 :   slpg_layout_cost cost = slpg_layout_cost::impossible ();
    7520       169523 :   auto edge_cost = edge_layout_cost (ud, from_node_i,
    7521       169523 :                                      from_partition.layout, to_layout_i);
    7522       169523 :   if (edge_cost.is_possible ())
    7523              :     {
    7524       334094 :       auto &from_costs = partition_layout_costs (from_partition_i,
    7525       167047 :                                                  from_partition.layout);
    7526       167047 :       cost = from_costs.in_cost;
    7527       167047 :       cost.add_serial_cost (from_costs.internal_cost);
    7528       167047 :       cost.split (from_partition.out_degree);
    7529       167047 :       cost.add_serial_cost (edge_cost);
    7530              :     }
    7531         2476 :   else if (from_partition.layout == 0)
    7532              :     /* We must allow the source partition to have layout 0 as a fallback,
    7533              :        in case all other options turn out to be impossible.  */
    7534         2476 :     return cost;
    7535              : 
    7536              :   /* Take the minimum of that cost and the cost that applies if
    7537              :      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
    7538       167047 :   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
    7539              :                                                       to_layout_i);
    7540       167047 :   if (direct_layout_costs.is_possible ())
    7541              :     {
    7542       147586 :       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
    7543       147586 :       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
    7544       147586 :       direct_cost.split (from_partition.out_degree);
    7545       147586 :       if (!cost.is_possible ()
    7546       147586 :           || direct_cost.is_better_than (cost, m_optimize_size))
    7547        33660 :         cost = direct_cost;
    7548              :     }
    7549              : 
    7550       167047 :   return cost;
    7551              : }
    7552              : 
    7553              : /* UD represents a use-def link between TO_NODE_I and a node in an earlier
    7554              :    partition; TO_NODE_I could be the definition node or the use node.
    7555              :    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
    7556              :    return the cost of any necessary fix-ups on edge UD, or
    7557              :    slpg_layout_cost::impossible () if the choice cannot be made.
    7558              : 
    7559              :    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
    7560              : 
    7561              : slpg_layout_cost
    7562       154094 : vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
    7563              :                                        unsigned int from_layout_i)
    7564              : {
    7565       154094 :   auto &to_vertex = m_vertices[to_node_i];
    7566       154094 :   unsigned int to_partition_i = to_vertex.partition;
    7567       154094 :   slpg_partition_info &to_partition = m_partitions[to_partition_i];
    7568       154094 :   gcc_assert (to_partition.layout >= 0);
    7569              : 
    7570              :   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
    7571              :      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
    7572              :      any other inputs keep their current choice of layout.  */
    7573       154094 :   auto &to_costs = partition_layout_costs (to_partition_i,
    7574              :                                            to_partition.layout);
    7575       154094 :   if (ud->src == int (to_node_i)
    7576       153892 :       && SLP_TREE_PERMUTE_P (to_vertex.node))
    7577              :     {
    7578         9377 :       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
    7579         9377 :       auto old_layout = from_partition.layout;
    7580         9377 :       from_partition.layout = from_layout_i;
    7581        18754 :       int factor = internal_node_cost (to_vertex.node, -1,
    7582         9377 :                                        to_partition.layout);
    7583         9377 :       from_partition.layout = old_layout;
    7584         9377 :       if (factor >= 0)
    7585              :         {
    7586         8747 :           slpg_layout_cost cost = to_costs.out_cost;
    7587        17494 :           cost.add_serial_cost ({ to_vertex.weight * factor,
    7588         8747 :                                   m_optimize_size });
    7589         8747 :           cost.split (to_partition.in_degree);
    7590         8747 :           return cost;
    7591              :         }
    7592              :     }
    7593              : 
    7594              :   /* Compute the cost if we insert any necessary layout change on edge UD.  */
    7595       145347 :   auto edge_cost = edge_layout_cost (ud, to_node_i,
    7596       145347 :                                      to_partition.layout, from_layout_i);
    7597       145347 :   if (edge_cost.is_possible ())
    7598              :     {
    7599       145347 :       slpg_layout_cost cost = to_costs.out_cost;
    7600       145347 :       cost.add_serial_cost (to_costs.internal_cost);
    7601       145347 :       cost.split (to_partition.in_degree);
    7602       145347 :       cost.add_serial_cost (edge_cost);
    7603       145347 :       return cost;
    7604              :     }
    7605              : 
    7606            0 :   return slpg_layout_cost::impossible ();
    7607              : }
    7608              : 
    7609              : /* Make a forward pass through the partitions, accumulating input costs.
    7610              :    Make a tentative (provisional) choice of layout for each partition,
    7611              :    ensuring that this choice still allows later partitions to keep
    7612              :    their original layout.  */
    7613              : 
    7614              : void
    7615         5219 : vect_optimize_slp_pass::forward_pass ()
    7616              : {
    7617       108197 :   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
    7618              :        ++partition_i)
    7619              :     {
    7620       102978 :       auto &partition = m_partitions[partition_i];
    7621              : 
    7622              :       /* If the partition consists of a single VEC_PERM_EXPR, precompute
    7623              :          the incoming cost that would apply if every predecessor partition
    7624              :          keeps its current layout.  This is used within the loop below.  */
    7625       102978 :       slpg_layout_cost in_cost;
    7626       102978 :       slp_tree single_node = nullptr;
    7627       102978 :       if (partition.node_end == partition.node_begin + 1)
    7628              :         {
    7629        96752 :           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
    7630        96752 :           single_node = m_vertices[node_i].node;
    7631        96752 :           if (SLP_TREE_PERMUTE_P (single_node))
    7632         3076 :             in_cost = total_in_cost (node_i);
    7633              :         }
    7634              : 
    7635              :       /* Go through the possible layouts.  Decide which ones are valid
    7636              :          for this partition and record which of the valid layouts has
    7637              :          the lowest cost.  */
    7638       102978 :       unsigned int min_layout_i = 0;
    7639       102978 :       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
    7640       314787 :       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
    7641              :         {
    7642       211809 :           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
    7643       211809 :           if (!layout_costs.is_possible ())
    7644        50722 :             continue;
    7645              : 
    7646              :           /* If the recorded layout is already 0 then the layout cannot
    7647              :              change.  */
    7648       211809 :           if (partition.layout == 0 && layout_i != 0)
    7649              :             {
    7650        34200 :               layout_costs.mark_impossible ();
    7651        34200 :               continue;
    7652              :             }
    7653              : 
    7654       177609 :           bool is_possible = true;
    7655       366951 :           for (unsigned int order_i = partition.node_begin;
    7656       366951 :                order_i < partition.node_end; ++order_i)
    7657              :             {
    7658       203764 :               unsigned int node_i = m_partitioned_nodes[order_i];
    7659       203764 :               auto &vertex = m_vertices[node_i];
    7660              : 
    7661              :               /* Reject the layout if it is individually incompatible
    7662              :                  with any node in the partition.  */
    7663       203764 :               if (!is_compatible_layout (vertex.node, layout_i))
    7664              :                 {
    7665        13406 :                   is_possible = false;
    7666        14422 :                   break;
    7667              :                 }
    7668              : 
    7669       516967 :               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
    7670              :                 {
    7671       326609 :                   auto &other_vertex = m_vertices[other_node_i];
    7672       326609 :                   if (other_vertex.partition < vertex.partition)
    7673              :                     {
    7674              :                       /* Accumulate the incoming costs from earlier
    7675              :                          partitions, plus the cost of any layout changes
    7676              :                          on UD itself.  */
    7677       169523 :                       auto cost = forward_cost (ud, other_node_i, layout_i);
    7678       169523 :                       if (!cost.is_possible ())
    7679         2476 :                         is_possible = false;
    7680              :                       else
    7681       167047 :                         layout_costs.in_cost.add_parallel_cost (cost);
    7682              :                     }
    7683              :                   else
    7684              :                     /* Reject the layout if it would make layout 0 impossible
    7685              :                        for later partitions.  This amounts to testing that the
    7686              :                        target supports reversing the layout change on edges
    7687              :                        to later partitions.
    7688              : 
    7689              :                        In principle, it might be possible to push a layout
    7690              :                        change all the way down a graph, so that it never
    7691              :                        needs to be reversed and so that the target doesn't
    7692              :                        need to support the reverse operation.  But it would
    7693              :                        be awkward to bail out if we hit a partition that
    7694              :                        does not support the new layout, especially since
    7695              :                        we are not dealing with a lattice.  */
    7696       157086 :                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
    7697       157086 :                                                      layout_i).is_possible ();
    7698       516967 :                 };
    7699       190358 :               for_each_partition_edge (node_i, add_cost);
    7700              : 
    7701              :               /* Accumulate the cost of using LAYOUT_I within NODE,
    7702              :                  both for the inputs and the outputs.  */
    7703       190358 :               int factor = internal_node_cost (vertex.node, layout_i,
    7704              :                                                layout_i);
    7705       190358 :               if (factor < 0)
    7706              :                 {
    7707         1016 :                   is_possible = false;
    7708         1016 :                   break;
    7709              :                 }
    7710       189342 :               else if (factor)
    7711        30436 :                 layout_costs.internal_cost.add_serial_cost
    7712        30436 :                   ({ vertex.weight * factor, m_optimize_size });
    7713              :             }
    7714       177609 :           if (!is_possible)
    7715              :             {
    7716        16522 :               layout_costs.mark_impossible ();
    7717        16522 :               continue;
    7718              :             }
    7719              : 
    7720              :           /* Combine the incoming and partition-internal costs.  */
    7721       161087 :           slpg_layout_cost combined_cost = layout_costs.in_cost;
    7722       161087 :           combined_cost.add_serial_cost (layout_costs.internal_cost);
    7723              : 
    7724              :           /* If this partition consists of a single VEC_PERM_EXPR, see
    7725              :              if the VEC_PERM_EXPR can be changed to support output layout
    7726              :              LAYOUT_I while keeping all the provisional choices of input
    7727              :              layout.  */
    7728       161087 :           if (single_node && SLP_TREE_PERMUTE_P (single_node))
    7729              :             {
    7730         5339 :               int factor = internal_node_cost (single_node, -1, layout_i);
    7731         5339 :               if (factor >= 0)
    7732              :                 {
    7733         4896 :                   auto weight = m_vertices[single_node->vertex].weight;
    7734         4896 :                   slpg_layout_cost internal_cost
    7735         4896 :                     = { weight * factor, m_optimize_size };
    7736              : 
    7737         4896 :                   slpg_layout_cost alt_cost = in_cost;
    7738         4896 :                   alt_cost.add_serial_cost (internal_cost);
    7739         4896 :                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
    7740              :                     {
    7741         1531 :                       combined_cost = alt_cost;
    7742         1531 :                       layout_costs.in_cost = in_cost;
    7743         1531 :                       layout_costs.internal_cost = internal_cost;
    7744              :                     }
    7745              :                 }
    7746              :             }
    7747              : 
    7748              :           /* Record the layout with the lowest cost.  Prefer layout 0 in
    7749              :              the event of a tie between it and another layout.  */
    7750       161087 :           if (!min_layout_cost.is_possible ()
    7751        58109 :               || combined_cost.is_better_than (min_layout_cost,
    7752        58109 :                                                m_optimize_size))
    7753              :             {
    7754       115342 :               min_layout_i = layout_i;
    7755       115342 :               min_layout_cost = combined_cost;
    7756              :             }
    7757              :         }
    7758              : 
    7759              :       /* This loop's handling of earlier partitions should ensure that
    7760              :          choosing the original layout for the current partition is no
    7761              :          less valid than it was in the original graph, even with the
    7762              :          provisional layout choices for those earlier partitions.  */
    7763       102978 :       gcc_assert (min_layout_cost.is_possible ());
    7764       102978 :       partition.layout = min_layout_i;
    7765              :     }
    7766         5219 : }
    7767              : 
    7768              : /* Make a backward pass through the partitions, accumulating output costs.
    7769              :    Make a final choice of layout for each partition.  */
    7770              : 
    7771              : void
    7772         5219 : vect_optimize_slp_pass::backward_pass ()
    7773              : {
    7774       113416 :   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
    7775              :     {
    7776       102978 :       auto &partition = m_partitions[partition_i];
    7777              : 
    7778       102978 :       unsigned int min_layout_i = 0;
    7779       102978 :       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
    7780       314787 :       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
    7781              :         {
    7782       211809 :           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
    7783       211809 :           if (!layout_costs.is_possible ())
    7784        50722 :             continue;
    7785              : 
    7786              :           /* Accumulate the costs from successor partitions.  */
    7787       161087 :           bool is_possible = true;
    7788       348292 :           for (unsigned int order_i = partition.node_begin;
    7789       348292 :                order_i < partition.node_end; ++order_i)
    7790              :             {
    7791       187205 :               unsigned int node_i = m_partitioned_nodes[order_i];
    7792       187205 :               auto &vertex = m_vertices[node_i];
    7793       508235 :               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
    7794              :                 {
    7795       321030 :                   auto &other_vertex = m_vertices[other_node_i];
    7796       321030 :                   auto &other_partition = m_partitions[other_vertex.partition];
    7797       321030 :                   if (other_vertex.partition > vertex.partition)
    7798              :                     {
    7799              :                       /* Accumulate the incoming costs from later
    7800              :                          partitions, plus the cost of any layout changes
    7801              :                          on UD itself.  */
    7802       154094 :                       auto cost = backward_cost (ud, other_node_i, layout_i);
    7803       154094 :                       if (!cost.is_possible ())
    7804            0 :                         is_possible = false;
    7805              :                       else
    7806       154094 :                         layout_costs.out_cost.add_parallel_cost (cost);
    7807              :                     }
    7808              :                   else
    7809              :                     /* Make sure that earlier partitions can (if necessary
    7810              :                        or beneficial) keep the layout that they chose in
    7811              :                        the forward pass.  This ensures that there is at
    7812              :                        least one valid choice of layout.  */
    7813       166936 :                     is_possible &= edge_layout_cost (ud, other_node_i,
    7814       166936 :                                                      other_partition.layout,
    7815       166936 :                                                      layout_i).is_possible ();
    7816       508235 :                 };
    7817       187205 :               for_each_partition_edge (node_i, add_cost);
    7818              :             }
    7819       161087 :           if (!is_possible)
    7820              :             {
    7821            0 :               layout_costs.mark_impossible ();
    7822            0 :               continue;
    7823              :             }
    7824              : 
    7825              :           /* Locally combine the costs from the forward and backward passes.
    7826              :              (This combined cost is not passed on, since that would lead
    7827              :              to double counting.)  */
    7828       161087 :           slpg_layout_cost combined_cost = layout_costs.in_cost;
    7829       161087 :           combined_cost.add_serial_cost (layout_costs.internal_cost);
    7830       161087 :           combined_cost.add_serial_cost (layout_costs.out_cost);
    7831              : 
    7832              :           /* Record the layout with the lowest cost.  Prefer layout 0 in
    7833              :              the event of a tie between it and another layout.  */
    7834       161087 :           if (!min_layout_cost.is_possible ()
    7835        58109 :               || combined_cost.is_better_than (min_layout_cost,
    7836        58109 :                                                m_optimize_size))
    7837              :             {
    7838       108446 :               min_layout_i = layout_i;
    7839       108446 :               min_layout_cost = combined_cost;
    7840              :             }
    7841              :         }
    7842              : 
    7843       102978 :       gcc_assert (min_layout_cost.is_possible ());
    7844       102978 :       partition.layout = min_layout_i;
    7845              :     }
    7846         5219 : }
    7847              : 
    7848              : /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
    7849              :    NODE already has the layout that was selected for its partition.  */
    7850              : 
    7851              : slp_tree
    7852       146078 : vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
    7853              :                                                 unsigned int to_layout_i)
    7854              : {
    7855       146078 :   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
    7856       146078 :   slp_tree result = m_node_layouts[result_i];
    7857       146078 :   if (result)
    7858              :     return result;
    7859              : 
    7860       145545 :   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
    7861       145545 :       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
    7862              :           /* We can't permute vector defs in place.  */
    7863        16133 :           && SLP_TREE_VEC_DEFS (node).is_empty ()))
    7864              :     {
    7865              :       /* If the vector is uniform or unchanged, there's nothing to do.  */
    7866        32600 :       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
    7867              :         result = node;
    7868              :       else
    7869              :         {
    7870         1450 :           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
    7871         1450 :           result = vect_create_new_slp_node (scalar_ops);
    7872         1450 :           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
    7873              :         }
    7874              :     }
    7875              :   else
    7876              :     {
    7877       112945 :       unsigned int partition_i = m_vertices[node->vertex].partition;
    7878       112945 :       unsigned int from_layout_i = m_partitions[partition_i].layout;
    7879       112945 :       if (from_layout_i == to_layout_i)
    7880       112388 :         return node;
    7881              : 
    7882              :       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
    7883              :          permutation instead of a serial one.  Leave the new permutation
    7884              :          in TMP_PERM on success.  */
    7885          557 :       auto_lane_permutation_t tmp_perm;
    7886          557 :       unsigned int num_inputs = 1;
    7887          557 :       if (SLP_TREE_PERMUTE_P (node))
    7888              :         {
    7889            7 :           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
    7890            7 :           if (from_layout_i != 0)
    7891            7 :             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
    7892            7 :           if (to_layout_i != 0)
    7893            4 :             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
    7894            7 :           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
    7895              :                                               tmp_perm,
    7896            7 :                                               SLP_TREE_CHILDREN (node),
    7897              :                                               false) >= 0)
    7898            7 :             num_inputs = SLP_TREE_CHILDREN (node).length ();
    7899              :           else
    7900            0 :             tmp_perm.truncate (0);
    7901              :         }
    7902              : 
    7903          557 :       if (dump_enabled_p ())
    7904              :         {
    7905           70 :           if (tmp_perm.length () > 0)
    7906            6 :             dump_printf_loc (MSG_NOTE, vect_location,
    7907              :                              "duplicating permutation node %p with"
    7908              :                              " layout %d\n",
    7909              :                              (void *) node, to_layout_i);
    7910              :           else
    7911           64 :             dump_printf_loc (MSG_NOTE, vect_location,
    7912              :                              "inserting permutation node in place of %p\n",
    7913              :                              (void *) node);
    7914              :         }
    7915              : 
    7916          557 :       unsigned int num_lanes = SLP_TREE_LANES (node);
    7917          557 :       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
    7918          557 :       if (SLP_TREE_SCALAR_STMTS (node).length ())
    7919              :         {
    7920          556 :           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
    7921          556 :           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
    7922          556 :           if (from_layout_i != 0)
    7923          299 :             vect_slp_permute (m_perms[from_layout_i], stmts, false);
    7924          556 :           if (to_layout_i != 0)
    7925          261 :             vect_slp_permute (m_perms[to_layout_i], stmts, true);
    7926              :         }
    7927          557 :       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
    7928          557 :       SLP_TREE_LANES (result) = num_lanes;
    7929          557 :       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
    7930          557 :       result->vertex = -1;
    7931              : 
    7932          557 :       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
    7933          557 :       if (tmp_perm.length ())
    7934              :         {
    7935            7 :           lane_perm.safe_splice (tmp_perm);
    7936            7 :           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
    7937              :         }
    7938              :       else
    7939              :         {
    7940          550 :           lane_perm.create (num_lanes);
    7941         1714 :           for (unsigned j = 0; j < num_lanes; ++j)
    7942         1164 :             lane_perm.quick_push ({ 0, j });
    7943          550 :           if (from_layout_i != 0)
    7944          292 :             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
    7945          550 :           if (to_layout_i != 0)
    7946          258 :             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
    7947          550 :           SLP_TREE_CHILDREN (result).safe_push (node);
    7948              :         }
    7949         2232 :       for (slp_tree child : SLP_TREE_CHILDREN (result))
    7950          561 :         child->refcnt++;
    7951          557 :     }
    7952        33157 :   m_node_layouts[result_i] = result;
    7953        33157 :   return result;
    7954              : }
    7955              : 
    7956              : /* Apply the chosen vector layouts to the SLP graph.  */
    7957              : 
    7958              : void
    7959        10174 : vect_optimize_slp_pass::materialize ()
    7960              : {
    7961              :   /* We no longer need the costs, so avoid having two O(N * P) arrays
    7962              :      live at the same time.  */
    7963        10174 :   m_partition_layout_costs.release ();
    7964        30522 :   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
    7965              : 
    7966        20348 :   auto_sbitmap fully_folded (m_vertices.length ());
    7967        10174 :   bitmap_clear (fully_folded);
    7968       156254 :   for (unsigned int node_i : m_partitioned_nodes)
    7969              :     {
    7970       125732 :       auto &vertex = m_vertices[node_i];
    7971       125732 :       slp_tree node = vertex.node;
    7972       125732 :       int layout_i = m_partitions[vertex.partition].layout;
    7973       125732 :       gcc_assert (layout_i >= 0);
    7974              : 
    7975              :       /* Rearrange the scalar statements to match the chosen layout.  */
    7976       125732 :       if (layout_i > 0)
    7977        13340 :         vect_slp_permute (m_perms[layout_i],
    7978        13340 :                           SLP_TREE_SCALAR_STMTS (node), true);
    7979              : 
    7980              :       /* Update load and lane permutations.  */
    7981       125732 :       if (SLP_TREE_PERMUTE_P (node))
    7982              :         {
    7983              :           /* First try to absorb the input vector layouts.  If that fails,
    7984              :              force the inputs to have layout LAYOUT_I too.  We checked that
    7985              :              that was possible before deciding to use nonzero output layouts.
    7986              :              (Note that at this stage we don't really have any guarantee that
    7987              :              the target supports the original VEC_PERM_EXPR.)  */
    7988         5283 :           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
    7989         5283 :           auto_lane_permutation_t tmp_perm;
    7990         5283 :           tmp_perm.safe_splice (perm);
    7991         5283 :           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
    7992         5283 :           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
    7993              :                                               tmp_perm,
    7994         5283 :                                               SLP_TREE_CHILDREN (node),
    7995              :                                               false) >= 0)
    7996              :             {
    7997         4916 :               if (dump_enabled_p ()
    7998         5836 :                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
    7999              :                                   perm.begin ()))
    8000           58 :                 dump_printf_loc (MSG_NOTE, vect_location,
    8001              :                                  "absorbing input layouts into %p\n",
    8002              :                                  (void *) node);
    8003        27719 :               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
    8004         4916 :               bitmap_set_bit (fully_folded, node_i);
    8005              :             }
    8006              :           else
    8007              :             {
    8008              :               /* Not MSG_MISSED because it would make no sense to users.  */
    8009          367 :               if (dump_enabled_p ())
    8010           46 :                 dump_printf_loc (MSG_NOTE, vect_location,
    8011              :                                  "failed to absorb input layouts into %p\n",
    8012              :                                  (void *) node);
    8013          367 :               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
    8014              :             }
    8015         5283 :         }
    8016              :       else
    8017              :         {
    8018       120449 :           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
    8019       120449 :           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
    8020       120449 :           if (layout_i > 0)
    8021              :             /* ???  When we handle non-bijective permutes the idea
    8022              :                is that we can force the load-permutation to be
    8023              :                { min, min + 1, min + 2, ... max }.  But then the
    8024              :                scalar defs might no longer match the lane content
    8025              :                which means wrong-code with live lane vectorization.
    8026              :                So we possibly have to have NULL entries for those.  */
    8027        13237 :             vect_slp_permute (m_perms[layout_i], load_perm, true);
    8028              :         }
    8029              :     }
    8030              : 
    8031              :   /* Do this before any nodes disappear, since it involves a walk
    8032              :      over the leaves.  */
    8033        10174 :   remove_redundant_permutations ();
    8034              : 
    8035              :   /* Replace each child with a correctly laid-out version.  */
    8036       156254 :   for (unsigned int node_i : m_partitioned_nodes)
    8037              :     {
    8038              :       /* Skip nodes that have already been handled above.  */
    8039       125732 :       if (bitmap_bit_p (fully_folded, node_i))
    8040         4916 :         continue;
    8041              : 
    8042       120816 :       auto &vertex = m_vertices[node_i];
    8043       120816 :       int in_layout_i = m_partitions[vertex.partition].layout;
    8044       120816 :       gcc_assert (in_layout_i >= 0);
    8045              : 
    8046              :       unsigned j;
    8047              :       slp_tree child;
    8048       365075 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
    8049              :         {
    8050       152094 :           if (!child)
    8051         6016 :             continue;
    8052              : 
    8053       146078 :           slp_tree new_child = get_result_with_layout (child, in_layout_i);
    8054       146078 :           if (new_child != child)
    8055              :             {
    8056         2248 :               vect_free_slp_tree (child);
    8057         2248 :               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
    8058         2248 :               new_child->refcnt += 1;
    8059              :             }
    8060              :         }
    8061              :     }
    8062        10174 : }
    8063              : 
    8064              : /* Elide load permutations that are not necessary.  Such permutations might
    8065              :    be pre-existing, rather than created by the layout optimizations.  */
    8066              : 
    8067              : void
    8068       678936 : vect_optimize_slp_pass::remove_redundant_permutations ()
    8069              : {
    8070      4476581 :   for (unsigned int node_i : m_leafs)
    8071              :     {
    8072      2439773 :       slp_tree node = m_vertices[node_i].node;
    8073      2439773 :       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
    8074      1845292 :         continue;
    8075              : 
    8076              :       /* In basic block vectorization we allow any subchain of an interleaving
    8077              :          chain.
    8078              :          FORNOW: not in loop SLP because of realignment complications.  */
    8079       594481 :       if (is_a <bb_vec_info> (m_vinfo))
    8080              :         {
    8081       154543 :           bool subchain_p = true;
    8082              :           stmt_vec_info next_load_info = NULL;
    8083              :           stmt_vec_info load_info;
    8084              :           unsigned j;
    8085       154543 :           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
    8086              :             {
    8087       125999 :               if (j != 0
    8088       125999 :                   && (next_load_info != load_info
    8089        60303 :                       || ! load_info
    8090        60303 :                       || DR_GROUP_GAP (load_info) != 1))
    8091              :                 {
    8092              :                   subchain_p = false;
    8093              :                   break;
    8094              :                 }
    8095       103788 :               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
    8096              :             }
    8097        50755 :           if (subchain_p)
    8098              :             {
    8099        28544 :               SLP_TREE_LOAD_PERMUTATION (node).release ();
    8100        28544 :               continue;
    8101              :             }
    8102              :         }
    8103              :       else
    8104              :         {
    8105       543726 :           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
    8106       543726 :           bool this_load_permuted = !vect_load_perm_consecutive_p (node, 0);
    8107              :           /* When this isn't a grouped access we know it's single element
    8108              :              and contiguous.  */
    8109       543726 :           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
    8110              :             {
    8111       422669 :               if (!this_load_permuted
    8112       422669 :                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
    8113       421914 :                       || SLP_TREE_LANES (node) == 1))
    8114       421916 :                 SLP_TREE_LOAD_PERMUTATION (node).release ();
    8115       422669 :               continue;
    8116              :             }
    8117       121057 :           stmt_vec_info first_stmt_info
    8118       121057 :             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
    8119       121562 :           if (!this_load_permuted
    8120              :               /* The load requires permutation when unrolling exposes
    8121              :                  a gap either because the group is larger than the SLP
    8122              :                  group-size or because there is a gap between the groups.  */
    8123       121057 :               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
    8124        98665 :                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
    8125          140 :                       && DR_GROUP_GAP (first_stmt_info) == 0)))
    8126              :             {
    8127          505 :               SLP_TREE_LOAD_PERMUTATION (node).release ();
    8128          505 :               continue;
    8129              :             }
    8130              :         }
    8131              :     }
    8132       678936 : }
    8133              : 
    8134              : /* Print the partition graph and layout information to the dump file.  */
    8135              : 
    8136              : void
    8137          679 : vect_optimize_slp_pass::dump ()
    8138              : {
    8139          679 :   dump_printf_loc (MSG_NOTE, vect_location,
    8140              :                    "SLP optimize permutations:\n");
    8141         1371 :   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
    8142              :     {
    8143          692 :       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
    8144          692 :       const char *sep = "";
    8145         5909 :       for (unsigned int idx : m_perms[layout_i])
    8146              :         {
    8147         3833 :           dump_printf (MSG_NOTE, "%s%d", sep, idx);
    8148         3833 :           sep = ", ";
    8149              :         }
    8150          692 :       dump_printf (MSG_NOTE, " }\n");
    8151              :     }
    8152          679 :   dump_printf_loc (MSG_NOTE, vect_location,
    8153              :                    "SLP optimize partitions:\n");
    8154         5659 :   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
    8155              :        ++partition_i)
    8156              :     {
    8157         4980 :       auto &partition = m_partitions[partition_i];
    8158         4980 :       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
    8159         4980 :       dump_printf_loc (MSG_NOTE, vect_location,
    8160              :                        "  partition %d (layout %d):\n",
    8161              :                        partition_i, partition.layout);
    8162         4980 :       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
    8163        10196 :       for (unsigned int order_i = partition.node_begin;
    8164        10196 :            order_i < partition.node_end; ++order_i)
    8165              :         {
    8166         5216 :           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
    8167        10432 :           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
    8168         5216 :                            (void *) vertex.node);
    8169         5216 :           dump_printf_loc (MSG_NOTE, vect_location,
    8170              :                            "          weight: %f\n",
    8171              :                            vertex.weight.to_double ());
    8172         5216 :           if (vertex.out_degree)
    8173         4083 :             dump_printf_loc (MSG_NOTE, vect_location,
    8174              :                              "          out weight: %f (degree %d)\n",
    8175              :                              vertex.out_weight.to_double (),
    8176              :                              vertex.out_degree);
    8177         5216 :           if (SLP_TREE_PERMUTE_P (vertex.node))
    8178          506 :             dump_printf_loc (MSG_NOTE, vect_location,
    8179              :                              "          op: VEC_PERM_EXPR\n");
    8180         4710 :           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
    8181         4692 :             dump_printf_loc (MSG_NOTE, vect_location,
    8182              :                              "          op template: %G", rep->stmt);
    8183              :         }
    8184         4980 :       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
    8185        10196 :       for (unsigned int order_i = partition.node_begin;
    8186        10196 :            order_i < partition.node_end; ++order_i)
    8187              :         {
    8188         5216 :           unsigned int node_i = m_partitioned_nodes[order_i];
    8189         5216 :           auto &vertex = m_vertices[node_i];
    8190        15726 :           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
    8191              :             {
    8192        10510 :               auto &other_vertex = m_vertices[other_node_i];
    8193        10510 :               if (other_vertex.partition < vertex.partition)
    8194         5255 :                 dump_printf_loc (MSG_NOTE, vect_location,
    8195              :                                  "      - %p [%d] --> %p\n",
    8196         5255 :                                  (void *) other_vertex.node,
    8197              :                                  other_vertex.partition,
    8198         5255 :                                  (void *) vertex.node);
    8199              :               else
    8200         5255 :                 dump_printf_loc (MSG_NOTE, vect_location,
    8201              :                                  "      - %p --> [%d] %p\n",
    8202         5255 :                                  (void *) vertex.node,
    8203              :                                  other_vertex.partition,
    8204         5255 :                                  (void *) other_vertex.node);
    8205        15726 :             };
    8206         5216 :           for_each_partition_edge (node_i, print_edge);
    8207              :         }
    8208              : 
    8209        15139 :       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
    8210              :         {
    8211        10159 :           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
    8212        10159 :           if (layout_costs.is_possible ())
    8213              :             {
    8214         8380 :               dump_printf_loc (MSG_NOTE, vect_location,
    8215              :                                "    layout %d:%s\n", layout_i,
    8216         8380 :                                partition.layout == int (layout_i)
    8217              :                                ? " (*)" : "");
    8218         8380 :               slpg_layout_cost combined_cost = layout_costs.in_cost;
    8219         8380 :               combined_cost.add_serial_cost (layout_costs.internal_cost);
    8220         8380 :               combined_cost.add_serial_cost (layout_costs.out_cost);
    8221              : #define TEMPLATE "{depth: %f, total: %f}"
    8222         8380 :               dump_printf_loc (MSG_NOTE, vect_location,
    8223              :                                "        " TEMPLATE "\n",
    8224              :                                layout_costs.in_cost.depth.to_double (),
    8225              :                                layout_costs.in_cost.total.to_double ());
    8226         8380 :               dump_printf_loc (MSG_NOTE, vect_location,
    8227              :                                "      + " TEMPLATE "\n",
    8228              :                                layout_costs.internal_cost.depth.to_double (),
    8229              :                                layout_costs.internal_cost.total.to_double ());
    8230         8380 :               dump_printf_loc (MSG_NOTE, vect_location,
    8231              :                                "      + " TEMPLATE "\n",
    8232              :                                layout_costs.out_cost.depth.to_double (),
    8233              :                                layout_costs.out_cost.total.to_double ());
    8234         8380 :               dump_printf_loc (MSG_NOTE, vect_location,
    8235              :                                "      = " TEMPLATE "\n",
    8236              :                                combined_cost.depth.to_double (),
    8237              :                                combined_cost.total.to_double ());
    8238              : #undef TEMPLATE
    8239              :             }
    8240              :           else
    8241         1779 :             dump_printf_loc (MSG_NOTE, vect_location,
    8242              :                              "    layout %d: rejected\n", layout_i);
    8243              :         }
    8244              :     }
    8245          679 : }
    8246              : 
    8247              : /* Masked load lanes discovery.  */
    8248              : 
    8249              : void
    8250       678936 : vect_optimize_slp_pass::decide_masked_load_lanes ()
    8251              : {
    8252      6973451 :   for (auto v : m_vertices)
    8253              :     {
    8254      4936643 :       slp_tree node = v.node;
    8255      4936643 :       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
    8256      3469568 :           || SLP_TREE_PERMUTE_P (node))
    8257      1604413 :         continue;
    8258      3332230 :       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
    8259      1637392 :       if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
    8260              :           /* The mask has to be uniform.  */
    8261       971442 :           || STMT_VINFO_SLP_VECT_ONLY (stmt_info)
    8262       971301 :           || ! is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
    8263      3332315 :           || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
    8264              :                                        IFN_MASK_LOAD))
    8265      3332197 :         continue;
    8266           33 :       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
    8267           66 :       if (STMT_VINFO_STRIDED_P (stmt_info)
    8268           33 :           || compare_step_with_zero (m_vinfo, stmt_info) <= 0
    8269           63 :           || vect_load_lanes_supported (SLP_TREE_VECTYPE (node),
    8270           30 :                                         DR_GROUP_SIZE (stmt_info),
    8271              :                                         true) == IFN_LAST)
    8272           33 :         continue;
    8273              : 
    8274              :       /* Uniform masks need to be suitably represented.  */
    8275            0 :       slp_tree mask = SLP_TREE_CHILDREN (node)[0];
    8276            0 :       if (!SLP_TREE_PERMUTE_P (mask)
    8277            0 :           || SLP_TREE_CHILDREN (mask).length () != 1)
    8278            0 :         continue;
    8279            0 :       bool match = true;
    8280            0 :       for (auto perm : SLP_TREE_LANE_PERMUTATION (mask))
    8281            0 :         if (perm.first != 0 || perm.second != 0)
    8282              :           {
    8283              :             match = false;
    8284              :             break;
    8285              :           }
    8286            0 :       if (!match)
    8287            0 :         continue;
    8288              : 
    8289              :       /* Now see if the consumer side matches.  */
    8290            0 :       for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
    8291            0 :            pred; pred = pred->pred_next)
    8292              :         {
    8293            0 :           slp_tree pred_node = m_vertices[pred->src].node;
    8294              :           /* All consumers should be a permute with a single outgoing lane.  */
    8295            0 :           if (!SLP_TREE_PERMUTE_P (pred_node)
    8296            0 :               || SLP_TREE_LANES (pred_node) != 1)
    8297              :             {
    8298              :               match = false;
    8299              :               break;
    8300              :             }
    8301            0 :           gcc_assert (SLP_TREE_CHILDREN (pred_node).length () == 1);
    8302              :         }
    8303            0 :       if (!match)
    8304            0 :         continue;
    8305              :       /* Now we can mark the nodes as to use load lanes.  */
    8306            0 :       node->ldst_lanes = true;
    8307            0 :       for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
    8308            0 :            pred; pred = pred->pred_next)
    8309            0 :         m_vertices[pred->src].node->ldst_lanes = true;
    8310              :       /* The catch is we have to massage the mask.  We have arranged
    8311              :          analyzed uniform masks to be represented by a splat VEC_PERM
    8312              :          which we can now simply elide as we cannot easily re-do SLP
    8313              :          discovery here.  */
    8314            0 :       slp_tree new_mask = SLP_TREE_CHILDREN (mask)[0];
    8315            0 :       SLP_TREE_REF_COUNT (new_mask)++;
    8316            0 :       SLP_TREE_CHILDREN (node)[0] = new_mask;
    8317            0 :       vect_free_slp_tree (mask);
    8318              :     }
    8319       678936 : }
    8320              : 
    8321              : /* Perform legitimizing attempts.  This is intended to improve the
    8322              :    situation when layout 0 is not valid which is a situation the cost
    8323              :    based propagation does not handle well.
    8324              :    Return true if further layout optimization is possible, false if
    8325              :    the layout configuration should be considered final.  */
    8326              : 
    8327              : bool
    8328        10174 : vect_optimize_slp_pass::legitimize ()
    8329              : {
    8330              :   /* Perform a very simple legitimizing attempt by attempting to choose
    8331              :      a single layout for all partitions that will make all permutations
    8332              :      a noop.  That should also be the optimal layout choice in case
    8333              :      layout zero is legitimate.
    8334              :      ???  Disconnected components of the SLP graph could have distinct
    8335              :      single layouts.  */
    8336        10174 :   int single_layout_i = -1;
    8337        10174 :   unsigned deferred_up_to = -1U;
    8338        29607 :   for (unsigned partition_i = 0; partition_i < m_partitions.length ();
    8339              :        ++partition_i)
    8340              :     {
    8341        24649 :       auto &partition = m_partitions[partition_i];
    8342        24649 :       if (single_layout_i == -1)
    8343              :         {
    8344        13544 :           single_layout_i = partition.layout;
    8345        13544 :           deferred_up_to = partition_i;
    8346              :         }
    8347        11105 :       else if (partition.layout == single_layout_i || partition.layout == -1)
    8348              :         ;
    8349              :       else
    8350              :         single_layout_i = 0;
    8351        21900 :       if (single_layout_i == 0)
    8352              :         return true;
    8353              : 
    8354        19497 :       if (single_layout_i != -1
    8355        19497 :           && !is_compatible_layout (partition, single_layout_i))
    8356              :         return true;
    8357              :     }
    8358              : 
    8359         4958 :   if (single_layout_i <= 0)
    8360              :     return true;
    8361              : 
    8362         5066 :   for (unsigned partition_i = 0; partition_i < deferred_up_to; ++partition_i)
    8363          111 :     if (!is_compatible_layout (m_partitions[partition_i],
    8364              :                                single_layout_i))
    8365              :       return true;
    8366              : 
    8367        12503 :   for (unsigned partition_i = 0; partition_i < m_partitions.length ();
    8368              :        ++partition_i)
    8369              :     {
    8370         7548 :       auto &partition = m_partitions[partition_i];
    8371         7548 :       partition.layout = single_layout_i;
    8372              :     }
    8373              : 
    8374              :   return false;
    8375              : }
    8376              : 
    8377              : /* Main entry point for the SLP graph optimization pass.  */
    8378              : 
    8379              : void
    8380       678936 : vect_optimize_slp_pass::run ()
    8381              : {
    8382       678936 :   build_graph ();
    8383       678936 :   create_partitions ();
    8384       678936 :   start_choosing_layouts ();
    8385       678936 :   if (m_perms.length () > 1)
    8386              :     {
    8387        10174 :       if (legitimize ())
    8388              :         {
    8389         5219 :           forward_pass ();
    8390         5219 :           backward_pass ();
    8391              :         }
    8392        10174 :       if (dump_enabled_p ())
    8393          679 :         dump ();
    8394        10174 :       materialize ();
    8395        41113 :       while (!m_perms.is_empty ())
    8396        20765 :         m_perms.pop ().release ();
    8397              :     }
    8398              :   else
    8399       668762 :     remove_redundant_permutations ();
    8400       678936 :   free_graph (m_slpg);
    8401       678936 :   build_graph ();
    8402       678936 :   decide_masked_load_lanes ();
    8403       678936 :   free_graph (m_slpg);
    8404       678936 : }
    8405              : 
    8406              : /* Apply CSE to NODE and its children using BST_MAP.  */
    8407              : 
    8408              : static void
    8409      5334030 : vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
    8410              : {
    8411      5334030 :   bool put_p = false;
    8412      5334030 :   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
    8413              :       /* Besides some VEC_PERM_EXPR, two-operator nodes also
    8414              :          lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
    8415              :          we'd have sth that works for all internal and external nodes.  */
    8416      5334030 :       && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
    8417              :     {
    8418      3841154 :       slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
    8419      3841154 :       if (leader)
    8420              :         {
    8421              :           /* We've visited this node already.  */
    8422       399947 :           if (!*leader || *leader == node)
    8423              :             return;
    8424              : 
    8425         2809 :           if (dump_enabled_p ())
    8426          924 :             dump_printf_loc (MSG_NOTE, vect_location,
    8427              :                              "re-using SLP tree %p for %p\n",
    8428              :                              (void *)*leader, (void *)node);
    8429         2809 :           vect_free_slp_tree (node);
    8430         2809 :           (*leader)->refcnt += 1;
    8431         2809 :           node = *leader;
    8432         2809 :           return;
    8433              :         }
    8434              : 
    8435              :       /* Avoid creating a cycle by populating the map only after recursion.  */
    8436      3441207 :       bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
    8437      3441207 :       node->refcnt += 1;
    8438      3441207 :       put_p = true;
    8439              :       /* And recurse.  */
    8440              :     }
    8441              : 
    8442     14746424 :   for (slp_tree &child : SLP_TREE_CHILDREN (node))
    8443      4303917 :     if (child)
    8444      3872549 :       vect_cse_slp_nodes (bst_map, child);
    8445              : 
    8446              :   /* Now record the node for CSE in other siblings.  */
    8447      4934083 :   if (put_p)
    8448      3441207 :     *bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
    8449              : }
    8450              : 
    8451              : /* Optimize the SLP graph of VINFO.  */
    8452              : 
    8453              : void
    8454      1025600 : vect_optimize_slp (vec_info *vinfo)
    8455              : {
    8456      1025600 :   if (vinfo->slp_instances.is_empty ())
    8457              :     return;
    8458       678936 :   vect_optimize_slp_pass (vinfo).run ();
    8459              : 
    8460              :   /* Apply CSE again to nodes after permute optimization.  */
    8461       678936 :   scalar_stmts_to_slp_tree_map_t *bst_map
    8462       678936 :     = new scalar_stmts_to_slp_tree_map_t ();
    8463              : 
    8464      3498289 :   for (auto inst : vinfo->slp_instances)
    8465      1461481 :     vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
    8466              : 
    8467       678936 :   release_scalar_stmts_to_slp_tree_map (bst_map);
    8468              : }
    8469              : 
    8470              : /* Gather loads reachable from the individual SLP graph entries.  */
    8471              : 
    8472              : void
    8473      1025600 : vect_gather_slp_loads (vec_info *vinfo)
    8474              : {
    8475      1025600 :   unsigned i;
    8476      1025600 :   slp_instance instance;
    8477      2487081 :   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
    8478              :     {
    8479      1461481 :       hash_set<slp_tree> visited;
    8480      1461481 :       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
    8481              :                              SLP_INSTANCE_TREE (instance), visited);
    8482      1461481 :     }
    8483      1025600 : }
    8484              : 
    8485              : /* For NODE update VF based on the number of lanes and the vector types
    8486              :    used.  */
    8487              : 
    8488              : static void
    8489      4228395 : vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
    8490              :                              hash_set<slp_tree> &visited)
    8491              : {
    8492      4228395 :   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    8493      1519723 :     return;
    8494      3072818 :   if (visited.add (node))
    8495              :     return;
    8496              : 
    8497     10272516 :   for (slp_tree child : SLP_TREE_CHILDREN (node))
    8498      3468638 :     vect_update_slp_vf_for_node (child, vf, visited);
    8499              : 
    8500              :   /* We do not visit SLP nodes for constants or externals - those neither
    8501              :      have a vector type set yet (vectorizable_* does this) nor do they
    8502              :      have max_nunits set.  Instead we rely on internal nodes max_nunit
    8503              :      to cover constant/external operands.
    8504              :      Note that when we stop using fixed size vectors externs and constants
    8505              :      shouldn't influence the (minimum) vectorization factor, instead
    8506              :      vectorizable_* should honor the vectorization factor when trying to
    8507              :      assign vector types to constants and externals and cause iteration
    8508              :      to a higher vectorization factor when required.  */
    8509      2708672 :   poly_uint64 node_vf
    8510      2708672 :     = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
    8511      2708672 :   vf = force_common_multiple (vf, node_vf);
    8512              : 
    8513              :   /* For permute nodes that are fed from externs or constants we have to
    8514              :      consider their number of lanes as well.  Likewise for store-lanes.  */
    8515      2708672 :   if (SLP_TREE_PERMUTE_P (node) || node->ldst_lanes)
    8516       710178 :     for (slp_tree child : SLP_TREE_CHILDREN (node))
    8517       191136 :       if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
    8518              :         {
    8519         3585 :           poly_uint64 child_vf
    8520         3585 :             = calculate_unrolling_factor (node->max_nunits,
    8521              :                                           SLP_TREE_LANES (child));
    8522         3585 :           vf = force_common_multiple (vf, child_vf);
    8523              :         }
    8524              : }
    8525              : 
    8526              : /* For each possible SLP instance decide whether to SLP it and calculate overall
    8527              :    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
    8528              :    least one instance.  */
    8529              : 
    8530              : bool
    8531       473646 : vect_make_slp_decision (loop_vec_info loop_vinfo)
    8532              : {
    8533       473646 :   unsigned int i;
    8534       473646 :   poly_uint64 unrolling_factor = 1;
    8535       473646 :   const vec<slp_instance> &slp_instances
    8536              :     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
    8537       473646 :   slp_instance instance;
    8538       473646 :   int decided_to_slp = 0;
    8539              : 
    8540       473646 :   DUMP_VECT_SCOPE ("vect_make_slp_decision");
    8541              : 
    8542       473646 :   hash_set<slp_tree> visited;
    8543      1233403 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
    8544              :     {
    8545       759757 :       slp_tree root = SLP_INSTANCE_TREE (instance);
    8546              : 
    8547              :       /* All unroll factors have the form:
    8548              : 
    8549              :            GET_MODE_SIZE (vinfo->vector_mode) * X
    8550              : 
    8551              :          for some rational X, so they must have a common multiple.  */
    8552       759757 :       vect_update_slp_vf_for_node (root, unrolling_factor, visited);
    8553              : 
    8554              :       /* If all instances ended up with vector(1) T roots make sure to
    8555              :          not vectorize.  RVV for example relies on loop vectorization
    8556              :          when some instances are essentially kept scalar.  See PR121048.  */
    8557       759757 :       if (SLP_TREE_VECTYPE (root)
    8558       759757 :           && known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U))
    8559       621581 :         decided_to_slp++;
    8560              :     }
    8561              : 
    8562       473646 :   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = unrolling_factor;
    8563              : 
    8564       473646 :   if (decided_to_slp && dump_enabled_p ())
    8565              :     {
    8566        19116 :       dump_printf_loc (MSG_NOTE, vect_location,
    8567              :                        "Decided to SLP %d instances. Unrolling factor ",
    8568              :                        decided_to_slp);
    8569        19116 :       dump_dec (MSG_NOTE, unrolling_factor);
    8570        19116 :       dump_printf (MSG_NOTE, "\n");
    8571              :     }
    8572              : 
    8573       473646 :   return (decided_to_slp > 0);
    8574       473646 : }
    8575              : 
    8576              : /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
    8577              : 
    8578      2183721 : _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
    8579              :   : vec_info (vec_info::bb, shared),
    8580      2183721 :     roots (vNULL)
    8581              : {
    8582              :   /* The region we are operating on.  bbs[0] is the entry, excluding
    8583              :      its PHI nodes.  In the future we might want to track an explicit
    8584              :      entry edge to cover bbs[0] PHI nodes and have a region entry
    8585              :      insert location.  */
    8586      2183721 :   bbs = _bbs.address ();
    8587      2183721 :   nbbs = _bbs.length ();
    8588              : 
    8589     17491773 :   for (unsigned i = 0; i < nbbs; ++i)
    8590              :     {
    8591     15308052 :       if (i != 0)
    8592     19926052 :         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
    8593      6801721 :              gsi_next (&si))
    8594              :           {
    8595      6801721 :             gphi *phi = si.phi ();
    8596      6801721 :             gimple_set_uid (phi, 0);
    8597      6801721 :             add_stmt (phi);
    8598              :           }
    8599     30616104 :       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
    8600    134643934 :            !gsi_end_p (gsi); gsi_next (&gsi))
    8601              :         {
    8602    119335882 :           gimple *stmt = gsi_stmt (gsi);
    8603    119335882 :           gimple_set_uid (stmt, 0);
    8604    119335882 :           if (is_gimple_debug (stmt))
    8605     74311225 :             continue;
    8606     45024657 :           add_stmt (stmt);
    8607              :         }
    8608              :     }
    8609      2183721 : }
    8610              : 
    8611              : 
    8612              : /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
    8613              :    stmts in the basic block.  */
    8614              : 
    8615      2183721 : _bb_vec_info::~_bb_vec_info ()
    8616              : {
    8617              :   /* Reset region marker.  */
    8618     17491773 :   for (unsigned i = 0; i < nbbs; ++i)
    8619              :     {
    8620     15308052 :       if (i != 0)
    8621     19941839 :         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
    8622      6817508 :              gsi_next (&si))
    8623              :           {
    8624      6817508 :             gphi *phi = si.phi ();
    8625      6817508 :             gimple_set_uid (phi, -1);
    8626              :           }
    8627     30616104 :       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
    8628    134586827 :            !gsi_end_p (gsi); gsi_next (&gsi))
    8629              :         {
    8630    119278775 :           gimple *stmt = gsi_stmt (gsi);
    8631    119278775 :           gimple_set_uid (stmt, -1);
    8632              :         }
    8633              :     }
    8634              : 
    8635      3428778 :   for (unsigned i = 0; i < roots.length (); ++i)
    8636              :     {
    8637      1245057 :       roots[i].stmts.release ();
    8638      1245057 :       roots[i].roots.release ();
    8639      1245057 :       roots[i].remain.release ();
    8640              :     }
    8641      2183721 :   roots.release ();
    8642      2183721 : }
    8643              : 
    8644              : /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
    8645              :    given then that child nodes have already been processed, and that
    8646              :    their def types currently match their SLP node's def type.  */
    8647              : 
    8648              : static bool
    8649      2784748 : vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
    8650              :                                     slp_instance node_instance,
    8651              :                                     stmt_vector_for_cost *cost_vec)
    8652              : {
    8653              :   /* Handle purely internal nodes.  */
    8654      2784748 :   if (SLP_TREE_PERMUTE_P (node))
    8655              :     {
    8656       121226 :       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
    8657              :         return false;
    8658              : 
    8659              :       stmt_vec_info slp_stmt_info;
    8660              :       unsigned int i;
    8661       319278 :       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
    8662              :         {
    8663       199339 :           if (slp_stmt_info
    8664       193775 :               && STMT_VINFO_LIVE_P (slp_stmt_info)
    8665       199339 :               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
    8666              :                                                node_instance, i,
    8667              :                                                false, cost_vec))
    8668              :             return false;
    8669              :         }
    8670       119939 :       SLP_TREE_TYPE (node) = permute_info_type;
    8671       119939 :       return true;
    8672              :     }
    8673              : 
    8674      2663522 :   return vect_analyze_stmt (vinfo, node, node_instance, cost_vec);
    8675              : }
    8676              : 
    8677              : static int
    8678      1847975 : sort_ints (const void *a_, const void *b_)
    8679              : {
    8680      1847975 :   int a = *(const int *)a_;
    8681      1847975 :   int b = *(const int *)b_;
    8682      1847975 :   return a - b;
    8683              : }
    8684              : 
    8685              : /* Verify if we can externalize a set of internal defs.  */
    8686              : 
    8687              : static bool
    8688       371746 : vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
    8689              : {
    8690              :   /* Constant generation uses get_later_stmt which can only handle
    8691              :      defs from the same BB or a set of defs that can be ordered
    8692              :      with a dominance query.  */
    8693       371746 :   basic_block bb = NULL;
    8694       371746 :   bool all_same = true;
    8695       371746 :   auto_vec<int> bbs;
    8696       743492 :   bbs.reserve_exact (stmts.length ());
    8697      2004470 :   for (stmt_vec_info stmt : stmts)
    8698              :     {
    8699       889232 :       if (!stmt)
    8700              :         return false;
    8701       889232 :       else if (!bb)
    8702       371746 :         bb = gimple_bb (stmt->stmt);
    8703       517486 :       else if (gimple_bb (stmt->stmt) != bb)
    8704       172873 :         all_same = false;
    8705       889232 :       bbs.quick_push (gimple_bb (stmt->stmt)->index);
    8706              :     }
    8707       371746 :   if (all_same)
    8708              :     return true;
    8709              : 
    8710              :   /* Produce a vector of unique BB indexes for the defs.  */
    8711       129106 :   bbs.qsort (sort_ints);
    8712              :   unsigned i, j;
    8713       315038 :   for (i = 1, j = 1; i < bbs.length (); ++i)
    8714       185932 :     if (bbs[i] != bbs[j-1])
    8715       137766 :       bbs[j++] = bbs[i];
    8716       129106 :   gcc_assert (j >= 2);
    8717       129106 :   bbs.truncate (j);
    8718              : 
    8719       258212 :   if (bbs.length () == 2)
    8720       125690 :     return (dominated_by_p (CDI_DOMINATORS,
    8721       125690 :                             BASIC_BLOCK_FOR_FN (cfun, bbs[0]),
    8722       125690 :                             BASIC_BLOCK_FOR_FN (cfun, bbs[1]))
    8723       245767 :             || dominated_by_p (CDI_DOMINATORS,
    8724       120077 :                                BASIC_BLOCK_FOR_FN (cfun, bbs[1]),
    8725       120077 :                                BASIC_BLOCK_FOR_FN (cfun, bbs[0])));
    8726              : 
    8727              :   /* ???  For more than two BBs we can sort the vector and verify the
    8728              :      result is a total order.  But we can't use vec::qsort with a
    8729              :      compare function using a dominance query since there's no way to
    8730              :      signal failure and any fallback for an unordered pair would
    8731              :      fail qsort_chk later.
    8732              :      For now simply hope that ordering after BB index provides the
    8733              :      best candidate total order.  If required we can implement our
    8734              :      own mergesort or export an entry without checking.  */
    8735       387185 :   for (unsigned i = 1; i < bbs.length (); ++i)
    8736        12052 :     if (!dominated_by_p (CDI_DOMINATORS,
    8737        12052 :                          BASIC_BLOCK_FOR_FN (cfun, bbs[i]),
    8738        12052 :                          BASIC_BLOCK_FOR_FN (cfun, bbs[i-1])))
    8739              :       return false;
    8740              : 
    8741              :   return true;
    8742       371746 : }
    8743              : 
    8744              : /* Try to build NODE from scalars, returning true on success.
    8745              :    NODE_INSTANCE is the SLP instance that contains NODE.  */
    8746              : 
    8747              : static bool
    8748       559858 : vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
    8749              :                               slp_instance node_instance)
    8750              : {
    8751       559858 :   stmt_vec_info stmt_info;
    8752       559858 :   unsigned int i;
    8753              : 
    8754       559858 :   if (!is_a <bb_vec_info> (vinfo)
    8755        69178 :       || node == SLP_INSTANCE_TREE (node_instance)
    8756        20394 :       || !SLP_TREE_SCALAR_STMTS (node).exists ()
    8757        20353 :       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
    8758              :       /* Force the mask use to be built from scalars instead.  */
    8759        18326 :       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
    8760       578037 :       || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
    8761       541679 :     return false;
    8762              : 
    8763        18179 :   if (dump_enabled_p ())
    8764           74 :     dump_printf_loc (MSG_NOTE, vect_location,
    8765              :                      "Building vector operands of %p from scalars instead\n",
    8766              :                      (void *) node);
    8767              : 
    8768              :   /* Don't remove and free the child nodes here, since they could be
    8769              :      referenced by other structures.  The analysis and scheduling phases
    8770              :      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
    8771        18179 :   unsigned int group_size = SLP_TREE_LANES (node);
    8772        18179 :   SLP_TREE_DEF_TYPE (node) = vect_external_def;
    8773              :   /* Invariants get their vector type from the uses.  */
    8774        18179 :   SLP_TREE_VECTYPE (node) = NULL_TREE;
    8775        18179 :   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
    8776        18179 :   SLP_TREE_LOAD_PERMUTATION (node).release ();
    8777        63951 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    8778              :     {
    8779        45772 :       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
    8780        45772 :       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
    8781              :     }
    8782              :   return true;
    8783              : }
    8784              : 
    8785              : /* Return true if all elements of the slice are the same.  */
    8786              : bool
    8787       472141 : vect_scalar_ops_slice::all_same_p () const
    8788              : {
    8789       519104 :   for (unsigned int i = 1; i < length; ++i)
    8790       437519 :     if (!operand_equal_p (op (0), op (i)))
    8791              :       return false;
    8792              :   return true;
    8793              : }
    8794              : 
    8795              : hashval_t
    8796       403915 : vect_scalar_ops_slice_hash::hash (const value_type &s)
    8797              : {
    8798       403915 :   hashval_t hash = 0;
    8799      1553131 :   for (unsigned i = 0; i < s.length; ++i)
    8800      1149216 :     hash = iterative_hash_expr (s.op (i), hash);
    8801       403915 :   return hash;
    8802              : }
    8803              : 
    8804              : bool
    8805       219272 : vect_scalar_ops_slice_hash::equal (const value_type &s1,
    8806              :                                    const compare_type &s2)
    8807              : {
    8808       219272 :   if (s1.length != s2.length)
    8809              :     return false;
    8810       383515 :   for (unsigned i = 0; i < s1.length; ++i)
    8811       333855 :     if (!operand_equal_p (s1.op (i), s2.op (i)))
    8812              :       return false;
    8813              :   return true;
    8814              : }
    8815              : 
    8816              : /* Compute the prologue cost for invariant or constant operands represented
    8817              :    by NODE.  */
    8818              : 
    8819              : static void
    8820      1099348 : vect_prologue_cost_for_slp (vec_info *vinfo, slp_tree node,
    8821              :                             stmt_vector_for_cost *cost_vec)
    8822              : {
    8823              :   /* There's a special case of an existing vector, that costs nothing.  */
    8824      1099348 :   if (SLP_TREE_SCALAR_OPS (node).length () == 0
    8825      1099348 :       && !SLP_TREE_VEC_DEFS (node).is_empty ())
    8826         1425 :     return;
    8827              :   /* Without looking at the actual initializer a vector of
    8828              :      constants can be implemented as load from the constant pool.
    8829              :      When all elements are the same we can use a splat.  */
    8830      1097923 :   tree vectype = SLP_TREE_VECTYPE (node);
    8831      1097923 :   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
    8832      1097923 :   unsigned HOST_WIDE_INT const_nunits;
    8833      1097923 :   unsigned nelt_limit;
    8834      1097923 :   unsigned nvectors = vect_get_num_copies (vinfo, node);
    8835      1097923 :   auto ops = &SLP_TREE_SCALAR_OPS (node);
    8836      1097923 :   auto_vec<unsigned int> starts (nvectors);
    8837      1097923 :   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
    8838      1097923 :       && ! multiple_p (const_nunits, group_size))
    8839              :     {
    8840        64056 :       nelt_limit = const_nunits;
    8841        64056 :       hash_set<vect_scalar_ops_slice_hash> vector_ops;
    8842       266282 :       for (unsigned int i = 0; i < nvectors; ++i)
    8843       202226 :         if (!vector_ops.add ({ ops, i * nelt_limit, nelt_limit }))
    8844       152566 :           starts.quick_push (i * nelt_limit);
    8845        64056 :     }
    8846              :   else
    8847              :     {
    8848              :       /* If either the vector has variable length or the vectors
    8849              :          are composed of repeated whole groups we only need to
    8850              :          cost construction once.  All vectors will be the same.  */
    8851      1033867 :       nelt_limit = group_size;
    8852      1033867 :       starts.quick_push (0);
    8853              :     }
    8854              :   /* ???  We're just tracking whether vectors in a single node are the same.
    8855              :      Ideally we'd do something more global.  */
    8856      1097923 :   bool passed = false;
    8857      4480202 :   for (unsigned int start : starts)
    8858              :     {
    8859      1186433 :       vect_cost_for_stmt kind;
    8860      1186433 :       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
    8861              :         kind = vector_load;
    8862       472141 :       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
    8863              :         kind = scalar_to_vec;
    8864              :       else
    8865       390556 :         kind = vec_construct;
    8866              :       /* The target cost hook has no idea which part of the SLP node
    8867              :          we are costing so avoid passing it down more than once.  Pass
    8868              :          it to the first vec_construct or scalar_to_vec part since for those
    8869              :          the x86 backend tries to account for GPR to XMM register moves.  */
    8870      1186433 :       record_stmt_cost (cost_vec, 1, kind, nullptr,
    8871      1186433 :                         (kind != vector_load && !passed) ? node : nullptr,
    8872              :                         vectype, 0, vect_prologue);
    8873      1186433 :       if (kind != vector_load)
    8874       472141 :         passed = true;
    8875              :     }
    8876      1097923 : }
    8877              : 
    8878              : /* Analyze statements contained in SLP tree NODE after recursively analyzing
    8879              :    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
    8880              : 
    8881              :    Return true if the operations are supported.  */
    8882              : 
    8883              : static bool
    8884      5149053 : vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
    8885              :                                   slp_instance node_instance,
    8886              :                                   hash_set<slp_tree> &visited_set,
    8887              :                                   vec<slp_tree> &visited_vec,
    8888              :                                   stmt_vector_for_cost *cost_vec)
    8889              : {
    8890      5149053 :   int i, j;
    8891      5149053 :   slp_tree child;
    8892              : 
    8893              :   /* Assume we can code-generate all invariants.  */
    8894      5149053 :   if (!node
    8895      4774933 :       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
    8896      4008320 :       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
    8897              :     return true;
    8898              : 
    8899      3465867 :   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
    8900              :     {
    8901            5 :       if (dump_enabled_p ())
    8902            0 :         dump_printf_loc (MSG_NOTE, vect_location,
    8903              :                          "Failed cyclic SLP reference in %p\n", (void *) node);
    8904            5 :       return false;
    8905              :     }
    8906      3465862 :   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
    8907              : 
    8908              :   /* If we already analyzed the exact same set of scalar stmts we're done.
    8909              :      We share the generated vector stmts for those.  */
    8910      3465862 :   if (visited_set.add (node))
    8911              :     return true;
    8912      3096249 :   visited_vec.safe_push (node);
    8913              : 
    8914      3096249 :   bool res = true;
    8915      3096249 :   unsigned visited_rec_start = visited_vec.length ();
    8916      3096249 :   unsigned cost_vec_rec_start = cost_vec->length ();
    8917      3096249 :   bool seen_non_constant_child = false;
    8918      6631867 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    8919              :     {
    8920      3846905 :       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
    8921              :                                               visited_set, visited_vec,
    8922              :                                               cost_vec);
    8923      3846905 :       if (!res)
    8924              :         break;
    8925      3535618 :       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
    8926      3535618 :         seen_non_constant_child = true;
    8927              :     }
    8928              :   /* We're having difficulties scheduling nodes with just constant
    8929              :      operands and no scalar stmts since we then cannot compute a stmt
    8930              :      insertion place.  */
    8931      3096249 :   if (res
    8932      3096249 :       && !seen_non_constant_child
    8933      3096249 :       && SLP_TREE_SCALAR_STMTS (node).is_empty ())
    8934              :     {
    8935          214 :       if (dump_enabled_p ())
    8936            6 :         dump_printf_loc (MSG_NOTE, vect_location,
    8937              :                          "Cannot vectorize all-constant op node %p\n",
    8938              :                          (void *) node);
    8939              :       res = false;
    8940              :     }
    8941              : 
    8942      3096035 :   if (res)
    8943      2784748 :     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
    8944              :                                               cost_vec);
    8945              :   /* If analysis failed we have to pop all recursive visited nodes
    8946              :      plus ourselves.  */
    8947      3096249 :   if (!res)
    8948              :     {
    8949      2803354 :       while (visited_vec.length () >= visited_rec_start)
    8950       841819 :         visited_set.remove (visited_vec.pop ());
    8951       559858 :       cost_vec->truncate (cost_vec_rec_start);
    8952              :     }
    8953              : 
    8954              :   /* When the node can be vectorized cost invariant nodes it references.
    8955              :      This is not done in DFS order to allow the referring node
    8956              :      vectorizable_* calls to nail down the invariant nodes vector type
    8957              :      and possibly unshare it if it needs a different vector type than
    8958              :      other referrers.  */
    8959      3096249 :   if (res)
    8960      5759733 :     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
    8961      3223342 :       if (child
    8962      2915581 :           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
    8963      2915581 :               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
    8964              :           /* Perform usual caching, note code-generation still
    8965              :              code-gens these nodes multiple times but we expect
    8966              :              to CSE them later.  */
    8967      4409206 :           && !visited_set.add (child))
    8968              :         {
    8969      1142461 :           visited_vec.safe_push (child);
    8970              :           /* ???  After auditing more code paths make a "default"
    8971              :              and push the vector type from NODE to all children
    8972              :              if it is not already set.  */
    8973              :           /* Compute the number of vectors to be generated.  */
    8974      1142461 :           tree vector_type = SLP_TREE_VECTYPE (child);
    8975      1142461 :           if (!vector_type)
    8976              :             {
    8977              :               /* Masked loads can have an undefined (default SSA definition)
    8978              :                  else operand.  We do not need to cost it.  */
    8979        43113 :               vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
    8980        44548 :               if (SLP_TREE_TYPE (node) == load_vec_info_type
    8981        44548 :                   && ((ops.length ()
    8982         1435 :                        && TREE_CODE (ops[0]) == SSA_NAME
    8983            0 :                        && SSA_NAME_IS_DEFAULT_DEF (ops[0])
    8984            0 :                        && VAR_P (SSA_NAME_VAR (ops[0])))
    8985         1435 :                       || SLP_TREE_DEF_TYPE (child) == vect_constant_def))
    8986         1435 :                 continue;
    8987              : 
    8988              :               /* For shifts with a scalar argument we don't need
    8989              :                  to cost or code-generate anything.
    8990              :                  ???  Represent this more explicitly.  */
    8991        41678 :               gcc_assert (SLP_TREE_TYPE (node) == shift_vec_info_type
    8992              :                           && j == 1);
    8993        41678 :               continue;
    8994        41678 :             }
    8995              : 
    8996              :           /* And cost them.  */
    8997      1099348 :           vect_prologue_cost_for_slp (vinfo, child, cost_vec);
    8998              :         }
    8999              : 
    9000              :   /* If this node or any of its children can't be vectorized, try pruning
    9001              :      the tree here rather than felling the whole thing.  */
    9002       559858 :   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
    9003              :     {
    9004              :       /* We'll need to revisit this for invariant costing and number
    9005              :          of vectorized stmt setting.   */
    9006              :       res = true;
    9007              :     }
    9008              : 
    9009              :   return res;
    9010              : }
    9011              : 
    9012              : /* Mark lanes of NODE that are live outside of the basic-block vectorized
    9013              :    region and that can be vectorized using vectorizable_live_operation
    9014              :    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
    9015              :    scalar code computing it to be retained.  */
    9016              : 
    9017              : static void
    9018       909127 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
    9019              :                              slp_instance instance,
    9020              :                              stmt_vector_for_cost *cost_vec,
    9021              :                              hash_set<stmt_vec_info> &svisited,
    9022              :                              hash_set<slp_tree> &visited)
    9023              : {
    9024       909127 :   if (visited.add (node))
    9025        42449 :     return;
    9026              : 
    9027       866678 :   unsigned i;
    9028       866678 :   stmt_vec_info stmt_info;
    9029       866678 :   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
    9030      3140826 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    9031              :     {
    9032      2274148 :       if (!stmt_info || svisited.contains (stmt_info))
    9033        48153 :         continue;
    9034      2249816 :       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
    9035      2249816 :       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
    9036        11999 :           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
    9037              :         /* Only the pattern root stmt computes the original scalar value.  */
    9038         8933 :         continue;
    9039      2240883 :       if (!PURE_SLP_STMT (orig_stmt_info))
    9040              :         /* Iff the stmt is not part of the vector coverage because it or
    9041              :            uses of it are used by SLP graph leafs as extern input there is
    9042              :            no point in trying to live code-generate from a vector stmt as
    9043              :            the scalar stmt will survive anyway.  */
    9044        14888 :         continue;
    9045      2225995 :       bool mark_visited = true;
    9046      2225995 :       gimple *orig_stmt = orig_stmt_info->stmt;
    9047      2225995 :       ssa_op_iter op_iter;
    9048      2225995 :       def_operand_p def_p;
    9049      4940717 :       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
    9050              :         {
    9051              :           /* We have to verify whether we can insert the lane extract
    9052              :              before all uses.  The following is a conservative approximation.
    9053              :              We cannot put this into vectorizable_live_operation because
    9054              :              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
    9055              :              doesn't work.
    9056              :              Note that while the fact that we emit code for loads at the
    9057              :              first load should make this a non-problem leafs we construct
    9058              :              from scalars are vectorized after the last scalar def.
    9059              :              ???  If we'd actually compute the insert location during
    9060              :              analysis we could use sth less conservative than the last
    9061              :              scalar stmt in the node for the dominance check.  */
    9062              :           /* ???  What remains is "live" uses in vector CTORs in the same
    9063              :              SLP graph which is where those uses can end up code-generated
    9064              :              right after their definition instead of close to their original
    9065              :              use.  But that would restrict us to code-generate lane-extracts
    9066              :              from the latest stmt in a node.  So we compensate for this
    9067              :              during code-generation, simply not replacing uses for those
    9068              :              hopefully rare cases.  */
    9069       488727 :           imm_use_iterator use_iter;
    9070       488727 :           gimple *use_stmt;
    9071       488727 :           stmt_vec_info use_stmt_info;
    9072              : 
    9073       488727 :           bool live_p = false;
    9074       488727 :           bool can_insert = true;
    9075      1889544 :           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
    9076       927962 :             if (!is_gimple_debug (use_stmt)
    9077       927962 :                 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
    9078       690868 :                     || !PURE_SLP_STMT (use_stmt_info)))
    9079              :               {
    9080       146178 :                 live_p = true;
    9081       146178 :                 if (!vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
    9082              :                   {
    9083        15872 :                     if (dump_enabled_p ())
    9084           34 :                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9085              :                                        "Cannot determine insertion place for "
    9086              :                                        "lane extract\n");
    9087              :                     can_insert = false;
    9088              :                     break;
    9089              :                   }
    9090       488727 :               }
    9091       488727 :           if (live_p && can_insert)
    9092              :             {
    9093              :               /* Only record a live stmt when we can replace all uses.  We
    9094              :                  record from which SLP tree we vectorize the uses, so we'll
    9095              :                  cost once and can deal with the case that not all SLP nodes
    9096              :                  may be suitable for code-generation of all live uses.
    9097              :                  ???  But we never split up the work between multiple SLP
    9098              :                  nodes.  */
    9099        64904 :               STMT_VINFO_LIVE_P (stmt_info) = true;
    9100        64904 :               if (!vectorizable_live_operation (bb_vinfo, stmt_info, node,
    9101              :                                                 instance, i, false, cost_vec))
    9102              :                 {
    9103            0 :                   STMT_VINFO_LIVE_P (stmt_info) = false;
    9104            0 :                   mark_visited = false;
    9105              :                 }
    9106              :             }
    9107              :         }
    9108      2225995 :       if (mark_visited)
    9109      2225995 :         svisited.add (stmt_info);
    9110              :     }
    9111              : 
    9112              :   slp_tree child;
    9113      2499491 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9114       872499 :     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    9115       228578 :       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
    9116              :                                    svisited, visited);
    9117              : }
    9118              : 
    9119              : /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
    9120              :    are live outside of the basic-block vectorized region and that can be
    9121              :    vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
    9122              : 
    9123              : static void
    9124       234430 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
    9125              : {
    9126       234430 :   if (bb_vinfo->slp_instances.is_empty ())
    9127            0 :     return;
    9128              : 
    9129       234430 :   hash_set<slp_tree> visited;
    9130       234430 :   hash_set<stmt_vec_info> svisited;
    9131      1383839 :   for (slp_instance instance : bb_vinfo->slp_instances)
    9132              :     {
    9133       680549 :       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
    9134        28766 :         STMT_VINFO_LIVE_P (SLP_INSTANCE_ROOT_STMTS (instance)[0]) = true;
    9135       680549 :       vect_location = instance->location ();
    9136       680549 :       vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
    9137              :                                    instance, &instance->cost_vec,
    9138              :                                    svisited, visited);
    9139              :     }
    9140       234430 : }
    9141              : 
    9142              : /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
    9143              : 
    9144              : static bool
    9145        73941 : vectorizable_bb_reduc_epilogue (slp_instance instance,
    9146              :                                 stmt_vector_for_cost *cost_vec)
    9147              : {
    9148        73941 :   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
    9149        73941 :   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
    9150        73941 :   if (reduc_code == MINUS_EXPR)
    9151            0 :     reduc_code = PLUS_EXPR;
    9152        73941 :   internal_fn reduc_fn;
    9153        73941 :   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
    9154        73941 :   if (!vectype
    9155        73929 :       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
    9156        73929 :       || reduc_fn == IFN_LAST
    9157        73929 :       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
    9158       108280 :       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
    9159        34339 :                                      TREE_TYPE (vectype)))
    9160              :     {
    9161        50006 :       if (dump_enabled_p ())
    9162          309 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9163              :                          "not vectorized: basic block reduction epilogue "
    9164              :                          "operation unsupported.\n");
    9165        50006 :       return false;
    9166              :     }
    9167              : 
    9168              :   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
    9169              :      cost log2 vector operations plus shuffles and one extraction.  */
    9170        23935 :   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
    9171        23935 :   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
    9172              :                     vectype, 0, vect_body);
    9173        23935 :   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
    9174              :                     vectype, 0, vect_body);
    9175        23935 :   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
    9176              :                     vectype, 0, vect_body);
    9177              : 
    9178              :   /* Since we replace all stmts of a possibly longer scalar reduction
    9179              :      chain account for the extra scalar stmts for that.  */
    9180        23935 :   if (!instance->remain_defs.is_empty ())
    9181        19100 :     record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
    9182         9550 :                       instance->root_stmts[0], 0, vect_body);
    9183              :   return true;
    9184              : }
    9185              : 
    9186              : /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
    9187              :    and recurse to children.  */
    9188              : 
    9189              : static void
    9190       166296 : vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
    9191              :                               hash_set<slp_tree> &visited)
    9192              : {
    9193       166296 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
    9194       166296 :       || visited.add (node))
    9195        72731 :     return;
    9196              : 
    9197              :   stmt_vec_info stmt;
    9198              :   unsigned i;
    9199       321756 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
    9200       228191 :     if (stmt)
    9201       233512 :       roots.remove (vect_orig_stmt (stmt));
    9202              : 
    9203              :   slp_tree child;
    9204       199761 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9205       106196 :     if (child)
    9206       104790 :       vect_slp_prune_covered_roots (child, roots, visited);
    9207              : }
    9208              : 
    9209              : /* Hand over COST_VEC to the target COSTS grouped by SLP node.  */
    9210              : 
    9211              : static void
    9212       941325 : add_slp_costs (vector_costs *costs, stmt_vector_for_cost& cost_vec)
    9213              : {
    9214      3561388 :   for (unsigned start = 0; start < cost_vec.length ();)
    9215              :     {
    9216      2620063 :       unsigned end = start + 1;
    9217      3199102 :       while (end < cost_vec.length ()
    9218      5465435 :              && cost_vec[start].node == cost_vec[end].node)
    9219       579039 :         end++;
    9220      2620063 :       costs->add_slp_cost (cost_vec[start].node,
    9221      2620063 :                            array_slice<stmt_info_for_cost>
    9222      2620063 :                              (cost_vec.begin () + start, end - start));
    9223      2620063 :       start = end;
    9224              :     }
    9225       941325 : }
    9226              : 
    9227              : /* Analyze statements in SLP instances of VINFO.  Return true if the
    9228              :    operations are supported. */
    9229              : 
    9230              : bool
    9231       660238 : vect_slp_analyze_operations (vec_info *vinfo)
    9232              : {
    9233       660238 :   slp_instance instance;
    9234       660238 :   int i;
    9235              : 
    9236       660238 :   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
    9237              : 
    9238       660238 :   hash_set<slp_tree> visited;
    9239      1722068 :   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
    9240              :     {
    9241      1302148 :       auto_vec<slp_tree> visited_vec;
    9242      1302148 :       stmt_vector_for_cost cost_vec;
    9243      1302148 :       cost_vec.create (2);
    9244      1302148 :       if (is_a <bb_vec_info> (vinfo))
    9245       780206 :         vect_location = instance->location ();
    9246      1302148 :       if (!vect_slp_analyze_node_operations (vinfo,
    9247              :                                              SLP_INSTANCE_TREE (instance),
    9248              :                                              instance, visited, visited_vec,
    9249              :                                              &cost_vec)
    9250              :           /* CTOR instances require vectorized defs for the SLP tree root.  */
    9251      1071751 :           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
    9252         5698 :               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
    9253              :                   != vect_internal_def
    9254              :                   /* Make sure we vectorized with the expected type.  */
    9255         5698 :                   || !useless_type_conversion_p
    9256         5698 :                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
    9257              :                                               (instance->root_stmts[0]->stmt))),
    9258         5698 :                          TREE_TYPE (SLP_TREE_VECTYPE
    9259              :                                             (SLP_INSTANCE_TREE (instance))))))
    9260              :           /* Check we can vectorize the reduction.  */
    9261      1071736 :           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
    9262        73941 :               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
    9263              :           /* Check we can vectorize the gcond.  */
    9264      2323878 :           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
    9265        61309 :               && !vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
    9266        61309 :                                            SLP_INSTANCE_ROOT_STMTS (instance)[0],
    9267              :                                            NULL,
    9268              :                                            SLP_INSTANCE_TREE (instance),
    9269              :                                            &cost_vec)))
    9270              :         {
    9271       339123 :           cost_vec.release ();
    9272       339123 :           slp_tree node = SLP_INSTANCE_TREE (instance);
    9273       339123 :           stmt_vec_info stmt_info;
    9274       339123 :           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
    9275       256210 :             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
    9276        82913 :           else if (!SLP_TREE_SCALAR_STMTS (node).is_empty ()
    9277        82913 :                    && SLP_TREE_SCALAR_STMTS (node)[0])
    9278              :             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
    9279              :           else
    9280            0 :             stmt_info = SLP_TREE_REPRESENTATIVE (node);
    9281       339123 :           if (is_a <loop_vec_info> (vinfo))
    9282              :             {
    9283       240318 :               if (dump_enabled_p ())
    9284         6493 :                 dump_printf_loc (MSG_NOTE, vect_location,
    9285              :                                  "unsupported SLP instance starting from: %G",
    9286              :                                  stmt_info->stmt);
    9287       240318 :               return false;
    9288              :             }
    9289        98805 :           if (dump_enabled_p ())
    9290          363 :             dump_printf_loc (MSG_NOTE, vect_location,
    9291              :                              "removing SLP instance operations starting from: %G",
    9292              :                              stmt_info->stmt);
    9293       522806 :           while (!visited_vec.is_empty ())
    9294              :             {
    9295       424001 :               slp_tree node = visited_vec.pop ();
    9296       424001 :               SLP_TREE_TYPE (node) = undef_vec_info_type;
    9297       424001 :               if (node->data)
    9298              :                 {
    9299         9969 :                   delete node->data;
    9300         9969 :                   node->data = nullptr;
    9301              :                 }
    9302       424001 :               visited.remove (node);
    9303              :             }
    9304        98805 :           vect_free_slp_instance (instance);
    9305        98805 :           vinfo->slp_instances.ordered_remove (i);
    9306              :         }
    9307              :       else
    9308              :         {
    9309       963025 :           i++;
    9310       963025 :           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
    9311              :             {
    9312       281624 :               add_slp_costs (loop_vinfo->vector_costs, cost_vec);
    9313       281624 :               cost_vec.release ();
    9314              :             }
    9315              :           else
    9316              :             /* For BB vectorization remember the SLP graph entry
    9317              :                cost for later.  */
    9318       681401 :             instance->cost_vec = cost_vec;
    9319              :         }
    9320      1302148 :     }
    9321              : 
    9322              :   /* Now look for SLP instances with a root that are covered by other
    9323              :      instances and remove them.  */
    9324       419920 :   hash_set<stmt_vec_info> roots;
    9325      1734427 :   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
    9326       926797 :     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
    9327        32210 :       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
    9328       419920 :   if (!roots.is_empty ())
    9329              :     {
    9330        13079 :       visited.empty ();
    9331        74585 :       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
    9332        61506 :         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
    9333              :                                       visited);
    9334        74585 :       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
    9335        61506 :         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
    9336        32210 :             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
    9337              :           {
    9338          852 :             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
    9339          852 :             if (dump_enabled_p ())
    9340           17 :               dump_printf_loc (MSG_NOTE, vect_location,
    9341              :                                "removing SLP instance operations starting "
    9342              :                                "from: %G", root->stmt);
    9343          852 :             vect_free_slp_instance (instance);
    9344          852 :             vinfo->slp_instances.ordered_remove (i);
    9345              :           }
    9346              :         else
    9347        60654 :           ++i;
    9348              :     }
    9349              : 
    9350       839840 :   return !vinfo->slp_instances.is_empty ();
    9351      1080158 : }
    9352              : 
    9353              : /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
    9354              :    closing the eventual chain.  */
    9355              : 
    9356              : static slp_instance
    9357       750343 : get_ultimate_leader (slp_instance instance,
    9358              :                      hash_map<slp_instance, slp_instance> &instance_leader)
    9359              : {
    9360       750343 :   auto_vec<slp_instance *, 8> chain;
    9361       750343 :   slp_instance *tem;
    9362       834494 :   while (*(tem = instance_leader.get (instance)) != instance)
    9363              :     {
    9364        84151 :       chain.safe_push (tem);
    9365        84151 :       instance = *tem;
    9366              :     }
    9367       834494 :   while (!chain.is_empty ())
    9368        84151 :     *chain.pop () = instance;
    9369       750343 :   return instance;
    9370       750343 : }
    9371              : 
    9372              : namespace {
    9373              : /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
    9374              :    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
    9375              :    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
    9376              : 
    9377              :    INSTANCE_LEADER is as for get_ultimate_leader.  */
    9378              : 
    9379              : template<typename T>
    9380              : bool
    9381      3286873 : vect_map_to_instance (slp_instance instance, T key,
    9382              :                       hash_map<T, slp_instance> &key_to_instance,
    9383              :                       hash_map<slp_instance, slp_instance> &instance_leader)
    9384              : {
    9385              :   bool existed_p;
    9386      3286873 :   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
    9387      3286873 :   if (!existed_p)
    9388              :     ;
    9389       173598 :   else if (key_instance != instance)
    9390              :     {
    9391              :       /* If we're running into a previously marked key make us the
    9392              :          leader of the current ultimate leader.  This keeps the
    9393              :          leader chain acyclic and works even when the current instance
    9394              :          connects two previously independent graph parts.  */
    9395        69794 :       slp_instance key_leader
    9396        69794 :         = get_ultimate_leader (key_instance, instance_leader);
    9397        69794 :       if (key_leader != instance)
    9398        21107 :         instance_leader.put (key_leader, instance);
    9399              :     }
    9400      3286873 :   key_instance = instance;
    9401      3286873 :   return existed_p;
    9402              : }
    9403              : }
    9404              : 
    9405              : /* Worker of vect_bb_partition_graph, recurse on NODE.  */
    9406              : 
    9407              : static void
    9408       909127 : vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
    9409              :                            slp_instance instance, slp_tree node,
    9410              :                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
    9411              :                            hash_map<slp_tree, slp_instance> &node_to_instance,
    9412              :                            hash_map<slp_instance, slp_instance> &instance_leader)
    9413              : {
    9414       909127 :   stmt_vec_info stmt_info;
    9415       909127 :   unsigned i;
    9416              : 
    9417      3286873 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    9418      2377746 :     if (stmt_info)
    9419      2377746 :       vect_map_to_instance (instance, stmt_info, stmt_to_instance,
    9420              :                             instance_leader);
    9421              : 
    9422       909127 :   if (vect_map_to_instance (instance, node, node_to_instance,
    9423              :                             instance_leader))
    9424       909127 :     return;
    9425              : 
    9426              :   slp_tree child;
    9427      1739177 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9428       872499 :     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    9429       228578 :       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
    9430              :                                  node_to_instance, instance_leader);
    9431              : }
    9432              : 
    9433              : /* Partition the SLP graph into pieces that can be costed independently.  */
    9434              : 
    9435              : static void
    9436       234430 : vect_bb_partition_graph (bb_vec_info bb_vinfo)
    9437              : {
    9438       234430 :   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
    9439              : 
    9440              :   /* First walk the SLP graph assigning each involved scalar stmt a
    9441              :      corresponding SLP graph entry and upon visiting a previously
    9442              :      marked stmt, make the stmts leader the current SLP graph entry.  */
    9443       234430 :   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
    9444       234430 :   hash_map<slp_tree, slp_instance> node_to_instance;
    9445       234430 :   hash_map<slp_instance, slp_instance> instance_leader;
    9446       234430 :   slp_instance instance;
    9447       914979 :   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
    9448              :     {
    9449       680549 :       instance_leader.put (instance, instance);
    9450       680549 :       vect_bb_partition_graph_r (bb_vinfo,
    9451              :                                  instance, SLP_INSTANCE_TREE (instance),
    9452              :                                  stmt_to_instance, node_to_instance,
    9453              :                                  instance_leader);
    9454              :     }
    9455              : 
    9456              :   /* Then collect entries to each independent subgraph.  */
    9457      1149409 :   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
    9458              :     {
    9459       680549 :       slp_instance leader = get_ultimate_leader (instance, instance_leader);
    9460       680549 :       leader->subgraph_entries.safe_push (instance);
    9461       680549 :       if (dump_enabled_p ()
    9462       680549 :           && leader != instance)
    9463           71 :         dump_printf_loc (MSG_NOTE, vect_location,
    9464              :                          "instance %p is leader of %p\n",
    9465              :                          (void *) leader, (void *) instance);
    9466              :     }
    9467       234430 : }
    9468              : 
    9469              : /* Compute the scalar cost of the SLP node NODE and its children
    9470              :    and return it.  Do not account defs that are marked in LIFE and
    9471              :    update LIFE according to uses of NODE.  */
    9472              : 
    9473              : static void
    9474       677059 : vect_bb_slp_scalar_cost (bb_vec_info vinfo,
    9475              :                          vec<stmt_vec_info> &worklist,
    9476              :                          stmt_vector_for_cost *cost_vec,
    9477              :                          hash_set<stmt_vec_info> &visited)
    9478              : {
    9479      3132315 :   while (!worklist.is_empty ())
    9480              :     {
    9481      2455256 :       stmt_vec_info stmt = worklist.pop ();
    9482      2740030 :       if (!PURE_SLP_STMT (stmt))
    9483       300321 :         continue;
    9484              : 
    9485              :       /* When the stmt is live but not actually vectorized we have
    9486              :          to keep the feeding scalar defs.  */
    9487      2173369 :       if (!STMT_VINFO_LIVE_P (vect_stmt_to_vectorize (stmt)))
    9488              :         {
    9489      2107370 :           bool live_p = false;
    9490      2107370 :           ssa_op_iter op_iter;
    9491      2107370 :           def_operand_p def_p;
    9492      4607660 :           FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt->stmt, op_iter, SSA_OP_DEF)
    9493              :             {
    9494       392920 :               imm_use_iterator use_iter;
    9495       392920 :               gimple *use_stmt;
    9496      1422610 :               FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
    9497       636770 :                 if (!is_gimple_debug (use_stmt))
    9498              :                   {
    9499       469269 :                     stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
    9500       469269 :                     if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
    9501              :                       {
    9502        24218 :                         if (dump_enabled_p ())
    9503              :                           {
    9504           67 :                             dump_printf_loc (MSG_NOTE, vect_location,
    9505              :                                              "stmt considered live: %G",
    9506              :                                              stmt->stmt);
    9507           67 :                             dump_printf_loc (MSG_NOTE, vect_location,
    9508              :                                              "because of use in: %G",
    9509              :                                              use_stmt);
    9510              :                           }
    9511              :                         live_p = true;
    9512              :                       }
    9513       392920 :                   }
    9514              :             }
    9515      2107370 :           if (live_p)
    9516        15547 :             continue;
    9517              :         }
    9518              : 
    9519              :       /* The following assert verifies that vect_bb_partition_graph
    9520              :          partitions the SLP graph in a way that each scalar stmt of
    9521              :          the coverage of the SLP graph belongs to exactly one subgraph.
    9522              :          ???  This is currently not guaranteed since the function
    9523              :          works purely on SLP_TREE_SCALAR_STMTS, resulting in the assert
    9524              :          tripping or scalar stmts costed multiple times, making vectorization
    9525              :          more profitable than it really is.  */
    9526              :       /* gcc_checking_assert (!gimple_visited_p (stmt->stmt)); */
    9527              : 
    9528      2154935 :       if (vect_nop_conversion_p (stmt))
    9529              :         ;
    9530              :       /* For single-argument PHIs assume coalescing which means zero
    9531              :          cost for the scalar and the vector PHIs.  This avoids
    9532              :          artificially favoring the vector path (but may pessimize it
    9533              :          in some cases).  */
    9534      2133796 :       else if (is_a <gphi *> (stmt->stmt)
    9535      2133796 :                && gimple_phi_num_args (as_a <gphi *> (stmt->stmt)) == 1)
    9536              :         ;
    9537              :       else
    9538              :         {
    9539      2124963 :           vect_cost_for_stmt kind;
    9540      2124963 :           if (STMT_VINFO_DATA_REF (stmt))
    9541              :             {
    9542      1956637 :               data_reference_p dr = STMT_VINFO_DATA_REF (stmt);
    9543      1956637 :               tree base = get_base_address (DR_REF (dr));
    9544              :               /* When the scalar access is to a non-global not
    9545              :                  address-taken decl that is not BLKmode assume we can
    9546              :                  access it with a single non-load/store instruction.  */
    9547      1956637 :               if (DECL_P (base)
    9548      1509433 :                   && !is_global_var (base)
    9549      1434830 :                   && !TREE_ADDRESSABLE (base)
    9550      2505237 :                   && DECL_MODE (base) != BLKmode)
    9551              :                 kind = scalar_stmt;
    9552      1813524 :               else if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt)))
    9553              :                 kind = scalar_load;
    9554              :               else
    9555      1587462 :                 kind = scalar_store;
    9556              :             }
    9557              :           else
    9558              :             kind = scalar_stmt;
    9559              :           /* Cost each scalar stmt only once.  */
    9560      2124963 :           gimple_set_visited (stmt->stmt, true);
    9561      2124963 :           record_stmt_cost (cost_vec, 1, kind, stmt, NULL_TREE, 0, vect_body);
    9562              :         }
    9563              : 
    9564              :       /* Now walk relevant parts of the SSA use-def graph.  */
    9565      2154935 :       slp_oprnds child_ops (stmt);
    9566      4515580 :       for (unsigned i = 0; i < child_ops.num_slp_children; ++i)
    9567              :         {
    9568      2360645 :           tree op = child_ops.get_op_for_slp_child (stmt, i);
    9569      2360645 :           stmt_vec_info def = vinfo->lookup_def (op);
    9570      2360645 :           if (def && !visited.add (def))
    9571       683518 :             worklist.safe_push (def);
    9572              :         }
    9573              :     }
    9574       677059 : }
    9575              : 
    9576              : 
    9577              : /* Comparator for the loop-index sorted cost vectors.  */
    9578              : 
    9579              : static int
    9580     16819469 : li_cost_vec_cmp (const void *a_, const void *b_, void *)
    9581              : {
    9582     16819469 :   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
    9583     16819469 :   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
    9584     16819469 :   if (a->first < b->first)
    9585              :     return -1;
    9586     16005914 :   else if (a->first == b->first)
    9587     15327298 :     return 0;
    9588              :   return 1;
    9589              : }
    9590              : 
    9591              : /* Check if vectorization of the basic block is profitable for the
    9592              :    subgraph denoted by SLP_INSTANCES.  */
    9593              : 
    9594              : static bool
    9595       656098 : vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
    9596              :                                     vec<slp_instance> slp_instances,
    9597              :                                     loop_p orig_loop)
    9598              : {
    9599       656098 :   slp_instance instance;
    9600       656098 :   int i;
    9601       656098 :   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
    9602       656098 :   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
    9603              : 
    9604       656098 :   if (dump_enabled_p ())
    9605              :     {
    9606          105 :       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
    9607          105 :       hash_set<slp_tree> visited;
    9608          425 :       FOR_EACH_VEC_ELT (slp_instances, i, instance)
    9609          110 :         vect_print_slp_graph (MSG_NOTE, vect_location,
    9610              :                               SLP_INSTANCE_TREE (instance), visited);
    9611          105 :     }
    9612              : 
    9613              :   /* Then DFS walk scalar stmts, performing costing and handling
    9614              :      still live scalar stmts via the previously computed vector coverage.  */
    9615       656098 :   stmt_vector_for_cost scalar_costs = vNULL;
    9616       656098 :   stmt_vector_for_cost vector_costs = vNULL;
    9617       656098 :   hash_set<slp_tree> visited;
    9618       656098 :   hash_set<stmt_vec_info> svisited;
    9619      1333157 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
    9620              :     {
    9621       677059 :       auto_vec<stmt_vec_info> worklist;
    9622       677059 :       if (SLP_INSTANCE_ROOT_STMTS (instance).exists ())
    9623        56994 :         record_stmt_cost (&scalar_costs,
    9624        28497 :                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
    9625              :                           scalar_stmt,
    9626        28497 :                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
    9627      3811609 :       for (auto stmt : SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)))
    9628              :         {
    9629      1780432 :           stmt = vect_orig_stmt (stmt);
    9630      1780432 :           if (!svisited.add (stmt))
    9631      1771738 :             worklist.safe_push (stmt);
    9632              :         }
    9633       677059 :       vect_bb_slp_scalar_cost (bb_vinfo, worklist, &scalar_costs, svisited);
    9634       677059 :       vector_costs.safe_splice (instance->cost_vec);
    9635       677059 :       instance->cost_vec.release ();
    9636       677059 :     }
    9637              : 
    9638       656098 :   if (dump_enabled_p ())
    9639          105 :     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
    9640              : 
    9641              :   /* When costing non-loop vectorization we need to consider each covered
    9642              :      loop independently and make sure vectorization is profitable.  For
    9643              :      now we assume a loop may be not entered or executed an arbitrary
    9644              :      number of iterations (???  static information can provide more
    9645              :      precise info here) which means we can simply cost each containing
    9646              :      loops stmts separately.  */
    9647              : 
    9648              :   /* First produce cost vectors sorted by loop index.  */
    9649       656098 :   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
    9650       656098 :     li_scalar_costs (scalar_costs.length ());
    9651       656098 :   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
    9652       656098 :     li_vector_costs (vector_costs.length ());
    9653       656098 :   stmt_info_for_cost *cost;
    9654      2809558 :   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
    9655              :     {
    9656      2153460 :       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
    9657      2153460 :       li_scalar_costs.quick_push (std::make_pair (l, cost));
    9658              :     }
    9659              :   /* Use a random used loop as fallback in case the first vector_costs
    9660              :      entry does not have a stmt_info associated with it.  */
    9661       656098 :   unsigned l = li_scalar_costs[0].first;
    9662      2389563 :   FOR_EACH_VEC_ELT (vector_costs, i, cost)
    9663              :     {
    9664              :       /* We inherit from the previous COST, invariants, externals and
    9665              :          extracts immediately follow the cost for the related stmt.  */
    9666      1733465 :       if (cost->stmt_info)
    9667      1017261 :         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
    9668      1733465 :       li_vector_costs.quick_push (std::make_pair (l, cost));
    9669              :     }
    9670       656098 :   li_scalar_costs.stablesort (li_cost_vec_cmp, NULL);
    9671       656098 :   li_vector_costs.stablesort (li_cost_vec_cmp, NULL);
    9672              : 
    9673              :   /* Now cost the portions individually.  */
    9674              :   unsigned vi = 0;
    9675              :   unsigned si = 0;
    9676      1139297 :   bool profitable = true;
    9677      1139297 :   while (si < li_scalar_costs.length ()
    9678      1799960 :          && vi < li_vector_costs.length ())
    9679              :     {
    9680       660651 :       unsigned sl = li_scalar_costs[si].first;
    9681       660651 :       unsigned vl = li_vector_costs[vi].first;
    9682       660651 :       if (sl != vl)
    9683              :         {
    9684          950 :           if (dump_enabled_p ())
    9685            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    9686              :                              "Scalar %d and vector %d loop part do not "
    9687              :                              "match up, skipping scalar part\n", sl, vl);
    9688              :           /* Skip the scalar part, assuming zero cost on the vector side.  */
    9689         1590 :           do
    9690              :             {
    9691         1590 :               si++;
    9692              :             }
    9693         1590 :           while (si < li_scalar_costs.length ()
    9694         3251 :                  && li_scalar_costs[si].first == sl);
    9695          950 :           continue;
    9696              :         }
    9697              : 
    9698       659701 :       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
    9699      2135281 :       do
    9700              :         {
    9701      2135281 :           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
    9702      2135281 :           si++;
    9703              :         }
    9704      2135281 :       while (si < li_scalar_costs.length ()
    9705      4278070 :              && li_scalar_costs[si].first == sl);
    9706       659701 :       scalar_target_cost_data->finish_cost (nullptr);
    9707       659701 :       scalar_cost = scalar_target_cost_data->body_cost ();
    9708              : 
    9709              :       /* Complete the target-specific vector cost calculation.  */
    9710       659701 :       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
    9711       659701 :       auto_vec<stmt_info_for_cost> tem;
    9712      1707395 :       do
    9713              :         {
    9714      1707395 :           tem.safe_push (*li_vector_costs[vi].second);
    9715      1707395 :           vi++;
    9716              :         }
    9717      1707395 :       while (vi < li_vector_costs.length ()
    9718      3423453 :              && li_vector_costs[vi].first == vl);
    9719       659701 :       add_slp_costs (vect_target_cost_data, tem);
    9720       659701 :       vect_target_cost_data->finish_cost (scalar_target_cost_data);
    9721       659701 :       vec_prologue_cost = vect_target_cost_data->prologue_cost ();
    9722       659701 :       vec_inside_cost = vect_target_cost_data->body_cost ();
    9723       659701 :       vec_epilogue_cost = vect_target_cost_data->epilogue_cost ();
    9724       659701 :       delete scalar_target_cost_data;
    9725       659701 :       delete vect_target_cost_data;
    9726              : 
    9727       659701 :       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
    9728              : 
    9729       659701 :       if (dump_enabled_p ())
    9730              :         {
    9731          112 :           dump_printf_loc (MSG_NOTE, vect_location,
    9732              :                            "Cost model analysis for part in loop %d:\n", sl);
    9733          112 :           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
    9734              :                        vec_inside_cost + vec_outside_cost);
    9735          112 :           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
    9736              :         }
    9737              : 
    9738              :       /* Vectorization is profitable if its cost is more than the cost of scalar
    9739              :          version.  Note that we err on the vector side for equal cost because
    9740              :          the cost estimate is otherwise quite pessimistic (constant uses are
    9741              :          free on the scalar side but cost a load on the vector side for
    9742              :          example).  */
    9743       659701 :       if (vec_outside_cost + vec_inside_cost > scalar_cost)
    9744              :         {
    9745       177452 :           profitable = false;
    9746       177452 :           break;
    9747              :         }
    9748       482249 :     }
    9749       656098 :   if (profitable && vi < li_vector_costs.length ())
    9750              :     {
    9751         1062 :       if (dump_enabled_p ())
    9752            0 :         dump_printf_loc (MSG_NOTE, vect_location,
    9753              :                          "Excess vector cost for part in loop %d:\n",
    9754            0 :                          li_vector_costs[vi].first);
    9755              :       profitable = false;
    9756              :     }
    9757              : 
    9758              :   /* Unset visited flag.  This is delayed when the subgraph is profitable
    9759              :      and we process the loop for remaining unvectorized if-converted code.  */
    9760       656098 :   if (!orig_loop || !profitable)
    9761      2808139 :     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
    9762      2152144 :       gimple_set_visited  (cost->stmt_info->stmt, false);
    9763              : 
    9764       656098 :   scalar_costs.release ();
    9765       656098 :   vector_costs.release ();
    9766              : 
    9767       656098 :   return profitable;
    9768       656098 : }
    9769              : 
    9770              : /* qsort comparator for lane defs.  */
    9771              : 
    9772              : static int
    9773           40 : vld_cmp (const void *a_, const void *b_)
    9774              : {
    9775           40 :   auto *a = (const std::pair<unsigned, tree> *)a_;
    9776           40 :   auto *b = (const std::pair<unsigned, tree> *)b_;
    9777           40 :   return a->first - b->first;
    9778              : }
    9779              : 
    9780              : /* Return true if USE_STMT is a vector lane insert into VEC and set
    9781              :    *THIS_LANE to the lane number that is set.  */
    9782              : 
    9783              : static bool
    9784          248 : vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
    9785              : {
    9786          248 :   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
    9787           91 :   if (!use_ass
    9788           91 :       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
    9789           22 :       || (vec
    9790           22 :           ? gimple_assign_rhs1 (use_ass) != vec
    9791           24 :           : ((vec = gimple_assign_rhs1 (use_ass)), false))
    9792           46 :       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
    9793           46 :                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
    9794           46 :       || !constant_multiple_p
    9795           46 :             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
    9796           92 :              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
    9797              :              this_lane))
    9798          202 :     return false;
    9799              :   return true;
    9800              : }
    9801              : 
    9802              : /* Find any vectorizable constructors and add them to the grouped_store
    9803              :    array.  */
    9804              : 
    9805              : static void
    9806      2183721 : vect_slp_check_for_roots (bb_vec_info bb_vinfo)
    9807              : {
    9808     17491773 :   for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
    9809     30616104 :     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
    9810    134643934 :          !gsi_end_p (gsi); gsi_next (&gsi))
    9811              :     {
    9812    119335882 :       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
    9813              :       /* This can be used to start SLP discovery for early breaks for BB early breaks
    9814              :          when we get that far.  */
    9815    119335882 :       if (!assign)
    9816    178883907 :         continue;
    9817              : 
    9818     30810641 :       tree rhs = gimple_assign_rhs1 (assign);
    9819     30810641 :       enum tree_code code = gimple_assign_rhs_code (assign);
    9820     30810641 :       use_operand_p use_p;
    9821     30810641 :       gimple *use_stmt;
    9822     30810641 :       if (code == CONSTRUCTOR)
    9823              :         {
    9824      1563583 :           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
    9825        64308 :               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
    9826        93668 :                            CONSTRUCTOR_NELTS (rhs))
    9827        43274 :               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
    9828      1606853 :               || uniform_vector_p (rhs))
    9829      1550488 :             continue;
    9830              : 
    9831              :           unsigned j;
    9832              :           tree val;
    9833        64272 :           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
    9834        51177 :             if (TREE_CODE (val) != SSA_NAME
    9835        51177 :                 || !bb_vinfo->lookup_def (val))
    9836              :               break;
    9837        32262 :           if (j != CONSTRUCTOR_NELTS (rhs))
    9838         3036 :             continue;
    9839              : 
    9840        13095 :           vec<stmt_vec_info> roots = vNULL;
    9841        13095 :           roots.safe_push (bb_vinfo->lookup_stmt (assign));
    9842        13095 :           vec<stmt_vec_info> stmts;
    9843        13095 :           stmts.create (CONSTRUCTOR_NELTS (rhs));
    9844        72670 :           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
    9845        46480 :             stmts.quick_push
    9846        46480 :               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
    9847        13095 :           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
    9848        13095 :                                                stmts, roots));
    9849              :         }
    9850     29247058 :       else if (code == BIT_INSERT_EXPR
    9851          927 :                && VECTOR_TYPE_P (TREE_TYPE (rhs))
    9852          605 :                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
    9853          605 :                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
    9854          602 :                && integer_zerop (gimple_assign_rhs3 (assign))
    9855          336 :                && useless_type_conversion_p
    9856          336 :                     (TREE_TYPE (TREE_TYPE (rhs)),
    9857          336 :                      TREE_TYPE (gimple_assign_rhs2 (assign)))
    9858     29247670 :                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
    9859              :         {
    9860              :           /* We start to match on insert to lane zero but since the
    9861              :              inserts need not be ordered we'd have to search both
    9862              :              the def and the use chains.  */
    9863          215 :           tree vectype = TREE_TYPE (rhs);
    9864          215 :           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
    9865          215 :           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
    9866          215 :           auto_sbitmap lanes (nlanes);
    9867          215 :           bitmap_clear (lanes);
    9868          215 :           bitmap_set_bit (lanes, 0);
    9869          215 :           tree def = gimple_assign_lhs (assign);
    9870          215 :           lane_defs.quick_push
    9871          215 :                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
    9872          215 :           unsigned lanes_found = 1;
    9873              :           /* Start with the use chains, the last stmt will be the root.  */
    9874          215 :           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
    9875          215 :           vec<stmt_vec_info> roots = vNULL;
    9876          215 :           roots.safe_push (last);
    9877          217 :           do
    9878              :             {
    9879          217 :               use_operand_p use_p;
    9880          217 :               gimple *use_stmt;
    9881          217 :               if (!single_imm_use (def, &use_p, &use_stmt))
    9882              :                 break;
    9883          211 :               unsigned this_lane;
    9884          211 :               if (!bb_vinfo->lookup_stmt (use_stmt)
    9885          211 :                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
    9886          233 :                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
    9887              :                 break;
    9888           22 :               if (bitmap_bit_p (lanes, this_lane))
    9889              :                 break;
    9890            2 :               lanes_found++;
    9891            2 :               bitmap_set_bit (lanes, this_lane);
    9892            2 :               gassign *use_ass = as_a <gassign *> (use_stmt);
    9893            2 :               lane_defs.quick_push (std::make_pair
    9894            2 :                                      (this_lane, gimple_assign_rhs2 (use_ass)));
    9895            2 :               last = bb_vinfo->lookup_stmt (use_ass);
    9896            2 :               roots.safe_push (last);
    9897            2 :               def = gimple_assign_lhs (use_ass);
    9898              :             }
    9899            2 :           while (lanes_found < nlanes);
    9900          215 :           if (roots.length () > 1)
    9901            2 :             std::swap(roots[0], roots[roots.length () - 1]);
    9902          215 :           if (lanes_found < nlanes)
    9903              :             {
    9904              :               /* Now search the def chain.  */
    9905          215 :               def = gimple_assign_rhs1 (assign);
    9906          217 :               do
    9907              :                 {
    9908          217 :                   if (TREE_CODE (def) != SSA_NAME
    9909          217 :                       || !has_single_use (def))
    9910              :                     break;
    9911           56 :                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
    9912           56 :                   unsigned this_lane;
    9913           56 :                   if (!bb_vinfo->lookup_stmt (def_stmt)
    9914           37 :                       || !vect_slp_is_lane_insert (def_stmt,
    9915              :                                                    NULL_TREE, &this_lane)
    9916           80 :                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
    9917              :                     break;
    9918           24 :                   if (bitmap_bit_p (lanes, this_lane))
    9919              :                     break;
    9920            4 :                   lanes_found++;
    9921            4 :                   bitmap_set_bit (lanes, this_lane);
    9922            8 :                   lane_defs.quick_push (std::make_pair
    9923            4 :                                           (this_lane,
    9924            4 :                                            gimple_assign_rhs2 (def_stmt)));
    9925            4 :                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
    9926            4 :                   def = gimple_assign_rhs1 (def_stmt);
    9927              :                 }
    9928            4 :               while (lanes_found < nlanes);
    9929              :             }
    9930          215 :           if (lanes_found == nlanes)
    9931              :             {
    9932              :               /* Sort lane_defs after the lane index and register the root.  */
    9933            2 :               lane_defs.qsort (vld_cmp);
    9934            2 :               vec<stmt_vec_info> stmts;
    9935            2 :               stmts.create (nlanes);
    9936           10 :               for (unsigned i = 0; i < nlanes; ++i)
    9937            8 :                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
    9938            2 :               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
    9939            2 :                                                    stmts, roots));
    9940              :             }
    9941              :           else
    9942          213 :             roots.release ();
    9943          215 :         }
    9944     29246843 :       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
    9945     28257903 :                && (associative_tree_code (code) || code == MINUS_EXPR)
    9946              :                /* ???  This pessimizes a two-element reduction.  PR54400.
    9947              :                   ???  In-order reduction could be handled if we only
    9948              :                   traverse one operand chain in vect_slp_linearize_chain.  */
    9949     33172418 :                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
    9950              :                /* Ops with constants at the tail can be stripped here.  */
    9951      5809934 :                && TREE_CODE (rhs) == SSA_NAME
    9952      5743939 :                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
    9953              :                /* Should be the chain end.  */
    9954     31540326 :                && (!single_imm_use (gimple_assign_lhs (assign),
    9955              :                                     &use_p, &use_stmt)
    9956      1766607 :                    || !is_gimple_assign (use_stmt)
    9957      1212710 :                    || (gimple_assign_rhs_code (use_stmt) != code
    9958       902842 :                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
    9959       500780 :                            || (gimple_assign_rhs_code (use_stmt)
    9960       500780 :                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
    9961              :         {
    9962              :           /* We start the match at the end of a possible association
    9963              :              chain.  */
    9964      1884359 :           auto_vec<chain_op_t> chain;
    9965      1884359 :           auto_vec<std::pair<tree_code, gimple *> > worklist;
    9966      1884359 :           auto_vec<gimple *> chain_stmts;
    9967      1884359 :           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
    9968      1884359 :           if (code == MINUS_EXPR)
    9969       304477 :             code = PLUS_EXPR;
    9970      1884359 :           internal_fn reduc_fn;
    9971      2167296 :           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
    9972      1884359 :               || reduc_fn == IFN_LAST)
    9973       282937 :             continue;
    9974      1601422 :           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
    9975              :                                     /* ??? */
    9976              :                                     code_stmt, alt_code_stmt, &chain_stmts,
    9977              :                                     false);
    9978      3202844 :           if (chain.length () > 1)
    9979              :             {
    9980              :               /* Sort the chain according to def_type and operation.  */
    9981      1601422 :               chain.sort (dt_sort_cmp, bb_vinfo);
    9982              :               /* ???  Now we'd want to strip externals and constants
    9983              :                  but record those to be handled in the epilogue.  */
    9984              :               /* ???  For now do not allow mixing ops or externs/constants.  */
    9985      1601422 :               bool invalid = false;
    9986      1601422 :               unsigned remain_cnt = 0;
    9987      1601422 :               unsigned last_idx = 0;
    9988      4834683 :               for (unsigned i = 0; i < chain.length (); ++i)
    9989              :                 {
    9990      3537738 :                   if (chain[i].code != code)
    9991              :                     {
    9992              :                       invalid = true;
    9993              :                       break;
    9994              :                     }
    9995      3233261 :                   if (chain[i].dt != vect_internal_def
    9996              :                       /* Avoid stmts where the def is not the LHS, like
    9997              :                          ASMs.  */
    9998      6254473 :                       || (gimple_get_lhs (bb_vinfo->lookup_def
    9999      3021212 :                                                       (chain[i].op)->stmt)
   10000      3021212 :                           != chain[i].op))
   10001       214993 :                     remain_cnt++;
   10002              :                   else
   10003              :                     last_idx = i;
   10004              :                 }
   10005              :               /* Make sure to have an even number of lanes as we later do
   10006              :                  all-or-nothing discovery, not trying to split further.  */
   10007      1601422 :               if ((chain.length () - remain_cnt) & 1)
   10008       169260 :                 remain_cnt++;
   10009      1601422 :               if (!invalid && chain.length () - remain_cnt > 1)
   10010              :                 {
   10011      1231960 :                   vec<stmt_vec_info> stmts;
   10012      1231960 :                   vec<tree> remain = vNULL;
   10013      1231960 :                   stmts.create (chain.length ());
   10014      1231960 :                   if (remain_cnt > 0)
   10015       114810 :                     remain.create (remain_cnt);
   10016      3956281 :                   for (unsigned i = 0; i < chain.length (); ++i)
   10017              :                     {
   10018      2724321 :                       stmt_vec_info stmt_info;
   10019      2724321 :                       if (chain[i].dt == vect_internal_def
   10020      2684366 :                           && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
   10021      2684366 :                               gimple_get_lhs (stmt_info->stmt) == chain[i].op)
   10022      5408603 :                           && (i != last_idx
   10023      1231960 :                               || (stmts.length () & 1)))
   10024      2598176 :                         stmts.quick_push (stmt_info);
   10025              :                       else
   10026       126145 :                         remain.quick_push (chain[i].op);
   10027              :                     }
   10028      1231960 :                   vec<stmt_vec_info> roots;
   10029      1231960 :                   roots.create (chain_stmts.length ());
   10030      2724321 :                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
   10031      1492361 :                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
   10032      1231960 :                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
   10033      1231960 :                                                        stmts, roots, remain));
   10034              :                 }
   10035              :             }
   10036      1884359 :         }
   10037              :     }
   10038      2183721 : }
   10039              : 
   10040              : /* Walk the grouped store chains and replace entries with their
   10041              :    pattern variant if any.  */
   10042              : 
   10043              : static void
   10044       613233 : vect_fixup_store_groups_with_patterns (vec_info *vinfo)
   10045              : {
   10046       613233 :   stmt_vec_info first_element;
   10047       613233 :   unsigned i;
   10048              : 
   10049      1504702 :   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
   10050              :     {
   10051              :       /* We also have CTORs in this array.  */
   10052       891469 :       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
   10053            0 :         continue;
   10054       891469 :       if (STMT_VINFO_IN_PATTERN_P (first_element))
   10055              :         {
   10056          252 :           stmt_vec_info orig = first_element;
   10057          252 :           first_element = STMT_VINFO_RELATED_STMT (first_element);
   10058          252 :           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
   10059          252 :           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
   10060          252 :           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
   10061          252 :           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
   10062          252 :           vinfo->grouped_stores[i] = first_element;
   10063              :         }
   10064       891469 :       stmt_vec_info prev = first_element;
   10065      2503675 :       while (DR_GROUP_NEXT_ELEMENT (prev))
   10066              :         {
   10067      1612206 :           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
   10068      1612206 :           if (STMT_VINFO_IN_PATTERN_P (elt))
   10069              :             {
   10070          849 :               stmt_vec_info orig = elt;
   10071          849 :               elt = STMT_VINFO_RELATED_STMT (elt);
   10072          849 :               DR_GROUP_NEXT_ELEMENT (prev) = elt;
   10073          849 :               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
   10074          849 :               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
   10075              :             }
   10076      1612206 :           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
   10077      1612206 :           prev = elt;
   10078              :         }
   10079              :     }
   10080       613233 : }
   10081              : 
   10082              : /* Check if the region described by BB_VINFO can be vectorized, returning
   10083              :    true if so.  When returning false, set FATAL to true if the same failure
   10084              :    would prevent vectorization at other vector sizes, false if it is still
   10085              :    worth trying other sizes.  N_STMTS is the number of statements in the
   10086              :    region.  */
   10087              : 
   10088              : static bool
   10089      2183721 : vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
   10090              :                        vec<int> *dataref_groups)
   10091              : {
   10092      2183721 :   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
   10093              : 
   10094      2183721 :   slp_instance instance;
   10095      2183721 :   int i;
   10096              : 
   10097              :   /* The first group of checks is independent of the vector size.  */
   10098      2183721 :   fatal = true;
   10099              : 
   10100              :   /* Analyze the data references.  */
   10101              : 
   10102      2183721 :   if (!vect_analyze_data_refs (bb_vinfo, NULL))
   10103              :     {
   10104            0 :       if (dump_enabled_p ())
   10105            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10106              :                          "not vectorized: unhandled data-ref in basic "
   10107              :                          "block.\n");
   10108            0 :       return false;
   10109              :     }
   10110              : 
   10111      2183721 :   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
   10112              :     {
   10113            0 :      if (dump_enabled_p ())
   10114            0 :        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10115              :                         "not vectorized: unhandled data access in "
   10116              :                         "basic block.\n");
   10117            0 :       return false;
   10118              :     }
   10119              : 
   10120      2183721 :   vect_slp_check_for_roots (bb_vinfo);
   10121              : 
   10122              :   /* If there are no grouped stores and no constructors in the region
   10123              :      there is no need to continue with pattern recog as vect_analyze_slp
   10124              :      will fail anyway.  */
   10125      2183721 :   if (bb_vinfo->grouped_stores.is_empty ()
   10126      1841762 :       && bb_vinfo->roots.is_empty ())
   10127              :     {
   10128      1570488 :       if (dump_enabled_p ())
   10129         1024 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10130              :                          "not vectorized: no grouped stores in "
   10131              :                          "basic block.\n");
   10132      1570488 :       return false;
   10133              :     }
   10134              : 
   10135              :   /* While the rest of the analysis below depends on it in some way.  */
   10136       613233 :   fatal = false;
   10137              : 
   10138       613233 :   vect_pattern_recog (bb_vinfo);
   10139              : 
   10140              :   /* Update store groups from pattern processing.  */
   10141       613233 :   vect_fixup_store_groups_with_patterns (bb_vinfo);
   10142              : 
   10143              :   /* Check the SLP opportunities in the basic block, analyze and build SLP
   10144              :      trees.  */
   10145       613233 :   if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
   10146              :     {
   10147            0 :       if (dump_enabled_p ())
   10148              :         {
   10149            0 :           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10150              :                            "Failed to SLP the basic block.\n");
   10151            0 :           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10152              :                            "not vectorized: failed to find SLP opportunities "
   10153              :                            "in basic block.\n");
   10154              :         }
   10155            0 :       return false;
   10156              :     }
   10157              : 
   10158              :   /* Optimize permutations.  */
   10159       613233 :   vect_optimize_slp (bb_vinfo);
   10160              : 
   10161              :   /* Gather the loads reachable from the SLP graph entries.  */
   10162       613233 :   vect_gather_slp_loads (bb_vinfo);
   10163              : 
   10164       613233 :   vect_record_base_alignments (bb_vinfo);
   10165              : 
   10166              :   /* Analyze and verify the alignment of data references and the
   10167              :      dependence in the SLP instances.  */
   10168      1401917 :   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
   10169              :     {
   10170       788684 :       vect_location = instance->location ();
   10171       788684 :       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
   10172       788684 :           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
   10173              :         {
   10174         8478 :           slp_tree node = SLP_INSTANCE_TREE (instance);
   10175         8478 :           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   10176         8478 :           if (dump_enabled_p ())
   10177            4 :             dump_printf_loc (MSG_NOTE, vect_location,
   10178              :                              "removing SLP instance operations starting from: %G",
   10179              :                              stmt_info->stmt);
   10180         8478 :           vect_free_slp_instance (instance);
   10181         8478 :           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
   10182         8478 :           continue;
   10183         8478 :         }
   10184              : 
   10185              :       /* Mark all the statements that we want to vectorize as relevant.  */
   10186       780206 :       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
   10187              : 
   10188       780206 :       i++;
   10189              :     }
   10190      2213942 :   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
   10191              :     return false;
   10192              : 
   10193       264651 :   if (!vect_slp_analyze_operations (bb_vinfo))
   10194              :     {
   10195        30221 :       if (dump_enabled_p ())
   10196           87 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10197              :                          "not vectorized: bad operation in basic block.\n");
   10198        30221 :       return false;
   10199              :     }
   10200              : 
   10201              :   /* Mark all the statements that we vectorize.  */
   10202       234430 :   vect_bb_slp_mark_stmts_vectorized (bb_vinfo);
   10203              : 
   10204              :   /* Compute vectorizable live stmts.  */
   10205       234430 :   vect_bb_slp_mark_live_stmts (bb_vinfo);
   10206              : 
   10207       234430 :   vect_bb_partition_graph (bb_vinfo);
   10208              : 
   10209       234430 :   return true;
   10210              : }
   10211              : 
   10212              : /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
   10213              :    basic blocks in BBS, returning true on success.
   10214              :    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
   10215              : 
   10216              : static bool
   10217      1862158 : vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
   10218              :                  vec<int> *dataref_groups, unsigned int n_stmts,
   10219              :                  loop_p orig_loop)
   10220              : {
   10221      1862158 :   bb_vec_info bb_vinfo;
   10222      1862158 :   auto_vector_modes vector_modes;
   10223              : 
   10224              :   /* Autodetect first vector size we try.  */
   10225      1862158 :   machine_mode next_vector_mode = VOIDmode;
   10226      1862158 :   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
   10227      1862158 :   unsigned int mode_i = 0;
   10228              : 
   10229      1862158 :   vec_info_shared shared;
   10230              : 
   10231      1862158 :   machine_mode autodetected_vector_mode = VOIDmode;
   10232      2505284 :   while (1)
   10233              :     {
   10234      2183721 :       bool vectorized = false;
   10235      2183721 :       bool fatal = false;
   10236      2183721 :       bb_vinfo = new _bb_vec_info (bbs, &shared);
   10237              : 
   10238      2183721 :       bool first_time_p = shared.datarefs.is_empty ();
   10239      2183721 :       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
   10240      2183721 :       if (first_time_p)
   10241      1885156 :         bb_vinfo->shared->save_datarefs ();
   10242              :       else
   10243       298565 :         bb_vinfo->shared->check_datarefs ();
   10244      2183721 :       bb_vinfo->vector_mode = next_vector_mode;
   10245              : 
   10246      2183721 :       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
   10247              :         {
   10248       234430 :           if (dump_enabled_p ())
   10249              :             {
   10250         1522 :               dump_printf_loc (MSG_NOTE, vect_location,
   10251              :                                "***** Analysis succeeded with vector mode"
   10252          761 :                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
   10253          761 :               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
   10254              :             }
   10255              : 
   10256       234430 :           bb_vinfo->shared->check_datarefs ();
   10257              : 
   10258       234430 :           bool force_clear = false;
   10259       234430 :           auto_vec<slp_instance> profitable_subgraphs;
   10260      1383839 :           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
   10261              :             {
   10262       680549 :               if (instance->subgraph_entries.is_empty ())
   10263       220728 :                 continue;
   10264              : 
   10265       659442 :               dump_user_location_t saved_vect_location = vect_location;
   10266       659442 :               vect_location = instance->location ();
   10267       659442 :               if (!unlimited_cost_model (NULL)
   10268       656103 :                   && !param_vect_allow_possibly_not_worthwhile_vectorizations
   10269      1315540 :                   && !vect_bb_vectorization_profitable_p
   10270       656098 :                         (bb_vinfo, instance->subgraph_entries, orig_loop))
   10271              :                 {
   10272       178514 :                   if (dump_enabled_p ())
   10273           32 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10274              :                                      "not vectorized: vectorization is not "
   10275              :                                      "profitable.\n");
   10276       178514 :                   vect_location = saved_vect_location;
   10277       178514 :                   continue;
   10278              :                 }
   10279              : 
   10280       480928 :               vect_location = saved_vect_location;
   10281       480928 :               if (!dbg_cnt (vect_slp))
   10282              :                 {
   10283            0 :                   force_clear = true;
   10284            0 :                   continue;
   10285              :                 }
   10286              : 
   10287       480928 :               profitable_subgraphs.safe_push (instance);
   10288              :             }
   10289              : 
   10290              :           /* When we're vectorizing an if-converted loop body make sure
   10291              :              we vectorized all if-converted code.  */
   10292       392904 :           if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
   10293              :             {
   10294          106 :               gcc_assert (bb_vinfo->nbbs == 1);
   10295          212 :               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
   10296         4390 :                    !gsi_end_p (gsi); gsi_next (&gsi))
   10297              :                 {
   10298              :                   /* The costing above left us with DCEable vectorized scalar
   10299              :                      stmts having the visited flag set on profitable
   10300              :                      subgraphs.  Do the delayed clearing of the flag here.  */
   10301         4284 :                   if (gimple_visited_p (gsi_stmt (gsi)))
   10302              :                     {
   10303         1260 :                       gimple_set_visited (gsi_stmt (gsi), false);
   10304         1260 :                       continue;
   10305              :                     }
   10306         3024 :                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
   10307          813 :                     continue;
   10308              : 
   10309         6338 :                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
   10310         2670 :                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
   10311              :                       {
   10312           69 :                         if (!profitable_subgraphs.is_empty ()
   10313           31 :                             && dump_enabled_p ())
   10314            0 :                           dump_printf_loc (MSG_NOTE, vect_location,
   10315              :                                            "not profitable because of "
   10316              :                                            "unprofitable if-converted scalar "
   10317              :                                            "code\n");
   10318           38 :                         profitable_subgraphs.truncate (0);
   10319              :                       }
   10320              :                 }
   10321              :             }
   10322              : 
   10323              :           /* Finally schedule the profitable subgraphs.  */
   10324      1032260 :           for (slp_instance instance : profitable_subgraphs)
   10325              :             {
   10326       480882 :               if (!vectorized && dump_enabled_p ())
   10327          735 :                 dump_printf_loc (MSG_NOTE, vect_location,
   10328              :                                  "Basic block will be vectorized "
   10329              :                                  "using SLP\n");
   10330       480882 :               vectorized = true;
   10331              : 
   10332              :               /* Dump before scheduling as store vectorization will remove
   10333              :                  the original stores and mess with the instance tree
   10334              :                  so querying its location will eventually ICE.  */
   10335       480882 :               if (flag_checking)
   10336      1934682 :                 for (slp_instance sub : instance->subgraph_entries)
   10337       492036 :                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
   10338       480882 :               unsigned HOST_WIDE_INT bytes;
   10339       480882 :               if (dump_enabled_p ())
   10340         3493 :                 for (slp_instance sub : instance->subgraph_entries)
   10341              :                   {
   10342          925 :                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
   10343         1850 :                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
   10344          925 :                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
   10345          925 :                                        sub->location (),
   10346              :                                        "basic block part vectorized using %wu "
   10347              :                                        "byte vectors\n", bytes);
   10348              :                     else
   10349              :                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
   10350              :                                        sub->location (),
   10351              :                                        "basic block part vectorized using "
   10352              :                                        "variable length vectors\n");
   10353              :                   }
   10354              : 
   10355       480882 :               dump_user_location_t saved_vect_location = vect_location;
   10356       480882 :               vect_location = instance->location ();
   10357              : 
   10358       480882 :               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
   10359              : 
   10360       480882 :               vect_location = saved_vect_location;
   10361              :             }
   10362              : 
   10363              : 
   10364              :           /* Generate the invariant statements.  */
   10365       234430 :           if (!gimple_seq_empty_p (bb_vinfo->inv_pattern_def_seq))
   10366              :             {
   10367           23 :               if (dump_enabled_p ())
   10368            0 :                 dump_printf_loc (MSG_NOTE, vect_location,
   10369              :                          "------>generating invariant statements\n");
   10370              : 
   10371           23 :               bb_vinfo->insert_seq_on_entry (NULL,
   10372              :                                              bb_vinfo->inv_pattern_def_seq);
   10373              :             }
   10374       234430 :         }
   10375              :       else
   10376              :         {
   10377      1949291 :           if (dump_enabled_p ())
   10378         1316 :             dump_printf_loc (MSG_NOTE, vect_location,
   10379              :                              "***** Analysis failed with vector mode %s\n",
   10380         1316 :                              GET_MODE_NAME (bb_vinfo->vector_mode));
   10381              :         }
   10382              : 
   10383      2183721 :       if (mode_i == 0)
   10384      1862158 :         autodetected_vector_mode = bb_vinfo->vector_mode;
   10385              : 
   10386      2183721 :       if (!fatal)
   10387      3131092 :         while (mode_i < vector_modes.length ()
   10388      1761470 :                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
   10389              :           {
   10390       334138 :             if (dump_enabled_p ())
   10391         1672 :               dump_printf_loc (MSG_NOTE, vect_location,
   10392              :                                "***** The result for vector mode %s would"
   10393              :                                " be the same\n",
   10394          836 :                                GET_MODE_NAME (vector_modes[mode_i]));
   10395       334138 :             mode_i += 1;
   10396              :           }
   10397              : 
   10398      2183721 :       delete bb_vinfo;
   10399              : 
   10400      2183721 :       if (mode_i < vector_modes.length ()
   10401      2005968 :           && VECTOR_MODE_P (autodetected_vector_mode)
   10402      1987878 :           && (related_vector_mode (vector_modes[mode_i],
   10403              :                                    GET_MODE_INNER (autodetected_vector_mode))
   10404       993939 :               == autodetected_vector_mode)
   10405      4189689 :           && (related_vector_mode (autodetected_vector_mode,
   10406       513530 :                                    GET_MODE_INNER (vector_modes[mode_i]))
   10407      1027060 :               == vector_modes[mode_i]))
   10408              :         {
   10409       513530 :           if (dump_enabled_p ())
   10410          205 :             dump_printf_loc (MSG_NOTE, vect_location,
   10411              :                              "***** Skipping vector mode %s, which would"
   10412              :                              " repeat the analysis for %s\n",
   10413          205 :                              GET_MODE_NAME (vector_modes[mode_i]),
   10414          205 :                              GET_MODE_NAME (autodetected_vector_mode));
   10415       513530 :           mode_i += 1;
   10416              :         }
   10417              : 
   10418      2183721 :       if (vectorized
   10419      2025278 :           || mode_i == vector_modes.length ()
   10420      1847570 :           || autodetected_vector_mode == VOIDmode
   10421              :           /* If vect_slp_analyze_bb_1 signaled that analysis for all
   10422              :              vector sizes will fail do not bother iterating.  */
   10423      3019262 :           || fatal)
   10424      3724316 :         return vectorized;
   10425              : 
   10426              :       /* Try the next biggest vector size.  */
   10427       321563 :       next_vector_mode = vector_modes[mode_i++];
   10428       321563 :       if (dump_enabled_p ())
   10429          219 :         dump_printf_loc (MSG_NOTE, vect_location,
   10430              :                          "***** Re-trying analysis with vector mode %s\n",
   10431          219 :                          GET_MODE_NAME (next_vector_mode));
   10432       321563 :     }
   10433      1862158 : }
   10434              : 
   10435              : 
   10436              : /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
   10437              :    true if anything in the basic-block was vectorized.  */
   10438              : 
   10439              : static bool
   10440      1862158 : vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
   10441              : {
   10442      1862158 :   vec<data_reference_p> datarefs = vNULL;
   10443      1862158 :   auto_vec<int> dataref_groups;
   10444      1862158 :   int insns = 0;
   10445      1862158 :   int current_group = 0;
   10446              : 
   10447     12338273 :   for (unsigned i = 0; i < bbs.length (); i++)
   10448              :     {
   10449     10476115 :       basic_block bb = bbs[i];
   10450     87740178 :       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
   10451     77264063 :            gsi_next (&gsi))
   10452              :         {
   10453     77264063 :           gimple *stmt = gsi_stmt (gsi);
   10454     77264063 :           if (is_gimple_debug (stmt))
   10455     47780501 :             continue;
   10456              : 
   10457     29483562 :           insns++;
   10458              : 
   10459     29483562 :           if (gimple_location (stmt) != UNKNOWN_LOCATION)
   10460     26448219 :             vect_location = stmt;
   10461              : 
   10462     29483562 :           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
   10463              :                                               &dataref_groups, current_group))
   10464      5061225 :             ++current_group;
   10465              :         }
   10466              :       /* New BBs always start a new DR group.  */
   10467     10476115 :       ++current_group;
   10468              :     }
   10469              : 
   10470      1862158 :   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
   10471      1862158 : }
   10472              : 
   10473              : /* Special entry for the BB vectorizer.  Analyze and transform a single
   10474              :    if-converted BB with ORIG_LOOPs body being the not if-converted
   10475              :    representation.  Returns true if anything in the basic-block was
   10476              :    vectorized.  */
   10477              : 
   10478              : bool
   10479        19359 : vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
   10480              : {
   10481        19359 :   auto_vec<basic_block> bbs;
   10482        19359 :   bbs.safe_push (bb);
   10483        19359 :   return vect_slp_bbs (bbs, orig_loop);
   10484        19359 : }
   10485              : 
   10486              : /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
   10487              :    true if anything in the basic-block was vectorized.  */
   10488              : 
   10489              : bool
   10490       905907 : vect_slp_function (function *fun)
   10491              : {
   10492       905907 :   bool r = false;
   10493       905907 :   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
   10494       905907 :   auto_bitmap exit_bbs;
   10495       905907 :   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
   10496       905907 :   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
   10497       905907 :   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
   10498       905907 :                                                       true, rpo, NULL);
   10499              : 
   10500              :   /* For the moment split the function into pieces to avoid making
   10501              :      the iteration on the vector mode moot.  Split at points we know
   10502              :      to not handle well which is CFG merges (SLP discovery doesn't
   10503              :      handle non-loop-header PHIs) and loop exits.  Since pattern
   10504              :      recog requires reverse iteration to visit uses before defs
   10505              :      simply chop RPO into pieces.  */
   10506       905907 :   auto_vec<basic_block> bbs;
   10507     11393593 :   for (unsigned i = 0; i < n; i++)
   10508              :     {
   10509     10487686 :       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
   10510     10487686 :       bool split = false;
   10511              : 
   10512              :       /* Split when a BB is not dominated by the first block.  */
   10513     19767128 :       if (!bbs.is_empty ()
   10514      9279442 :           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
   10515              :         {
   10516       654390 :           if (dump_enabled_p ())
   10517          146 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10518              :                              "splitting region at dominance boundary bb%d\n",
   10519              :                              bb->index);
   10520              :           split = true;
   10521              :         }
   10522              :       /* Split when the loop determined by the first block
   10523              :          is exited.  This is because we eventually insert
   10524              :          invariants at region begin.  */
   10525     18458348 :       else if (!bbs.is_empty ()
   10526      8625052 :                && bbs[0]->loop_father != bb->loop_father
   10527      2267412 :                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
   10528              :         {
   10529         3827 :           if (dump_enabled_p ())
   10530            6 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10531              :                              "splitting region at loop %d exit at bb%d\n",
   10532            3 :                              bbs[0]->loop_father->num, bb->index);
   10533              :           split = true;
   10534              :         }
   10535      9829469 :       else if (!bbs.is_empty ()
   10536      8621225 :                && bb->loop_father->header == bb
   10537       468143 :                && bb->loop_father->dont_vectorize)
   10538              :         {
   10539         7268 :           if (dump_enabled_p ())
   10540           72 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10541              :                              "splitting region at dont-vectorize loop %d "
   10542              :                              "entry at bb%d\n",
   10543              :                              bb->loop_father->num, bb->index);
   10544              :           split = true;
   10545              :         }
   10546              : 
   10547     11153171 :       if (split && !bbs.is_empty ())
   10548              :         {
   10549       665485 :           r |= vect_slp_bbs (bbs, NULL);
   10550       665485 :           bbs.truncate (0);
   10551              :         }
   10552              : 
   10553     10487686 :       if (bbs.is_empty ())
   10554              :         {
   10555              :           /* We need to be able to insert at the head of the region which
   10556              :              we cannot for region starting with a returns-twice call.  */
   10557      1873729 :           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
   10558       398869 :             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
   10559              :               {
   10560          306 :                 if (dump_enabled_p ())
   10561            2 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10562              :                                    "skipping bb%d as start of region as it "
   10563              :                                    "starts with returns-twice call\n",
   10564              :                                    bb->index);
   10565        30930 :                 continue;
   10566              :               }
   10567              :           /* If the loop this BB belongs to is marked as not to be vectorized
   10568              :              honor that also for BB vectorization.  */
   10569      1873423 :           if (bb->loop_father->dont_vectorize)
   10570        30624 :             continue;
   10571              :         }
   10572              : 
   10573     10456756 :       bbs.safe_push (bb);
   10574              : 
   10575              :       /* When we have a stmt ending this block and defining a
   10576              :          value we have to insert on edges when inserting after it for
   10577              :          a vector containing its definition.  Avoid this for now.  */
   10578     20913512 :       if (gimple *last = *gsi_last_bb (bb))
   10579      8475831 :         if (gimple_get_lhs (last)
   10580      8475831 :             && is_ctrl_altering_stmt (last))
   10581              :           {
   10582       271414 :             if (dump_enabled_p ())
   10583            2 :               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10584              :                                "splitting region at control altering "
   10585              :                                "definition %G", last);
   10586       271414 :             r |= vect_slp_bbs (bbs, NULL);
   10587       271414 :             bbs.truncate (0);
   10588              :           }
   10589              :     }
   10590              : 
   10591       905907 :   if (!bbs.is_empty ())
   10592       905900 :     r |= vect_slp_bbs (bbs, NULL);
   10593              : 
   10594       905907 :   free (rpo);
   10595              : 
   10596       905907 :   return r;
   10597       905907 : }
   10598              : 
   10599              : /* Build a variable-length vector in which the elements in ELTS are repeated
   10600              :    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
   10601              :    RESULTS and add any new instructions to SEQ.
   10602              : 
   10603              :    The approach we use is:
   10604              : 
   10605              :    (1) Find a vector mode VM with integer elements of mode IM.
   10606              : 
   10607              :    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
   10608              :        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
   10609              :        from small vectors to IM.
   10610              : 
   10611              :    (3) Duplicate each ELTS'[I] into a vector of mode VM.
   10612              : 
   10613              :    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
   10614              :        correct byte contents.
   10615              : 
   10616              :    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
   10617              : 
   10618              :    We try to find the largest IM for which this sequence works, in order
   10619              :    to cut down on the number of interleaves.  */
   10620              : 
   10621              : void
   10622            0 : duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
   10623              :                           const vec<tree> &elts, unsigned int nresults,
   10624              :                           vec<tree> &results)
   10625              : {
   10626            0 :   unsigned int nelts = elts.length ();
   10627            0 :   tree element_type = TREE_TYPE (vector_type);
   10628              : 
   10629              :   /* (1) Find a vector mode VM with integer elements of mode IM.  */
   10630            0 :   unsigned int nvectors = 1;
   10631            0 :   tree new_vector_type;
   10632            0 :   tree permutes[2];
   10633            0 :   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
   10634              :                                        &nvectors, &new_vector_type,
   10635              :                                        permutes))
   10636            0 :     gcc_unreachable ();
   10637              : 
   10638              :   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
   10639            0 :   unsigned int partial_nelts = nelts / nvectors;
   10640            0 :   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
   10641              : 
   10642            0 :   tree_vector_builder partial_elts;
   10643            0 :   auto_vec<tree, 32> pieces (nvectors * 2);
   10644            0 :   pieces.quick_grow_cleared (nvectors * 2);
   10645            0 :   for (unsigned int i = 0; i < nvectors; ++i)
   10646              :     {
   10647              :       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
   10648              :              ELTS' has mode IM.  */
   10649            0 :       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
   10650            0 :       for (unsigned int j = 0; j < partial_nelts; ++j)
   10651            0 :         partial_elts.quick_push (elts[i * partial_nelts + j]);
   10652            0 :       tree t = gimple_build_vector (seq, &partial_elts);
   10653            0 :       t = gimple_build (seq, VIEW_CONVERT_EXPR,
   10654            0 :                         TREE_TYPE (new_vector_type), t);
   10655              : 
   10656              :       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
   10657            0 :       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
   10658              :     }
   10659              : 
   10660              :   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
   10661              :          correct byte contents.
   10662              : 
   10663              :      Conceptually, we need to repeat the following operation log2(nvectors)
   10664              :      times, where hi_start = nvectors / 2:
   10665              : 
   10666              :         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
   10667              :         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
   10668              : 
   10669              :      However, if each input repeats every N elements and the VF is
   10670              :      a multiple of N * 2, the HI result is the same as the LO result.
   10671              :      This will be true for the first N1 iterations of the outer loop,
   10672              :      followed by N2 iterations for which both the LO and HI results
   10673              :      are needed.  I.e.:
   10674              : 
   10675              :         N1 + N2 = log2(nvectors)
   10676              : 
   10677              :      Each "N1 iteration" doubles the number of redundant vectors and the
   10678              :      effect of the process as a whole is to have a sequence of nvectors/2**N1
   10679              :      vectors that repeats 2**N1 times.  Rather than generate these redundant
   10680              :      vectors, we halve the number of vectors for each N1 iteration.  */
   10681              :   unsigned int in_start = 0;
   10682              :   unsigned int out_start = nvectors;
   10683              :   unsigned int new_nvectors = nvectors;
   10684            0 :   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
   10685              :     {
   10686            0 :       unsigned int hi_start = new_nvectors / 2;
   10687            0 :       unsigned int out_i = 0;
   10688            0 :       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
   10689              :         {
   10690            0 :           if ((in_i & 1) != 0
   10691            0 :               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
   10692              :                              2 * in_repeat))
   10693            0 :             continue;
   10694              : 
   10695            0 :           tree output = make_ssa_name (new_vector_type);
   10696            0 :           tree input1 = pieces[in_start + (in_i / 2)];
   10697            0 :           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
   10698            0 :           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
   10699              :                                                input1, input2,
   10700              :                                                permutes[in_i & 1]);
   10701            0 :           gimple_seq_add_stmt (seq, stmt);
   10702            0 :           pieces[out_start + out_i] = output;
   10703            0 :           out_i += 1;
   10704              :         }
   10705            0 :       std::swap (in_start, out_start);
   10706            0 :       new_nvectors = out_i;
   10707              :     }
   10708              : 
   10709              :   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
   10710            0 :   results.reserve (nresults);
   10711            0 :   for (unsigned int i = 0; i < nresults; ++i)
   10712            0 :     if (i < new_nvectors)
   10713            0 :       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
   10714            0 :                                         pieces[in_start + i]));
   10715              :     else
   10716            0 :       results.quick_push (results[i - new_nvectors]);
   10717            0 : }
   10718              : 
   10719              : 
   10720              : /* For constant and loop invariant defs in OP_NODE this function creates
   10721              :    vector defs that will be used in the vectorized stmts and stores them
   10722              :    to SLP_TREE_VEC_DEFS of OP_NODE.  */
   10723              : 
   10724              : static void
   10725       490404 : vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
   10726              : {
   10727       490404 :   unsigned HOST_WIDE_INT nunits;
   10728       490404 :   tree vec_cst;
   10729       490404 :   unsigned j, number_of_places_left_in_vector;
   10730       490404 :   tree vector_type;
   10731       490404 :   tree vop;
   10732       490404 :   int group_size = op_node->ops.length ();
   10733       490404 :   unsigned int vec_num, i;
   10734       490404 :   unsigned number_of_copies = 1;
   10735       490404 :   bool constant_p;
   10736       490404 :   gimple_seq ctor_seq = NULL;
   10737       490404 :   auto_vec<tree, 16> permute_results;
   10738              : 
   10739              :   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
   10740       490404 :   vector_type = SLP_TREE_VECTYPE (op_node);
   10741              : 
   10742       490404 :   unsigned int number_of_vectors = vect_get_num_copies (vinfo, op_node);
   10743       490404 :   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
   10744       490404 :   auto_vec<tree> voprnds (number_of_vectors);
   10745              : 
   10746              :   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
   10747              :      created vectors. It is greater than 1 if unrolling is performed.
   10748              : 
   10749              :      For example, we have two scalar operands, s1 and s2 (e.g., group of
   10750              :      strided accesses of size two), while NUNITS is four (i.e., four scalars
   10751              :      of this type can be packed in a vector).  The output vector will contain
   10752              :      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
   10753              :      will be 2).
   10754              : 
   10755              :      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
   10756              :      containing the operands.
   10757              : 
   10758              :      For example, NUNITS is four as before, and the group size is 8
   10759              :      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
   10760              :      {s5, s6, s7, s8}.  */
   10761              : 
   10762              :   /* When using duplicate_and_interleave, we just need one element for
   10763              :      each scalar statement.  */
   10764       490404 :   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
   10765              :     nunits = group_size;
   10766              : 
   10767       490404 :   number_of_copies = nunits * number_of_vectors / group_size;
   10768              : 
   10769       490404 :   number_of_places_left_in_vector = nunits;
   10770       490404 :   constant_p = true;
   10771       490404 :   tree uniform_elt = NULL_TREE;
   10772       490404 :   tree_vector_builder elts (vector_type, nunits, 1);
   10773       490404 :   elts.quick_grow (nunits);
   10774       490404 :   stmt_vec_info insert_after = NULL;
   10775      1463553 :   for (j = 0; j < number_of_copies; j++)
   10776              :     {
   10777       973149 :       tree op;
   10778      3731000 :       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
   10779              :         {
   10780              :           /* Create 'vect_ = {op0,op1,...,opn}'.  */
   10781      1784702 :           tree orig_op = op;
   10782      1784702 :           if (number_of_places_left_in_vector == nunits)
   10783              :             uniform_elt = op;
   10784      1164880 :           else if (uniform_elt && operand_equal_p (uniform_elt, op))
   10785       741174 :             op = elts[number_of_places_left_in_vector];
   10786              :           else
   10787              :             uniform_elt = NULL_TREE;
   10788      1784702 :           number_of_places_left_in_vector--;
   10789      1784702 :           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
   10790              :             {
   10791       276022 :               if (CONSTANT_CLASS_P (op))
   10792              :                 {
   10793       100629 :                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
   10794              :                     {
   10795              :                       /* Can't use VIEW_CONVERT_EXPR for booleans because
   10796              :                          of possibly different sizes of scalar value and
   10797              :                          vector element.  */
   10798           51 :                       if (integer_zerop (op))
   10799           51 :                         op = build_int_cst (TREE_TYPE (vector_type), 0);
   10800            0 :                       else if (integer_onep (op))
   10801            0 :                         op = build_all_ones_cst (TREE_TYPE (vector_type));
   10802              :                       else
   10803            0 :                         gcc_unreachable ();
   10804              :                     }
   10805              :                   else
   10806       100578 :                     op = fold_unary (VIEW_CONVERT_EXPR,
   10807              :                                      TREE_TYPE (vector_type), op);
   10808       100629 :                   gcc_assert (op && CONSTANT_CLASS_P (op));
   10809              :                 }
   10810              :               else
   10811              :                 {
   10812       175393 :                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
   10813       175393 :                   gimple *init_stmt;
   10814       175393 :                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
   10815              :                     {
   10816          418 :                       tree true_val
   10817          418 :                         = build_all_ones_cst (TREE_TYPE (vector_type));
   10818          418 :                       tree false_val
   10819          418 :                         = build_zero_cst (TREE_TYPE (vector_type));
   10820          418 :                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
   10821          418 :                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
   10822              :                                                        op, true_val,
   10823              :                                                        false_val);
   10824              :                     }
   10825              :                   else
   10826              :                     {
   10827       174975 :                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
   10828              :                                    op);
   10829       174975 :                       init_stmt
   10830       174975 :                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
   10831              :                                                op);
   10832              :                     }
   10833       175393 :                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
   10834       175393 :                   op = new_temp;
   10835              :                 }
   10836              :             }
   10837      1784702 :           elts[number_of_places_left_in_vector] = op;
   10838      1784702 :           if (!CONSTANT_CLASS_P (op))
   10839       314767 :             constant_p = false;
   10840              :           /* For BB vectorization we have to compute an insert location
   10841              :              when a def is inside the analyzed region since we cannot
   10842              :              simply insert at the BB start in this case.  */
   10843      1784702 :           stmt_vec_info opdef;
   10844      1784702 :           if (TREE_CODE (orig_op) == SSA_NAME
   10845       179637 :               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
   10846       159703 :               && is_a <bb_vec_info> (vinfo)
   10847      1886772 :               && (opdef = vinfo->lookup_def (orig_op)))
   10848              :             {
   10849        83065 :               if (!insert_after)
   10850              :                 insert_after = opdef;
   10851              :               else
   10852        45681 :                 insert_after = get_later_stmt (insert_after, opdef);
   10853              :             }
   10854              : 
   10855      1784702 :           if (number_of_places_left_in_vector == 0)
   10856              :             {
   10857       619822 :               auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
   10858       619822 :               if (uniform_elt)
   10859       646322 :                 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
   10860       323161 :                                                         elts[0]);
   10861       593322 :               else if (constant_p
   10862       593322 :                        ? multiple_p (type_nunits, nunits)
   10863       108627 :                        : known_eq (type_nunits, nunits))
   10864       296661 :                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
   10865              :               else
   10866              :                 {
   10867            0 :                   if (permute_results.is_empty ())
   10868            0 :                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
   10869              :                                               elts, number_of_vectors,
   10870              :                                               permute_results);
   10871            0 :                   vec_cst = permute_results[number_of_vectors - j - 1];
   10872              :                 }
   10873       619822 :               if (!gimple_seq_empty_p (ctor_seq))
   10874              :                 {
   10875       135922 :                   if (insert_after)
   10876              :                     {
   10877        37384 :                       gimple_stmt_iterator gsi;
   10878        37384 :                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
   10879              :                         {
   10880          624 :                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
   10881          624 :                           gsi_insert_seq_before (&gsi, ctor_seq,
   10882              :                                                  GSI_CONTINUE_LINKING);
   10883              :                         }
   10884        36760 :                       else if (!stmt_ends_bb_p (insert_after->stmt))
   10885              :                         {
   10886        36760 :                           gsi = gsi_for_stmt (insert_after->stmt);
   10887        36760 :                           gsi_insert_seq_after (&gsi, ctor_seq,
   10888              :                                                 GSI_CONTINUE_LINKING);
   10889              :                         }
   10890              :                       else
   10891              :                         {
   10892              :                           /* When we want to insert after a def where the
   10893              :                              defining stmt throws then insert on the fallthru
   10894              :                              edge.  */
   10895            0 :                           edge e = find_fallthru_edge
   10896            0 :                                      (gimple_bb (insert_after->stmt)->succs);
   10897            0 :                           basic_block new_bb
   10898            0 :                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
   10899            0 :                           gcc_assert (!new_bb);
   10900              :                         }
   10901              :                     }
   10902              :                   else
   10903        98538 :                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
   10904       135922 :                   ctor_seq = NULL;
   10905              :                 }
   10906       619822 :               voprnds.quick_push (vec_cst);
   10907       619822 :               insert_after = NULL;
   10908       619822 :               number_of_places_left_in_vector = nunits;
   10909       619822 :               constant_p = true;
   10910       619822 :               elts.new_vector (vector_type, nunits, 1);
   10911       619822 :               elts.quick_grow (nunits);
   10912              :             }
   10913              :         }
   10914              :     }
   10915              : 
   10916              :   /* Since the vectors are created in the reverse order, we should invert
   10917              :      them.  */
   10918       490404 :   vec_num = voprnds.length ();
   10919      1110226 :   for (j = vec_num; j != 0; j--)
   10920              :     {
   10921       619822 :       vop = voprnds[j - 1];
   10922       619822 :       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
   10923              :     }
   10924              : 
   10925              :   /* In case that VF is greater than the unrolling factor needed for the SLP
   10926              :      group of stmts, NUMBER_OF_VECTORS to be created is greater than
   10927              :      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
   10928              :      to replicate the vectors.  */
   10929       490404 :   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
   10930       490404 :     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
   10931              :          i++)
   10932            0 :       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
   10933       490404 : }
   10934              : 
   10935              : /* Get the scalar definition of the Nth lane from SLP_NODE or NULL_TREE
   10936              :    if there is no definition for it in the scalar IL or it is not known.  */
   10937              : 
   10938              : tree
   10939         2665 : vect_get_slp_scalar_def (slp_tree slp_node, unsigned n)
   10940              : {
   10941         2665 :   if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
   10942              :     {
   10943         2653 :       if (!SLP_TREE_SCALAR_STMTS (slp_node).exists ())
   10944              :         return NULL_TREE;
   10945         2653 :       stmt_vec_info def = SLP_TREE_SCALAR_STMTS (slp_node)[n];
   10946         2653 :       if (!def)
   10947              :         return NULL_TREE;
   10948         2653 :       return gimple_get_lhs (STMT_VINFO_STMT (def));
   10949              :     }
   10950              :   else
   10951           12 :     return SLP_TREE_SCALAR_OPS (slp_node)[n];
   10952              : }
   10953              : 
   10954              : /* Get the Ith vectorized definition from SLP_NODE.  */
   10955              : 
   10956              : tree
   10957       146086 : vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
   10958              : {
   10959       146086 :   return SLP_TREE_VEC_DEFS (slp_node)[i];
   10960              : }
   10961              : 
   10962              : /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
   10963              : 
   10964              : void
   10965       931394 : vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
   10966              : {
   10967      1862788 :   vec_defs->create (SLP_TREE_VEC_DEFS (slp_node).length ());
   10968       931394 :   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
   10969       931394 : }
   10970              : 
   10971              : /* Get N vectorized definitions for SLP_NODE.  */
   10972              : 
   10973              : void
   10974         2943 : vect_get_slp_defs (vec_info *,
   10975              :                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
   10976              : {
   10977         2943 :   if (n == -1U)
   10978         2943 :     n = SLP_TREE_CHILDREN (slp_node).length ();
   10979              : 
   10980        10619 :   for (unsigned i = 0; i < n; ++i)
   10981              :     {
   10982         7676 :       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
   10983         7676 :       vec<tree> vec_defs = vNULL;
   10984         7676 :       vect_get_slp_defs (child, &vec_defs);
   10985         7676 :       vec_oprnds->quick_push (vec_defs);
   10986              :     }
   10987         2943 : }
   10988              : 
   10989              : /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
   10990              :    - PERM gives the permutation that the caller wants to use for NODE,
   10991              :      which might be different from SLP_LOAD_PERMUTATION.
   10992              :    - DUMP_P controls whether the function dumps information.  */
   10993              : 
   10994              : static bool
   10995       121916 : vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
   10996              :                                 load_permutation_t &perm,
   10997              :                                 const vec<tree> &dr_chain,
   10998              :                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
   10999              :                                 bool analyze_only, bool dump_p,
   11000              :                                 unsigned *n_perms, unsigned int *n_loads,
   11001              :                                 bool dce_chain)
   11002              : {
   11003       121916 :   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   11004       121916 :   int vec_index = 0;
   11005       121916 :   tree vectype = SLP_TREE_VECTYPE (node);
   11006       121916 :   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
   11007       121916 :   unsigned int mask_element;
   11008       121916 :   unsigned dr_group_size;
   11009       121916 :   machine_mode mode;
   11010              : 
   11011       121916 :   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
   11012              :     {
   11013              :       /* We have both splats of the same non-grouped load and groups
   11014              :          of distinct invariant loads entering here.  */
   11015         1483 :       unsigned max_idx = 0;
   11016         8219 :       for (auto idx : perm)
   11017         3770 :         max_idx = idx > max_idx ? idx : max_idx;
   11018         1483 :       dr_group_size = max_idx + 1;
   11019              :     }
   11020              :   else
   11021              :     {
   11022       120433 :       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
   11023       120433 :       dr_group_size = DR_GROUP_SIZE (stmt_info);
   11024              :     }
   11025              : 
   11026       121916 :   mode = TYPE_MODE (vectype);
   11027       121916 :   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   11028       121916 :   unsigned int nstmts = vect_get_num_copies (vinfo, node);
   11029              : 
   11030              :   /* Initialize the vect stmts of NODE to properly insert the generated
   11031              :      stmts later.  */
   11032       121916 :   if (! analyze_only)
   11033        58108 :     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
   11034        22374 :       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
   11035              : 
   11036              :   /* Generate permutation masks for every NODE. Number of masks for each NODE
   11037              :      is equal to GROUP_SIZE.
   11038              :      E.g., we have a group of three nodes with three loads from the same
   11039              :      location in each node, and the vector size is 4. I.e., we have a
   11040              :      a0b0c0a1b1c1... sequence and we need to create the following vectors:
   11041              :      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
   11042              :      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
   11043              :      ...
   11044              : 
   11045              :      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
   11046              :      The last mask is illegal since we assume two operands for permute
   11047              :      operation, and the mask element values can't be outside that range.
   11048              :      Hence, the last mask must be converted into {2,5,5,5}.
   11049              :      For the first two permutations we need the first and the second input
   11050              :      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
   11051              :      we need the second and the third vectors: {b1,c1,a2,b2} and
   11052              :      {c2,a3,b3,c3}.  */
   11053              : 
   11054       121916 :   int vect_stmts_counter = 0;
   11055       121916 :   unsigned int index = 0;
   11056       121916 :   int first_vec_index = -1;
   11057       121916 :   int second_vec_index = -1;
   11058       121916 :   bool noop_p = true;
   11059       121916 :   *n_perms = 0;
   11060              : 
   11061       121916 :   vec_perm_builder mask;
   11062       121916 :   unsigned int nelts_to_build;
   11063       121916 :   unsigned int nvectors_per_build;
   11064       121916 :   unsigned int in_nlanes;
   11065       121916 :   bool repeating_p = (group_size == dr_group_size
   11066       154682 :                       && multiple_p (nunits, group_size));
   11067       121916 :   if (repeating_p)
   11068              :     {
   11069              :       /* A single vector contains a whole number of copies of the node, so:
   11070              :          (a) all permutes can use the same mask; and
   11071              :          (b) the permutes only need a single vector input.  */
   11072        30533 :       mask.new_vector (nunits, group_size, 3);
   11073        30533 :       nelts_to_build = mask.encoded_nelts ();
   11074              :       /* It's possible to obtain zero nstmts during analyze_only, so make
   11075              :          it at least one to ensure the later computation for n_perms
   11076              :          proceed.  */
   11077        30533 :       nvectors_per_build = nstmts > 0 ? nstmts : 1;
   11078        30533 :       in_nlanes = dr_group_size * 3;
   11079              :     }
   11080              :   else
   11081              :     {
   11082              :       /* We need to construct a separate mask for each vector statement.  */
   11083        91383 :       unsigned HOST_WIDE_INT const_nunits, const_vf;
   11084        91383 :       if (!nunits.is_constant (&const_nunits)
   11085        91383 :           || !vf.is_constant (&const_vf))
   11086              :         return false;
   11087        91383 :       mask.new_vector (const_nunits, const_nunits, 1);
   11088        91383 :       nelts_to_build = const_vf * group_size;
   11089        91383 :       nvectors_per_build = 1;
   11090        91383 :       in_nlanes = const_vf * dr_group_size;
   11091              :     }
   11092       121916 :   auto_sbitmap used_in_lanes (in_nlanes);
   11093       121916 :   bitmap_clear (used_in_lanes);
   11094       121916 :   auto_bitmap used_defs;
   11095              : 
   11096       121916 :   unsigned int count = mask.encoded_nelts ();
   11097       121916 :   mask.quick_grow (count);
   11098       121916 :   vec_perm_indices indices;
   11099              : 
   11100       658613 :   for (unsigned int j = 0; j < nelts_to_build; j++)
   11101              :     {
   11102       546341 :       unsigned int iter_num = j / group_size;
   11103       546341 :       unsigned int stmt_num = j % group_size;
   11104       546341 :       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
   11105       546341 :       bitmap_set_bit (used_in_lanes, i);
   11106       546341 :       if (repeating_p)
   11107              :         {
   11108              :           first_vec_index = 0;
   11109              :           mask_element = i;
   11110              :         }
   11111              :       else
   11112              :         {
   11113              :           /* Enforced before the loop when !repeating_p.  */
   11114       348647 :           unsigned int const_nunits = nunits.to_constant ();
   11115       348647 :           vec_index = i / const_nunits;
   11116       348647 :           mask_element = i % const_nunits;
   11117       348647 :           if (vec_index == first_vec_index
   11118       348647 :               || first_vec_index == -1)
   11119              :             {
   11120              :               first_vec_index = vec_index;
   11121              :             }
   11122       140107 :           else if (vec_index == second_vec_index
   11123       140107 :                    || second_vec_index == -1)
   11124              :             {
   11125       134004 :               second_vec_index = vec_index;
   11126       134004 :               mask_element += const_nunits;
   11127              :             }
   11128              :           else
   11129              :             {
   11130         6103 :               if (dump_p)
   11131          280 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11132              :                                  "permutation requires at "
   11133              :                                  "least three vectors %G",
   11134              :                                  stmt_info->stmt);
   11135         6103 :               gcc_assert (analyze_only);
   11136              :               return false;
   11137              :             }
   11138              : 
   11139       342544 :           gcc_assert (mask_element < 2 * const_nunits);
   11140              :         }
   11141              : 
   11142       540238 :       if (mask_element != index)
   11143       351299 :         noop_p = false;
   11144       540238 :       mask[index++] = mask_element;
   11145              : 
   11146       540238 :       if (index == count)
   11147              :         {
   11148       145237 :           if (!noop_p)
   11149              :             {
   11150       199782 :               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
   11151       117990 :               if (!can_vec_perm_const_p (mode, mode, indices))
   11152              :                 {
   11153         3541 :                   if (dump_p)
   11154              :                     {
   11155           79 :                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11156              :                                        "unsupported vect permute { ");
   11157          669 :                       for (i = 0; i < count; ++i)
   11158              :                         {
   11159          590 :                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
   11160          590 :                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
   11161              :                         }
   11162           79 :                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
   11163              :                     }
   11164         3541 :                   gcc_assert (analyze_only);
   11165              :                   return false;
   11166              :                 }
   11167              : 
   11168       114449 :               tree mask_vec = NULL_TREE;
   11169       114449 :               if (!analyze_only)
   11170        20684 :                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
   11171              : 
   11172       114449 :               if (second_vec_index == -1)
   11173        34248 :                 second_vec_index = first_vec_index;
   11174              : 
   11175       231772 :               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
   11176              :                 {
   11177       117323 :                   ++*n_perms;
   11178       117323 :                   if (analyze_only)
   11179        96356 :                     continue;
   11180              :                   /* Generate the permute statement if necessary.  */
   11181        20967 :                   tree first_vec = dr_chain[first_vec_index + ri];
   11182        20967 :                   tree second_vec = dr_chain[second_vec_index + ri];
   11183        20967 :                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
   11184        20967 :                   tree perm_dest
   11185        20967 :                     = vect_create_destination_var (gimple_assign_lhs (stmt),
   11186              :                                                    vectype);
   11187        20967 :                   perm_dest = make_ssa_name (perm_dest);
   11188        20967 :                   gimple *perm_stmt
   11189        20967 :                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
   11190              :                                            second_vec, mask_vec);
   11191        20967 :                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
   11192              :                                                gsi);
   11193        20967 :                   if (dce_chain)
   11194              :                     {
   11195        20044 :                       bitmap_set_bit (used_defs, first_vec_index + ri);
   11196        20044 :                       bitmap_set_bit (used_defs, second_vec_index + ri);
   11197              :                     }
   11198              : 
   11199              :                   /* Store the vector statement in NODE.  */
   11200        20967 :                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
   11201              :                 }
   11202              :             }
   11203        27247 :           else if (!analyze_only)
   11204              :             {
   11205         2814 :               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
   11206              :                 {
   11207         1407 :                   tree first_vec = dr_chain[first_vec_index + ri];
   11208              :                   /* If mask was NULL_TREE generate the requested
   11209              :                      identity transform.  */
   11210         1407 :                   if (dce_chain)
   11211         1400 :                     bitmap_set_bit (used_defs, first_vec_index + ri);
   11212              : 
   11213              :                   /* Store the vector statement in NODE.  */
   11214         1407 :                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
   11215              :                 }
   11216              :             }
   11217              : 
   11218              :           index = 0;
   11219              :           first_vec_index = -1;
   11220              :           second_vec_index = -1;
   11221              :           noop_p = true;
   11222              :         }
   11223              :     }
   11224              : 
   11225       112272 :   if (n_loads)
   11226              :     {
   11227        80455 :       if (repeating_p)
   11228        10468 :         *n_loads = nstmts;
   11229              :       else
   11230              :         {
   11231              :           /* Enforced above when !repeating_p.  */
   11232        69987 :           unsigned int const_nunits = nunits.to_constant ();
   11233        69987 :           *n_loads = 0;
   11234        69987 :           bool load_seen = false;
   11235       979763 :           for (unsigned i = 0; i < in_nlanes; ++i)
   11236              :             {
   11237       909776 :               if (i % const_nunits == 0)
   11238              :                 {
   11239       383863 :                   if (load_seen)
   11240       109879 :                     *n_loads += 1;
   11241              :                   load_seen = false;
   11242              :                 }
   11243       909776 :               if (bitmap_bit_p (used_in_lanes, i))
   11244       252311 :                 load_seen = true;
   11245              :             }
   11246        69987 :           if (load_seen)
   11247        48312 :             *n_loads += 1;
   11248              :         }
   11249              :     }
   11250              : 
   11251       112272 :   if (dce_chain)
   11252       212493 :     for (unsigned i = 0; i < dr_chain.length (); ++i)
   11253        73503 :       if (!bitmap_bit_p (used_defs, i))
   11254              :         {
   11255        40635 :           tree def = dr_chain[i];
   11256        41018 :           do
   11257              :             {
   11258        41018 :               gimple *stmt = SSA_NAME_DEF_STMT (def);
   11259        41018 :               if (is_gimple_assign (stmt)
   11260        41018 :                   && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
   11261        41018 :                       || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
   11262         4952 :                 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
   11263              :               else
   11264              :                 def = NULL;
   11265        41018 :               gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
   11266        41018 :               gsi_remove (&rgsi, true);
   11267        41018 :               release_defs (stmt);
   11268              :             }
   11269        41018 :           while (def);
   11270              :         }
   11271              : 
   11272              :   return true;
   11273       121916 : }
   11274              : 
   11275              : /* Generate vector permute statements from a list of loads in DR_CHAIN.
   11276              :    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
   11277              :    permute statements for the SLP node NODE.  Store the number of vector
   11278              :    permute instructions in *N_PERMS and the number of vector load
   11279              :    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
   11280              :    that were not needed.  */
   11281              : 
   11282              : bool
   11283        89389 : vect_transform_slp_perm_load (vec_info *vinfo,
   11284              :                               slp_tree node, const vec<tree> &dr_chain,
   11285              :                               gimple_stmt_iterator *gsi, poly_uint64 vf,
   11286              :                               bool analyze_only, unsigned *n_perms,
   11287              :                               unsigned int *n_loads, bool dce_chain)
   11288              : {
   11289        89389 :   return vect_transform_slp_perm_load_1 (vinfo, node,
   11290        89389 :                                          SLP_TREE_LOAD_PERMUTATION (node),
   11291              :                                          dr_chain, gsi, vf, analyze_only,
   11292              :                                          dump_enabled_p (), n_perms, n_loads,
   11293        89389 :                                          dce_chain);
   11294              : }
   11295              : 
   11296              : /* Produce the next vector result for SLP permutation NODE by adding a vector
   11297              :    statement at GSI.  If MASK_VEC is nonnull, add:
   11298              : 
   11299              :       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
   11300              : 
   11301              :    otherwise add:
   11302              : 
   11303              :       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF,
   11304              :                                       { N, N+1, N+2, ... }>
   11305              : 
   11306              :    where N == IDENTITY_OFFSET which is either zero or equal to the
   11307              :    number of elements of the result.  */
   11308              : 
   11309              : static void
   11310        31191 : vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
   11311              :                           slp_tree node, tree first_def, tree second_def,
   11312              :                           tree mask_vec, poly_uint64 identity_offset)
   11313              : {
   11314        31191 :   tree vectype = SLP_TREE_VECTYPE (node);
   11315              : 
   11316              :   /* ???  We SLP match existing vector element extracts but
   11317              :      allow punning which we need to re-instantiate at uses
   11318              :      but have no good way of explicitly representing.  */
   11319        31191 :   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
   11320        31191 :       && !types_compatible_p (TREE_TYPE (first_def), vectype))
   11321              :     {
   11322           14 :       gassign *conv_stmt
   11323           14 :         = gimple_build_assign (make_ssa_name (vectype),
   11324              :                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
   11325           14 :       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
   11326           14 :       first_def = gimple_assign_lhs (conv_stmt);
   11327              :     }
   11328        31191 :   gassign *perm_stmt;
   11329        31191 :   tree perm_dest = make_ssa_name (vectype);
   11330        31191 :   if (mask_vec)
   11331              :     {
   11332        27975 :       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
   11333        27975 :                            TYPE_SIZE (vectype))
   11334        27975 :           && !types_compatible_p (TREE_TYPE (second_def), vectype))
   11335              :         {
   11336            8 :           gassign *conv_stmt
   11337            8 :             = gimple_build_assign (make_ssa_name (vectype),
   11338              :                                    build1 (VIEW_CONVERT_EXPR,
   11339              :                                            vectype, second_def));
   11340            8 :           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
   11341            8 :           second_def = gimple_assign_lhs (conv_stmt);
   11342              :         }
   11343        27975 :       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
   11344              :                                        first_def, second_def,
   11345              :                                        mask_vec);
   11346              :     }
   11347              :   else
   11348              :     {
   11349         3216 :       auto def_nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
   11350         3216 :       unsigned HOST_WIDE_INT vecno;
   11351         3216 :       poly_uint64 eltno;
   11352         3216 :       if (!can_div_trunc_p (poly_uint64 (identity_offset), def_nunits,
   11353              :                             &vecno, &eltno))
   11354              :         gcc_unreachable ();
   11355         3216 :       tree def = vecno & 1 ? second_def : first_def;
   11356         3216 :       if (!types_compatible_p (TREE_TYPE (def), vectype))
   11357              :         {
   11358              :           /* For identity permutes we still need to handle the case
   11359              :              of offsetted extracts or concats.  */
   11360          219 :           unsigned HOST_WIDE_INT c;
   11361          219 :           if (known_le (TYPE_VECTOR_SUBPARTS (vectype), def_nunits))
   11362              :             {
   11363          215 :               unsigned HOST_WIDE_INT elsz
   11364          215 :                 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (def))));
   11365          430 :               tree lowpart = build3 (BIT_FIELD_REF, vectype, def,
   11366          215 :                                      TYPE_SIZE (vectype),
   11367          215 :                                      bitsize_int (eltno * elsz));
   11368          215 :               perm_stmt = gimple_build_assign (perm_dest, lowpart);
   11369              :             }
   11370            4 :           else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
   11371            4 :                                         def_nunits, &c) && c == 2)
   11372              :             {
   11373            4 :               gcc_assert (known_eq (identity_offset, 0U));
   11374            4 :               tree ctor = build_constructor_va (vectype, 2,
   11375              :                                                 NULL_TREE, first_def,
   11376              :                                                 NULL_TREE, second_def);
   11377            4 :               perm_stmt = gimple_build_assign (perm_dest, ctor);
   11378              :             }
   11379              :           else
   11380            0 :             gcc_unreachable ();
   11381              :         }
   11382              :       else
   11383              :         {
   11384              :           /* We need a copy here in case the def was external.  */
   11385         2997 :           gcc_assert (known_eq (eltno, 0U));
   11386         2997 :           perm_stmt = gimple_build_assign (perm_dest, def);
   11387              :         }
   11388              :     }
   11389        31191 :   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
   11390              :   /* Store the vector statement in NODE.  */
   11391        31191 :   node->push_vec_def (perm_stmt);
   11392        31191 : }
   11393              : 
   11394              : /* Subroutine of vectorizable_slp_permutation.  Check whether the target
   11395              :    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
   11396              :    If GSI is nonnull, emit the permutation there.
   11397              : 
   11398              :    When GSI is null, the only purpose of NODE is to give properties
   11399              :    of the result, such as the vector type and number of SLP lanes.
   11400              :    The node does not need to be a VEC_PERM_EXPR.
   11401              : 
   11402              :    If the target supports the operation, return the number of individual
   11403              :    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
   11404              :    dump file if DUMP_P is true.  */
   11405              : 
   11406              : static int
   11407       430332 : vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
   11408              :                                 slp_tree node, lane_permutation_t &perm,
   11409              :                                 vec<slp_tree> &children, bool dump_p)
   11410              : {
   11411       430332 :   tree vectype = SLP_TREE_VECTYPE (node);
   11412              : 
   11413              :   /* ???  We currently only support all same vector input types
   11414              :      while the SLP IL should really do a concat + select and thus accept
   11415              :      arbitrary mismatches.  */
   11416       430332 :   slp_tree child;
   11417       430332 :   unsigned i;
   11418       430332 :   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   11419       430332 :   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
   11420              :   /* True if we're permuting a single input of 2N vectors down
   11421              :      to N vectors.  This case doesn't generalize beyond 2 since
   11422              :      VEC_PERM_EXPR only takes 2 inputs.  */
   11423       430332 :   bool pack_p = false;
   11424              :   /* If we're permuting inputs of N vectors each into X*N outputs,
   11425              :      this is the value of X, otherwise it is 1.  */
   11426       430332 :   unsigned int unpack_factor = 1;
   11427       430332 :   tree op_vectype = NULL_TREE;
   11428       431890 :   FOR_EACH_VEC_ELT (children, i, child)
   11429       431818 :     if (SLP_TREE_VECTYPE (child))
   11430              :       {
   11431              :         op_vectype = SLP_TREE_VECTYPE (child);
   11432              :         break;
   11433              :       }
   11434       430332 :   if (!op_vectype)
   11435           72 :     op_vectype = vectype;
   11436       943470 :   FOR_EACH_VEC_ELT (children, i, child)
   11437              :     {
   11438       513138 :       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
   11439         9430 :            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
   11440       513138 :           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
   11441      1026276 :           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
   11442              :         {
   11443            0 :           if (dump_p)
   11444            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11445              :                              "Unsupported vector types in lane permutation\n");
   11446            0 :           return -1;
   11447              :         }
   11448       513138 :       auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
   11449       513138 :       unsigned int this_unpack_factor;
   11450              :       /* Detect permutations of external, pre-existing vectors.  The external
   11451              :          node's SLP_TREE_LANES stores the total number of units in the vector,
   11452              :          or zero if the vector has variable length.
   11453              : 
   11454              :          We are expected to keep the original VEC_PERM_EXPR for such cases.
   11455              :          There is no repetition to model.  */
   11456       513138 :       if (SLP_TREE_DEF_TYPE (child) == vect_external_def
   11457       513138 :           && SLP_TREE_SCALAR_OPS (child).is_empty ())
   11458              :         repeating_p = false;
   11459              :       /* Check whether the input has twice as many lanes per vector.  */
   11460       506256 :       else if (children.length () == 1
   11461       506256 :                && known_eq (SLP_TREE_LANES (child) * nunits,
   11462              :                             SLP_TREE_LANES (node) * op_nunits * 2))
   11463              :         pack_p = true;
   11464              :       /* Check whether the output has N times as many lanes per vector.  */
   11465       513138 :       else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
   11466       462397 :                                     SLP_TREE_LANES (child) * nunits,
   11467              :                                     &this_unpack_factor)
   11468       427407 :                && (i == 0 || unpack_factor == this_unpack_factor))
   11469              :         unpack_factor = this_unpack_factor;
   11470              :       else
   11471              :         repeating_p = false;
   11472              :     }
   11473              : 
   11474       860664 :   gcc_assert (perm.length () == SLP_TREE_LANES (node));
   11475              : 
   11476              :   /* Load-lanes permute.  This permute only acts as a forwarder to
   11477              :      select the correct vector def of the load-lanes load which
   11478              :      has the permuted vectors in its vector defs like
   11479              :      { v0, w0, r0, v1, w1, r1 ... } for a ld3.  All costs are
   11480              :      accounted for in the costing for the actual load so we
   11481              :      return zero here.  */
   11482       430332 :   if (node->ldst_lanes)
   11483              :     {
   11484            0 :       gcc_assert (children.length () == 1);
   11485            0 :       if (!gsi)
   11486              :         /* This is a trivial op always supported.  */
   11487              :         return 0;
   11488            0 :       slp_tree child = children[0];
   11489            0 :       unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
   11490            0 :                           / SLP_TREE_LANES (node));
   11491            0 :       unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
   11492            0 :       unsigned nvectors = vect_get_num_copies (vinfo, node);
   11493            0 :       for (unsigned i = 0; i < nvectors; ++i)
   11494              :         {
   11495            0 :           tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num  + vec_idx];
   11496            0 :           node->push_vec_def (def);
   11497              :         }
   11498              :       return 0;
   11499              :     }
   11500              : 
   11501              :   /* Set REPEATING_P to true if the permutations are cyclical wrt UNPACK_FACTOR
   11502              :      and if we can generate the vectors in a vector-length agnostic way.
   11503              :      This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
   11504              :      compile time.
   11505              : 
   11506              :      The significance of UNPACK_STEP is that, when PACK_P is false,
   11507              :      output vector I operates on a window of UNPACK_STEP elements from each
   11508              :      input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR).  For example,
   11509              :      when UNPACK_FACTOR is 2, the first output vector operates on lanes
   11510              :      [0, NUNITS / 2 - 1] of each input vector and the second output vector
   11511              :      operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
   11512              : 
   11513              :      When REPEATING_P is true, NOUTPUTS holds the total number of outputs
   11514              :      that we actually need to generate.  */
   11515       430332 :   uint64_t noutputs = 0;
   11516       430332 :   poly_uint64 unpack_step = 0;
   11517       430332 :   loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
   11518       183103 :   if (!linfo
   11519       469515 :       || !multiple_p (nunits, unpack_factor, &unpack_step)
   11520       182161 :       || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
   11521       182161 :                                * SLP_TREE_LANES (node), nunits, &noutputs))
   11522              :     repeating_p = false;
   11523              : 
   11524              :   /* We can handle the conditions described for REPEATING_P above for
   11525              :      both variable- and constant-length vectors.  The fallback requires
   11526              :      us to generate every element of every permute vector explicitly,
   11527              :      which is only possible for constant-length permute vectors.
   11528              : 
   11529              :      Set:
   11530              : 
   11531              :      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
   11532              :        mask vectors that we want to build.
   11533              : 
   11534              :      - NCOPIES to the number of copies of PERM that we need in order
   11535              :        to build the necessary permute mask vectors.  */
   11536       182161 :   uint64_t npatterns;
   11537       182161 :   unsigned nelts_per_pattern;
   11538       182161 :   uint64_t ncopies;
   11539       182161 :   if (repeating_p)
   11540              :     {
   11541              :       /* We need permute mask vectors that have the form:
   11542              : 
   11543              :            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
   11544              : 
   11545              :          In other words, the original n-element permute in PERM is
   11546              :          "unrolled" to fill a full vector.  The stepped vector encoding
   11547              :          that we use for permutes requires 3n elements.  */
   11548       142978 :       npatterns = SLP_TREE_LANES (node);
   11549       142978 :       nelts_per_pattern = ncopies = 3;
   11550              :     }
   11551              :   else
   11552              :     {
   11553              :       /* Calculate every element of every permute mask vector explicitly,
   11554              :          instead of relying on the pattern described above.  */
   11555       287354 :       if (!nunits.is_constant (&npatterns)
   11556       287354 :           || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
   11557              :         {
   11558              :           if (dump_p)
   11559              :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11560              :                              "unsupported permutation %p on variable-length"
   11561              :                              " vectors\n", (void *) node);
   11562              :           return -1;
   11563              :         }
   11564       287354 :       nelts_per_pattern = ncopies = 1;
   11565       287354 :       if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
   11566              :         {
   11567              :           if (dump_p)
   11568              :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11569              :                              "unsupported permutation %p for variable VF\n",
   11570              :                              (void *) node);
   11571              :           return -1;
   11572              :         }
   11573              :       pack_p = false;
   11574              :       unpack_factor = 1;
   11575              :     }
   11576       430332 :   unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
   11577       430332 :   gcc_assert (repeating_p || multiple_p (olanes, nunits));
   11578              : 
   11579              :   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
   11580              :      from the { SLP operand, scalar lane } permutation as recorded in the
   11581              :      SLP node as intermediate step.  This part should already work
   11582              :      with SLP children with arbitrary number of lanes.  */
   11583       430332 :   auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
   11584       430332 :   auto_vec<poly_uint64> active_lane;
   11585       430332 :   vperm.create (olanes);
   11586       430332 :   active_lane.safe_grow_cleared (children.length (), true);
   11587       868922 :   for (unsigned int ui = 0; ui < unpack_factor; ++ui)
   11588              :     {
   11589      1936440 :       for (unsigned j = 0; j < children.length (); ++j)
   11590       529630 :         active_lane[j] = ui * unpack_step;
   11591      1279748 :       for (unsigned i = 0; i < ncopies; ++i)
   11592              :         {
   11593      5251236 :           for (unsigned pi = 0; pi < perm.length (); ++pi)
   11594              :             {
   11595      1784460 :               std::pair<unsigned, unsigned> p = perm[pi];
   11596      1784460 :               tree vtype = SLP_TREE_VECTYPE (children[p.first]);
   11597      1784460 :               if (repeating_p)
   11598       833508 :                 vperm.quick_push ({{p.first, 0},
   11599       833508 :                                    p.second + active_lane[p.first]});
   11600              :               else
   11601              :                 {
   11602              :                   /* We checked above that the vectors are constant-length.  */
   11603       950952 :                   unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
   11604       950952 :                     .to_constant ();
   11605       950952 :                   unsigned lane = active_lane[p.first].to_constant ();
   11606       950952 :                   unsigned vi = (lane + p.second) / vnunits;
   11607       950952 :                   unsigned vl = (lane + p.second) % vnunits;
   11608       950952 :                   vperm.quick_push ({{p.first, vi}, vl});
   11609              :                 }
   11610              :             }
   11611              :           /* Advance to the next group.  */
   11612      1837700 :           for (unsigned j = 0; j < children.length (); ++j)
   11613       996542 :             active_lane[j] += SLP_TREE_LANES (children[j]);
   11614              :         }
   11615              :     }
   11616              : 
   11617       430332 :   if (dump_p)
   11618              :     {
   11619         8975 :       dump_printf_loc (MSG_NOTE, vect_location,
   11620              :                        "vectorizing permutation %p", (void *)node);
   11621        32494 :       for (unsigned i = 0; i < perm.length (); ++i)
   11622        23519 :         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
   11623         8975 :       if (repeating_p)
   11624         7574 :         dump_printf (MSG_NOTE, " (repeat %d)", SLP_TREE_LANES (node));
   11625         8975 :       dump_printf (MSG_NOTE, "\n");
   11626         8975 :       dump_printf_loc (MSG_NOTE, vect_location, "as");
   11627        90432 :       for (unsigned i = 0; i < vperm.length (); ++i)
   11628              :         {
   11629        81457 :           if (i != 0
   11630        81457 :               && (repeating_p
   11631        55237 :                   ? multiple_p (i, npatterns)
   11632        60615 :                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
   11633        24347 :             dump_printf (MSG_NOTE, ",");
   11634        81457 :           dump_printf (MSG_NOTE, " vops%u[%u][",
   11635        81457 :                        vperm[i].first.first, vperm[i].first.second);
   11636        81457 :           dump_dec (MSG_NOTE, vperm[i].second);
   11637        81457 :           dump_printf (MSG_NOTE, "]");
   11638              :         }
   11639         8975 :       dump_printf (MSG_NOTE, "\n");
   11640              :     }
   11641              : 
   11642              :   /* We can only handle two-vector permutes, everything else should
   11643              :      be lowered on the SLP level.  The following is closely inspired
   11644              :      by vect_transform_slp_perm_load and is supposed to eventually
   11645              :      replace it.
   11646              :      ???   As intermediate step do code-gen in the SLP tree representation
   11647              :      somehow?  */
   11648       430332 :   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
   11649       430332 :   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
   11650       430332 :   unsigned int index = 0;
   11651       430332 :   poly_uint64 mask_element;
   11652       430332 :   vec_perm_builder mask;
   11653       430332 :   mask.new_vector (nunits, npatterns, nelts_per_pattern);
   11654       430332 :   unsigned int count = mask.encoded_nelts ();
   11655       430332 :   mask.quick_grow (count);
   11656       430332 :   vec_perm_indices indices;
   11657       430332 :   unsigned nperms = 0;
   11658              :   /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
   11659              :      vectors to check during analysis, but we need to generate NOUTPUTS
   11660              :      vectors during transformation.  */
   11661       430332 :   unsigned total_nelts = olanes;
   11662       430332 :   unsigned process_nelts = olanes;
   11663       430332 :   if (repeating_p)
   11664              :     {
   11665       142978 :       total_nelts = (total_nelts / unpack_factor) * noutputs;
   11666       142978 :       if (gsi)
   11667         9808 :         process_nelts = total_nelts;
   11668              :     }
   11669       430332 :   unsigned last_ei = (total_nelts - 1) % process_nelts;
   11670      2224049 :   for (unsigned i = 0; i < process_nelts; ++i)
   11671              :     {
   11672              :       /* VI is the input vector index when generating code for REPEATING_P.  */
   11673      1801090 :       unsigned vi = i / olanes * (pack_p ? 2 : 1);
   11674      1801090 :       unsigned ei = i % olanes;
   11675      1801090 :       mask_element = vperm[ei].second;
   11676      1801090 :       if (pack_p)
   11677              :         {
   11678              :           /* In this case, we have N outputs and the single child provides 2N
   11679              :              inputs.  Output X permutes inputs 2X and 2X+1.
   11680              : 
   11681              :              The mask indices are taken directly from the SLP permutation node.
   11682              :              Index X selects from the first vector if (X / NUNITS) % 2 == 0;
   11683              :              X selects from the second vector otherwise.  These conditions
   11684              :              are only known at compile time for constant-length vectors.  */
   11685              :           first_vec = std::make_pair (0, 0);
   11686              :           second_vec = std::make_pair (0, 1);
   11687              :         }
   11688      1632019 :       else if (first_vec.first == -1U
   11689      1632019 :                || first_vec == vperm[ei].first)
   11690      1400333 :         first_vec = vperm[ei].first;
   11691       231686 :       else if (second_vec.first == -1U
   11692       231686 :                || second_vec == vperm[ei].first)
   11693              :         {
   11694       231289 :           second_vec = vperm[ei].first;
   11695       231289 :           mask_element += nunits;
   11696              :         }
   11697              :       else
   11698              :         {
   11699          397 :           if (dump_p)
   11700            7 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11701              :                              "permutation requires at "
   11702              :                              "least three vectors\n");
   11703          397 :           gcc_assert (!gsi);
   11704              :           return -1;
   11705              :         }
   11706              : 
   11707      1800693 :       mask[index++] = mask_element;
   11708              : 
   11709      1800693 :       if (index == count)
   11710              :         {
   11711       746085 :           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
   11712              :                               TYPE_VECTOR_SUBPARTS (op_vectype));
   11713       573002 :           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
   11714       850417 :                              && constant_multiple_p (mask[0], nunits));
   11715       573002 :           machine_mode vmode = TYPE_MODE (vectype);
   11716       573002 :           machine_mode op_vmode = TYPE_MODE (op_vectype);
   11717       573002 :           unsigned HOST_WIDE_INT c;
   11718       573002 :           if ((!identity_p
   11719       530339 :                && !can_vec_perm_const_p (vmode, op_vmode, indices))
   11720       573002 :               || (identity_p
   11721        42663 :                   && !known_le (nunits,
   11722              :                                 TYPE_VECTOR_SUBPARTS (op_vectype))
   11723         6984 :                   && (!constant_multiple_p (nunits,
   11724            8 :                                             TYPE_VECTOR_SUBPARTS (op_vectype),
   11725            8 :                                             &c) || c != 2)))
   11726              :             {
   11727         6976 :               if (dump_p)
   11728              :                 {
   11729          152 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
   11730              :                                    vect_location,
   11731              :                                    "unsupported vect permute { ");
   11732         1586 :                   for (i = 0; i < count; ++i)
   11733              :                     {
   11734         1434 :                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
   11735         1434 :                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
   11736              :                     }
   11737          152 :                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
   11738              :                 }
   11739         6976 :               gcc_assert (!gsi);
   11740         7373 :               return -1;
   11741              :             }
   11742              : 
   11743       566026 :           if (!identity_p)
   11744       523363 :             nperms += CEIL (total_nelts, process_nelts) - (ei > last_ei);
   11745       566026 :           if (gsi)
   11746              :             {
   11747        31191 :               if (second_vec.first == -1U)
   11748         6973 :                 second_vec = first_vec;
   11749              : 
   11750        31191 :               slp_tree
   11751        31191 :                 first_node = children[first_vec.first],
   11752        31191 :                 second_node = children[second_vec.first];
   11753              : 
   11754        31191 :               tree mask_vec = NULL_TREE;
   11755        31191 :               if (!identity_p)
   11756        27975 :                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
   11757              : 
   11758        31191 :               tree first_def
   11759        31191 :                 = vect_get_slp_vect_def (first_node, first_vec.second + vi);
   11760        31191 :               tree second_def
   11761        31191 :                 = vect_get_slp_vect_def (second_node, second_vec.second + vi);
   11762        31191 :               vect_add_slp_permutation (vinfo, gsi, node, first_def,
   11763        31191 :                                         second_def, mask_vec, mask[0]);
   11764              :             }
   11765              : 
   11766              :           index = 0;
   11767              :           first_vec = std::make_pair (-1U, -1U);
   11768              :           second_vec = std::make_pair (-1U, -1U);
   11769              :         }
   11770              :     }
   11771              : 
   11772       422959 :   return nperms;
   11773       430332 : }
   11774              : 
   11775              : /* Vectorize the SLP permutations in NODE as specified
   11776              :    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
   11777              :    child number and lane number.
   11778              :    Interleaving of two two-lane two-child SLP subtrees (not supported):
   11779              :      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
   11780              :    A blend of two four-lane two-child SLP subtrees:
   11781              :      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
   11782              :    Highpart of a four-lane one-child SLP subtree (not supported):
   11783              :      [ { 0, 2 }, { 0, 3 } ]
   11784              :    Where currently only a subset is supported by code generating below.  */
   11785              : 
   11786              : bool
   11787       137559 : vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
   11788              :                               slp_tree node, stmt_vector_for_cost *cost_vec)
   11789              : {
   11790       137559 :   tree vectype = SLP_TREE_VECTYPE (node);
   11791       137559 :   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
   11792       137559 :   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
   11793       137559 :                                                SLP_TREE_CHILDREN (node),
   11794              :                                                dump_enabled_p ());
   11795       137559 :   if (nperms < 0)
   11796              :     return false;
   11797              : 
   11798       136272 :   if (!gsi && nperms != 0)
   11799       114552 :     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
   11800              : 
   11801              :   return true;
   11802              : }
   11803              : 
   11804              : /* Vectorize SLP NODE.  */
   11805              : 
   11806              : static void
   11807      1471918 : vect_schedule_slp_node (vec_info *vinfo,
   11808              :                         slp_tree node, slp_instance instance)
   11809              : {
   11810      1471918 :   gimple_stmt_iterator si;
   11811      1471918 :   int i;
   11812      1471918 :   slp_tree child;
   11813              : 
   11814              :   /* Vectorize externals and constants.  */
   11815      1471918 :   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
   11816      1471918 :       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
   11817              :     {
   11818              :       /* ???  vectorizable_shift can end up using a scalar operand which is
   11819              :          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
   11820              :          node in this case.  */
   11821       498121 :       if (!SLP_TREE_VECTYPE (node))
   11822       498121 :         return;
   11823              : 
   11824              :       /* There are two reasons vector defs might already exist.  The first
   11825              :          is that we are vectorizing an existing vector def.  The second is
   11826              :          when performing BB vectorization shared constant/external nodes
   11827              :          are not split apart during partitioning so during the code-gen
   11828              :          DFS walk we can end up visiting them twice.  */
   11829       491087 :       if (! SLP_TREE_VEC_DEFS (node).exists ())
   11830       490404 :         vect_create_constant_vectors (vinfo, node);
   11831       491087 :       return;
   11832              :     }
   11833              : 
   11834       973797 :   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
   11835              : 
   11836       973797 :   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
   11837       973797 :   if (SLP_TREE_VECTYPE (node))
   11838       973791 :     SLP_TREE_VEC_DEFS (node).create (vect_get_num_copies (vinfo, node));
   11839              : 
   11840       973797 :   if (!SLP_TREE_PERMUTE_P (node) && STMT_VINFO_DATA_REF (stmt_info))
   11841              :     {
   11842              :       /* Vectorized loads go before the first scalar load to make it
   11843              :          ready early, vectorized stores go before the last scalar
   11844              :          stmt which is where all uses are ready.  */
   11845       712399 :       stmt_vec_info last_stmt_info = NULL;
   11846       712399 :       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
   11847       166677 :         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
   11848              :       else /* DR_IS_WRITE */
   11849       545722 :         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
   11850       712399 :       si = gsi_for_stmt (last_stmt_info->stmt);
   11851       712399 :     }
   11852       261398 :   else if (!SLP_TREE_PERMUTE_P (node)
   11853       245065 :            && (SLP_TREE_TYPE (node) == cycle_phi_info_type
   11854              :                || SLP_TREE_TYPE (node) == induc_vec_info_type
   11855              :                || SLP_TREE_TYPE (node) == phi_info_type))
   11856              :     {
   11857              :       /* For PHI node vectorization we do not use the insertion iterator.  */
   11858        54184 :       si = gsi_none ();
   11859              :     }
   11860              :   else
   11861              :     {
   11862              :       /* Emit other stmts after the children vectorized defs which is
   11863              :          earliest possible.  */
   11864              :       gimple *last_stmt = NULL;
   11865              :       bool seen_vector_def = false;
   11866       576424 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
   11867       369210 :         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
   11868              :           {
   11869              :             /* For fold-left reductions we are retaining the scalar
   11870              :                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
   11871              :                set so the representation isn't perfect.  Resort to the
   11872              :                last scalar def here.  */
   11873       296213 :             if (SLP_TREE_VEC_DEFS (child).is_empty ())
   11874              :               {
   11875          925 :                 gcc_assert (SLP_TREE_TYPE (child) == cycle_phi_info_type);
   11876          925 :                 gphi *phi = as_a <gphi *>
   11877          925 :                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
   11878          925 :                 if (!last_stmt)
   11879              :                   last_stmt = phi;
   11880          705 :                 else if (vect_stmt_dominates_stmt_p (last_stmt, phi))
   11881              :                   last_stmt = phi;
   11882          694 :                 else if (vect_stmt_dominates_stmt_p (phi, last_stmt))
   11883              :                   ;
   11884              :                 else
   11885            0 :                   gcc_unreachable ();
   11886              :               }
   11887              :             /* We are emitting all vectorized stmts in the same place and
   11888              :                the last one is the last.
   11889              :                ???  Unless we have a load permutation applied and that
   11890              :                figures to re-use an earlier generated load.  */
   11891              :             unsigned j;
   11892              :             tree vdef;
   11893       700342 :             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
   11894              :               {
   11895       404129 :                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
   11896       404129 :                 if (!last_stmt)
   11897              :                   last_stmt = vstmt;
   11898       207488 :                 else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
   11899              :                   last_stmt = vstmt;
   11900        45656 :                 else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
   11901              :                   ;
   11902              :                 else
   11903            0 :                   gcc_unreachable ();
   11904              :               }
   11905              :           }
   11906        72997 :         else if (!SLP_TREE_VECTYPE (child))
   11907              :           {
   11908              :             /* For externals we use unvectorized at all scalar defs.  */
   11909              :             unsigned j;
   11910              :             tree def;
   11911        14941 :             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
   11912         8543 :               if (TREE_CODE (def) == SSA_NAME
   11913         8543 :                   && !SSA_NAME_IS_DEFAULT_DEF (def))
   11914              :                 {
   11915          295 :                   gimple *stmt = SSA_NAME_DEF_STMT (def);
   11916          295 :                   if (gimple_uid (stmt) == -1u)
   11917              :                     /* If the stmt is not inside the region do not
   11918              :                        use it as possible insertion point.  */
   11919              :                     ;
   11920          285 :                   else if (!last_stmt)
   11921              :                     last_stmt = stmt;
   11922          261 :                   else if (vect_stmt_dominates_stmt_p (last_stmt, stmt))
   11923              :                     last_stmt = stmt;
   11924          159 :                   else if (vect_stmt_dominates_stmt_p (stmt, last_stmt))
   11925              :                     ;
   11926              :                   else
   11927            0 :                     gcc_unreachable ();
   11928              :                 }
   11929              :           }
   11930              :         else
   11931              :           {
   11932              :             /* For externals we have to look at all defs since their
   11933              :                insertion place is decided per vector.  But beware
   11934              :                of pre-existing vectors where we need to make sure
   11935              :                we do not insert before the region boundary.  */
   11936        66599 :             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
   11937          512 :                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
   11938              :               seen_vector_def = true;
   11939              :             else
   11940              :               {
   11941              :                 unsigned j;
   11942              :                 tree vdef;
   11943       530091 :                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
   11944        94399 :                   if (TREE_CODE (vdef) == SSA_NAME
   11945        94399 :                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
   11946              :                     {
   11947        19452 :                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
   11948        19452 :                       if (!last_stmt)
   11949              :                         last_stmt = vstmt;
   11950        10846 :                       else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
   11951              :                         last_stmt = vstmt;
   11952         8721 :                       else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
   11953              :                         ;
   11954              :                       else
   11955            0 :                         gcc_unreachable ();
   11956              :                     }
   11957              :               }
   11958              :           }
   11959              :       /* This can happen when all children are pre-existing vectors or
   11960              :          constants.  */
   11961       207214 :       if (!last_stmt)
   11962         1723 :         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
   11963         1723 :       if (!last_stmt)
   11964              :         {
   11965            0 :           gcc_assert (seen_vector_def);
   11966            0 :           si = gsi_after_labels (vinfo->bbs[0]);
   11967              :         }
   11968       207214 :       else if (is_ctrl_altering_stmt (last_stmt))
   11969              :         {
   11970              :           /* We split regions to vectorize at control altering stmts
   11971              :              with a definition so this must be an external which
   11972              :              we can insert at the start of the region.  */
   11973            0 :           si = gsi_after_labels (vinfo->bbs[0]);
   11974              :         }
   11975       207214 :       else if (is_a <bb_vec_info> (vinfo)
   11976        17733 :                && !SLP_TREE_PERMUTE_P (node)
   11977        16394 :                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
   11978       208568 :                && gimple_could_trap_p (stmt_info->stmt))
   11979              :         {
   11980              :           /* We've constrained possibly trapping operations to all come
   11981              :              from the same basic-block, if vectorized defs would allow earlier
   11982              :              scheduling still force vectorized stmts to the original block.
   11983              :              This is only necessary for BB vectorization since for loop vect
   11984              :              all operations are in a single BB and scalar stmt based
   11985              :              placement doesn't play well with epilogue vectorization.  */
   11986           54 :           gcc_assert (dominated_by_p (CDI_DOMINATORS,
   11987              :                                       gimple_bb (stmt_info->stmt),
   11988              :                                       gimple_bb (last_stmt)));
   11989           54 :           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
   11990              :         }
   11991       207160 :       else if (is_a <gphi *> (last_stmt))
   11992        14496 :         si = gsi_after_labels (gimple_bb (last_stmt));
   11993              :       else
   11994              :         {
   11995       192664 :           si = gsi_for_stmt (last_stmt);
   11996       192664 :           gsi_next (&si);
   11997              : 
   11998       192664 :           if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
   11999              :             {
   12000              :               /* Avoid scheduling stmts to random places in the CFG, any
   12001              :                  stmt dominance check we performed is possibly wrong as UIDs
   12002              :                  are not initialized for all of the function for loop
   12003              :                  vectorization.  Instead append to the loop preheader.  */
   12004       175201 :               if ((LOOP_VINFO_LOOP (loop_vinfo)->header
   12005       175201 :                    != gimple_bb (last_stmt))
   12006       178416 :                   && dominated_by_p (CDI_DOMINATORS,
   12007              :                                      LOOP_VINFO_LOOP (loop_vinfo)->header,
   12008         3215 :                                      gimple_bb (last_stmt)))
   12009         1402 :                 si = gsi_end_bb (loop_preheader_edge
   12010          701 :                                    (LOOP_VINFO_LOOP (loop_vinfo))->src);
   12011              :               /* Avoid scheduling internal defs outside of the loop when
   12012              :                  we might have only implicitly tracked loop mask/len defs.  */
   12013           74 :               if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
   12014       175201 :                   || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
   12015              :                 {
   12016           74 :                   gimple_stmt_iterator si2
   12017           74 :                     = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
   12018           74 :                   if ((gsi_end_p (si2)
   12019            0 :                        && (LOOP_VINFO_LOOP (loop_vinfo)->header
   12020            0 :                            != gimple_bb (last_stmt))
   12021            0 :                        && dominated_by_p (CDI_DOMINATORS,
   12022              :                                           LOOP_VINFO_LOOP (loop_vinfo)->header,
   12023            0 :                                           gimple_bb (last_stmt)))
   12024           74 :                       || (!gsi_end_p (si2)
   12025           74 :                           && last_stmt != *si2
   12026           72 :                           && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
   12027            3 :                     si = si2;
   12028              :                 }
   12029              :             }
   12030              :         }
   12031              :     }
   12032              : 
   12033       973797 :   if (dump_enabled_p ())
   12034              :     {
   12035        71697 :       if (stmt_info)
   12036        71644 :         dump_printf_loc (MSG_NOTE, vect_location,
   12037              :                          "------>vectorizing SLP node starting from: %G",
   12038              :                          stmt_info->stmt);
   12039              :       else
   12040              :         {
   12041           53 :           dump_printf_loc (MSG_NOTE, vect_location,
   12042              :                            "------>vectorizing SLP node:\n");
   12043           53 :           vect_print_slp_tree (MSG_NOTE, vect_location, node);
   12044              :         }
   12045              :     }
   12046       973797 :   vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
   12047              : }
   12048              : 
   12049              : /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
   12050              :    For loop vectorization this is done in vectorizable_call, but for SLP
   12051              :    it needs to be deferred until end of vect_schedule_slp, because multiple
   12052              :    SLP instances may refer to the same scalar stmt.  */
   12053              : 
   12054              : static void
   12055       602294 : vect_remove_slp_scalar_calls (vec_info *vinfo,
   12056              :                               slp_tree node, hash_set<slp_tree> &visited)
   12057              : {
   12058       602294 :   gimple *new_stmt;
   12059       602294 :   gimple_stmt_iterator gsi;
   12060       602294 :   int i;
   12061       602294 :   slp_tree child;
   12062       602294 :   tree lhs;
   12063       602294 :   stmt_vec_info stmt_info;
   12064              : 
   12065       602294 :   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
   12066       188573 :     return;
   12067              : 
   12068       457325 :   if (visited.add (node))
   12069              :     return;
   12070              : 
   12071       925930 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
   12072       512209 :     vect_remove_slp_scalar_calls (vinfo, child, visited);
   12073              : 
   12074      1309696 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
   12075              :     {
   12076       486387 :       if (!stmt_info)
   12077         3976 :         continue;
   12078       482411 :       stmt_info = vect_orig_stmt (stmt_info);
   12079       482411 :       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
   12080         5237 :       if (!stmt || gimple_bb (stmt) == NULL)
   12081       477218 :         continue;
   12082         5193 :       lhs = gimple_call_lhs (stmt);
   12083         5193 :       if (lhs)
   12084         4585 :         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
   12085              :       else
   12086          608 :         new_stmt = gimple_build_nop ();
   12087         5193 :       unlink_stmt_vdef (stmt_info->stmt);
   12088         5193 :       gsi = gsi_for_stmt (stmt);
   12089         5193 :       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
   12090         5193 :       if (lhs)
   12091         4585 :         SSA_NAME_DEF_STMT (lhs) = new_stmt;
   12092              :     }
   12093              : }
   12094              : 
   12095              : static void
   12096        90085 : vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
   12097              : {
   12098        90085 :   hash_set<slp_tree> visited;
   12099        90085 :   vect_remove_slp_scalar_calls (vinfo, node, visited);
   12100        90085 : }
   12101              : 
   12102              : /* Vectorize the instance root.  */
   12103              : 
   12104              : void
   12105        10978 : vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance instance)
   12106              : {
   12107        10978 :   gassign *rstmt = NULL;
   12108              : 
   12109        10978 :   if (instance->kind == slp_inst_kind_ctor)
   12110              :     {
   12111         5295 :       if (SLP_TREE_VEC_DEFS (node).length () == 1)
   12112              :         {
   12113         5256 :           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
   12114         5256 :           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
   12115         5256 :           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
   12116         5256 :                                           TREE_TYPE (vect_lhs)))
   12117            0 :             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
   12118              :                                vect_lhs);
   12119         5256 :           rstmt = gimple_build_assign (root_lhs, vect_lhs);
   12120              :         }
   12121              :       else
   12122              :         {
   12123           39 :           gcc_assert (SLP_TREE_VEC_DEFS (node).length () > 1);
   12124           39 :           tree child_def;
   12125           39 :           int j;
   12126           39 :           vec<constructor_elt, va_gc> *v;
   12127           39 :           vec_alloc (v, SLP_TREE_VEC_DEFS (node).length ());
   12128              : 
   12129              :           /* A CTOR can handle V16HI composition from VNx8HI so we
   12130              :              do not need to convert vector elements if the types
   12131              :              do not match.  */
   12132          117 :           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
   12133           78 :             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
   12134           39 :           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
   12135           39 :           tree rtype
   12136           39 :             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
   12137           39 :           tree r_constructor = build_constructor (rtype, v);
   12138           39 :           rstmt = gimple_build_assign (lhs, r_constructor);
   12139              :         }
   12140              :     }
   12141         5683 :   else if (instance->kind == slp_inst_kind_bb_reduc)
   12142              :     {
   12143              :       /* Largely inspired by reduction chain epilogue handling in
   12144              :          vect_create_epilog_for_reduction.  */
   12145         4113 :       vec<tree> vec_defs = vNULL;
   12146         4113 :       vect_get_slp_defs (node, &vec_defs);
   12147         4113 :       enum tree_code reduc_code
   12148         4113 :         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
   12149              :       /* ???  We actually have to reflect signs somewhere.  */
   12150         4113 :       if (reduc_code == MINUS_EXPR)
   12151            0 :         reduc_code = PLUS_EXPR;
   12152         4113 :       gimple_seq epilogue = NULL;
   12153              :       /* We may end up with more than one vector result, reduce them
   12154              :          to one vector.  */
   12155         4113 :       tree vec_def = vec_defs[0];
   12156         4113 :       tree vectype = TREE_TYPE (vec_def);
   12157         4113 :       tree compute_vectype = vectype;
   12158         4113 :       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
   12159         3918 :                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
   12160         6874 :                                  && operation_can_overflow (reduc_code));
   12161         2618 :       if (pun_for_overflow_p)
   12162              :         {
   12163         2618 :           compute_vectype = unsigned_type_for (vectype);
   12164         2618 :           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
   12165              :                                   compute_vectype, vec_def);
   12166              :         }
   12167         6491 :       for (unsigned i = 1; i < vec_defs.length (); ++i)
   12168              :         {
   12169         2378 :           tree def = vec_defs[i];
   12170         2378 :           if (pun_for_overflow_p)
   12171         2275 :             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
   12172              :                                 compute_vectype, def);
   12173         2378 :           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
   12174              :                                   vec_def, def);
   12175              :         }
   12176         4113 :       vec_defs.release ();
   12177              :       /* ???  Support other schemes than direct internal fn.  */
   12178         4113 :       internal_fn reduc_fn;
   12179         4113 :       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
   12180         4113 :           || reduc_fn == IFN_LAST)
   12181            0 :         gcc_unreachable ();
   12182         4113 :       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
   12183         4113 :                                       TREE_TYPE (compute_vectype), vec_def);
   12184         4113 :       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
   12185              :         {
   12186         2557 :           tree rem_def = NULL_TREE;
   12187        11891 :           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
   12188              :             {
   12189         9334 :               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
   12190         9334 :               if (!rem_def)
   12191              :                 rem_def = def;
   12192              :               else
   12193         6777 :                 rem_def = gimple_build (&epilogue, reduc_code,
   12194         6777 :                                         TREE_TYPE (scalar_def),
   12195              :                                         rem_def, def);
   12196              :             }
   12197         2557 :           scalar_def = gimple_build (&epilogue, reduc_code,
   12198         2557 :                                      TREE_TYPE (scalar_def),
   12199              :                                      scalar_def, rem_def);
   12200              :         }
   12201         4113 :       scalar_def = gimple_convert (&epilogue,
   12202         4113 :                                    TREE_TYPE (vectype), scalar_def);
   12203         4113 :       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
   12204         4113 :       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
   12205         4113 :       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
   12206         4113 :       update_stmt (gsi_stmt (rgsi));
   12207         4113 :       return;
   12208              :     }
   12209         1570 :   else if (instance->kind == slp_inst_kind_gcond)
   12210              :     {
   12211              :       /* Only support a single root for now as we can't codegen CFG yet and so we
   12212              :          can't support lane > 1 at this time.  */
   12213         1570 :       gcc_assert (instance->root_stmts.length () == 1);
   12214         1570 :       auto root_stmt_info = instance->root_stmts[0];
   12215         1570 :       auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
   12216         1570 :       gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
   12217         1570 :       gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
   12218         1570 :       bool res = vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
   12219              :                                           root_stmt_info, &rgsi, node, NULL);
   12220         1570 :       gcc_assert (res);
   12221         1570 :       return;
   12222              :     }
   12223              :   else
   12224            0 :     gcc_unreachable ();
   12225              : 
   12226         5295 :   gcc_assert (rstmt);
   12227              : 
   12228         5295 :   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
   12229         5295 :   gsi_replace (&rgsi, rstmt, true);
   12230              : }
   12231              : 
   12232              : struct slp_scc_info
   12233              : {
   12234              :   bool on_stack;
   12235              :   int dfs;
   12236              :   int lowlink;
   12237              : };
   12238              : 
   12239              : /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
   12240              : 
   12241              : static void
   12242      1471918 : vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
   12243              :                    hash_map<slp_tree, slp_scc_info> &scc_info,
   12244              :                    int &maxdfs, vec<slp_tree> &stack)
   12245              : {
   12246      1471918 :   bool existed_p;
   12247      1471918 :   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
   12248      1471918 :   gcc_assert (!existed_p);
   12249      1471918 :   info->dfs = maxdfs;
   12250      1471918 :   info->lowlink = maxdfs;
   12251      1471918 :   maxdfs++;
   12252              : 
   12253              :   /* Leaf.  */
   12254      1471918 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
   12255              :     {
   12256       498121 :       info->on_stack = false;
   12257       498121 :       vect_schedule_slp_node (vinfo, node, instance);
   12258      1028005 :       return;
   12259              :     }
   12260              : 
   12261       973797 :   info->on_stack = true;
   12262       973797 :   stack.safe_push (node);
   12263              : 
   12264       973797 :   unsigned i;
   12265       973797 :   slp_tree child;
   12266              :   /* DFS recurse.  */
   12267      2009345 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
   12268              :     {
   12269      1035548 :       if (!child)
   12270        55377 :         continue;
   12271       980171 :       slp_scc_info *child_info = scc_info.get (child);
   12272       980171 :       if (!child_info)
   12273              :         {
   12274       889908 :           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
   12275              :           /* Recursion might have re-allocated the node.  */
   12276       889908 :           info = scc_info.get (node);
   12277       889908 :           child_info = scc_info.get (child);
   12278       889908 :           info->lowlink = MIN (info->lowlink, child_info->lowlink);
   12279              :         }
   12280        90263 :       else if (child_info->on_stack)
   12281        25529 :         info->lowlink = MIN (info->lowlink, child_info->dfs);
   12282              :     }
   12283       973797 :   if (info->lowlink != info->dfs)
   12284              :     return;
   12285              : 
   12286       942034 :   auto_vec<slp_tree, 4> phis_to_fixup;
   12287              : 
   12288              :   /* Singleton.  */
   12289       942034 :   if (stack.last () == node)
   12290              :     {
   12291       918179 :       stack.pop ();
   12292       918179 :       info->on_stack = false;
   12293       918179 :       vect_schedule_slp_node (vinfo, node, instance);
   12294       918179 :       if (!SLP_TREE_PERMUTE_P (node)
   12295       918179 :           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
   12296        30458 :         phis_to_fixup.quick_push (node);
   12297              :     }
   12298              :   else
   12299              :     {
   12300              :       /* SCC.  */
   12301        23855 :       int last_idx = stack.length () - 1;
   12302        55618 :       while (stack[last_idx] != node)
   12303        31763 :         last_idx--;
   12304              :       /* We can break the cycle at PHIs who have at least one child
   12305              :          code generated.  Then we could re-start the DFS walk until
   12306              :          all nodes in the SCC are covered (we might have new entries
   12307              :          for only back-reachable nodes).  But it's simpler to just
   12308              :          iterate and schedule those that are ready.  */
   12309        23855 :       unsigned todo = stack.length () - last_idx;
   12310        24194 :       do
   12311              :         {
   12312       105790 :           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
   12313              :             {
   12314        57402 :               slp_tree entry = stack[idx];
   12315        57402 :               if (!entry)
   12316          956 :                 continue;
   12317        56446 :               bool phi = (!SLP_TREE_PERMUTE_P (entry)
   12318        56446 :                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
   12319        56446 :               bool ready = !phi;
   12320       142866 :               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
   12321       111519 :                   if (!child)
   12322              :                     {
   12323        22979 :                       gcc_assert (phi);
   12324              :                       ready = true;
   12325              :                       break;
   12326              :                     }
   12327        88540 :                   else if (scc_info.get (child)->on_stack)
   12328              :                     {
   12329        24055 :                       if (!phi)
   12330              :                         {
   12331              :                           ready = false;
   12332              :                           break;
   12333              :                         }
   12334              :                     }
   12335              :                   else
   12336              :                     {
   12337        64485 :                       if (phi)
   12338              :                         {
   12339              :                           ready = true;
   12340              :                           break;
   12341              :                         }
   12342              :                     }
   12343        33467 :               if (ready)
   12344              :                 {
   12345        55618 :                   vect_schedule_slp_node (vinfo, entry, instance);
   12346        55618 :                   scc_info.get (entry)->on_stack = false;
   12347        55618 :                   stack[idx] = NULL;
   12348        55618 :                   todo--;
   12349        55618 :                   if (phi)
   12350        24301 :                     phis_to_fixup.safe_push (entry);
   12351              :                 }
   12352              :             }
   12353              :         }
   12354        24194 :       while (todo != 0);
   12355              : 
   12356              :       /* Pop the SCC.  */
   12357        23855 :       stack.truncate (last_idx);
   12358              :     }
   12359              : 
   12360              :   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
   12361              :   slp_tree phi_node;
   12362      1938827 :   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
   12363              :     {
   12364        54759 :       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
   12365        54759 :       edge_iterator ei;
   12366        54759 :       edge e;
   12367       172943 :       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
   12368              :         {
   12369       118184 :           unsigned dest_idx = e->dest_idx;
   12370       118184 :           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
   12371       118184 :           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
   12372        66423 :             continue;
   12373        51761 :           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
   12374              :           /* Simply fill all args.  */
   12375        51761 :           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
   12376              :               != vect_first_order_recurrence)
   12377       111276 :             for (unsigned i = 0; i < n; ++i)
   12378              :               {
   12379        59560 :                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
   12380        59560 :                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
   12381        59560 :                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
   12382              :                              e, gimple_phi_arg_location (phi, dest_idx));
   12383              :               }
   12384              :           else
   12385              :             {
   12386              :               /* Unless it is a first order recurrence which needs
   12387              :                  args filled in for both the PHI node and the permutes.  */
   12388           45 :               gimple *perm
   12389           45 :                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
   12390           45 :               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
   12391           45 :               add_phi_arg (as_a <gphi *> (rphi),
   12392              :                            vect_get_slp_vect_def (child, n - 1),
   12393              :                            e, gimple_phi_arg_location (phi, dest_idx));
   12394          127 :               for (unsigned i = 0; i < n; ++i)
   12395              :                 {
   12396           82 :                   gimple *perm
   12397           82 :                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
   12398           82 :                   if (i > 0)
   12399           37 :                     gimple_assign_set_rhs1 (perm,
   12400              :                                             vect_get_slp_vect_def (child, i - 1));
   12401           82 :                   gimple_assign_set_rhs2 (perm,
   12402              :                                           vect_get_slp_vect_def (child, i));
   12403           82 :                   update_stmt (perm);
   12404              :                 }
   12405              :             }
   12406              :         }
   12407              :     }
   12408       942034 : }
   12409              : 
   12410              : /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
   12411              : 
   12412              : void
   12413       542685 : vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
   12414              : {
   12415       542685 :   slp_instance instance;
   12416       542685 :   unsigned int i;
   12417              : 
   12418       542685 :   hash_map<slp_tree, slp_scc_info> scc_info;
   12419       542685 :   int maxdfs = 0;
   12420      1124806 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
   12421              :     {
   12422       582121 :       slp_tree node = SLP_INSTANCE_TREE (instance);
   12423       582121 :       if (dump_enabled_p ())
   12424              :         {
   12425        16071 :           dump_printf_loc (MSG_NOTE, vect_location,
   12426              :                            "Vectorizing SLP tree:\n");
   12427              :           /* ???  Dump all?  */
   12428        16071 :           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
   12429          467 :             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
   12430          467 :                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
   12431        16071 :           vect_print_slp_graph (MSG_NOTE, vect_location,
   12432              :                                 SLP_INSTANCE_TREE (instance));
   12433              :         }
   12434              :       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
   12435              :          have a PHI be the node breaking the cycle.  */
   12436       582121 :       auto_vec<slp_tree> stack;
   12437       582121 :       if (!scc_info.get (node))
   12438       582010 :         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
   12439              : 
   12440       582121 :       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
   12441        10978 :         vectorize_slp_instance_root_stmt (vinfo, node, instance);
   12442              : 
   12443       582121 :       if (dump_enabled_p ())
   12444        16071 :         dump_printf_loc (MSG_NOTE, vect_location,
   12445              :                          "vectorizing stmts using SLP.\n");
   12446       582121 :     }
   12447              : 
   12448      1667491 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
   12449              :     {
   12450       582121 :       slp_tree root = SLP_INSTANCE_TREE (instance);
   12451       582121 :       stmt_vec_info store_info;
   12452       582121 :       unsigned int j;
   12453              : 
   12454              :       /* Remove scalar call stmts.  Do not do this for basic-block
   12455              :          vectorization as not all uses may be vectorized.
   12456              :          ???  Why should this be necessary?  DCE should be able to
   12457              :          remove the stmts itself.
   12458              :          ???  For BB vectorization we can as well remove scalar
   12459              :          stmts starting from the SLP tree root if they have no
   12460              :          uses.  */
   12461       582121 :       if (is_a <loop_vec_info> (vinfo))
   12462        90085 :         vect_remove_slp_scalar_calls (vinfo, root);
   12463              : 
   12464              :       /* Remove vectorized stores original scalar stmts.  */
   12465      2598092 :       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
   12466              :         {
   12467      1470249 :           if (!store_info
   12468      1470235 :               || !STMT_VINFO_DATA_REF (store_info)
   12469      1442514 :               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
   12470              :             break;
   12471              : 
   12472      1433850 :           store_info = vect_orig_stmt (store_info);
   12473              :           /* Free the attached stmt_vec_info and remove the stmt.  */
   12474      1433850 :           vinfo->remove_stmt (store_info);
   12475              : 
   12476              :           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
   12477              :              to not crash in vect_free_slp_tree later.  */
   12478      1433850 :           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
   12479       545391 :             SLP_TREE_REPRESENTATIVE (root) = NULL;
   12480              :         }
   12481              :     }
   12482       542685 : }
        

Generated by: LCOV version 2.4-beta

LCOV profile is generated on x86_64 machine using following configure options: configure --disable-bootstrap --enable-coverage=opt --enable-languages=c,c++,fortran,go,jit,lto,rust,m2 --enable-host-shared. GCC test suite is run with the built compiler.