LCOV - code coverage report
Current view: top level - gcc - tree-vect-slp.cc (source / functions) Coverage Total Hit
Test: gcc.info Lines: 92.4 % 5904 5455
Test Date: 2026-05-11 19:44:49 Functions: 95.0 % 181 172
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /* SLP - Basic Block Vectorization
       2              :    Copyright (C) 2007-2026 Free Software Foundation, Inc.
       3              :    Contributed by Dorit Naishlos <dorit@il.ibm.com>
       4              :    and Ira Rosen <irar@il.ibm.com>
       5              : 
       6              : This file is part of GCC.
       7              : 
       8              : GCC is free software; you can redistribute it and/or modify it under
       9              : the terms of the GNU General Public License as published by the Free
      10              : Software Foundation; either version 3, or (at your option) any later
      11              : version.
      12              : 
      13              : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
      14              : WARRANTY; without even the implied warranty of MERCHANTABILITY or
      15              : FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      16              : for more details.
      17              : 
      18              : You should have received a copy of the GNU General Public License
      19              : along with GCC; see the file COPYING3.  If not see
      20              : <http://www.gnu.org/licenses/>.  */
      21              : 
      22              : #include "config.h"
      23              : #define INCLUDE_ALGORITHM
      24              : #include "system.h"
      25              : #include "coretypes.h"
      26              : #include "backend.h"
      27              : #include "target.h"
      28              : #include "rtl.h"
      29              : #include "tree.h"
      30              : #include "gimple.h"
      31              : #include "tree-pass.h"
      32              : #include "ssa.h"
      33              : #include "optabs-tree.h"
      34              : #include "insn-config.h"
      35              : #include "recog.h"            /* FIXME: for insn_data */
      36              : #include "fold-const.h"
      37              : #include "stor-layout.h"
      38              : #include "gimple-iterator.h"
      39              : #include "cfgloop.h"
      40              : #include "tree-vectorizer.h"
      41              : #include "langhooks.h"
      42              : #include "gimple-walk.h"
      43              : #include "dbgcnt.h"
      44              : #include "tree-vector-builder.h"
      45              : #include "vec-perm-indices.h"
      46              : #include "gimple-fold.h"
      47              : #include "internal-fn.h"
      48              : #include "dump-context.h"
      49              : #include "cfganal.h"
      50              : #include "tree-eh.h"
      51              : #include "tree-cfg.h"
      52              : #include "alloc-pool.h"
      53              : #include "sreal.h"
      54              : #include "predict.h"
      55              : 
      56              : #define REDUC_GROUP_FIRST_ELEMENT(S) \
      57              :   (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element)
      58              : 
      59              : static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
      60              :                                             load_permutation_t &,
      61              :                                             const vec<tree> &,
      62              :                                             gimple_stmt_iterator *,
      63              :                                             poly_uint64, bool, bool,
      64              :                                             unsigned *,
      65              :                                             unsigned * = nullptr,
      66              :                                             bool = false);
      67              : static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
      68              :                                            slp_tree, lane_permutation_t &,
      69              :                                            vec<slp_tree> &, bool);
      70              : static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
      71              : static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
      72              : 
      73              : static object_allocator<_slp_tree> *slp_tree_pool;
      74              : static slp_tree slp_first_node;
      75              : 
      76              : void
      77      1113436 : vect_slp_init (void)
      78              : {
      79      1113436 :   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
      80      1113436 : }
      81              : 
      82              : void
      83      1113436 : vect_slp_fini (void)
      84              : {
      85      1774056 :   while (slp_first_node)
      86       660620 :     delete slp_first_node;
      87      2226872 :   delete slp_tree_pool;
      88      1113436 :   slp_tree_pool = NULL;
      89      1113436 : }
      90              : 
      91              : void *
      92      7657062 : _slp_tree::operator new (size_t n)
      93              : {
      94      7657062 :   gcc_assert (n == sizeof (_slp_tree));
      95      7657062 :   return slp_tree_pool->allocate_raw ();
      96              : }
      97              : 
      98              : void
      99      7657062 : _slp_tree::operator delete (void *node, size_t n)
     100              : {
     101      7657062 :   gcc_assert (n == sizeof (_slp_tree));
     102      7657062 :   slp_tree_pool->remove_raw (node);
     103      7657062 : }
     104              : 
     105              : 
     106              : /* Initialize a SLP node.  */
     107              : 
     108      7657062 : _slp_tree::_slp_tree ()
     109              : {
     110      7657062 :   this->prev_node = NULL;
     111      7657062 :   if (slp_first_node)
     112      6701228 :     slp_first_node->prev_node = this;
     113      7657062 :   this->next_node = slp_first_node;
     114      7657062 :   slp_first_node = this;
     115      7657062 :   SLP_TREE_SCALAR_STMTS (this) = vNULL;
     116      7657062 :   SLP_TREE_SCALAR_OPS (this) = vNULL;
     117      7657062 :   SLP_TREE_LIVE_LANES (this) = vNULL;
     118      7657062 :   SLP_TREE_VEC_DEFS (this) = vNULL;
     119      7657062 :   SLP_TREE_CHILDREN (this) = vNULL;
     120      7657062 :   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
     121      7657062 :   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
     122      7657062 :   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
     123      7657062 :   SLP_TREE_CODE (this) = ERROR_MARK;
     124      7657062 :   SLP_TREE_GS_SCALE (this) = 0;
     125      7657062 :   SLP_TREE_GS_BASE (this) = NULL_TREE;
     126      7657062 :   this->ldst_lanes = false;
     127      7657062 :   this->avoid_stlf_fail = false;
     128      7657062 :   SLP_TREE_VECTYPE (this) = NULL_TREE;
     129      7657062 :   SLP_TREE_REPRESENTATIVE (this) = NULL;
     130      7657062 :   this->cycle_info.id = -1;
     131      7657062 :   this->cycle_info.reduc_idx = -1;
     132      7657062 :   SLP_TREE_REF_COUNT (this) = 1;
     133      7657062 :   this->failed = NULL;
     134      7657062 :   this->max_nunits = 1;
     135      7657062 :   this->lanes = 0;
     136      7657062 :   SLP_TREE_TYPE (this) = undef_vec_info_type;
     137      7657062 :   this->data = NULL;
     138      7657062 : }
     139              : 
     140              : /* Tear down a SLP node.  */
     141              : 
     142      7657062 : _slp_tree::~_slp_tree ()
     143              : {
     144      7657062 :   if (this->prev_node)
     145      4627808 :     this->prev_node->next_node = this->next_node;
     146              :   else
     147      3029254 :     slp_first_node = this->next_node;
     148      7657062 :   if (this->next_node)
     149      5779109 :     this->next_node->prev_node = this->prev_node;
     150      7657062 :   SLP_TREE_CHILDREN (this).release ();
     151      7657062 :   SLP_TREE_SCALAR_STMTS (this).release ();
     152      7657062 :   SLP_TREE_SCALAR_OPS (this).release ();
     153      7657062 :   SLP_TREE_LIVE_LANES (this).release ();
     154      7657062 :   SLP_TREE_VEC_DEFS (this).release ();
     155      7657062 :   SLP_TREE_LOAD_PERMUTATION (this).release ();
     156      7657062 :   SLP_TREE_LANE_PERMUTATION (this).release ();
     157      7657062 :   if (this->failed)
     158      1982177 :     free (failed);
     159      7657062 :   if (this->data)
     160      1234415 :     delete this->data;
     161      7657062 : }
     162              : 
     163              : /* Push the single SSA definition in DEF to the vector of vector defs.  */
     164              : 
     165              : void
     166       525123 : _slp_tree::push_vec_def (gimple *def)
     167              : {
     168       525123 :   if (gphi *phi = dyn_cast <gphi *> (def))
     169        58537 :     vec_defs.quick_push (gimple_phi_result (phi));
     170              :   else
     171              :     {
     172       466586 :       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
     173       466586 :       vec_defs.quick_push (get_def_from_ptr (defop));
     174              :     }
     175       525123 : }
     176              : 
     177              : /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
     178              : 
     179              : void
     180     14557464 : vect_free_slp_tree (slp_tree node)
     181              : {
     182     14557464 :   int i;
     183     14557464 :   slp_tree child;
     184              : 
     185     14557464 :   if (--SLP_TREE_REF_COUNT (node) != 0)
     186     14557464 :     return;
     187              : 
     188     10884059 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     189      3887617 :     if (child)
     190      3534081 :       vect_free_slp_tree (child);
     191              : 
     192      6996442 :   delete node;
     193              : }
     194              : 
     195              : /* Return a location suitable for dumpings related to the SLP instance.  */
     196              : 
     197              : dump_user_location_t
     198      3376322 : _slp_instance::location () const
     199              : {
     200      3376322 :   if (!root_stmts.is_empty ())
     201       316656 :     return root_stmts[0]->stmt;
     202              :   else
     203      3059666 :     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
     204              : }
     205              : 
     206              : 
     207              : /* Free the memory allocated for the SLP instance.  */
     208              : 
     209              : void
     210      1542846 : vect_free_slp_instance (slp_instance instance)
     211              : {
     212      1542846 :   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
     213      1542846 :   SLP_INSTANCE_LOADS (instance).release ();
     214      1542846 :   SLP_INSTANCE_ROOT_STMTS (instance).release ();
     215      1542846 :   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
     216      1542846 :   instance->subgraph_entries.release ();
     217      1542846 :   instance->cost_vec.release ();
     218      1542846 :   free (instance);
     219      1542846 : }
     220              : 
     221              : 
     222              : /* Create a SLP node with NOPS children with CODE, either VEC_PERM_EXPR
     223              :    for a permute node or else ERROR_MARK.  */
     224              : 
     225              : slp_tree
     226        95058 : vect_create_new_slp_node (unsigned nops, tree_code code)
     227              : {
     228        95058 :   gcc_assert (code == ERROR_MARK || code == VEC_PERM_EXPR);
     229        95058 :   slp_tree node = new _slp_tree;
     230        95058 :   SLP_TREE_SCALAR_STMTS (node) = vNULL;
     231        95058 :   SLP_TREE_CHILDREN (node).create (nops);
     232        95058 :   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
     233        95058 :   SLP_TREE_CODE (node) = code;
     234        95058 :   return node;
     235              : }
     236              : 
     237              : /* Create a SLP node inplace at NODE for SCALAR_STMTS and NOPS children.  */
     238              : 
     239              : static slp_tree
     240      3741791 : vect_create_new_slp_node (slp_tree node,
     241              :                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
     242              : {
     243      3741791 :   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
     244      3741791 :   SLP_TREE_CHILDREN (node).create (nops);
     245      3741791 :   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
     246      3741791 :   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
     247      3741791 :   SLP_TREE_LANES (node) = scalar_stmts.length ();
     248      3741791 :   return node;
     249              : }
     250              : 
     251              : /* Create an SLP node for SCALAR_STMTS and NOPS children.  */
     252              : 
     253              : static slp_tree
     254         7835 : vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
     255              : {
     256         7835 :   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
     257              : }
     258              : 
     259              : /* Create a vect_external_def SLP node inplace at NODE for scalar
     260              :    operands OPS.  */
     261              : 
     262              : static slp_tree
     263      1827382 : vect_create_new_slp_node (slp_tree node, vec<tree> ops)
     264              : {
     265      1827382 :   SLP_TREE_SCALAR_OPS (node) = ops;
     266      1827382 :   SLP_TREE_DEF_TYPE (node) = vect_external_def;
     267            0 :   SLP_TREE_LANES (node) = ops.length ();
     268      1827382 :   return node;
     269              : }
     270              : 
     271              : /* Create a vect_external_def SLP node for scalar operands OPS.  */
     272              : 
     273              : static slp_tree
     274      1827382 : vect_create_new_slp_node (vec<tree> ops)
     275              : {
     276      1827382 :   return vect_create_new_slp_node (new _slp_tree, ops);
     277              : }
     278              : 
     279              : 
     280              : /* This structure is used in creation of an SLP tree.  Each instance
     281              :    corresponds to the same operand in a group of scalar stmts in an SLP
     282              :    node.  */
     283              : typedef struct _slp_oprnd_info
     284              : {
     285              :   /* Def-stmts for the operands.  */
     286              :   vec<stmt_vec_info> def_stmts;
     287              :   /* Operands.  */
     288              :   vec<tree> ops;
     289              :   /* Information about the first statement, its vector def-type, type, the
     290              :      operand itself in case it's constant, and an indication if it's a pattern
     291              :      stmt and gather/scatter info.  */
     292              :   tree first_op_type;
     293              :   enum vect_def_type first_dt;
     294              :   bool any_pattern;
     295              :   bool first_gs_p;
     296              :   gather_scatter_info first_gs_info;
     297              : } *slp_oprnd_info;
     298              : 
     299              : 
     300              : /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
     301              :    operand.  */
     302              : static vec<slp_oprnd_info>
     303      3313099 : vect_create_oprnd_info (int nops, int group_size)
     304              : {
     305      3313099 :   int i;
     306      3313099 :   slp_oprnd_info oprnd_info;
     307      3313099 :   vec<slp_oprnd_info> oprnds_info;
     308              : 
     309      3313099 :   oprnds_info.create (nops);
     310     11887184 :   for (i = 0; i < nops; i++)
     311              :     {
     312      5260986 :       oprnd_info = XNEW (struct _slp_oprnd_info);
     313      5260986 :       oprnd_info->def_stmts.create (group_size);
     314      5260986 :       oprnd_info->ops.create (group_size);
     315      5260986 :       oprnd_info->first_dt = vect_uninitialized_def;
     316      5260986 :       oprnd_info->first_op_type = NULL_TREE;
     317      5260986 :       oprnd_info->any_pattern = false;
     318      5260986 :       oprnd_info->first_gs_p = false;
     319      5260986 :       oprnds_info.quick_push (oprnd_info);
     320              :     }
     321              : 
     322      3313099 :   return oprnds_info;
     323              : }
     324              : 
     325              : 
     326              : /* Free operands info.  */
     327              : 
     328              : static void
     329      3313099 : vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
     330              : {
     331      3313099 :   int i;
     332      3313099 :   slp_oprnd_info oprnd_info;
     333              : 
     334      8574085 :   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     335              :     {
     336      5260986 :       oprnd_info->def_stmts.release ();
     337      5260986 :       oprnd_info->ops.release ();
     338      5260986 :       XDELETE (oprnd_info);
     339              :     }
     340              : 
     341      3313099 :   oprnds_info.release ();
     342      3313099 : }
     343              : 
     344              : /* Return the execution frequency of NODE (so that a higher value indicates
     345              :    a "more important" node when optimizing for speed).  */
     346              : 
     347              : static sreal
     348      3467835 : vect_slp_node_weight (slp_tree node)
     349              : {
     350      3467835 :   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
     351      3467835 :   basic_block bb = gimple_bb (stmt_info->stmt);
     352      3467835 :   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
     353              : }
     354              : 
     355              : /* Return true if STMTS contains a pattern statement.  */
     356              : 
     357              : static bool
     358        22190 : vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
     359              : {
     360        22190 :   stmt_vec_info stmt_info;
     361        22190 :   unsigned int i;
     362        71916 :   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
     363        51903 :     if (stmt_info && is_pattern_stmt_p (stmt_info))
     364              :       return true;
     365              :   return false;
     366              : }
     367              : 
     368              : /* Return true when all lanes in the external or constant NODE have
     369              :    the same value.  */
     370              : 
     371              : static bool
     372       589312 : vect_slp_tree_uniform_p (slp_tree node)
     373              : {
     374       589312 :   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
     375              :               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
     376              : 
     377              :   /* Pre-exsting vectors.  */
     378      1037324 :   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
     379              :     return false;
     380              : 
     381              :   unsigned i;
     382              :   tree op, first = NULL_TREE;
     383      1349759 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
     384      1208459 :     if (!first)
     385              :       first = op;
     386       619147 :     else if (!operand_equal_p (first, op, 0))
     387              :       return false;
     388              : 
     389              :   return true;
     390              : }
     391              : 
     392              : /* Find the place of the data-ref in STMT_INFO in the interleaving chain
     393              :    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
     394              :    of the chain.  */
     395              : 
     396              : int
     397       698766 : vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
     398              :                                       stmt_vec_info first_stmt_info)
     399              : {
     400       698766 :   stmt_vec_info next_stmt_info = first_stmt_info;
     401       698766 :   int result = 0;
     402              : 
     403       698766 :   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
     404              :     return -1;
     405              : 
     406      1747627 :   do
     407              :     {
     408      1747627 :       if (next_stmt_info == stmt_info)
     409              :         return result;
     410      1048861 :       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
     411      1048861 :       if (next_stmt_info)
     412      1048861 :         result += DR_GROUP_GAP (next_stmt_info);
     413              :     }
     414      1048861 :   while (next_stmt_info);
     415              : 
     416              :   return -1;
     417              : }
     418              : 
     419              : /* Check whether it is possible to load COUNT elements of type ELT_TYPE
     420              :    using the method implemented by duplicate_and_interleave.  Return true
     421              :    if so, returning the number of intermediate vectors in *NVECTORS_OUT
     422              :    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
     423              :    (if nonnull).  */
     424              : 
     425              : bool
     426            0 : can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
     427              :                                 tree elt_type, unsigned int *nvectors_out,
     428              :                                 tree *vector_type_out,
     429              :                                 tree *permutes)
     430              : {
     431            0 :   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
     432            0 :   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
     433            0 :     return false;
     434              : 
     435            0 :   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
     436            0 :   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
     437            0 :   unsigned int nvectors = 1;
     438            0 :   for (;;)
     439              :     {
     440            0 :       scalar_int_mode int_mode;
     441            0 :       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
     442            0 :       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
     443              :         {
     444              :           /* Get the natural vector type for this SLP group size.  */
     445            0 :           tree int_type = build_nonstandard_integer_type
     446            0 :             (GET_MODE_BITSIZE (int_mode), 1);
     447            0 :           tree vector_type
     448            0 :             = get_vectype_for_scalar_type (vinfo, int_type, count);
     449            0 :           poly_int64 half_nelts;
     450            0 :           if (vector_type
     451            0 :               && VECTOR_MODE_P (TYPE_MODE (vector_type))
     452            0 :               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
     453              :                            GET_MODE_SIZE (base_vector_mode))
     454            0 :               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
     455              :                              2, &half_nelts))
     456              :             {
     457              :               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
     458              :                  together into elements of type INT_TYPE and using the result
     459              :                  to build NVECTORS vectors.  */
     460            0 :               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
     461            0 :               vec_perm_builder sel1 (nelts, 2, 3);
     462            0 :               vec_perm_builder sel2 (nelts, 2, 3);
     463              : 
     464            0 :               for (unsigned int i = 0; i < 3; ++i)
     465              :                 {
     466            0 :                   sel1.quick_push (i);
     467            0 :                   sel1.quick_push (i + nelts);
     468            0 :                   sel2.quick_push (half_nelts + i);
     469            0 :                   sel2.quick_push (half_nelts + i + nelts);
     470              :                 }
     471            0 :               vec_perm_indices indices1 (sel1, 2, nelts);
     472            0 :               vec_perm_indices indices2 (sel2, 2, nelts);
     473            0 :               machine_mode vmode = TYPE_MODE (vector_type);
     474            0 :               if (can_vec_perm_const_p (vmode, vmode, indices1)
     475            0 :                   && can_vec_perm_const_p (vmode, vmode, indices2))
     476              :                 {
     477            0 :                   if (nvectors_out)
     478            0 :                     *nvectors_out = nvectors;
     479            0 :                   if (vector_type_out)
     480            0 :                     *vector_type_out = vector_type;
     481            0 :                   if (permutes)
     482              :                     {
     483            0 :                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
     484              :                                                                 indices1);
     485            0 :                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
     486              :                                                                 indices2);
     487              :                     }
     488            0 :                   return true;
     489              :                 }
     490            0 :             }
     491              :         }
     492            0 :       if (!multiple_p (elt_bytes, 2, &elt_bytes))
     493              :         return false;
     494            0 :       nvectors *= 2;
     495              :       /* We need to be able to fuse COUNT / NVECTORS elements together.  */
     496            0 :       if (!multiple_p (count, nvectors))
     497              :         return false;
     498              :     }
     499              : }
     500              : 
     501              : /* Return true if DTA and DTB match.  */
     502              : 
     503              : static bool
     504     16983229 : vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
     505              : {
     506     16983229 :   return (dta == dtb
     507       347959 :           || ((dta == vect_external_def || dta == vect_constant_def)
     508       215860 :               && (dtb == vect_external_def || dtb == vect_constant_def)));
     509              : }
     510              : 
     511              : #define GATHER_SCATTER_OFFSET (-3)
     512              : 
     513              : /* For most SLP statements, there is a one-to-one mapping between
     514              :    gimple arguments and child nodes.  If that is not true for STMT,
     515              :    return an array that contains:
     516              : 
     517              :    - the number of child nodes, followed by
     518              :    - for each child node, the index of the argument associated with that node.
     519              :      The special index -1 is the first operand of an embedded comparison and
     520              :      the special index -2 is the second operand of an embedded comparison.
     521              :      The special indes -3 is the offset of a gather as analyzed by
     522              :      vect_check_gather_scatter.
     523              : 
     524              :    SWAP is as for vect_get_and_check_slp_defs.  */
     525              : 
     526              : static const int *
     527     24140575 : vect_get_operand_map (const gimple *stmt, bool gather_scatter_p,
     528              :                       unsigned char swap)
     529              : {
     530     24140575 :   static const int no_arg_map[] = { 0 };
     531     24140575 :   static const int arg0_map[] = { 1, 0 };
     532     24140575 :   static const int arg2_map[] = { 1, 2 };
     533     24140575 :   static const int arg2_arg3_map[] = { 2, 2, 3 };
     534     24140575 :   static const int arg2_arg4_map[] = { 2, 2, 4 };
     535     24140575 :   static const int arg2_arg5_arg6_map[] = { 3, 2, 5, 6 };
     536     24140575 :   static const int arg2_arg4_arg5_map[] = { 3, 2, 4, 5 };
     537     24140575 :   static const int arg3_arg2_map[] = { 2, 3, 2 };
     538     24140575 :   static const int op00_map[] = { 1, -1 };
     539     24140575 :   static const int op1_op0_map[] = { 2, 1, 0 };
     540     24140575 :   static const int off_map[] = { 1, GATHER_SCATTER_OFFSET };
     541     24140575 :   static const int off_op0_map[] = { 2, GATHER_SCATTER_OFFSET, 0 };
     542     24140575 :   static const int off_arg2_arg3_map[] = { 3, GATHER_SCATTER_OFFSET, 2, 3 };
     543     24140575 :   static const int off_arg3_arg2_map[] = { 3, GATHER_SCATTER_OFFSET, 3, 2 };
     544     24140575 :   static const int mask_call_maps[6][7] = {
     545              :         { 1, 1, },
     546              :         { 2, 1, 2, },
     547              :         { 3, 1, 2, 3, },
     548              :         { 4, 1, 2, 3, 4, },
     549              :         { 5, 1, 2, 3, 4, 5, },
     550              :         { 6, 1, 2, 3, 4, 5, 6 },
     551              :   };
     552              : 
     553     24140575 :   gcc_checking_assert (!swap
     554              :                        || !is_gimple_assign (stmt)
     555              :                        || TREE_CODE_CLASS
     556              :                             (gimple_assign_rhs_code (stmt)) == tcc_comparison
     557              :                        || commutative_tree_code
     558              :                             (gimple_assign_rhs_code (stmt)));
     559              : 
     560     24140575 :   if (auto assign = dyn_cast<const gassign *> (stmt))
     561              :     {
     562     22696952 :       tree_code code = gimple_assign_rhs_code (assign);
     563     22696952 :       if (code == COND_EXPR
     564     22696952 :           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
     565            0 :         gcc_unreachable ();
     566     22696952 :       else if ((TREE_CODE_CLASS (code) == tcc_comparison
     567     21360865 :                 || commutative_tree_code (code))
     568     31623680 :                && swap)
     569              :         return op1_op0_map;
     570     22656262 :       else if (code == VIEW_CONVERT_EXPR)
     571              :         return op00_map;
     572     22648111 :       else if (gather_scatter_p)
     573        43313 :         return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
     574        43313 :                 ? off_op0_map : off_map);
     575              :     }
     576      1443623 :   else if (auto call = dyn_cast<const gcall *> (stmt))
     577              :     {
     578       160784 :       if (gimple_call_internal_p (call))
     579        92028 :         switch (gimple_call_internal_fn (call))
     580              :           {
     581        15940 :           case IFN_MASK_LOAD:
     582        27186 :             return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
     583              : 
     584              :           case IFN_GATHER_LOAD:
     585              :             return arg2_map;
     586              : 
     587            0 :           case IFN_MASK_GATHER_LOAD:
     588            0 :           case IFN_MASK_LEN_GATHER_LOAD:
     589            0 :             return arg2_arg5_arg6_map;
     590              : 
     591            0 :           case IFN_SCATTER_STORE:
     592            0 :             return arg2_arg4_map;
     593              : 
     594            0 :           case IFN_MASK_SCATTER_STORE:
     595            0 :           case IFN_MASK_LEN_SCATTER_STORE:
     596            0 :             return arg2_arg4_arg5_map;
     597              : 
     598         9481 :           case IFN_MASK_STORE:
     599        17540 :             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
     600              : 
     601          988 :           case IFN_MASK_CALL:
     602          988 :             {
     603          988 :               unsigned nargs = gimple_call_num_args (call);
     604          988 :               if (nargs >= 2 && nargs <= 7)
     605          988 :                 return mask_call_maps[nargs-2];
     606              :               else
     607              :                 return nullptr;
     608              :             }
     609              : 
     610          278 :           case IFN_CLZ:
     611          278 :           case IFN_CTZ:
     612          278 :             return arg0_map;
     613              : 
     614         6306 :           case IFN_GOMP_SIMD_LANE:
     615         6306 :             return no_arg_map;
     616              : 
     617              :           default:
     618              :             break;
     619              :           }
     620              :     }
     621              :   return nullptr;
     622              : }
     623              : 
     624              : static const int *
     625     24124649 : vect_get_operand_map (const stmt_vec_info stmt, unsigned char swap = 0)
     626              : {
     627            0 :   return vect_get_operand_map (stmt->stmt, STMT_VINFO_GATHER_SCATTER_P (stmt),
     628            0 :                                swap);
     629              : }
     630              : 
     631              : /* Return the SLP node child index for operand OP of STMT.  */
     632              : 
     633              : int
     634      1365551 : vect_slp_child_index_for_operand (const stmt_vec_info stmt, int op)
     635              : {
     636      1365551 :   const int *opmap = vect_get_operand_map (stmt);
     637      1365551 :   if (!opmap)
     638              :     return op;
     639        21863 :   for (int i = 1; i < 1 + opmap[0]; ++i)
     640        21863 :     if (opmap[i] == op)
     641        12246 :       return i - 1;
     642            0 :   gcc_unreachable ();
     643              : }
     644              : 
     645              : /* Helper class for mapping of GIMPLE operands to SLP children.  */
     646              : /* ???  Add vect_slp_child_index_for_operand here and amend opmaps
     647              :    with the full reverse mapping and indicating the position of the
     648              :    first commutative operand index, eliding the swap_p argument from
     649              :    vect_get_operand_map.  Adjust all consumers.  */
     650              : 
     651              : struct slp_oprnds {
     652              :   slp_oprnds (stmt_vec_info);
     653              :   tree get_op_for_slp_child (stmt_vec_info, unsigned);
     654              :   const int *opmap;
     655              :   const unsigned int num_slp_children;
     656              : };
     657              : 
     658      4373915 : slp_oprnds::slp_oprnds (stmt_vec_info stmt_info)
     659      4373915 :   : opmap (vect_get_operand_map (stmt_info)),
     660      4373915 :     num_slp_children (opmap ? opmap[0] : gimple_num_args (stmt_info->stmt))
     661              : {
     662      4373915 : }
     663              : 
     664              : /* For SLP child number N get the corresponding tree operand from GIMPLE
     665              :    statement described by STMT_INFO.  */
     666              : 
     667              : tree
     668      4818223 : slp_oprnds::get_op_for_slp_child (stmt_vec_info stmt_info, unsigned n)
     669              : {
     670      4818223 :   gcc_assert (n < num_slp_children);
     671      4818223 :   int opno = opmap ? opmap[n + 1] : (int) n;
     672      4818223 :   if (opno == GATHER_SCATTER_OFFSET)
     673            0 :     gcc_unreachable (); // TODO
     674      4818223 :   else if (opno < 0)
     675         1934 :     return TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
     676              :   else
     677      4816289 :     return gimple_arg (stmt_info->stmt, opno);
     678              : }
     679              : 
     680              : /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
     681              :    they are of a valid type and that they match the defs of the first stmt of
     682              :    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
     683              :    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
     684              :    indicates swap is required for cond_expr stmts.  Specifically, SWAP
     685              :    is 1 if STMT is cond and operands of comparison need to be swapped;
     686              :    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
     687              : 
     688              :    If there was a fatal error return -1; if the error could be corrected by
     689              :    swapping operands of father node of this one, return 1; if everything is
     690              :    ok return 0.  */
     691              : static int
     692     12672511 : vect_get_and_check_slp_defs (vec_info *vinfo, tree vectype, unsigned char swap,
     693              :                              bool *skip_args,
     694              :                              vec<stmt_vec_info> stmts, unsigned stmt_num,
     695              :                              vec<slp_oprnd_info> *oprnds_info)
     696              : {
     697     12672511 :   stmt_vec_info stmt_info = stmts[stmt_num];
     698     12672511 :   tree oprnd;
     699     12672511 :   unsigned int i, number_of_oprnds;
     700     12672511 :   enum vect_def_type dt = vect_uninitialized_def;
     701     12672511 :   slp_oprnd_info oprnd_info;
     702     12672511 :   gather_scatter_info gs_info;
     703     12672511 :   unsigned int gs_op = -1u;
     704     12672511 :   unsigned int commutative_op = -1U;
     705     12672511 :   bool first = stmt_num == 0;
     706              : 
     707     12672511 :   if (!stmt_info)
     708              :     {
     709            0 :       for (auto oi : *oprnds_info)
     710              :         {
     711            0 :           oi->def_stmts.quick_push (NULL);
     712            0 :           oi->ops.quick_push (NULL_TREE);
     713              :         }
     714              :       return 0;
     715              :     }
     716              : 
     717     12672511 :   if (!is_a<gcall *> (stmt_info->stmt)
     718              :       && !is_a<gassign *> (stmt_info->stmt)
     719              :       && !is_a<gphi *> (stmt_info->stmt))
     720              :     return -1;
     721              : 
     722     12672511 :   number_of_oprnds = gimple_num_args (stmt_info->stmt);
     723     12672511 :   const int *map = vect_get_operand_map (stmt_info, swap);
     724     12672511 :   if (map)
     725        75842 :     number_of_oprnds = *map++;
     726     12672511 :   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
     727              :     {
     728        49213 :       if (gimple_call_internal_p (stmt))
     729              :         {
     730        32558 :           internal_fn ifn = gimple_call_internal_fn (stmt);
     731        32558 :           commutative_op = first_commutative_argument (ifn);
     732        32558 :           if (internal_gather_scatter_fn_p (ifn))
     733              :             {
     734            0 :               vect_describe_gather_scatter_call
     735            0 :                 (stmt_info,
     736            0 :                  first ? &(*oprnds_info)[0]->first_gs_info : &gs_info);
     737            0 :               if (first)
     738            0 :                 (*oprnds_info)[0]->first_gs_p = true;
     739              :               gs_op = 0;
     740              :             }
     741              :         }
     742              :     }
     743     12623298 :   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
     744              :     {
     745     14725049 :       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
     746      8365245 :         commutative_op = 0;
     747              :     }
     748              : 
     749     12672511 :   bool swapped = (swap != 0);
     750     12672511 :   bool backedge = false;
     751     12672511 :   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
     752     35068579 :   for (i = 0; i < number_of_oprnds; i++)
     753              :     {
     754     22397247 :       oprnd_info = (*oprnds_info)[i];
     755     22397247 :       int opno = map ? map[i] : int (i);
     756     22397247 :       if (opno == GATHER_SCATTER_OFFSET)
     757              :         {
     758        22734 :           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
     759        22734 :           if (!is_a <loop_vec_info> (vinfo)
     760        22734 :               || !vect_check_gather_scatter (stmt_info, vectype,
     761              :                                              as_a <loop_vec_info> (vinfo),
     762              :                                              first ? &oprnd_info->first_gs_info
     763              :                                              : &gs_info))
     764         1179 :             return -1;
     765              : 
     766        22734 :           if (first)
     767              :             {
     768        22483 :               oprnd_info->first_gs_p = true;
     769        22483 :               oprnd = oprnd_info->first_gs_info.offset;
     770              :             }
     771              :           else
     772              :             {
     773          251 :               gs_op = i;
     774          251 :               oprnd = gs_info.offset;
     775              :             }
     776              :         }
     777     22374513 :       else if (opno < 0)
     778         2842 :         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
     779              :       else
     780              :         {
     781     22371671 :           oprnd = gimple_arg (stmt_info->stmt, opno);
     782     22371671 :           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
     783              :             {
     784      1206443 :               edge e = gimple_phi_arg_edge (stmt, opno);
     785      2412886 :               backedge = (is_a <bb_vec_info> (vinfo)
     786      1863146 :                           ? e->flags & EDGE_DFS_BACK
     787       656703 :                           : dominated_by_p (CDI_DOMINATORS, e->src,
     788       656703 :                                             gimple_bb (stmt_info->stmt)));
     789              :             }
     790              :         }
     791              : 
     792     22397247 :       stmt_vec_info def_stmt_info;
     793     22397247 :       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
     794              :         {
     795          994 :           if (dump_enabled_p ())
     796            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     797              :                              "Build SLP failed: can't analyze def for %T\n",
     798              :                              oprnd);
     799              : 
     800          994 :           return -1;
     801              :         }
     802              : 
     803     22396253 :       if (skip_args[i])
     804              :         {
     805       522657 :           oprnd_info->def_stmts.quick_push (NULL);
     806       522657 :           oprnd_info->ops.quick_push (NULL_TREE);
     807       522657 :           oprnd_info->first_dt = vect_uninitialized_def;
     808       522657 :           continue;
     809              :         }
     810              : 
     811     21873596 :       oprnd_info->def_stmts.quick_push (def_stmt_info);
     812     21873596 :       oprnd_info->ops.quick_push (oprnd);
     813              : 
     814     21873596 :       if (def_stmt_info
     815     21873596 :           && is_pattern_stmt_p (def_stmt_info))
     816              :         {
     817       393810 :           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
     818              :               != def_stmt_info)
     819       278193 :             oprnd_info->any_pattern = true;
     820              :           else
     821              :             /* If we promote this to external use the original stmt def.  */
     822       115617 :             oprnd_info->ops.last ()
     823       231234 :               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
     824              :         }
     825              : 
     826              :       /* If there's a extern def on a backedge make sure we can
     827              :          code-generate at the region start.
     828              :          ???  This is another case that could be fixed by adjusting
     829              :          how we split the function but at the moment we'd have conflicting
     830              :          goals there.  */
     831     21873596 :       if (backedge
     832       166850 :           && dts[i] == vect_external_def
     833          206 :           && is_a <bb_vec_info> (vinfo)
     834          206 :           && TREE_CODE (oprnd) == SSA_NAME
     835          185 :           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
     836     21873781 :           && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
     837          185 :                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
     838              :         {
     839          185 :           if (dump_enabled_p ())
     840            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     841              :                              "Build SLP failed: extern def %T only defined "
     842              :                              "on backedge\n", oprnd);
     843          185 :           return -1;
     844              :         }
     845              : 
     846     21873411 :       if (first)
     847              :         {
     848      4776118 :           tree type = TREE_TYPE (oprnd);
     849      4776118 :           dt = dts[i];
     850              : 
     851              :           /* For the swapping logic below force vect_reduction_def
     852              :              for the reduction op in a SLP reduction group.  */
     853      4776118 :           if (!STMT_VINFO_DATA_REF (stmt_info)
     854      3615225 :               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     855         5210 :               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
     856      4778695 :               && def_stmt_info)
     857         2577 :             dts[i] = dt = vect_reduction_def;
     858              : 
     859              :           /* Check the types of the definition.  */
     860      4776118 :           switch (dt)
     861              :             {
     862      4776118 :             case vect_external_def:
     863      4776118 :             case vect_constant_def:
     864      4776118 :             case vect_internal_def:
     865      4776118 :             case vect_reduction_def:
     866      4776118 :             case vect_double_reduction_def:
     867      4776118 :             case vect_induction_def:
     868      4776118 :             case vect_nested_cycle:
     869      4776118 :             case vect_first_order_recurrence:
     870      4776118 :               break;
     871              : 
     872            0 :             default:
     873              :               /* FORNOW: Not supported.  */
     874            0 :               if (dump_enabled_p ())
     875            0 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     876              :                                  "Build SLP failed: illegal type of def %T\n",
     877              :                                  oprnd);
     878            0 :               return -1;
     879              :             }
     880              : 
     881      4776118 :           oprnd_info->first_dt = dt;
     882      4776118 :           oprnd_info->first_op_type = type;
     883              :         }
     884              :     }
     885     12671332 :   if (first)
     886              :     return 0;
     887              : 
     888              :   /* Now match the operand definition types to that of the first stmt.  */
     889     26202422 :   for (i = 0; i < number_of_oprnds;)
     890              :     {
     891     17109451 :       if (skip_args[i])
     892              :         {
     893        43200 :           ++i;
     894        43200 :           continue;
     895              :         }
     896              : 
     897     17066251 :       oprnd_info = (*oprnds_info)[i];
     898     17066251 :       dt = dts[i];
     899     17066251 :       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
     900     17066251 :       oprnd = oprnd_info->ops[stmt_num];
     901     17066251 :       tree type = TREE_TYPE (oprnd);
     902              : 
     903     17066251 :       if (!types_compatible_p (oprnd_info->first_op_type, type))
     904              :         {
     905        88818 :           if (dump_enabled_p ())
     906          109 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     907              :                              "Build SLP failed: different operand types\n");
     908        88818 :           return 1;
     909              :         }
     910              : 
     911     16977433 :       if ((gs_op == i) != oprnd_info->first_gs_p)
     912              :         {
     913            0 :           if (dump_enabled_p ())
     914            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     915              :                              "Build SLP failed: mixed gather and non-gather\n");
     916            0 :           return 1;
     917              :         }
     918     16977433 :       else if (gs_op == i)
     919              :         {
     920          221 :           if (!operand_equal_p (oprnd_info->first_gs_info.base,
     921          221 :                                 gs_info.base))
     922              :             {
     923           16 :               if (dump_enabled_p ())
     924            6 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     925              :                                  "Build SLP failed: different gather base\n");
     926           16 :               return 1;
     927              :             }
     928          205 :           if (oprnd_info->first_gs_info.scale != gs_info.scale)
     929              :             {
     930            8 :               if (dump_enabled_p ())
     931            2 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     932              :                                  "Build SLP failed: different gather scale\n");
     933            8 :               return 1;
     934              :             }
     935              :         }
     936              : 
     937              :       /* Not first stmt of the group, check that the def-stmt/s match
     938              :          the def-stmt/s of the first stmt.  Allow different definition
     939              :          types for reduction chains: the first stmt must be a
     940              :          vect_reduction_def (a phi node), and the rest
     941              :          end in the reduction chain.  */
     942     16977409 :       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
     943       291820 :            && !(oprnd_info->first_dt == vect_reduction_def
     944         4535 :                 && !STMT_VINFO_DATA_REF (stmt_info)
     945         4535 :                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     946         4509 :                 && def_stmt_info
     947         4509 :                 && !STMT_VINFO_DATA_REF (def_stmt_info)
     948         4509 :                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
     949              :                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
     950     16690098 :           || (!STMT_VINFO_DATA_REF (stmt_info)
     951     15389718 :               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     952         9386 :               && ((!def_stmt_info
     953         9217 :                    || STMT_VINFO_DATA_REF (def_stmt_info)
     954        16906 :                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
     955              :                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
     956         9386 :                   != (oprnd_info->first_dt != vect_reduction_def))))
     957              :         {
     958              :           /* Try swapping operands if we got a mismatch.  For BB
     959              :              vectorization only in case it will clearly improve things.  */
     960       289730 :           if (i == commutative_op && !swapped
     961       287311 :               && (!is_a <bb_vec_info> (vinfo)
     962         4576 :                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
     963         4576 :                                              dts[i+1])
     964         1094 :                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
     965              :                           || vect_def_types_match
     966          150 :                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
     967              :             {
     968         2419 :               if (dump_enabled_p ())
     969          144 :                 dump_printf_loc (MSG_NOTE, vect_location,
     970              :                                  "trying swapped operands\n");
     971         2419 :               std::swap (dts[i], dts[i+1]);
     972         2419 :               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
     973         2419 :                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
     974         2419 :               std::swap ((*oprnds_info)[i]->ops[stmt_num],
     975         2419 :                          (*oprnds_info)[i+1]->ops[stmt_num]);
     976              :               /* After swapping some operands we lost track whether an
     977              :                  operand has any pattern defs so be conservative here.  */
     978         2419 :               if ((*oprnds_info)[i]->any_pattern
     979         2419 :                   || (*oprnds_info)[i+1]->any_pattern)
     980           36 :                 (*oprnds_info)[i]->any_pattern
     981           18 :                   = (*oprnds_info)[i+1]->any_pattern = true;
     982         2419 :               swapped = true;
     983         2419 :               continue;
     984              :             }
     985              : 
     986       284892 :           if (is_a <bb_vec_info> (vinfo)
     987       269494 :               && !oprnd_info->any_pattern
     988       554148 :               && number_of_oprnds > 1)
     989              :             {
     990              :               /* Now for commutative ops we should see whether we can
     991              :                  make the other operand matching.  */
     992       103439 :               if (dump_enabled_p ())
     993          203 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     994              :                                  "treating operand as external\n");
     995       103439 :               oprnd_info->first_dt = dt = vect_external_def;
     996              :             }
     997              :           else
     998              :             {
     999       181453 :               if (dump_enabled_p ())
    1000          407 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1001              :                                  "Build SLP failed: different types\n");
    1002       181453 :               return 1;
    1003              :             }
    1004              :         }
    1005              : 
    1006              :       /* Make sure to demote the overall operand to external.  */
    1007     16793537 :       if (dt == vect_external_def)
    1008       331942 :         oprnd_info->first_dt = vect_external_def;
    1009              :       /* For a SLP reduction chain we want to duplicate the reduction to
    1010              :          each of the chain members.  That gets us a sane SLP graph (still
    1011              :          the stmts are not 100% correct wrt the initial values).  */
    1012     16461595 :       else if ((dt == vect_internal_def
    1013     16461595 :                 || dt == vect_reduction_def)
    1014     15541210 :                && oprnd_info->first_dt == vect_reduction_def
    1015       100842 :                && !STMT_VINFO_DATA_REF (stmt_info)
    1016       100842 :                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
    1017         4509 :                && !STMT_VINFO_DATA_REF (def_stmt_info)
    1018     16466104 :                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
    1019              :                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
    1020              :         {
    1021         4509 :           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
    1022         4509 :           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
    1023              :         }
    1024              : 
    1025     16793537 :       ++i;
    1026              :     }
    1027              : 
    1028              :   /* Swap operands.  */
    1029      9092971 :   if (swapped)
    1030              :     {
    1031        40748 :       if (dump_enabled_p ())
    1032          430 :         dump_printf_loc (MSG_NOTE, vect_location,
    1033              :                          "swapped operands to match def types in %G",
    1034              :                          stmt_info->stmt);
    1035              :     }
    1036              : 
    1037              :   return 0;
    1038              : }
    1039              : 
    1040              : /* Return true if call statements CALL1 and CALL2 are similar enough
    1041              :    to be combined into the same SLP group.  */
    1042              : 
    1043              : bool
    1044        21106 : compatible_calls_p (gcall *call1, gcall *call2, bool allow_two_operators)
    1045              : {
    1046        21106 :   unsigned int nargs = gimple_call_num_args (call1);
    1047        21106 :   if (nargs != gimple_call_num_args (call2))
    1048              :     return false;
    1049              : 
    1050        19170 :   auto cfn1 = gimple_call_combined_fn (call1);
    1051        19170 :   auto cfn2 = gimple_call_combined_fn (call2);
    1052        19170 :   if (cfn1 != cfn2
    1053            2 :       && (!allow_two_operators
    1054            2 :           || !((cfn1 == CFN_FMA || cfn1 == CFN_FMS)
    1055            2 :                && (cfn2 == CFN_FMA || cfn2 == CFN_FMS))))
    1056              :     return false;
    1057              : 
    1058        19170 :   if (gimple_call_internal_p (call1))
    1059              :     {
    1060         6997 :       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
    1061         6997 :                                TREE_TYPE (gimple_call_lhs (call2))))
    1062              :         return false;
    1063        14372 :       for (unsigned int i = 0; i < nargs; ++i)
    1064         7375 :         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
    1065         7375 :                                  TREE_TYPE (gimple_call_arg (call2, i))))
    1066              :           return false;
    1067              :     }
    1068              :   else
    1069              :     {
    1070        12173 :       if (!operand_equal_p (gimple_call_fn (call1),
    1071        12173 :                             gimple_call_fn (call2), 0))
    1072              :         return false;
    1073              : 
    1074        26787 :       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
    1075              :         return false;
    1076              :     }
    1077              : 
    1078              :   /* Check that any unvectorized arguments are equal.  */
    1079        15926 :   if (const int *map = vect_get_operand_map (call1, false, false))
    1080              :     {
    1081           15 :       unsigned int nkept = *map++;
    1082           15 :       unsigned int mapi = 0;
    1083           57 :       for (unsigned int i = 0; i < nargs; ++i)
    1084           42 :         if (mapi < nkept && map[mapi] == int (i))
    1085           27 :           mapi += 1;
    1086           15 :         else if (!operand_equal_p (gimple_call_arg (call1, i),
    1087           15 :                                    gimple_call_arg (call2, i)))
    1088              :           return false;
    1089              :     }
    1090              : 
    1091              :   return true;
    1092              : }
    1093              : 
    1094              : /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
    1095              :    caller's attempt to find the vector type in STMT_INFO with the narrowest
    1096              :    element type.  Return true if VECTYPE is nonnull and if it is valid
    1097              :    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
    1098              :    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
    1099              :    vect_build_slp_tree.  */
    1100              : 
    1101              : static bool
    1102      5457928 : vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
    1103              :                         unsigned int group_size,
    1104              :                         tree vectype, poly_uint64 *max_nunits)
    1105              : {
    1106      5457928 :   if (!vectype)
    1107              :     {
    1108         3874 :       if (dump_enabled_p ())
    1109            7 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1110              :                          "Build SLP failed: unsupported data-type in %G\n",
    1111              :                          stmt_info->stmt);
    1112              :       /* Fatal mismatch.  */
    1113         3874 :       return false;
    1114              :     }
    1115              : 
    1116              :   /* If populating the vector type requires unrolling then fail
    1117              :      before adjusting *max_nunits for basic-block vectorization.  */
    1118      5454054 :   if (is_a <bb_vec_info> (vinfo)
    1119      5454054 :       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
    1120              :     {
    1121       140850 :       if (dump_enabled_p ())
    1122           34 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1123              :                          "Build SLP failed: unrolling required "
    1124              :                          "in basic block SLP\n");
    1125              :       /* Fatal mismatch.  */
    1126       140850 :       return false;
    1127              :     }
    1128              : 
    1129              :   /* In case of multiple types we need to detect the smallest type.  */
    1130      5313204 :   vect_update_max_nunits (max_nunits, vectype);
    1131      5313204 :   return true;
    1132              : }
    1133              : 
    1134              : /* Verify if the scalar stmts STMTS are isomorphic, require data
    1135              :    permutation or are of unsupported types of operation.  Return
    1136              :    true if they are, otherwise return false and indicate in *MATCHES
    1137              :    which stmts are not isomorphic to the first one.  If MATCHES[0]
    1138              :    is false then this indicates the comparison could not be
    1139              :    carried out or the stmts will never be vectorized by SLP.
    1140              : 
    1141              :    Note COND_EXPR is possibly isomorphic to another one after swapping its
    1142              :    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
    1143              :    the first stmt by swapping the two operands of comparison; set SWAP[i]
    1144              :    to 2 if stmt I is isormorphic to the first stmt by inverting the code
    1145              :    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
    1146              :    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
    1147              : 
    1148              : static bool
    1149      5708792 : vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
    1150              :                        vec<stmt_vec_info> stmts, unsigned int group_size,
    1151              :                        poly_uint64 *max_nunits, bool *matches,
    1152              :                        bool *two_operators, tree *node_vectype)
    1153              : {
    1154      5708792 :   unsigned int i;
    1155      5708792 :   stmt_vec_info first_stmt_info = stmts[0];
    1156      5708792 :   code_helper first_stmt_code = ERROR_MARK;
    1157      5708792 :   code_helper alt_stmt_code = ERROR_MARK;
    1158      5708792 :   code_helper first_cond_code = ERROR_MARK;
    1159      5708792 :   bool need_same_oprnds = false;
    1160      5708792 :   tree first_lhs = NULL_TREE;
    1161      5708792 :   tree first_op1 = NULL_TREE;
    1162      5708792 :   stmt_vec_info first_load = NULL, prev_first_load = NULL;
    1163      5708792 :   bool first_stmt_ldst_p = false, first_stmt_ldst_masklen_p = false;
    1164      5708792 :   bool first_stmt_phi_p = false;
    1165      5708792 :   int first_reduc_idx = -1;
    1166      5708792 :   bool maybe_soft_fail = false;
    1167      5708792 :   tree soft_fail_nunits_vectype = NULL_TREE;
    1168              : 
    1169      5708792 :   tree vectype, nunits_vectype;
    1170      5708792 :   if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
    1171              :                                        &nunits_vectype, group_size))
    1172              :     {
    1173              :       /* Fatal mismatch.  */
    1174       199563 :       matches[0] = false;
    1175       199563 :       return false;
    1176              :     }
    1177      5509229 :   if (is_a <bb_vec_info> (vinfo)
    1178      5509229 :       && known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U))
    1179              :     {
    1180       349296 :       if (dump_enabled_p ())
    1181          296 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1182              :                          "Build SLP failed: not using single lane "
    1183              :                          "vector type %T\n", vectype);
    1184       349296 :       matches[0] = false;
    1185       349296 :       return false;
    1186              :     }
    1187              :   /* Record nunits required but continue analysis, producing matches[]
    1188              :      as if nunits was not an issue.  This allows splitting of groups
    1189              :      to happen.  */
    1190      5159933 :   if (nunits_vectype
    1191      5159933 :       && !vect_record_max_nunits (vinfo, first_stmt_info, group_size,
    1192              :                                   nunits_vectype, max_nunits))
    1193              :     {
    1194       140850 :       gcc_assert (is_a <bb_vec_info> (vinfo));
    1195       140850 :       maybe_soft_fail = true;
    1196       140850 :       soft_fail_nunits_vectype = nunits_vectype;
    1197              :     }
    1198              : 
    1199      5159933 :   gcc_assert (vectype || !gimple_get_lhs (first_stmt_info->stmt));
    1200      5159933 :   *node_vectype = vectype;
    1201              : 
    1202              :   /* For every stmt in NODE find its def stmt/s.  */
    1203      5159933 :   stmt_vec_info stmt_info;
    1204     22071911 :   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
    1205              :     {
    1206     17074432 :       bool ldst_p = false;
    1207     17074432 :       bool ldst_masklen_p = false;
    1208     17074432 :       bool phi_p = false;
    1209     17074432 :       code_helper rhs_code = ERROR_MARK;
    1210              : 
    1211     17074432 :       swap[i] = 0;
    1212     17074432 :       matches[i] = false;
    1213     17074432 :       if (!stmt_info)
    1214              :         {
    1215        40264 :           matches[i] = true;
    1216     16952242 :           continue;
    1217              :         }
    1218              : 
    1219     17034168 :       gimple *stmt = stmt_info->stmt;
    1220     17034168 :       if (dump_enabled_p ())
    1221       218212 :         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
    1222              : 
    1223              :       /* Fail to vectorize statements marked as unvectorizable, throw
    1224              :          or are volatile.  */
    1225     17034168 :       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
    1226     16845033 :           || stmt_can_throw_internal (cfun, stmt)
    1227     33095661 :           || gimple_has_volatile_ops (stmt))
    1228              :         {
    1229       194630 :           if (dump_enabled_p ())
    1230          199 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1231              :                              "Build SLP failed: unvectorizable statement %G",
    1232              :                              stmt);
    1233              :           /* ???  For BB vectorization we want to commutate operands in a way
    1234              :              to shuffle all unvectorizable defs into one operand and have
    1235              :              the other still vectorized.  The following doesn't reliably
    1236              :              work for this though but it's the easiest we can do here.  */
    1237       194630 :           if (is_a <bb_vec_info> (vinfo) && i != 0)
    1238        63820 :             continue;
    1239              :           /* Fatal mismatch.  */
    1240       130810 :           matches[0] = false;
    1241       130810 :           return false;
    1242              :         }
    1243              : 
    1244     16839538 :       gcall *call_stmt = dyn_cast <gcall *> (stmt);
    1245     16839538 :       tree lhs = gimple_get_lhs (stmt);
    1246     16839538 :       if (lhs == NULL_TREE && !call_stmt)
    1247              :         {
    1248           36 :           if (dump_enabled_p ())
    1249            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1250              :                              "Build SLP failed: not GIMPLE_ASSIGN nor "
    1251              :                              "GIMPLE_CALL %G", stmt);
    1252           36 :           if (is_a <bb_vec_info> (vinfo) && i != 0)
    1253           36 :             continue;
    1254              :           /* Fatal mismatch.  */
    1255            0 :           matches[0] = false;
    1256            0 :           return false;
    1257              :         }
    1258              : 
    1259     16839502 :       if (call_stmt)
    1260              :         {
    1261       102339 :           combined_fn cfn = gimple_call_combined_fn (call_stmt);
    1262       102339 :           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
    1263        58546 :             rhs_code = cfn;
    1264              :           else
    1265              :             rhs_code = CALL_EXPR;
    1266              : 
    1267       102339 :           if (cfn == CFN_GATHER_LOAD
    1268       102339 :               || cfn == CFN_SCATTER_STORE)
    1269              :             ldst_p = true;
    1270              :           else if (cfn == CFN_MASK_LOAD
    1271              :                    || cfn == CFN_MASK_GATHER_LOAD
    1272              :                    || cfn == CFN_MASK_LEN_GATHER_LOAD
    1273              :                    || cfn == CFN_MASK_SCATTER_STORE
    1274              :                    || cfn == CFN_MASK_LEN_SCATTER_STORE)
    1275              :             {
    1276              :               ldst_p = true;
    1277              :               ldst_masklen_p = true;
    1278              :             }
    1279              :           else if (cfn == CFN_MASK_STORE)
    1280              :             {
    1281              :               ldst_p = true;
    1282              :               ldst_masklen_p = true;
    1283              :               rhs_code = CFN_MASK_STORE;
    1284              :             }
    1285              :           else if (cfn == CFN_GOMP_SIMD_LANE)
    1286              :             ;
    1287        90805 :           else if ((cfn != CFN_LAST
    1288              :                     && cfn != CFN_MASK_CALL
    1289        47012 :                     && internal_fn_p (cfn)
    1290        36867 :                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
    1291        90730 :                    || gimple_call_tail_p (call_stmt)
    1292        90730 :                    || gimple_call_noreturn_p (call_stmt)
    1293       181535 :                    || gimple_call_chain (call_stmt))
    1294              :             {
    1295          424 :               if (dump_enabled_p ())
    1296           13 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1297              :                                  "Build SLP failed: unsupported call type %G",
    1298              :                                  (gimple *) call_stmt);
    1299          424 :               if (is_a <bb_vec_info> (vinfo) && i != 0)
    1300           64 :                 continue;
    1301              :               /* Fatal mismatch.  */
    1302          360 :               matches[0] = false;
    1303          360 :               return false;
    1304              :             }
    1305              :         }
    1306     16737163 :       else if (gimple_code (stmt) == GIMPLE_PHI)
    1307              :         {
    1308              :           rhs_code = ERROR_MARK;
    1309              :           phi_p = true;
    1310              :         }
    1311              :       else
    1312              :         {
    1313     15953623 :           rhs_code = gimple_assign_rhs_code (stmt);
    1314     15953623 :           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
    1315              :         }
    1316              : 
    1317              :       /* Check the operation.  */
    1318     16839078 :       if (i == 0)
    1319              :         {
    1320      5028763 :           first_lhs = lhs;
    1321      5028763 :           first_stmt_code = rhs_code;
    1322      5028763 :           first_stmt_ldst_p = ldst_p;
    1323      5028763 :           first_stmt_ldst_masklen_p = ldst_masklen_p;
    1324      5028763 :           first_stmt_phi_p = phi_p;
    1325      5028763 :           first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
    1326              : 
    1327              :           /* Shift arguments should be equal in all the packed stmts for a
    1328              :              vector shift with scalar shift operand.  */
    1329      5028763 :           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
    1330      4893717 :               || rhs_code == LROTATE_EXPR
    1331      9922408 :               || rhs_code == RROTATE_EXPR)
    1332              :             {
    1333              :               /* First see if we have a vector/vector shift.  */
    1334       135501 :               if (!directly_supported_p (rhs_code, vectype, optab_vector))
    1335              :                 {
    1336              :                   /* No vector/vector shift, try for a vector/scalar shift.  */
    1337       123467 :                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
    1338              :                     {
    1339        11988 :                       if (dump_enabled_p ())
    1340          386 :                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1341              :                                          "Build SLP failed: "
    1342              :                                          "op not supported by target.\n");
    1343        11988 :                       if (is_a <bb_vec_info> (vinfo) && i != 0)
    1344              :                         continue;
    1345              :                       /* Fatal mismatch.  */
    1346        11988 :                       matches[0] = false;
    1347        11988 :                       return false;
    1348              :                     }
    1349       111479 :                   need_same_oprnds = true;
    1350       111479 :                   first_op1 = gimple_assign_rhs2 (stmt);
    1351              :                 }
    1352              :             }
    1353      4893262 :           else if (rhs_code == WIDEN_LSHIFT_EXPR)
    1354              :             {
    1355            0 :               need_same_oprnds = true;
    1356            0 :               first_op1 = gimple_assign_rhs2 (stmt);
    1357              :             }
    1358      4893262 :           else if (!ldst_p
    1359      4893262 :                    && rhs_code == BIT_FIELD_REF)
    1360              :             {
    1361         5776 :               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
    1362         5776 :               if (!is_a <bb_vec_info> (vinfo)
    1363         5650 :                   || TREE_CODE (vec) != SSA_NAME
    1364              :                   /* When the element types are not compatible we pun the
    1365              :                      source to the target vectype which requires equal size.  */
    1366        11414 :                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
    1367         4915 :                        || !types_compatible_p (TREE_TYPE (vectype),
    1368         4915 :                                                TREE_TYPE (TREE_TYPE (vec))))
    1369         1039 :                       && !operand_equal_p (TYPE_SIZE (vectype),
    1370         1039 :                                            TYPE_SIZE (TREE_TYPE (vec)))))
    1371              :                 {
    1372          781 :                   if (dump_enabled_p ())
    1373            0 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1374              :                                      "Build SLP failed: "
    1375              :                                      "BIT_FIELD_REF not supported\n");
    1376              :                   /* Fatal mismatch.  */
    1377          781 :                   matches[0] = false;
    1378          781 :                   return false;
    1379              :                 }
    1380              :             }
    1381      4887486 :           else if (rhs_code == CFN_DIV_POW2)
    1382              :             {
    1383            0 :               need_same_oprnds = true;
    1384            0 :               first_op1 = gimple_call_arg (call_stmt, 1);
    1385              :             }
    1386      4887486 :           else if (rhs_code == CFN_GOMP_SIMD_LANE)
    1387              :             {
    1388         3153 :               need_same_oprnds = true;
    1389         3153 :               first_op1 = gimple_call_arg (call_stmt, 1);
    1390              :             }
    1391              :         }
    1392              :       else
    1393              :         {
    1394     11810668 :           if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
    1395              :               /* For SLP reduction groups the index isn't necessarily
    1396              :                  uniform but only that of the first stmt matters.  */
    1397         2161 :               && !(first_reduc_idx != -1
    1398         2161 :                    && STMT_VINFO_REDUC_IDX (stmt_info) != -1
    1399         2161 :                    && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
    1400     11810315 :               && !(first_reduc_idx != -1
    1401          974 :                    && STMT_VINFO_REDUC_IDX (stmt_info) != -1
    1402          974 :                    && rhs_code.is_tree_code ()
    1403          974 :                    && commutative_tree_code (tree_code (rhs_code))
    1404          765 :                    && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info)))
    1405              :             {
    1406          353 :               if (dump_enabled_p ())
    1407              :                 {
    1408           12 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1409              :                                    "Build SLP failed: different reduc_idx "
    1410              :                                    "%d instead of %d in %G",
    1411              :                                    STMT_VINFO_REDUC_IDX (stmt_info),
    1412              :                                    first_reduc_idx, stmt);
    1413              :                 }
    1414              :               /* Mismatch.  */
    1415          353 :               continue;
    1416              :             }
    1417     11809962 :           if (!ldst_p
    1418      9278256 :               && first_stmt_code != rhs_code
    1419     13208949 :               && alt_stmt_code == ERROR_MARK)
    1420              :             alt_stmt_code = rhs_code;
    1421     13183158 :           if ((!ldst_p
    1422      9278256 :                && first_stmt_code != rhs_code
    1423      1398987 :                && (first_stmt_code != IMAGPART_EXPR
    1424          127 :                    || rhs_code != REALPART_EXPR)
    1425      1398967 :                && (first_stmt_code != REALPART_EXPR
    1426          524 :                    || rhs_code != IMAGPART_EXPR)
    1427              :                /* Handle mismatches in plus/minus by computing both
    1428              :                   and merging the results.  */
    1429      1398956 :                && !((((first_stmt_code == PLUS_EXPR
    1430      1296073 :                        || first_stmt_code == MINUS_EXPR)
    1431       126313 :                       && (alt_stmt_code == PLUS_EXPR
    1432       117234 :                           || alt_stmt_code == MINUS_EXPR))
    1433      1369707 :                      || ((first_stmt_code == CFN_FMA
    1434      1369705 :                           || first_stmt_code == CFN_FMS)
    1435            2 :                          && (alt_stmt_code == CFN_FMA
    1436            2 :                              || alt_stmt_code == CFN_FMS)))
    1437        29251 :                     && rhs_code == alt_stmt_code)
    1438      1409515 :                && !(first_stmt_code.is_tree_code ()
    1439      1293668 :                     && rhs_code.is_tree_code ()
    1440      1200600 :                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
    1441              :                         == tcc_comparison)
    1442       126946 :                     && (swap_tree_comparison (tree_code (first_stmt_code))
    1443       126946 :                         == tree_code (rhs_code))
    1444              :                     && (first_reduc_idx == -1
    1445            0 :                         || REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
    1446              :               || (ldst_p
    1447      5063412 :                   && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
    1448      2531706 :                       != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
    1449              :               || (ldst_p
    1450      2488941 :                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    1451      2488941 :                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
    1452     10436913 :               || first_stmt_ldst_p != ldst_p
    1453     10436774 :               || (ldst_p && first_stmt_ldst_masklen_p != ldst_masklen_p)
    1454     22246728 :               || first_stmt_phi_p != phi_p)
    1455              :             {
    1456      1373196 :               if (dump_enabled_p ())
    1457              :                 {
    1458         2929 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1459              :                                    "Build SLP failed: different operation "
    1460              :                                    "in stmt %G", stmt);
    1461         2929 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1462              :                                    "original stmt %G", first_stmt_info->stmt);
    1463              :                 }
    1464              :               /* Mismatch.  */
    1465      1373196 :               continue;
    1466              :             }
    1467              : 
    1468     10439103 :           if (!ldst_p
    1469      7947960 :               && first_stmt_code == BIT_FIELD_REF
    1470     10442499 :               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
    1471         5733 :                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
    1472              :             {
    1473         2337 :               if (dump_enabled_p ())
    1474           40 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1475              :                                  "Build SLP failed: different BIT_FIELD_REF "
    1476              :                                  "arguments in %G", stmt);
    1477              :               /* Mismatch.  */
    1478         2337 :               continue;
    1479              :             }
    1480              : 
    1481     10434429 :           if (call_stmt
    1482        21938 :               && first_stmt_code != CFN_MASK_LOAD
    1483     10455881 :               && first_stmt_code != CFN_MASK_STORE)
    1484              :             {
    1485        21106 :               if (!is_a <gcall *> (stmts[0]->stmt)
    1486        21106 :                   || !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
    1487              :                                           call_stmt, true))
    1488              :                 {
    1489         5180 :                   if (dump_enabled_p ())
    1490            0 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1491              :                                      "Build SLP failed: different calls in %G",
    1492              :                                      stmt);
    1493              :                   /* Mismatch.  */
    1494         5180 :                   continue;
    1495              :                 }
    1496              :             }
    1497              : 
    1498     10245661 :           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
    1499     11225866 :               && (gimple_bb (first_stmt_info->stmt)
    1500       980205 :                   != gimple_bb (stmt_info->stmt)))
    1501              :             {
    1502        27021 :               if (dump_enabled_p ())
    1503            8 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1504              :                                  "Build SLP failed: different BB for PHI "
    1505              :                                  "or possibly trapping operation in %G", stmt);
    1506              :               /* Mismatch.  */
    1507        27021 :               continue;
    1508              :             }
    1509              : 
    1510     10402228 :           if (need_same_oprnds)
    1511              :             {
    1512        55016 :               tree other_op1 = gimple_arg (stmt, 1);
    1513        55016 :               if (!operand_equal_p (first_op1, other_op1, 0))
    1514              :                 {
    1515         7506 :                   if (dump_enabled_p ())
    1516          123 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1517              :                                      "Build SLP failed: different shift "
    1518              :                                      "arguments in %G", stmt);
    1519              :                   /* Mismatch.  */
    1520         7506 :                   continue;
    1521              :                 }
    1522              :             }
    1523              : 
    1524     10395459 :           if (first_lhs
    1525     10394722 :               && lhs
    1526     10394722 :               && !types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (first_lhs)))
    1527              :             {
    1528          737 :               if (dump_enabled_p ())
    1529            6 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1530              :                                  "Build SLP failed: different vector type "
    1531              :                                  "in %G", stmt);
    1532              :               /* Mismatch.  */
    1533          737 :               continue;
    1534              :             }
    1535              :         }
    1536              : 
    1537              :       /* Grouped store or load.  */
    1538     15409979 :       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    1539              :         {
    1540      3842284 :           gcc_assert (ldst_p);
    1541      3842284 :           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
    1542              :             {
    1543              :               /* Store.  */
    1544      3010046 :               gcc_assert (rhs_code == CFN_MASK_STORE
    1545              :                           || REFERENCE_CLASS_P (lhs)
    1546              :                           || DECL_P (lhs));
    1547              :             }
    1548              :           else
    1549              :             {
    1550              :               /* Load.  */
    1551       832238 :               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
    1552       832238 :               if (prev_first_load)
    1553              :                 {
    1554              :                   /* Check that there are no loads from different interleaving
    1555              :                      chains in the same node.  */
    1556       379636 :                   if (prev_first_load != first_load)
    1557              :                     {
    1558        54358 :                       if (dump_enabled_p ())
    1559         1994 :                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
    1560              :                                          vect_location,
    1561              :                                          "Build SLP failed: different "
    1562              :                                          "interleaving chains in one node %G",
    1563              :                                          stmt);
    1564              :                       /* Mismatch.  */
    1565        54358 :                       continue;
    1566              :                     }
    1567              :                 }
    1568              :               else
    1569              :                 prev_first_load = first_load;
    1570              :            }
    1571              :         }
    1572              :       /* Non-grouped store or load.  */
    1573     11567695 :       else if (ldst_p)
    1574              :         {
    1575       882169 :           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
    1576       613070 :               && rhs_code != CFN_GATHER_LOAD
    1577              :               && rhs_code != CFN_MASK_GATHER_LOAD
    1578              :               && rhs_code != CFN_MASK_LEN_GATHER_LOAD
    1579              :               && rhs_code != CFN_SCATTER_STORE
    1580              :               && rhs_code != CFN_MASK_SCATTER_STORE
    1581              :               && rhs_code != CFN_MASK_LEN_SCATTER_STORE
    1582       613070 :               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    1583              :               /* Not grouped loads are handled as externals for BB
    1584              :                  vectorization.  For loop vectorization we can handle
    1585              :                  splats the same we handle single element interleaving.
    1586              :                  Likewise we can handle a collection of invariant refs.  */
    1587      1476406 :               && (is_a <bb_vec_info> (vinfo)
    1588       594237 :                   || (stmt_info != first_stmt_info
    1589        67872 :                   && !(integer_zerop (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
    1590          241 :                       && integer_zerop (DR_STEP (STMT_VINFO_DATA_REF
    1591              :                                                          (first_stmt_info)))))))
    1592              :             {
    1593              :               /* Not grouped load.  */
    1594        67390 :               if (dump_enabled_p ())
    1595          133 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1596              :                                  "Build SLP failed: not grouped load %G", stmt);
    1597              : 
    1598        67390 :               if (i != 0)
    1599        67390 :                 continue;
    1600              :               /* Fatal mismatch.  */
    1601            0 :               matches[0] = false;
    1602            0 :               return false;
    1603              :             }
    1604              :         }
    1605              :       /* Not memory operation.  */
    1606              :       else
    1607              :         {
    1608     10685526 :           if (!phi_p
    1609     10024262 :               && rhs_code.is_tree_code ()
    1610      9975722 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
    1611      1516058 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
    1612       939599 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
    1613       877672 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
    1614        65008 :               && rhs_code != VIEW_CONVERT_EXPR
    1615              :               && rhs_code != CALL_EXPR
    1616              :               && rhs_code != BIT_FIELD_REF
    1617     10685526 :               && rhs_code != SSA_NAME)
    1618              :             {
    1619        18515 :               if (dump_enabled_p ())
    1620            7 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1621              :                                  "Build SLP failed: operation unsupported %G",
    1622              :                                  stmt);
    1623        18515 :               if (is_a <bb_vec_info> (vinfo) && i != 0)
    1624            0 :                 continue;
    1625              :               /* Fatal mismatch.  */
    1626        18515 :               matches[0] = false;
    1627        18515 :               return false;
    1628              :             }
    1629              : 
    1630     10667011 :           if (rhs_code == COND_EXPR)
    1631              :             {
    1632        59046 :               tree cond_expr = gimple_assign_rhs1 (stmt);
    1633        59046 :               enum tree_code cond_code = TREE_CODE (cond_expr);
    1634        59046 :               enum tree_code swap_code = ERROR_MARK;
    1635        59046 :               enum tree_code invert_code = ERROR_MARK;
    1636              : 
    1637        59046 :               if (i == 0)
    1638        49815 :                 first_cond_code = TREE_CODE (cond_expr);
    1639         9231 :               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
    1640              :                 {
    1641            0 :                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
    1642            0 :                   swap_code = swap_tree_comparison (cond_code);
    1643            0 :                   invert_code = invert_tree_comparison (cond_code, honor_nans);
    1644              :                 }
    1645              : 
    1646        59046 :               if (first_cond_code == cond_code)
    1647              :                 ;
    1648              :               /* Isomorphic can be achieved by swapping.  */
    1649            0 :               else if (first_cond_code == swap_code)
    1650            0 :                 swap[i] = 1;
    1651              :               /* Isomorphic can be achieved by inverting.  */
    1652            0 :               else if (first_cond_code == invert_code)
    1653            0 :                 swap[i] = 2;
    1654              :               else
    1655              :                 {
    1656            0 :                   if (dump_enabled_p ())
    1657            0 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1658              :                                      "Build SLP failed: different"
    1659              :                                      " operation %G", stmt);
    1660              :                   /* Mismatch.  */
    1661            0 :                   continue;
    1662              :                 }
    1663              :             }
    1664              : 
    1665     10667011 :           if (i != 0
    1666      7906363 :               && first_stmt_code != rhs_code
    1667        68691 :               && first_stmt_code.is_tree_code ()
    1668        68689 :               && rhs_code.is_tree_code ()
    1669        68689 :               && TREE_CODE_CLASS ((tree_code)first_stmt_code) == tcc_comparison
    1670     10706636 :               && (swap_tree_comparison ((tree_code)first_stmt_code)
    1671        39625 :                   == (tree_code)rhs_code))
    1672        39625 :             swap[i] = 1;
    1673              : 
    1674     10667011 :           if (i != 0
    1675      7906363 :               && first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
    1676         1566 :               && first_reduc_idx != -1
    1677         1566 :               && STMT_VINFO_REDUC_IDX (stmt_info) != -1
    1678         1566 :               && rhs_code.is_tree_code ()
    1679         1566 :               && commutative_tree_code (tree_code (rhs_code))
    1680     10668577 :               && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info))
    1681         1566 :             swap[i] = 1;
    1682              :         }
    1683              : 
    1684     15269716 :       matches[i] = true;
    1685              :     }
    1686              : 
    1687     20276853 :   for (i = 0; i < group_size; ++i)
    1688     15950345 :     if (!matches[i])
    1689              :       return false;
    1690              : 
    1691              :   /* If we allowed a two-operation SLP node verify the target can cope
    1692              :      with the permute we are going to use.  */
    1693      4326508 :   if (alt_stmt_code != ERROR_MARK
    1694      4326508 :       && (!alt_stmt_code.is_tree_code ()
    1695        53525 :           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
    1696        53525 :               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
    1697              :     {
    1698        14473 :       *two_operators = true;
    1699              :     }
    1700              : 
    1701      4326508 :   if (maybe_soft_fail)
    1702              :     {
    1703       140440 :       unsigned HOST_WIDE_INT const_nunits;
    1704       140440 :       if (!TYPE_VECTOR_SUBPARTS
    1705       140440 :             (soft_fail_nunits_vectype).is_constant (&const_nunits)
    1706       140440 :           || const_nunits > group_size)
    1707            0 :         matches[0] = false;
    1708              :       else
    1709              :         {
    1710              :           /* With constant vector elements simulate a mismatch at the
    1711              :              point we need to split.  */
    1712       140440 :           unsigned tail = group_size & (const_nunits - 1);
    1713       140440 :           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
    1714              :         }
    1715       140440 :       return false;
    1716              :     }
    1717              : 
    1718              :   return true;
    1719              : }
    1720              : 
    1721              : /* Traits for the hash_set to record failed SLP builds for a stmt set.
    1722              :    Note we never remove apart from at destruction time so we do not
    1723              :    need a special value for deleted that differs from empty.  */
    1724              : struct bst_traits
    1725              : {
    1726              :   typedef vec <stmt_vec_info> value_type;
    1727              :   typedef vec <stmt_vec_info> compare_type;
    1728              :   static inline hashval_t hash (value_type);
    1729              :   static inline bool equal (value_type existing, value_type candidate);
    1730    476459549 :   static inline bool is_empty (value_type x) { return !x.exists (); }
    1731    106787001 :   static inline bool is_deleted (value_type x) { return !x.exists (); }
    1732              :   static const bool empty_zero_p = true;
    1733            0 :   static inline void mark_empty (value_type &x) { x.release (); }
    1734              :   static inline void mark_deleted (value_type &x) { x.release (); }
    1735      9155919 :   static inline void remove (value_type &x) { x.release (); }
    1736              : };
    1737              : inline hashval_t
    1738     93009311 : bst_traits::hash (value_type x)
    1739              : {
    1740     93009311 :   inchash::hash h;
    1741    422222311 :   for (unsigned i = 0; i < x.length (); ++i)
    1742    329213000 :     h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
    1743     93009311 :   return h.end ();
    1744              : }
    1745              : inline bool
    1746     81408377 : bst_traits::equal (value_type existing, value_type candidate)
    1747              : {
    1748    244225131 :   if (existing.length () != candidate.length ())
    1749              :     return false;
    1750     82862564 :   for (unsigned i = 0; i < existing.length (); ++i)
    1751     78536704 :     if (existing[i] != candidate[i])
    1752              :       return false;
    1753              :   return true;
    1754              : }
    1755              : 
    1756              : typedef hash_map <vec <stmt_vec_info>, slp_tree,
    1757              :                   simple_hashmap_traits <bst_traits, slp_tree> >
    1758              :   scalar_stmts_to_slp_tree_map_t;
    1759              : 
    1760              : /* Release BST_MAP.  */
    1761              : 
    1762              : static void
    1763      1782071 : release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
    1764              : {
    1765              :   /* The map keeps a reference on SLP nodes built, release that.  */
    1766     10937990 :   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
    1767     20093909 :        it != bst_map->end (); ++it)
    1768      9155919 :     if ((*it).second)
    1769      9155919 :       vect_free_slp_tree ((*it).second);
    1770      1782071 :   delete bst_map;
    1771      1782071 : }
    1772              : 
    1773              : /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
    1774              :    but then vec::insert does memmove and that's not compatible with
    1775              :    std::pair.  */
    1776              : struct chain_op_t
    1777              : {
    1778      3627176 :   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
    1779      3627176 :       : code (code_), dt (dt_), op (op_) {}
    1780              :   tree_code code;
    1781              :   vect_def_type dt;
    1782              :   tree op;
    1783              : };
    1784              : 
    1785              : /* Comparator for sorting associatable chains.  */
    1786              : 
    1787              : static int
    1788      8142393 : dt_sort_cmp (const void *op1_, const void *op2_, void *)
    1789              : {
    1790      8142393 :   auto *op1 = (const chain_op_t *) op1_;
    1791      8142393 :   auto *op2 = (const chain_op_t *) op2_;
    1792      8142393 :   if (op1->dt != op2->dt)
    1793       936800 :     return (int)op1->dt - (int)op2->dt;
    1794      7205593 :   return (int)op1->code - (int)op2->code;
    1795              : }
    1796              : 
    1797              : /* Linearize the associatable expression chain at START with the
    1798              :    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
    1799              :    filling CHAIN with the result and using WORKLIST as intermediate storage.
    1800              :    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
    1801              :    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
    1802              :    stmts, starting with START.  When ALLOW_ALT_CODE is false, do not
    1803              :    follow into MINUS_EXPR when building a PLUS chain (treat MINUS as leaf).  */
    1804              : 
    1805              : static void
    1806      1639303 : vect_slp_linearize_chain (vec_info *vinfo,
    1807              :                           vec<std::pair<tree_code, gimple *> > &worklist,
    1808              :                           vec<chain_op_t> &chain,
    1809              :                           enum tree_code code, gimple *start,
    1810              :                           gimple *&code_stmt, gimple *&alt_code_stmt,
    1811              :                           vec<gimple *> *chain_stmts,
    1812              :                           bool allow_alt_code = true)
    1813              : {
    1814              :   /* For each lane linearize the addition/subtraction (or other
    1815              :      uniform associatable operation) expression tree.  */
    1816      1639303 :   worklist.safe_push (std::make_pair (code, start));
    1817      3627176 :   while (!worklist.is_empty ())
    1818              :     {
    1819      1987873 :       auto entry = worklist.pop ();
    1820      1987873 :       gassign *stmt = as_a <gassign *> (entry.second);
    1821      1987873 :       enum tree_code in_code = entry.first;
    1822      3975746 :       enum tree_code this_code = gimple_assign_rhs_code (stmt);
    1823              :       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
    1824      1987873 :       if (!code_stmt
    1825      1987873 :           && gimple_assign_rhs_code (stmt) == code)
    1826      1393258 :         code_stmt = stmt;
    1827       594615 :       else if (!alt_code_stmt
    1828       594615 :                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
    1829       298798 :         alt_code_stmt = stmt;
    1830      1987873 :       if (chain_stmts)
    1831      1915076 :         chain_stmts->safe_push (stmt);
    1832      5963619 :       for (unsigned opnum = 1; opnum <= 2; ++opnum)
    1833              :         {
    1834      3975746 :           tree op = gimple_op (stmt, opnum);
    1835      3975746 :           vect_def_type dt;
    1836      3975746 :           stmt_vec_info def_stmt_info;
    1837      3975746 :           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
    1838      3975746 :           gcc_assert (res);
    1839      3975746 :           if (dt == vect_internal_def
    1840      3975746 :               && is_pattern_stmt_p (def_stmt_info))
    1841         8665 :             op = gimple_get_lhs (def_stmt_info->stmt);
    1842      3975746 :           gimple *use_stmt;
    1843      3975746 :           use_operand_p use_p;
    1844      3975746 :           if (dt == vect_internal_def
    1845      3694598 :               && single_imm_use (op, &use_p, &use_stmt)
    1846      2282959 :               && is_gimple_assign (def_stmt_info->stmt)
    1847      6080211 :               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
    1848      1756207 :                   || (allow_alt_code
    1849        49379 :                       && code == PLUS_EXPR
    1850        28840 :                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
    1851              :                           == MINUS_EXPR))))
    1852              :             {
    1853       348570 :               tree_code op_def_code = this_code;
    1854       348570 :               if (op_def_code == MINUS_EXPR && opnum == 1)
    1855        51026 :                 op_def_code = PLUS_EXPR;
    1856       348570 :               if (in_code == MINUS_EXPR)
    1857          135 :                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
    1858       348570 :               worklist.safe_push (std::make_pair (op_def_code,
    1859       348570 :                                                   def_stmt_info->stmt));
    1860              :             }
    1861              :           else
    1862              :             {
    1863      3627176 :               tree_code op_def_code = this_code;
    1864      3627176 :               if (op_def_code == MINUS_EXPR && opnum == 1)
    1865       247889 :                 op_def_code = PLUS_EXPR;
    1866      3627176 :               if (in_code == MINUS_EXPR)
    1867         4051 :                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
    1868      3627176 :               chain.safe_push (chain_op_t (op_def_code, dt, op));
    1869              :             }
    1870              :         }
    1871              :     }
    1872      1639303 : }
    1873              : 
    1874              : static slp_tree
    1875              : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
    1876              :                        vec<stmt_vec_info> stmts, unsigned int group_size,
    1877              :                        poly_uint64 *max_nunits,
    1878              :                        bool *matches, unsigned *limit, unsigned *tree_size,
    1879              :                        scalar_stmts_to_slp_tree_map_t *bst_map);
    1880              : 
    1881              : static slp_tree
    1882      6192511 : vect_build_slp_tree (vec_info *vinfo,
    1883              :                      vec<stmt_vec_info> stmts, unsigned int group_size,
    1884              :                      poly_uint64 *max_nunits,
    1885              :                      bool *matches, unsigned *limit, unsigned *tree_size,
    1886              :                      scalar_stmts_to_slp_tree_map_t *bst_map)
    1887              : {
    1888      6192511 :   if (slp_tree *leader = bst_map->get (stmts))
    1889              :     {
    1890       478533 :       if (dump_enabled_p ())
    1891        17108 :         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
    1892        17108 :                          !(*leader)->failed ? "" : "failed ",
    1893              :                          (void *) *leader);
    1894       478533 :       if (!(*leader)->failed)
    1895              :         {
    1896       431669 :           SLP_TREE_REF_COUNT (*leader)++;
    1897       431669 :           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
    1898       431669 :           stmts.release ();
    1899       431669 :           return *leader;
    1900              :         }
    1901        46864 :       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
    1902        46864 :       return NULL;
    1903              :     }
    1904              : 
    1905              :   /* Single-lane SLP doesn't have the chance of run-away, do not account
    1906              :      it to the limit.  */
    1907      5713978 :   if (stmts.length () > 1)
    1908              :     {
    1909      3141938 :       if (*limit == 0)
    1910              :         {
    1911         1235 :           if (dump_enabled_p ())
    1912           12 :             dump_printf_loc (MSG_NOTE, vect_location,
    1913              :                              "SLP discovery limit exceeded\n");
    1914         1235 :           memset (matches, 0, sizeof (bool) * group_size);
    1915         1235 :           return NULL;
    1916              :         }
    1917      3140703 :       --*limit;
    1918              :     }
    1919              : 
    1920              :   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
    1921              :      so we can pick up backedge destinations during discovery.  */
    1922      5712743 :   slp_tree res = new _slp_tree;
    1923      5712743 :   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
    1924      5712743 :   SLP_TREE_SCALAR_STMTS (res) = stmts;
    1925      5712743 :   bst_map->put (stmts.copy (), res);
    1926              : 
    1927      5712743 :   if (dump_enabled_p ())
    1928       145814 :     dump_printf_loc (MSG_NOTE, vect_location,
    1929              :                      "starting SLP discovery for node %p\n", (void *) res);
    1930              : 
    1931      5712743 :   poly_uint64 this_max_nunits = 1;
    1932      5712743 :   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
    1933              :                                         &this_max_nunits,
    1934              :                                         matches, limit, tree_size, bst_map);
    1935      5712743 :   if (!res_)
    1936              :     {
    1937      1982177 :       if (dump_enabled_p ())
    1938         8285 :         dump_printf_loc (MSG_NOTE, vect_location,
    1939              :                          "SLP discovery for node %p failed\n", (void *) res);
    1940              :       /* Mark the node invalid so we can detect those when still in use
    1941              :          as backedge destinations.  */
    1942      1982177 :       SLP_TREE_SCALAR_STMTS (res) = vNULL;
    1943      1982177 :       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
    1944      1982177 :       res->failed = XNEWVEC (bool, group_size);
    1945      1982177 :       if (flag_checking)
    1946              :         {
    1947              :           unsigned i;
    1948      3520626 :           for (i = 0; i < group_size; ++i)
    1949      3520626 :             if (!matches[i])
    1950              :               break;
    1951      1982177 :           gcc_assert (i < group_size);
    1952              :         }
    1953      1982177 :       memcpy (res->failed, matches, sizeof (bool) * group_size);
    1954              :     }
    1955              :   else
    1956              :     {
    1957      3730566 :       if (dump_enabled_p ())
    1958       137529 :         dump_printf_loc (MSG_NOTE, vect_location,
    1959              :                          "SLP discovery for node %p succeeded\n",
    1960              :                          (void *) res);
    1961      3730566 :       gcc_assert (res_ == res);
    1962      3730566 :       res->max_nunits = this_max_nunits;
    1963      3730566 :       vect_update_max_nunits (max_nunits, this_max_nunits);
    1964              :       /* Keep a reference for the bst_map use.  */
    1965      3730566 :       SLP_TREE_REF_COUNT (res)++;
    1966              :     }
    1967              :   return res_;
    1968              : }
    1969              : 
    1970              : /* Helper for building an associated SLP node chain.  */
    1971              : 
    1972              : static void
    1973          178 : vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
    1974              :                                    slp_tree op0, slp_tree op1,
    1975              :                                    stmt_vec_info oper1, stmt_vec_info oper2,
    1976              :                                    vec<std::pair<unsigned, unsigned> > lperm)
    1977              : {
    1978          178 :   unsigned group_size = SLP_TREE_LANES (op1);
    1979              : 
    1980          178 :   slp_tree child1 = new _slp_tree;
    1981          178 :   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
    1982          178 :   SLP_TREE_VECTYPE (child1) = vectype;
    1983          178 :   SLP_TREE_LANES (child1) = group_size;
    1984          178 :   SLP_TREE_CHILDREN (child1).create (2);
    1985          178 :   SLP_TREE_CHILDREN (child1).quick_push (op0);
    1986          178 :   SLP_TREE_CHILDREN (child1).quick_push (op1);
    1987          178 :   SLP_TREE_REPRESENTATIVE (child1) = oper1;
    1988              : 
    1989          178 :   slp_tree child2 = new _slp_tree;
    1990          178 :   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
    1991          178 :   SLP_TREE_VECTYPE (child2) = vectype;
    1992          178 :   SLP_TREE_LANES (child2) = group_size;
    1993          178 :   SLP_TREE_CHILDREN (child2).create (2);
    1994          178 :   SLP_TREE_CHILDREN (child2).quick_push (op0);
    1995          178 :   SLP_TREE_REF_COUNT (op0)++;
    1996          178 :   SLP_TREE_CHILDREN (child2).quick_push (op1);
    1997          178 :   SLP_TREE_REF_COUNT (op1)++;
    1998          178 :   SLP_TREE_REPRESENTATIVE (child2) = oper2;
    1999              : 
    2000          178 :   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
    2001          178 :   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
    2002          178 :   SLP_TREE_VECTYPE (perm) = vectype;
    2003          178 :   SLP_TREE_LANES (perm) = group_size;
    2004              :   /* ???  We should set this NULL but that's not expected.  */
    2005          178 :   SLP_TREE_REPRESENTATIVE (perm) = oper1;
    2006          178 :   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
    2007          178 :   SLP_TREE_CHILDREN (perm).quick_push (child1);
    2008          178 :   SLP_TREE_CHILDREN (perm).quick_push (child2);
    2009          178 : }
    2010              : 
    2011              : /* Recursively build an SLP tree starting from NODE.
    2012              :    Fail (and return a value not equal to zero) if def-stmts are not
    2013              :    isomorphic, require data permutation or are of unsupported types of
    2014              :    operation.  Otherwise, return 0.
    2015              :    The value returned is the depth in the SLP tree where a mismatch
    2016              :    was found.  */
    2017              : 
    2018              : static slp_tree
    2019      5712743 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
    2020              :                        vec<stmt_vec_info> stmts, unsigned int group_size,
    2021              :                        poly_uint64 *max_nunits,
    2022              :                        bool *matches, unsigned *limit, unsigned *tree_size,
    2023              :                        scalar_stmts_to_slp_tree_map_t *bst_map)
    2024              : {
    2025      5712743 :   unsigned nops, i, this_tree_size = 0;
    2026      5712743 :   poly_uint64 this_max_nunits = *max_nunits;
    2027              : 
    2028      5712743 :   matches[0] = false;
    2029              : 
    2030      5712743 :   stmt_vec_info stmt_info = stmts[0];
    2031      5712743 :   if (!is_a<gcall *> (stmt_info->stmt)
    2032              :       && !is_a<gassign *> (stmt_info->stmt)
    2033              :       && !is_a<gphi *> (stmt_info->stmt))
    2034              :     return NULL;
    2035              : 
    2036      5712672 :   nops = gimple_num_args (stmt_info->stmt);
    2037      5712672 :   if (const int *map = vect_get_operand_map (stmt_info))
    2038        35090 :     nops = map[0];
    2039              : 
    2040              :   /* If the SLP node is a PHI (induction or reduction), terminate
    2041              :      the recursion.  */
    2042      5712672 :   bool *skip_args = XALLOCAVEC (bool, nops);
    2043      5712672 :   memset (skip_args, 0, sizeof (bool) * nops);
    2044      5712672 :   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
    2045      2768061 :     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
    2046              :       {
    2047       298015 :         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
    2048       298015 :         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
    2049              :                                                     group_size);
    2050       298015 :         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
    2051              :                                      max_nunits))
    2052              :           return NULL;
    2053              : 
    2054       294141 :         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
    2055       294141 :         if (def_type == vect_induction_def)
    2056              :           {
    2057              :             /* Induction PHIs are not cycles but walk the initial
    2058              :                value.  Only for inner loops through, for outer loops
    2059              :                we need to pick up the value from the actual PHIs
    2060              :                to more easily support peeling and epilogue vectorization.  */
    2061       188288 :             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    2062       188288 :             if (!nested_in_vect_loop_p (loop, stmt_info))
    2063       187464 :               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
    2064              :             else
    2065              :               loop = loop->inner;
    2066       188288 :             skip_args[loop_latch_edge (loop)->dest_idx] = true;
    2067              :           }
    2068       105853 :         else if (def_type == vect_reduction_def
    2069              :                  || def_type == vect_double_reduction_def
    2070              :                  || def_type == vect_nested_cycle
    2071       105853 :                  || def_type == vect_first_order_recurrence)
    2072              :           {
    2073              :             /* Else def types have to match.  */
    2074              :             stmt_vec_info other_info;
    2075              :             bool all_same = true;
    2076       239614 :             FOR_EACH_VEC_ELT (stmts, i, other_info)
    2077              :               {
    2078       135075 :                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
    2079      1739083 :                   return NULL;
    2080       135069 :                 if (other_info != stmt_info)
    2081        26183 :                   all_same = false;
    2082              :               }
    2083       104539 :             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    2084              :             /* Reduction initial values are not explicitly represented.  */
    2085       104539 :             if (def_type != vect_first_order_recurrence
    2086       104539 :                 && gimple_bb (stmt_info->stmt) == loop->header)
    2087       101394 :               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
    2088              :             /* Reduction chain backedge defs are filled manually.
    2089              :                ???  Need a better way to identify a SLP reduction chain PHI.
    2090              :                Or a better overall way to SLP match those.  */
    2091       104539 :             if (stmts.length () > 1
    2092       104539 :                 && all_same && def_type == vect_reduction_def)
    2093         2311 :               skip_args[loop_latch_edge (loop)->dest_idx] = true;
    2094              :           }
    2095         1308 :         else if (def_type != vect_internal_def)
    2096              :           return NULL;
    2097              :       }
    2098              : 
    2099              : 
    2100      5708792 :   bool two_operators = false;
    2101      5708792 :   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
    2102      5708792 :   tree vectype = NULL_TREE;
    2103      5708792 :   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
    2104              :                               &this_max_nunits, matches, &two_operators,
    2105              :                               &vectype))
    2106              :     return NULL;
    2107              : 
    2108              :   /* If the SLP node is a load, terminate the recursion unless masked.  */
    2109      4186068 :   if (STMT_VINFO_DATA_REF (stmt_info)
    2110      2022333 :       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
    2111              :     {
    2112       895949 :       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
    2113              :         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
    2114              :       else
    2115              :         {
    2116       877367 :           *max_nunits = this_max_nunits;
    2117       877367 :           (*tree_size)++;
    2118       877367 :           node = vect_create_new_slp_node (node, stmts, 0);
    2119       877367 :           SLP_TREE_VECTYPE (node) = vectype;
    2120              :           /* And compute the load permutation.  Whether it is actually
    2121              :              a permutation depends on the unrolling factor which is
    2122              :              decided later.  */
    2123       877367 :           vec<unsigned> load_permutation;
    2124       877367 :           int j;
    2125       877367 :           stmt_vec_info load_info;
    2126       877367 :           load_permutation.create (group_size);
    2127       877367 :           stmt_vec_info first_stmt_info
    2128       877367 :             = STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2129       877367 :               ? DR_GROUP_FIRST_ELEMENT (stmt_info) : stmt_info;
    2130       877367 :           bool any_permute = false;
    2131      2114986 :           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
    2132              :             {
    2133      1237619 :               int load_place;
    2134      1237619 :               if (! load_info)
    2135              :                 {
    2136        39944 :                   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    2137              :                     load_place = j;
    2138              :                   else
    2139              :                     load_place = 0;
    2140              :                 }
    2141      1197675 :               else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    2142       698766 :                 load_place = vect_get_place_in_interleaving_chain
    2143       698766 :                     (load_info, first_stmt_info);
    2144              :               else
    2145              :                 /* Recognize the splat case as { 0, 0, ... } but make
    2146              :                    sure to use the appropriate refs for collections
    2147              :                    of invariant refs.  */
    2148       498909 :                 load_place = (load_info == stmt_info) ? 0 : j;
    2149       738951 :               gcc_assert (load_place != -1);
    2150      1237619 :               any_permute |= load_place != j;
    2151      1237619 :               load_permutation.quick_push (load_place);
    2152              :             }
    2153              : 
    2154       877367 :           if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
    2155              :             {
    2156         3406 :               gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
    2157         3406 :               bool has_gaps = false;
    2158         3406 :               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    2159          209 :                 for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
    2160         1346 :                      si; si = DR_GROUP_NEXT_ELEMENT (si))
    2161         1137 :                   if (DR_GROUP_GAP (si) != 1)
    2162          160 :                     has_gaps = true;
    2163              :               /* We cannot handle permuted masked loads directly, see
    2164              :                  PR114375.  We cannot handle strided masked loads or masked
    2165              :                  loads with gaps unless the mask is uniform.  */
    2166         3406 :               if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2167          209 :                    && (DR_GROUP_GAP (first_stmt_info) != 0
    2168          149 :                        || (has_gaps
    2169           55 :                            && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))))
    2170         6717 :                   || STMT_VINFO_STRIDED_P (stmt_info))
    2171              :                 {
    2172          108 :                   load_permutation.release ();
    2173          108 :                   matches[0] = false;
    2174       874113 :                   return NULL;
    2175              :                 }
    2176              : 
    2177              :               /* For permuted masked loads do an unpermuted masked load of
    2178              :                  the whole group followed by a SLP permute node.  */
    2179         3298 :               if (any_permute
    2180         3298 :                   || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2181           84 :                       && DR_GROUP_SIZE (first_stmt_info) != group_size))
    2182              :                 {
    2183              :                   /* Discover the whole unpermuted load.  */
    2184           44 :                   vec<stmt_vec_info> stmts2;
    2185           44 :                   unsigned dr_group_size = STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2186           78 :                       ? DR_GROUP_SIZE (first_stmt_info) : 1;
    2187           44 :                   stmts2.create (dr_group_size);
    2188           44 :                   stmts2.quick_grow_cleared (dr_group_size);
    2189           44 :                   unsigned i = 0;
    2190           44 :                   for (stmt_vec_info si = first_stmt_info;
    2191          594 :                        si; si = DR_GROUP_NEXT_ELEMENT (si))
    2192              :                     {
    2193          550 :                       if (si != first_stmt_info)
    2194         2106 :                         for (unsigned k = 1; k < DR_GROUP_GAP (si); ++k)
    2195         1600 :                           stmts2[i++] = NULL;
    2196          550 :                       stmts2[i++] = si;
    2197              :                     }
    2198           44 :                   bool *matches2 = XALLOCAVEC (bool, dr_group_size);
    2199           44 :                   slp_tree unperm_load
    2200           44 :                     = vect_build_slp_tree (vinfo, stmts2, dr_group_size,
    2201              :                                            &this_max_nunits, matches2, limit,
    2202           44 :                                            &this_tree_size, bst_map);
    2203              :                   /* When we are able to do the full masked load emit that
    2204              :                      followed by 'node' being the desired final permutation.  */
    2205           44 :                   if (unperm_load)
    2206              :                     {
    2207           16 :                       gcc_assert
    2208              :                         (!SLP_TREE_LOAD_PERMUTATION (unperm_load).exists ());
    2209           16 :                       lane_permutation_t lperm;
    2210           16 :                       lperm.create (group_size);
    2211           56 :                       for (unsigned j = 0; j < load_permutation.length (); ++j)
    2212           40 :                         lperm.quick_push
    2213           40 :                           (std::make_pair (0, load_permutation[j]));
    2214           16 :                       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
    2215           16 :                       SLP_TREE_CHILDREN (node).safe_push (unperm_load);
    2216           16 :                       SLP_TREE_LANE_PERMUTATION (node) = lperm;
    2217           16 :                       load_permutation.release ();
    2218           16 :                       return node;
    2219              :                     }
    2220           28 :                   stmts2.release ();
    2221           28 :                   load_permutation.release ();
    2222           28 :                   matches[0] = false;
    2223           28 :                   return NULL;
    2224              :                 }
    2225         3254 :               load_permutation.release ();
    2226              :             }
    2227              :           else
    2228              :             {
    2229       873961 :               if (!any_permute
    2230       761472 :                   && STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2231      1162510 :                   && group_size == DR_GROUP_SIZE (first_stmt_info))
    2232       126151 :                 load_permutation.release ();
    2233       873961 :               SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
    2234       873961 :               return node;
    2235              :             }
    2236              :         }
    2237              :     }
    2238      3290119 :   else if (gimple_assign_single_p (stmt_info->stmt)
    2239      2257228 :            && !gimple_vuse (stmt_info->stmt)
    2240      3297915 :            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
    2241              :     {
    2242              :       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
    2243              :          the same SSA name vector of a compatible type to vectype.  */
    2244         2367 :       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
    2245         2367 :       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
    2246         2367 :       stmt_vec_info estmt_info;
    2247         7443 :       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
    2248              :         {
    2249         5223 :           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
    2250         5223 :           tree bfref = gimple_assign_rhs1 (estmt);
    2251         5223 :           HOST_WIDE_INT lane;
    2252         5223 :           if (!known_eq (bit_field_size (bfref),
    2253              :                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
    2254        10299 :               || !constant_multiple_p (bit_field_offset (bfref),
    2255         5076 :                                        bit_field_size (bfref), &lane))
    2256              :             {
    2257          147 :               lperm.release ();
    2258          147 :               matches[0] = false;
    2259          147 :               return NULL;
    2260              :             }
    2261         5076 :           lperm.safe_push (std::make_pair (0, (unsigned)lane));
    2262              :         }
    2263         2220 :       slp_tree vnode = vect_create_new_slp_node (vNULL);
    2264         2220 :       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
    2265              :         /* ???  We record vectype here but we hide eventually necessary
    2266              :            punning and instead rely on code generation to materialize
    2267              :            VIEW_CONVERT_EXPRs as necessary.  We instead should make
    2268              :            this explicit somehow.  */
    2269          704 :         SLP_TREE_VECTYPE (vnode) = vectype;
    2270              :       else
    2271              :         {
    2272              :           /* For different size but compatible elements we can still
    2273              :              use VEC_PERM_EXPR without punning.  */
    2274         1516 :           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
    2275              :                       && types_compatible_p (TREE_TYPE (vectype),
    2276              :                                              TREE_TYPE (TREE_TYPE (vec))));
    2277         1516 :           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
    2278              :         }
    2279         2220 :       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
    2280         2220 :       unsigned HOST_WIDE_INT const_nunits;
    2281         2220 :       if (nunits.is_constant (&const_nunits))
    2282         2220 :         SLP_TREE_LANES (vnode) = const_nunits;
    2283         2220 :       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
    2284              :       /* We are always building a permutation node even if it is an identity
    2285              :          permute to shield the rest of the vectorizer from the odd node
    2286              :          representing an actual vector without any scalar ops.
    2287              :          ???  We could hide it completely with making the permute node
    2288              :          external?  */
    2289         2220 :       node = vect_create_new_slp_node (node, stmts, 1);
    2290         2220 :       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
    2291         2220 :       SLP_TREE_LANE_PERMUTATION (node) = lperm;
    2292         2220 :       SLP_TREE_VECTYPE (node) = vectype;
    2293         2220 :       SLP_TREE_CHILDREN (node).quick_push (vnode);
    2294         2220 :       return node;
    2295              :     }
    2296              :   /* When discovery reaches an associatable operation see whether we can
    2297              :      improve that to match up lanes in a way superior to the operand
    2298              :      swapping code which at most looks at two defs.
    2299              :      ???  For BB vectorization we cannot do the brute-force search
    2300              :      for matching as we can succeed by means of builds from scalars
    2301              :      and have no good way to "cost" one build against another.  */
    2302      3287752 :   else if (is_a <loop_vec_info> (vinfo)
    2303              :            /* Do not bother for single-lane SLP.  */
    2304      1955914 :            && group_size > 1
    2305              :            /* ???  We don't handle !vect_internal_def defs below.  */
    2306       111410 :            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
    2307              :            /* ???  Do not associate a reduction, this will wreck REDUC_IDX
    2308              :               mapping as long as that exists on the stmt_info level.  */
    2309        86051 :            && STMT_VINFO_REDUC_IDX (stmt_info) == -1
    2310        77556 :            && is_gimple_assign (stmt_info->stmt)
    2311        77242 :            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
    2312        50678 :                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
    2313      3316224 :            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
    2314        16258 :                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
    2315        13736 :                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
    2316              :     {
    2317              :       /* See if we have a chain of (mixed) adds or subtracts or other
    2318              :          associatable ops.  */
    2319        21439 :       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
    2320        21439 :       if (code == MINUS_EXPR)
    2321          796 :         code = PLUS_EXPR;
    2322        21439 :       stmt_vec_info other_op_stmt_info = NULL;
    2323        21439 :       stmt_vec_info op_stmt_info = NULL;
    2324        21439 :       unsigned chain_len = 0;
    2325        21439 :       auto_vec<chain_op_t> chain;
    2326        21439 :       auto_vec<std::pair<tree_code, gimple *> > worklist;
    2327        21439 :       auto_vec<vec<chain_op_t> > chains (group_size);
    2328        21439 :       auto_vec<slp_tree, 4> children;
    2329        21439 :       bool hard_fail = true;
    2330        22506 :       for (unsigned lane = 0; lane < group_size; ++lane)
    2331              :         {
    2332        22150 :           if (!stmts[lane])
    2333              :             {
    2334              :               /* ???  Below we require lane zero is present.  */
    2335            0 :               if (lane == 0)
    2336              :                 {
    2337              :                   hard_fail = false;
    2338        21083 :                   break;
    2339              :                 }
    2340            0 :               chains.quick_push (vNULL);
    2341            0 :               continue;
    2342              :             }
    2343              :           /* For each lane linearize the addition/subtraction (or other
    2344              :              uniform associatable operation) expression tree.  */
    2345        22150 :           gimple *op_stmt = NULL, *other_op_stmt = NULL;
    2346        22150 :           vect_slp_linearize_chain (vinfo, worklist, chain, code,
    2347        22150 :                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
    2348              :                                     NULL);
    2349        22150 :           if (!op_stmt_info && op_stmt)
    2350        20860 :             op_stmt_info = vinfo->lookup_stmt (op_stmt);
    2351        22150 :           if (!other_op_stmt_info && other_op_stmt)
    2352          832 :             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
    2353        22150 :           if (chain.length () == 2)
    2354              :             {
    2355              :               /* In a chain of just two elements resort to the regular
    2356              :                  operand swapping scheme.  Likewise if we run into a
    2357              :                  length mismatch process regularly as well as we did not
    2358              :                  process the other lanes we cannot report a good hint what
    2359              :                  lanes to try swapping in the parent.  */
    2360              :               hard_fail = false;
    2361              :               break;
    2362              :             }
    2363         1070 :           else if (chain_len == 0)
    2364          396 :             chain_len = chain.length ();
    2365         1348 :           else if (chain.length () != chain_len)
    2366              :             {
    2367              :               /* ???  Here we could slip in magic to compensate with
    2368              :                  neutral operands.  */
    2369            3 :               matches[lane] = false;
    2370            3 :               if (lane != group_size - 1)
    2371            3 :                 matches[0] = false;
    2372              :               break;
    2373              :             }
    2374         1067 :           chains.quick_push (chain.copy ());
    2375         1067 :           chain.truncate (0);
    2376              :         }
    2377        42878 :       if (chains.length () == group_size)
    2378              :         {
    2379              :           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
    2380          356 :           if (!op_stmt_info)
    2381              :             {
    2382            3 :               hard_fail = false;
    2383            3 :               goto out;
    2384              :             }
    2385              :           /* Now we have a set of chains with the same length.  */
    2386              :           /* 1. pre-sort according to def_type and operation.  */
    2387         1308 :           for (unsigned lane = 0; lane < group_size; ++lane)
    2388         1910 :             chains[lane].stablesort (dt_sort_cmp, vinfo);
    2389          353 :           if (dump_enabled_p ())
    2390              :             {
    2391          157 :               dump_printf_loc (MSG_NOTE, vect_location,
    2392              :                                "pre-sorted chains of %s\n",
    2393              :                                get_tree_code_name (code));
    2394          685 :               for (unsigned lane = 0; lane < group_size; ++lane)
    2395              :                 {
    2396          528 :                   if (!stmts[lane])
    2397            0 :                     dump_printf (MSG_NOTE, "--");
    2398              :                   else
    2399         2422 :                     for (unsigned opnum = 0; opnum < chain_len; ++opnum)
    2400         3788 :                       dump_printf (MSG_NOTE, "%s %T ",
    2401         1894 :                                    get_tree_code_name (chains[lane][opnum].code),
    2402         1894 :                                    chains[lane][opnum].op);
    2403          528 :                   dump_printf (MSG_NOTE, "\n");
    2404              :                 }
    2405              :             }
    2406              :           /* 2. try to build children nodes, associating as necessary.  */
    2407              :           /* 2a. prepare and perform early checks to avoid eating into
    2408              :              discovery limit unnecessarily.  */
    2409          353 :           vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
    2410         1487 :           for (unsigned n = 0; n < chain_len; ++n)
    2411              :             {
    2412         1134 :               vect_def_type dt = chains[0][n].dt;
    2413         1134 :               unsigned lane;
    2414         4357 :               for (lane = 0; lane < group_size; ++lane)
    2415         6446 :                 if (stmts[lane] && chains[lane][n].dt != dt)
    2416              :                   {
    2417            0 :                     if (dt == vect_constant_def
    2418            0 :                         && chains[lane][n].dt == vect_external_def)
    2419              :                       dt = vect_external_def;
    2420            0 :                     else if (dt == vect_external_def
    2421            0 :                              && chains[lane][n].dt == vect_constant_def)
    2422              :                       ;
    2423              :                     else
    2424              :                       break;
    2425              :                   }
    2426         1134 :               if (lane != group_size)
    2427              :                 {
    2428            0 :                   if (dump_enabled_p ())
    2429            0 :                     dump_printf_loc (MSG_NOTE, vect_location,
    2430              :                                      "giving up on chain due to mismatched "
    2431              :                                      "def types\n");
    2432            0 :                   matches[lane] = false;
    2433            0 :                   if (lane != group_size - 1)
    2434            0 :                     matches[0] = false;
    2435            0 :                   goto out;
    2436              :                 }
    2437         1134 :               dts[n] = dt;
    2438         1134 :               if (dt == vect_constant_def
    2439         1134 :                   || dt == vect_external_def)
    2440              :                 {
    2441              :                   /* Check whether we can build the invariant.  If we can't
    2442              :                      we never will be able to.  */
    2443           93 :                   tree type = TREE_TYPE (chains[0][n].op);
    2444         1134 :                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
    2445              :                       && (TREE_CODE (type) == BOOLEAN_TYPE
    2446              :                           || !can_duplicate_and_interleave_p (vinfo, group_size,
    2447              :                                                               type)))
    2448              :                     {
    2449              :                       matches[0] = false;
    2450              :                       goto out;
    2451              :                     }
    2452              :                 }
    2453         1041 :               else if (dt != vect_internal_def)
    2454              :                 {
    2455              :                   /* Not sure, we might need sth special.
    2456              :                      gcc.dg/vect/pr96854.c,
    2457              :                      gfortran.dg/vect/fast-math-pr37021.f90
    2458              :                      and gfortran.dg/vect/pr61171.f trigger.  */
    2459              :                   /* Soft-fail for now.  */
    2460            0 :                   hard_fail = false;
    2461            0 :                   goto out;
    2462              :                 }
    2463              :             }
    2464              :           /* 2b. do the actual build.  */
    2465         1429 :           for (unsigned n = 0; n < chain_len; ++n)
    2466              :             {
    2467         1096 :               vect_def_type dt = dts[n];
    2468         1096 :               unsigned lane;
    2469         1096 :               if (dt == vect_constant_def
    2470         1096 :                   || dt == vect_external_def)
    2471              :                 {
    2472           93 :                   vec<tree> ops;
    2473           93 :                   ops.create (group_size);
    2474          461 :                   for (lane = 0; lane < group_size; ++lane)
    2475          275 :                     if (stmts[lane])
    2476          275 :                       ops.quick_push (chains[lane][n].op);
    2477              :                     else
    2478            0 :                       ops.quick_push (NULL_TREE);
    2479           93 :                   slp_tree child = vect_create_new_slp_node (ops);
    2480           93 :                   SLP_TREE_DEF_TYPE (child) = dt;
    2481           93 :                   children.safe_push (child);
    2482              :                 }
    2483              :               else
    2484              :                 {
    2485         1003 :                   vec<stmt_vec_info> op_stmts;
    2486         1003 :                   op_stmts.create (group_size);
    2487         1003 :                   slp_tree child = NULL;
    2488              :                   /* Brute-force our way.  We have to consider a lane
    2489              :                      failing after fixing an earlier fail up in the
    2490              :                      SLP discovery recursion.  So track the current
    2491              :                      permute per lane.  */
    2492         1003 :                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
    2493         1003 :                   memset (perms, 0, sizeof (unsigned) * group_size);
    2494         1097 :                   do
    2495              :                     {
    2496         1097 :                       op_stmts.truncate (0);
    2497         5320 :                       for (lane = 0; lane < group_size; ++lane)
    2498         3126 :                         if (stmts[lane])
    2499         3126 :                           op_stmts.quick_push
    2500         3126 :                             (vinfo->lookup_def (chains[lane][n].op));
    2501              :                         else
    2502            0 :                           op_stmts.quick_push (NULL);
    2503         1097 :                       child = vect_build_slp_tree (vinfo, op_stmts,
    2504              :                                                    group_size, &this_max_nunits,
    2505              :                                                    matches, limit,
    2506              :                                                    &this_tree_size, bst_map);
    2507              :                       /* ???  We're likely getting too many fatal mismatches
    2508              :                          here so maybe we want to ignore them (but then we
    2509              :                          have no idea which lanes fatally mismatched).  */
    2510         1097 :                       if (child || !matches[0])
    2511              :                         break;
    2512              :                       /* Swap another lane we have not yet matched up into
    2513              :                          lanes that did not match.  If we run out of
    2514              :                          permute possibilities for a lane terminate the
    2515              :                          search.  */
    2516          287 :                       bool term = false;
    2517          287 :                       for (lane = 1; lane < group_size; ++lane)
    2518          193 :                         if (!matches[lane])
    2519              :                           {
    2520          165 :                             if (n + perms[lane] + 1 == chain_len)
    2521              :                               {
    2522              :                                 term = true;
    2523              :                                 break;
    2524              :                               }
    2525          146 :                             if (dump_enabled_p ())
    2526          113 :                               dump_printf_loc (MSG_NOTE, vect_location,
    2527              :                                                "swapping operand %d and %d "
    2528              :                                                "of lane %d\n",
    2529              :                                                n, n + perms[lane] + 1, lane);
    2530          292 :                             std::swap (chains[lane][n],
    2531          146 :                                        chains[lane][n + perms[lane] + 1]);
    2532          146 :                             perms[lane]++;
    2533              :                           }
    2534          113 :                       if (term)
    2535              :                         break;
    2536              :                     }
    2537              :                   while (1);
    2538         1003 :                   if (!child)
    2539              :                     {
    2540           20 :                       if (dump_enabled_p ())
    2541           18 :                         dump_printf_loc (MSG_NOTE, vect_location,
    2542              :                                          "failed to match up op %d\n", n);
    2543           20 :                       op_stmts.release ();
    2544           20 :                       if (lane != group_size - 1)
    2545           10 :                         matches[0] = false;
    2546              :                       else
    2547           10 :                         matches[lane] = false;
    2548           20 :                       goto out;
    2549              :                     }
    2550          983 :                   if (dump_enabled_p ())
    2551              :                     {
    2552          421 :                       dump_printf_loc (MSG_NOTE, vect_location,
    2553              :                                        "matched up op %d to\n", n);
    2554          421 :                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
    2555              :                     }
    2556          983 :                   children.safe_push (child);
    2557              :                 }
    2558              :             }
    2559              :           /* 3. build SLP nodes to combine the chain.  */
    2560         1213 :           for (unsigned lane = 0; lane < group_size; ++lane)
    2561         1772 :             if (stmts[lane] && chains[lane][0].code != code)
    2562              :               {
    2563              :                 /* See if there's any alternate all-PLUS entry.  */
    2564              :                 unsigned n;
    2565            6 :                 for (n = 1; n < chain_len; ++n)
    2566              :                   {
    2567           30 :                     for (lane = 0; lane < group_size; ++lane)
    2568           48 :                       if (stmts[lane] && chains[lane][n].code != code)
    2569              :                         break;
    2570            6 :                     if (lane == group_size)
    2571              :                       break;
    2572              :                   }
    2573            6 :                 if (n != chain_len)
    2574              :                   {
    2575              :                     /* Swap that in at first position.  */
    2576            6 :                     std::swap (children[0], children[n]);
    2577           30 :                     for (lane = 0; lane < group_size; ++lane)
    2578           24 :                       if (stmts[lane])
    2579           24 :                         std::swap (chains[lane][0], chains[lane][n]);
    2580              :                   }
    2581              :                 else
    2582              :                   {
    2583              :                     /* ???  When this triggers and we end up with two
    2584              :                        vect_constant/external_def up-front things break (ICE)
    2585              :                        spectacularly finding an insertion place for the
    2586              :                        all-constant op.  We should have a fully
    2587              :                        vect_internal_def operand though(?) so we can swap
    2588              :                        that into first place and then prepend the all-zero
    2589              :                        constant.  */
    2590            0 :                     if (dump_enabled_p ())
    2591            0 :                       dump_printf_loc (MSG_NOTE, vect_location,
    2592              :                                        "inserting constant zero to compensate "
    2593              :                                        "for (partially) negated first "
    2594              :                                        "operand\n");
    2595            0 :                     chain_len++;
    2596            0 :                     for (lane = 0; lane < group_size; ++lane)
    2597            0 :                       if (stmts[lane])
    2598            0 :                         chains[lane].safe_insert
    2599            0 :                           (0, chain_op_t (code, vect_constant_def, NULL_TREE));
    2600            0 :                     vec<tree> zero_ops;
    2601            0 :                     zero_ops.create (group_size);
    2602            0 :                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
    2603            0 :                     for (lane = 1; lane < group_size; ++lane)
    2604            0 :                       if (stmts[lane])
    2605            0 :                         zero_ops.quick_push (zero_ops[0]);
    2606              :                       else
    2607            0 :                         zero_ops.quick_push (NULL_TREE);
    2608            0 :                     slp_tree zero = vect_create_new_slp_node (zero_ops);
    2609            0 :                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
    2610            0 :                     children.safe_insert (0, zero);
    2611              :                   }
    2612              :                 break;
    2613              :               }
    2614         1071 :           for (unsigned i = 1; i < children.length (); ++i)
    2615              :             {
    2616          738 :               slp_tree op0 = children[i - 1];
    2617          738 :               slp_tree op1 = children[i];
    2618          738 :               bool this_two_op = false;
    2619         2660 :               for (unsigned lane = 0; lane < group_size; ++lane)
    2620         4200 :                 if (stmts[lane] && chains[lane][i].code != chains[0][i].code)
    2621              :                   {
    2622              :                     this_two_op = true;
    2623              :                     break;
    2624              :                   }
    2625          738 :               slp_tree child;
    2626          738 :               if (i == children.length () - 1)
    2627          333 :                 child = vect_create_new_slp_node (node, stmts, 2);
    2628              :               else
    2629          405 :                 child = vect_create_new_slp_node (2, ERROR_MARK);
    2630          738 :               if (this_two_op)
    2631              :                 {
    2632          178 :                   vec<std::pair<unsigned, unsigned> > lperm;
    2633          178 :                   lperm.create (group_size);
    2634          630 :                   for (unsigned lane = 0; lane < group_size; ++lane)
    2635          904 :                     lperm.quick_push (std::make_pair
    2636          452 :                       (chains[lane][i].code != chains[0][i].code, lane));
    2637          356 :                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
    2638          178 :                                                      (chains[0][i].code == code
    2639              :                                                       ? op_stmt_info
    2640              :                                                       : other_op_stmt_info),
    2641          178 :                                                      (chains[0][i].code == code
    2642              :                                                       ? other_op_stmt_info
    2643              :                                                       : op_stmt_info),
    2644              :                                                      lperm);
    2645              :                 }
    2646              :               else
    2647              :                 {
    2648          560 :                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
    2649          560 :                   SLP_TREE_VECTYPE (child) = vectype;
    2650          560 :                   SLP_TREE_LANES (child) = group_size;
    2651          560 :                   SLP_TREE_CHILDREN (child).quick_push (op0);
    2652          560 :                   SLP_TREE_CHILDREN (child).quick_push (op1);
    2653          560 :                   SLP_TREE_REPRESENTATIVE (child)
    2654         1120 :                     = (chains[0][i].code == code
    2655          560 :                        ? op_stmt_info : other_op_stmt_info);
    2656              :                 }
    2657          738 :               children[i] = child;
    2658              :             }
    2659          333 :           *tree_size += this_tree_size + 1;
    2660          333 :           *max_nunits = this_max_nunits;
    2661         1593 :           while (!chains.is_empty ())
    2662          904 :             chains.pop ().release ();
    2663              :           return node;
    2664              :         }
    2665        21083 : out:
    2666        21106 :       if (dump_enabled_p ())
    2667         2809 :         dump_printf_loc (MSG_NOTE, vect_location,
    2668              :                          "failed to line up SLP graph by re-associating "
    2669              :                          "operations in lanes%s\n",
    2670              :                          !hard_fail ? " trying regular discovery" : "");
    2671        21111 :       while (!children.is_empty ())
    2672            5 :         vect_free_slp_tree (children.pop ());
    2673        21269 :       while (!chains.is_empty ())
    2674          163 :         chains.pop ().release ();
    2675              :       /* Hard-fail, otherwise we might run into quadratic processing of the
    2676              :          chains starting one stmt into the chain again.  */
    2677        21106 :       if (hard_fail)
    2678              :         return NULL;
    2679              :       /* Fall thru to normal processing.  */
    2680        21439 :     }
    2681              : 
    2682              :   /* Get at the operands, verifying they are compatible.  */
    2683      3309232 :   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
    2684      3309232 :   slp_oprnd_info oprnd_info;
    2685     15980564 :   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
    2686              :     {
    2687     25345022 :       int res = vect_get_and_check_slp_defs (vinfo, vectype,
    2688     12672511 :                                              swap[i], skip_args,
    2689              :                                              stmts, i, &oprnds_info);
    2690     12672511 :       if (res != 0)
    2691       541769 :         matches[(res == -1) ? 0 : i] = false;
    2692     12672511 :       if (!matches[0])
    2693              :         break;
    2694              :     }
    2695     15670595 :   for (i = 0; i < group_size; ++i)
    2696     12573771 :     if (!matches[i])
    2697              :       {
    2698       212408 :         vect_free_oprnd_info (oprnds_info);
    2699       212408 :         return NULL;
    2700              :       }
    2701      9290472 :   swap = NULL;
    2702              : 
    2703      9290472 :   bool has_two_operators_perm = false;
    2704     18580944 :   auto_vec<unsigned> two_op_perm_indices[2];
    2705      3096824 :   vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
    2706              : 
    2707      3111069 :   if (two_operators && oprnds_info.length () == 2 && group_size > 2)
    2708              :     {
    2709         3867 :       unsigned idx = 0;
    2710         3867 :       hash_map<gimple *, unsigned> seen;
    2711         3867 :       vec<slp_oprnd_info> new_oprnds_info
    2712         3867 :         = vect_create_oprnd_info (1, group_size);
    2713         3867 :       bool success = true;
    2714              : 
    2715         3867 :       enum tree_code code = ERROR_MARK;
    2716         3867 :       if (oprnds_info[0]->def_stmts[0]
    2717         3867 :           && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
    2718         3809 :         code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
    2719         3867 :       basic_block bb = nullptr;
    2720              : 
    2721         7470 :       for (unsigned j = 0; j < group_size; ++j)
    2722              :         {
    2723        17480 :           FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
    2724              :             {
    2725        13877 :               stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
    2726        13877 :               if (!stmt_info
    2727        13654 :                   || !is_a<gassign *> (stmt_info->stmt)
    2728        13651 :                   || gimple_assign_rhs_code (stmt_info->stmt) != code
    2729        24350 :                   || skip_args[i])
    2730              :                 {
    2731              :                   success = false;
    2732         3408 :                   break;
    2733              :                 }
    2734              :               /* Avoid mixing lanes with defs in different basic-blocks.  */
    2735        10473 :               if (!bb)
    2736         3985 :                 bb = gimple_bb (vect_orig_stmt (stmt_info)->stmt);
    2737         8252 :               else if (gimple_bb (vect_orig_stmt (stmt_info)->stmt) != bb)
    2738              :                 {
    2739              :                   success = false;
    2740              :                   break;
    2741              :                 }
    2742              : 
    2743        10469 :               bool exists;
    2744        10469 :               unsigned &stmt_idx
    2745        10469 :                 = seen.get_or_insert (stmt_info->stmt, &exists);
    2746              : 
    2747        10469 :               if (!exists)
    2748              :                 {
    2749         9128 :                   new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
    2750         9128 :                   new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
    2751         9128 :                   stmt_idx = idx;
    2752         9128 :                   idx++;
    2753              :                 }
    2754              : 
    2755        10469 :               two_op_perm_indices[i].safe_push (stmt_idx);
    2756              :             }
    2757              : 
    2758         7011 :           if (!success)
    2759              :             break;
    2760              :         }
    2761              : 
    2762         3867 :       if (success && idx == group_size)
    2763              :         {
    2764           94 :           if (dump_enabled_p ())
    2765              :             {
    2766            0 :               dump_printf_loc (MSG_NOTE, vect_location,
    2767              :                                "Replace two_operators operands:\n");
    2768              : 
    2769            0 :               FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
    2770              :                 {
    2771            0 :                   dump_printf_loc (MSG_NOTE, vect_location,
    2772              :                                    "Operand %u:\n", i);
    2773            0 :                   for (unsigned j = 0; j < group_size; j++)
    2774            0 :                     dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
    2775            0 :                                      j, oprnd_info->def_stmts[j]->stmt);
    2776              :                 }
    2777              : 
    2778            0 :               dump_printf_loc (MSG_NOTE, vect_location,
    2779              :                                "With a single operand:\n");
    2780            0 :               for (unsigned j = 0; j < group_size; j++)
    2781            0 :                 dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
    2782            0 :                                  j, new_oprnds_info[0]->def_stmts[j]->stmt);
    2783              :             }
    2784              : 
    2785           94 :           two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
    2786           94 :           two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
    2787              : 
    2788           94 :           new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
    2789           94 :           new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
    2790           94 :           new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
    2791           94 :           new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
    2792           94 :           new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
    2793              : 
    2794           94 :           vect_free_oprnd_info (oprnds_info);
    2795           94 :           oprnds_info = new_oprnds_info;
    2796           94 :           nops = 1;
    2797           94 :           has_two_operators_perm = true;
    2798              :         }
    2799              :       else
    2800         3773 :         vect_free_oprnd_info (new_oprnds_info);
    2801         3867 :     }
    2802              : 
    2803      6193648 :   auto_vec<slp_tree, 4> children;
    2804              : 
    2805      3096824 :   stmt_info = stmts[0];
    2806              : 
    2807      3096824 :   int reduc_idx = -1;
    2808      3096824 :   int gs_scale = 0;
    2809      3096824 :   tree gs_base = NULL_TREE;
    2810              : 
    2811              :   /* Create SLP_TREE nodes for the definition node/s.  */
    2812      7927696 :   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
    2813              :     {
    2814      4945877 :       slp_tree child = nullptr;
    2815      4945877 :       unsigned int j;
    2816              : 
    2817              :       /* We're skipping certain operands from processing, for example
    2818              :          outer loop reduction initial defs.  */
    2819      4945877 :       if (skip_args[i])
    2820              :         {
    2821       479457 :           children.safe_push (NULL);
    2822      5310329 :           continue;
    2823              :         }
    2824              : 
    2825      4466420 :       if (oprnd_info->first_dt == vect_uninitialized_def)
    2826              :         {
    2827              :           /* COND_EXPR have one too many eventually if the condition
    2828              :              is a SSA name.  */
    2829            0 :           gcc_assert (i == 3 && nops == 4);
    2830            0 :           continue;
    2831              :         }
    2832              : 
    2833      4466420 :       if (oprnd_info->first_gs_p)
    2834              :         {
    2835        22435 :           gs_scale = oprnd_info->first_gs_info.scale;
    2836        22435 :           gs_base = oprnd_info->first_gs_info.base;
    2837              :         }
    2838              : 
    2839      4466420 :       if (is_a <bb_vec_info> (vinfo)
    2840      1563827 :           && oprnd_info->first_dt == vect_internal_def
    2841      5277892 :           && !oprnd_info->any_pattern)
    2842              :         {
    2843              :           /* For BB vectorization, if all defs are the same do not
    2844              :              bother to continue the build along the single-lane
    2845              :              graph but use a splat of the scalar value.  */
    2846       767743 :           stmt_vec_info first_def = oprnd_info->def_stmts[0];
    2847       823550 :           for (j = 1; j < group_size; ++j)
    2848       783512 :             if (oprnd_info->def_stmts[j] != first_def)
    2849              :               break;
    2850       767743 :           if (j == group_size
    2851              :               /* But avoid doing this for loads where we may be
    2852              :                  able to CSE things, unless the stmt is not
    2853              :                  vectorizable.  */
    2854       767743 :               && (!STMT_VINFO_VECTORIZABLE (first_def)
    2855        49294 :                   || !gimple_vuse (first_def->stmt)))
    2856              :             {
    2857        30745 :               if (dump_enabled_p ())
    2858          105 :                 dump_printf_loc (MSG_NOTE, vect_location,
    2859              :                                  "Using a splat of the uniform operand %G",
    2860              :                                  first_def->stmt);
    2861        30745 :               oprnd_info->first_dt = vect_external_def;
    2862              :             }
    2863              :         }
    2864              : 
    2865      4466420 :       if (oprnd_info->first_dt == vect_external_def
    2866      4466420 :           || oprnd_info->first_dt == vect_constant_def)
    2867              :         {
    2868      1463619 :           if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
    2869              :             {
    2870              :               tree op0;
    2871              :               tree uniform_val = op0 = oprnd_info->ops[0];
    2872              :               for (j = 1; j < oprnd_info->ops.length (); ++j)
    2873              :                 if (oprnd_info->ops[j]
    2874              :                     && !operand_equal_p (uniform_val, oprnd_info->ops[j]))
    2875              :                   {
    2876              :                     uniform_val = NULL_TREE;
    2877              :                     break;
    2878              :                   }
    2879              :               if (!uniform_val
    2880              :                   && !can_duplicate_and_interleave_p (vinfo,
    2881              :                                                       oprnd_info->ops.length (),
    2882              :                                                       TREE_TYPE (op0)))
    2883              :                 {
    2884              :                   matches[j] = false;
    2885              :                   if (dump_enabled_p ())
    2886              :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    2887              :                                      "Build SLP failed: invalid type of def "
    2888              :                                      "for variable-length SLP %T\n", op0);
    2889              :                   goto fail;
    2890              :                 }
    2891              :             }
    2892      1463619 :           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
    2893      1463619 :           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
    2894      1463619 :           oprnd_info->ops = vNULL;
    2895      1463619 :           children.safe_push (invnode);
    2896      1463619 :           continue;
    2897      1463619 :         }
    2898              : 
    2899              :       /* See which SLP operand a reduction chain continues on.  We want
    2900              :          to chain even PHIs but not backedges.  */
    2901      3002801 :       if (STMT_VINFO_REDUC_DEF (oprnd_info->def_stmts[0])
    2902      3002801 :           || STMT_VINFO_REDUC_IDX (oprnd_info->def_stmts[0]) != -1)
    2903              :         {
    2904       232673 :           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
    2905              :             {
    2906          756 :               if (oprnd_info->first_dt == vect_double_reduction_def)
    2907          378 :                 reduc_idx = i;
    2908              :             }
    2909       231917 :           else if (is_a <gphi *> (stmt_info->stmt)
    2910       231917 :                    && gimple_phi_num_args
    2911        99466 :                         (as_a <gphi *> (stmt_info->stmt)) != 1)
    2912              :             ;
    2913       132834 :           else if (STMT_VINFO_REDUC_IDX (stmt_info) == -1
    2914          383 :                    && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
    2915              :             ;
    2916       132834 :           else if (reduc_idx == -1)
    2917       124424 :             reduc_idx = i;
    2918              :           else
    2919              :             /* For .COND_* reduction operations the else value can be the
    2920              :                same as one of the operation operands.  The other def
    2921              :                stmts have been moved, so we can't check easily.  Check
    2922              :                it's a call at least.  */
    2923         8410 :             gcc_assert (is_a <gcall *> (stmt_info->stmt));
    2924              :         }
    2925              : 
    2926              :       /* When we have a masked load with uniform mask discover this
    2927              :          as a single-lane mask with a splat permute.  This way we can
    2928              :          recognize this as a masked load-lane by stripping the splat.  */
    2929      3002801 :       if (is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
    2930        57410 :           && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
    2931              :                                      IFN_MASK_LOAD)
    2932         6075 :           && STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2933      3002878 :           && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info)))
    2934              :         {
    2935           35 :           vec<stmt_vec_info> def_stmts2;
    2936           35 :           def_stmts2.create (1);
    2937           35 :           def_stmts2.quick_push (oprnd_info->def_stmts[0]);
    2938           35 :           child = vect_build_slp_tree (vinfo, def_stmts2, 1,
    2939              :                                        &this_max_nunits,
    2940              :                                        matches, limit,
    2941              :                                        &this_tree_size, bst_map);
    2942           35 :           if (child)
    2943              :             {
    2944           35 :               slp_tree pnode = vect_create_new_slp_node (1, VEC_PERM_EXPR);
    2945           35 :               SLP_TREE_VECTYPE (pnode) = SLP_TREE_VECTYPE (child);
    2946           35 :               SLP_TREE_LANES (pnode) = group_size;
    2947           35 :               SLP_TREE_SCALAR_STMTS (pnode).create (group_size);
    2948           35 :               SLP_TREE_LANE_PERMUTATION (pnode).create (group_size);
    2949          210 :               for (unsigned k = 0; k < group_size; ++k)
    2950              :                 {
    2951          175 :                   SLP_TREE_SCALAR_STMTS (pnode)
    2952          175 :                     .quick_push (oprnd_info->def_stmts[0]);
    2953          175 :                   SLP_TREE_LANE_PERMUTATION (pnode)
    2954          175 :                     .quick_push (std::make_pair (0u, 0u));
    2955              :                 }
    2956           35 :               SLP_TREE_CHILDREN (pnode).quick_push (child);
    2957           35 :               pnode->max_nunits = child->max_nunits;
    2958           35 :               children.safe_push (pnode);
    2959           35 :               oprnd_info->def_stmts = vNULL;
    2960           35 :               continue;
    2961           35 :             }
    2962              :           else
    2963            0 :             def_stmts2.release ();
    2964              :         }
    2965              : 
    2966      3002766 :       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
    2967              :                                         group_size, &this_max_nunits,
    2968              :                                         matches, limit,
    2969              :                                         &this_tree_size, bst_map)) != NULL)
    2970              :         {
    2971      2521635 :           oprnd_info->def_stmts = vNULL;
    2972      2521635 :           children.safe_push (child);
    2973      2521635 :           continue;
    2974              :         }
    2975              : 
    2976              :       /* If the SLP build for operand zero failed and operand zero
    2977              :          and one can be commutated try that for the scalar stmts
    2978              :          that failed the match.  */
    2979       481131 :       if (i == 0
    2980              :           /* A first scalar stmt mismatch signals a fatal mismatch.  */
    2981       379660 :           && matches[0]
    2982              :           /* ???  For COND_EXPRs we can swap the comparison operands
    2983              :              as well as the arms under some constraints.  */
    2984       179663 :           && (nops == 2 || nops == 3)
    2985       108956 :           && oprnds_info[1]->first_dt == vect_internal_def
    2986        59535 :           && (is_gimple_assign (stmt_info->stmt)
    2987        11509 :               || is_gimple_call (stmt_info->stmt))
    2988              :           /* Swapping operands for reductions breaks assumptions later on.  */
    2989       529170 :           && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
    2990              :         {
    2991              :           /* See whether we can swap the matching or the non-matching
    2992              :              stmt operands.  */
    2993              :           bool swap_not_matching = true;
    2994        51964 :           do
    2995              :             {
    2996      7058485 :               for (j = 0; j < group_size; ++j)
    2997              :                 {
    2998      7020523 :                   if (matches[j] != !swap_not_matching)
    2999        70756 :                     continue;
    3000      6949767 :                   stmt_vec_info stmt_info = stmts[j];
    3001              :                   /* Verify if we can swap operands of this stmt.  */
    3002      6949767 :                   if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
    3003              :                     {
    3004      6949741 :                       tree_code code = gimple_assign_rhs_code (stmt);
    3005      6949741 :                       if (! commutative_tree_code (code)
    3006      6949741 :                           && ! commutative_ternary_tree_code (code))
    3007              :                         {
    3008        13978 :                           if (!swap_not_matching)
    3009         6464 :                             goto fail;
    3010              :                           swap_not_matching = false;
    3011              :                           break;
    3012              :                         }
    3013              :                     }
    3014      7006547 :                   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
    3015              :                     {
    3016           26 :                       internal_fn fn = (gimple_call_internal_p (call)
    3017           26 :                                         ? gimple_call_internal_fn (call)
    3018              :                                         : IFN_LAST);
    3019           26 :                       if ((! commutative_binary_fn_p (fn)
    3020           26 :                            && ! commutative_ternary_fn_p (fn))
    3021           28 :                           || first_commutative_argument (fn) != 0)
    3022              :                         {
    3023           24 :                           if (!swap_not_matching)
    3024           12 :                             goto fail;
    3025              :                           swap_not_matching = false;
    3026              :                           break;
    3027              :                         }
    3028              :                     }
    3029              :                 }
    3030              :             }
    3031        45488 :           while (j != group_size);
    3032              : 
    3033              :           /* Swap mismatched definition stmts.  */
    3034        37962 :           if (dump_enabled_p ())
    3035          351 :             dump_printf_loc (MSG_NOTE, vect_location,
    3036              :                              "Re-trying with swapped operands of stmts ");
    3037      7036092 :           for (j = 0; j < group_size; ++j)
    3038      6998130 :             if (matches[j] == !swap_not_matching)
    3039              :               {
    3040     13871162 :                 std::swap (oprnds_info[0]->def_stmts[j],
    3041      6935581 :                            oprnds_info[1]->def_stmts[j]);
    3042     13871162 :                 std::swap (oprnds_info[0]->ops[j],
    3043      6935581 :                            oprnds_info[1]->ops[j]);
    3044      6935581 :                 if (dump_enabled_p ())
    3045          956 :                   dump_printf (MSG_NOTE, "%d ", j);
    3046              :               }
    3047        37962 :           if (dump_enabled_p ())
    3048          351 :             dump_printf (MSG_NOTE, "\n");
    3049              :           /* After swapping some operands we lost track whether an
    3050              :              operand has any pattern defs so be conservative here.  */
    3051        72640 :           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
    3052         3330 :             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
    3053              :           /* And try again with scratch 'matches' ... */
    3054        37962 :           bool *tem = XALLOCAVEC (bool, group_size);
    3055        37962 :           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
    3056              :                                             group_size, &this_max_nunits,
    3057              :                                             tem, limit,
    3058              :                                             &this_tree_size, bst_map)) != NULL)
    3059              :             {
    3060         6658 :               oprnd_info->def_stmts = vNULL;
    3061         6658 :               children.safe_push (child);
    3062         6658 :               continue;
    3063              :             }
    3064              :         }
    3065       474473 : fail:
    3066              : 
    3067              :       /* If the SLP build failed and we analyze a basic-block
    3068              :          simply treat nodes we fail to build as externally defined
    3069              :          (and thus build vectors from the scalar defs).
    3070              :          The cost model will reject outright expensive cases.
    3071              :          ???  This doesn't treat cases where permutation ultimatively
    3072              :          fails (or we don't try permutation below).  Ideally we'd
    3073              :          even compute a permutation that will end up with the maximum
    3074              :          SLP tree size...  */
    3075       474473 :       if (is_a <bb_vec_info> (vinfo)
    3076              :           /* ???  Rejecting patterns this way doesn't work.  We'd have to
    3077              :              do extra work to cancel the pattern so the uses see the
    3078              :              scalar version.  */
    3079       394179 :           && !is_pattern_stmt_p (stmt_info)
    3080       844423 :           && !oprnd_info->any_pattern)
    3081              :         {
    3082              :           /* But if there's a leading vector sized set of matching stmts
    3083              :              fail here so we can split the group.  This matches the condition
    3084              :              vect_analyze_slp_instance uses.  */
    3085              :           /* ???  We might want to split here and combine the results to support
    3086              :              multiple vector sizes better.  */
    3087       580534 :           for (j = 0; j < group_size; ++j)
    3088       580534 :             if (!matches[j])
    3089              :               break;
    3090       369689 :           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
    3091       369660 :               && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
    3092              :             {
    3093       359468 :               if (dump_enabled_p ())
    3094          555 :                 dump_printf_loc (MSG_NOTE, vect_location,
    3095              :                                  "Building vector operands from scalars\n");
    3096       359468 :               this_tree_size++;
    3097       359468 :               child = vect_create_new_slp_node (oprnd_info->ops);
    3098       359468 :               children.safe_push (child);
    3099       359468 :               oprnd_info->ops = vNULL;
    3100       359468 :               continue;
    3101              :             }
    3102              :         }
    3103              : 
    3104       115005 :       gcc_assert (child == NULL);
    3105       131090 :       FOR_EACH_VEC_ELT (children, j, child)
    3106        16085 :         if (child)
    3107        16085 :           vect_free_slp_tree (child);
    3108       115005 :       vect_free_oprnd_info (oprnds_info);
    3109       115005 :       return NULL;
    3110              :     }
    3111              : 
    3112      2981819 :   vect_free_oprnd_info (oprnds_info);
    3113              : 
    3114              :   /* If we have all children of a child built up from uniform scalars
    3115              :      or does more than one possibly expensive vector construction then
    3116              :      just throw that away, causing it built up from scalars.
    3117              :      The exception is the SLP node for the vector store.  */
    3118      2981819 :   if (is_a <bb_vec_info> (vinfo)
    3119      1090467 :       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
    3120              :       /* ???  Rejecting patterns this way doesn't work.  We'd have to
    3121              :          do extra work to cancel the pattern so the uses see the
    3122              :          scalar version.  */
    3123      3414983 :       && !is_pattern_stmt_p (stmt_info))
    3124              :     {
    3125              :       slp_tree child;
    3126              :       unsigned j;
    3127              :       bool all_uniform_p = true;
    3128              :       unsigned n_vector_builds = 0;
    3129      1228609 :       FOR_EACH_VEC_ELT (children, j, child)
    3130              :         {
    3131       821293 :           if (!child)
    3132              :             ;
    3133       821293 :           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    3134              :             all_uniform_p = false;
    3135       585963 :           else if (!vect_slp_tree_uniform_p (child))
    3136              :             {
    3137       446030 :               all_uniform_p = false;
    3138       446030 :               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
    3139       411902 :                 n_vector_builds++;
    3140              :             }
    3141              :         }
    3142       407316 :       if (all_uniform_p
    3143       407316 :           || n_vector_builds > 1
    3144       691662 :           || (n_vector_builds == children.length ()
    3145        30145 :               && is_a <gphi *> (stmt_info->stmt)))
    3146              :         {
    3147              :           /* Roll back.  */
    3148       127783 :           matches[0] = false;
    3149       405909 :           FOR_EACH_VEC_ELT (children, j, child)
    3150       278126 :             if (child)
    3151       278126 :               vect_free_slp_tree (child);
    3152              : 
    3153       127783 :           if (dump_enabled_p ())
    3154          177 :             dump_printf_loc (MSG_NOTE, vect_location,
    3155              :                              "Building parent vector operands from "
    3156              :                              "scalars instead\n");
    3157       127783 :           return NULL;
    3158              :         }
    3159              :     }
    3160              : 
    3161      2854036 :   *tree_size += this_tree_size + 1;
    3162      2854036 :   *max_nunits = this_max_nunits;
    3163              : 
    3164      2854036 :   if (two_operators)
    3165              :     {
    3166              :       /* ???  We'd likely want to either cache in bst_map sth like
    3167              :          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
    3168              :          the true { a+b, a+b, a+b, a+b } ... but there we don't have
    3169              :          explicit stmts to put in so the keying on 'stmts' doesn't
    3170              :          work (but we have the same issue with nodes that use 'ops').  */
    3171              : 
    3172         6844 :       if (has_two_operators_perm)
    3173              :         {
    3174           40 :           slp_tree child = children[0];
    3175           40 :           children.truncate (0);
    3176          120 :           for (i = 0; i < 2; i++)
    3177              :             {
    3178           80 :               slp_tree pnode
    3179           80 :                 = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
    3180           80 :               SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
    3181           80 :               SLP_TREE_VECTYPE (pnode) = vectype;
    3182           80 :               SLP_TREE_CHILDREN (pnode).quick_push (child);
    3183           80 :               SLP_TREE_CHILDREN (pnode).quick_push (child);
    3184           80 :               lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
    3185           80 :               children.safe_push (pnode);
    3186              : 
    3187          656 :               for (unsigned j = 0; j < stmts.length (); j++)
    3188          576 :                 perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
    3189              :             }
    3190              : 
    3191           40 :           SLP_TREE_REF_COUNT (child) += 4;
    3192              :         }
    3193              : 
    3194         6844 :       slp_tree one = new _slp_tree;
    3195         6844 :       slp_tree two = new _slp_tree;
    3196         6844 :       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
    3197         6844 :       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
    3198         6844 :       SLP_TREE_VECTYPE (one) = vectype;
    3199         6844 :       SLP_TREE_VECTYPE (two) = vectype;
    3200         6844 :       SLP_TREE_CHILDREN (one).safe_splice (children);
    3201         6844 :       SLP_TREE_CHILDREN (two).safe_splice (children);
    3202         6844 :       slp_tree child;
    3203        27378 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
    3204        13690 :         SLP_TREE_REF_COUNT (child)++;
    3205              : 
    3206              :       /* Here we record the original defs since this
    3207              :          node represents the final lane configuration.  */
    3208         6844 :       node = vect_create_new_slp_node (node, stmts, 2);
    3209         6844 :       SLP_TREE_VECTYPE (node) = vectype;
    3210         6844 :       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
    3211         6844 :       SLP_TREE_CHILDREN (node).quick_push (one);
    3212         6844 :       SLP_TREE_CHILDREN (node).quick_push (two);
    3213         6844 :       enum tree_code code0 = ERROR_MARK;
    3214         6844 :       enum tree_code ocode = ERROR_MARK;
    3215         6844 :       if (gassign *stmt = dyn_cast <gassign *> (stmts[0]->stmt))
    3216         6842 :         code0 = gimple_assign_rhs_code (stmt);
    3217         6844 :       stmt_vec_info ostmt_info;
    3218         6844 :       unsigned j = 0;
    3219        25005 :       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
    3220              :         {
    3221        18161 :           int op = 0;
    3222        18161 :           if (gassign *ostmt = dyn_cast <gassign *> (ostmt_info->stmt))
    3223              :             {
    3224        18157 :               if (gimple_assign_rhs_code (ostmt) != code0)
    3225              :                 {
    3226         9113 :                   ocode = gimple_assign_rhs_code (ostmt);
    3227              :                   op = 1;
    3228              :                   j = i;
    3229              :                 }
    3230              :             }
    3231              :           else
    3232              :             {
    3233            8 :               if (gimple_call_combined_fn (stmts[0]->stmt)
    3234            4 :                   != gimple_call_combined_fn (ostmt_info->stmt))
    3235              :                 {
    3236            2 :                   op = 1;
    3237            2 :                   j = i;
    3238              :                 }
    3239              :             }
    3240        18161 :           SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (op, i));
    3241              :         }
    3242         6844 :       SLP_TREE_CODE (one) = code0;
    3243         6844 :       SLP_TREE_CODE (two) = ocode;
    3244         6844 :       SLP_TREE_LANES (one) = stmts.length ();
    3245         6844 :       SLP_TREE_LANES (two) = stmts.length ();
    3246         6844 :       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
    3247         6844 :       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
    3248              : 
    3249         6844 :       return node;
    3250              :     }
    3251              : 
    3252      2847192 :   node = vect_create_new_slp_node (node, stmts, nops);
    3253      2847192 :   SLP_TREE_VECTYPE (node) = vectype;
    3254      2847192 :   SLP_TREE_CHILDREN (node).splice (children);
    3255      2847192 :   SLP_TREE_GS_SCALE (node) = gs_scale;
    3256      2847192 :   SLP_TREE_GS_BASE (node) = gs_base;
    3257      2847192 :   if (reduc_idx != -1)
    3258              :     {
    3259       116063 :       gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) != -1
    3260              :                   || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
    3261              :                   || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def);
    3262       116063 :       SLP_TREE_REDUC_IDX (node) = reduc_idx;
    3263       116063 :       node->cycle_info.id = SLP_TREE_CHILDREN (node)[reduc_idx]->cycle_info.id;
    3264              :     }
    3265              :   /* When reaching the reduction PHI, create a vect_reduc_info.  */
    3266      2731129 :   else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
    3267      2731129 :             || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
    3268      2731129 :            && is_a <gphi *> (STMT_VINFO_STMT (stmt_info)))
    3269              :     {
    3270       101394 :       loop_vec_info loop_vinfo = as_a <loop_vec_info> (vinfo);
    3271       101394 :       gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) == -1);
    3272       101394 :       node->cycle_info.id = loop_vinfo->reduc_infos.length ();
    3273       101394 :       vect_reduc_info reduc_info = new vect_reduc_info_s ();
    3274       101394 :       loop_vinfo->reduc_infos.safe_push (reduc_info);
    3275       101394 :       stmt_vec_info reduc_phi = stmt_info;
    3276              :       /* ???  For double reductions vect_is_simple_reduction stores the
    3277              :          reduction type and code on the inner loop header PHI.  */
    3278       101394 :       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
    3279              :         {
    3280          378 :           use_operand_p use_p;
    3281          378 :           gimple *use_stmt;
    3282          378 :           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
    3283              :                                      &use_p, &use_stmt);
    3284          378 :           gcc_assert (res);
    3285          378 :           reduc_phi = loop_vinfo->lookup_stmt (use_stmt);
    3286              :         }
    3287       101394 :       VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (stmt_info);
    3288       101394 :       VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (reduc_phi);
    3289       101394 :       VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (reduc_phi);
    3290       101394 :       VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
    3291              :     }
    3292              :   return node;
    3293      9290472 : }
    3294              : 
    3295              : /* Dump a single SLP tree NODE.  */
    3296              : 
    3297              : static void
    3298       444202 : vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
    3299              :                      slp_tree node)
    3300              : {
    3301       444202 :   unsigned i, j;
    3302       444202 :   slp_tree child;
    3303       444202 :   stmt_vec_info stmt_info;
    3304       444202 :   tree op;
    3305              : 
    3306       444202 :   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
    3307       444202 :   dump_user_location_t user_loc = loc.get_user_location ();
    3308       444202 :   dump_printf_loc (metadata, user_loc,
    3309              :                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
    3310              :                    ", refcnt=%u)",
    3311       444202 :                    SLP_TREE_DEF_TYPE (node) == vect_external_def
    3312              :                    ? " (external)"
    3313              :                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
    3314       428559 :                       ? " (constant)"
    3315              :                       : ""), (void *) node,
    3316       444202 :                    estimated_poly_value (node->max_nunits),
    3317              :                                          SLP_TREE_REF_COUNT (node));
    3318       444202 :   if (SLP_TREE_VECTYPE (node))
    3319       376675 :     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
    3320       444202 :   dump_printf (metadata, "%s",
    3321       444202 :                node->avoid_stlf_fail ? " (avoid-stlf-fail)" : "");
    3322       444202 :   if (node->cycle_info.id != -1 || node->cycle_info.reduc_idx != -1)
    3323        23821 :     dump_printf (metadata, " cycle %d, link %d", node->cycle_info.id,
    3324              :                  node->cycle_info.reduc_idx);
    3325       444202 :   dump_printf (metadata, "\n");
    3326       444202 :   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
    3327              :     {
    3328       361687 :       if (SLP_TREE_PERMUTE_P (node))
    3329        13668 :         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
    3330              :       else
    3331       348019 :         dump_printf_loc (metadata, user_loc, "op template: %G",
    3332       348019 :                          SLP_TREE_REPRESENTATIVE (node)->stmt);
    3333              :     }
    3334       444202 :   if (SLP_TREE_SCALAR_STMTS (node).exists ())
    3335       865553 :     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    3336       511969 :       if (stmt_info)
    3337       506688 :         dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
    3338       506688 :                          SLP_TREE_LIVE_LANES (node).contains (i)
    3339       503079 :                          ? "[l*]" : (STMT_VINFO_LIVE_P (stmt_info)
    3340       503079 :                                      ? "[l] " : ""),
    3341              :                          i, stmt_info->stmt);
    3342              :       else
    3343         5281 :         dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
    3344              :   else
    3345              :     {
    3346        90618 :       dump_printf_loc (metadata, user_loc, "\t{ ");
    3347       199537 :       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
    3348       108919 :         dump_printf (metadata, "%T%s ", op,
    3349       108919 :                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
    3350        90618 :       dump_printf (metadata, "}\n");
    3351              :     }
    3352       444202 :   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
    3353              :     {
    3354        64658 :       dump_printf_loc (metadata, user_loc, "\tload permutation {");
    3355       147426 :       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
    3356        82768 :         dump_printf (dump_kind, " %u", j);
    3357        64658 :       dump_printf (dump_kind, " }\n");
    3358              :     }
    3359       444202 :   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
    3360              :     {
    3361        13676 :       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
    3362        51245 :       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
    3363        37569 :         dump_printf (dump_kind, " %u[%u]",
    3364        37569 :                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
    3365        37569 :                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
    3366        13676 :       dump_printf (dump_kind, " }%s\n",
    3367        13676 :                    node->ldst_lanes ? " (load-lanes)" : "");
    3368              :     }
    3369       444202 :   if (SLP_TREE_CHILDREN (node).is_empty ())
    3370       169327 :     return;
    3371       274875 :   dump_printf_loc (metadata, user_loc, "\tchildren");
    3372       725141 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3373       450266 :     dump_printf (dump_kind, " %p", (void *)child);
    3374       274875 :   dump_printf (dump_kind, "%s\n",
    3375       274875 :                node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
    3376              :                ? " (store-lanes)" : "");
    3377              : }
    3378              : 
    3379              : DEBUG_FUNCTION void
    3380            0 : debug (slp_tree node)
    3381              : {
    3382            0 :   debug_dump_context ctx;
    3383            0 :   vect_print_slp_tree (MSG_NOTE,
    3384            0 :                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
    3385              :                        node);
    3386            0 : }
    3387              : 
    3388              : /* Recursive helper for the dot producer below.  */
    3389              : 
    3390              : static void
    3391            0 : dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
    3392              : {
    3393            0 :   if (visited.add (node))
    3394              :     return;
    3395              : 
    3396            0 :   fprintf (f, "\"%p\" [label=\"", (void *)node);
    3397            0 :   vect_print_slp_tree (MSG_NOTE,
    3398            0 :                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
    3399              :                        node);
    3400            0 :   fprintf (f, "\"];\n");
    3401              : 
    3402              : 
    3403            0 :   for (slp_tree child : SLP_TREE_CHILDREN (node))
    3404            0 :     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
    3405              : 
    3406            0 :   for (slp_tree child : SLP_TREE_CHILDREN (node))
    3407            0 :     if (child)
    3408            0 :       dot_slp_tree (f, child, visited);
    3409              : }
    3410              : 
    3411              : DEBUG_FUNCTION void
    3412            0 : dot_slp_tree (const char *fname, slp_tree node)
    3413              : {
    3414            0 :   FILE *f = fopen (fname, "w");
    3415            0 :   fprintf (f, "digraph {\n");
    3416            0 :   fflush (f);
    3417            0 :     {
    3418            0 :       debug_dump_context ctx (f);
    3419            0 :       hash_set<slp_tree> visited;
    3420            0 :       dot_slp_tree (f, node, visited);
    3421            0 :     }
    3422            0 :   fflush (f);
    3423            0 :   fprintf (f, "}\n");
    3424            0 :   fclose (f);
    3425            0 : }
    3426              : 
    3427              : DEBUG_FUNCTION void
    3428            0 : dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
    3429              : {
    3430            0 :   FILE *f = fopen (fname, "w");
    3431            0 :   fprintf (f, "digraph {\n");
    3432            0 :   fflush (f);
    3433            0 :     {
    3434            0 :       debug_dump_context ctx (f);
    3435            0 :       hash_set<slp_tree> visited;
    3436            0 :       for (auto inst : slp_instances)
    3437            0 :         dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
    3438            0 :     }
    3439            0 :   fflush (f);
    3440            0 :   fprintf (f, "}\n");
    3441            0 :   fclose (f);
    3442            0 : }
    3443              : 
    3444              : /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
    3445              : 
    3446              : static void
    3447       482917 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
    3448              :                       slp_tree node, hash_set<slp_tree> &visited)
    3449              : {
    3450       482917 :   unsigned i;
    3451       482917 :   slp_tree child;
    3452              : 
    3453       482917 :   if (visited.add (node))
    3454       482917 :     return;
    3455              : 
    3456       443728 :   vect_print_slp_tree (dump_kind, loc, node);
    3457              : 
    3458      1337208 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3459       449752 :     if (child)
    3460       407129 :       vect_print_slp_graph (dump_kind, loc, child, visited);
    3461              : }
    3462              : 
    3463              : static void
    3464        46525 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
    3465              :                       slp_tree entry)
    3466              : {
    3467        46525 :   hash_set<slp_tree> visited;
    3468        46525 :   vect_print_slp_graph (dump_kind, loc, entry, visited);
    3469        46525 : }
    3470              : 
    3471              : DEBUG_FUNCTION void
    3472            0 : debug (slp_instance instance)
    3473              : {
    3474            0 :   debug_dump_context ctx;
    3475            0 :   vect_print_slp_graph (MSG_NOTE,
    3476            0 :                         dump_location_t::from_location_t (UNKNOWN_LOCATION),
    3477              :                         SLP_INSTANCE_TREE (instance));
    3478            0 : }
    3479              : 
    3480              : 
    3481              : /* Compute the set of scalar stmts participating in external nodes.  */
    3482              : 
    3483              : static void
    3484      1554795 : vect_slp_gather_extern_scalar_stmts (vec_info *vinfo, slp_tree node,
    3485              :                                      hash_set<slp_tree> &visited,
    3486              :                                      hash_set<stmt_vec_info> &estmts)
    3487              : {
    3488      1554795 :   if (visited.add (node))
    3489              :     return;
    3490              : 
    3491      1511715 :   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
    3492              :     {
    3493              :       slp_tree child;
    3494              :       int i;
    3495      1745788 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3496       877785 :         if (child)
    3497       877785 :           vect_slp_gather_extern_scalar_stmts (vinfo, child, visited, estmts);
    3498              :     }
    3499              :   else
    3500      3623780 :     for (tree def : SLP_TREE_SCALAR_OPS (node))
    3501              :       {
    3502      1694188 :         stmt_vec_info def_stmt = vinfo->lookup_def (def);
    3503      1694188 :         if (def_stmt)
    3504       336717 :           estmts.add (def_stmt);
    3505              :       }
    3506              : }
    3507              : 
    3508              : /* Mark the original scalar stmt coverage of the vector SLP graph of VINFO
    3509              :    with STMT_SLP_TYPE == pure_slp.  */
    3510              : 
    3511              : static void
    3512       234604 : vect_bb_slp_mark_stmts_vectorized (bb_vec_info vinfo)
    3513              : {
    3514              :   /* Gather the scalar stmt leafs of the SLP graph to stop the below DFS
    3515              :      walk on.  */
    3516       234604 :   hash_set<stmt_vec_info> scalar_stmts_in_externs;
    3517       234604 :   hash_set<slp_tree> visited;
    3518      1380822 :   for (auto instance : BB_VINFO_SLP_INSTANCES (vinfo))
    3519       677010 :     vect_slp_gather_extern_scalar_stmts (vinfo, SLP_INSTANCE_TREE (instance),
    3520              :                                          visited, scalar_stmts_in_externs);
    3521              : 
    3522              :   /* DFS walk scalar stmts to compute the vectorized coverage indicated
    3523              :      by STMT_SLP_TYPE (stmt) == pure_slp on the original scalar (non-pattern)
    3524              :      stmts.  */
    3525      1380822 :   for (auto instance : BB_VINFO_SLP_INSTANCES (vinfo))
    3526              :     {
    3527       787343 :       for (auto stmt : SLP_INSTANCE_ROOT_STMTS (instance))
    3528        52639 :         if (!scalar_stmts_in_externs.contains (stmt))
    3529        51941 :           STMT_SLP_TYPE (stmt) = pure_slp;
    3530       677010 :       auto_vec<stmt_vec_info> worklist;
    3531      3826724 :       for (auto stmt : SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)))
    3532              :         {
    3533      1795694 :           stmt = vect_orig_stmt (stmt);
    3534      1795694 :           if (!scalar_stmts_in_externs.contains (stmt)
    3535      1795694 :               && STMT_SLP_TYPE (stmt) != pure_slp)
    3536              :             {
    3537      1786625 :               STMT_SLP_TYPE (stmt) = pure_slp;
    3538      1786625 :               worklist.safe_push (stmt);
    3539              :             }
    3540              :         }
    3541      3575003 :       while (!worklist.is_empty ())
    3542              :         {
    3543      2223605 :           stmt_vec_info stmt = worklist.pop ();
    3544              : 
    3545              :           /* Now walk relevant parts of the SSA use-def graph.  */
    3546      2223605 :           slp_oprnds child_ops (stmt);
    3547      4682621 :           for (unsigned i = 0; i < child_ops.num_slp_children; ++i)
    3548              :             {
    3549      2459016 :               tree op = child_ops.get_op_for_slp_child (stmt, i);
    3550      2459016 :               stmt_vec_info def = vinfo->lookup_def (op);
    3551      2459016 :               if (def
    3552       851073 :                   && !scalar_stmts_in_externs.contains (def)
    3553      2977281 :                   && STMT_SLP_TYPE (def) != pure_slp)
    3554              :                 {
    3555       436980 :                   STMT_SLP_TYPE (def) = pure_slp;
    3556       436980 :                   worklist.safe_push (def);
    3557              :                 }
    3558              :             }
    3559              :         }
    3560       677010 :     }
    3561       234604 : }
    3562              : 
    3563              : /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
    3564              : 
    3565              : static void
    3566      2488726 : vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
    3567              : {
    3568      2488726 :   int i;
    3569      2488726 :   stmt_vec_info stmt_info;
    3570      2488726 :   slp_tree child;
    3571              : 
    3572      2488726 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    3573              :     return;
    3574              : 
    3575      1489676 :   if (visited.add (node))
    3576              :     return;
    3577              : 
    3578      4432615 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    3579      3088710 :     if (stmt_info)
    3580              :       {
    3581      3088710 :         gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
    3582              :                     || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
    3583      3088710 :         STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
    3584              :       }
    3585              : 
    3586      3056060 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3587      1712155 :     if (child)
    3588      1712155 :       vect_mark_slp_stmts_relevant (child, visited);
    3589              : }
    3590              : 
    3591              : static void
    3592       776571 : vect_mark_slp_stmts_relevant (slp_tree node)
    3593              : {
    3594       776571 :   hash_set<slp_tree> visited;
    3595       776571 :   vect_mark_slp_stmts_relevant (node, visited);
    3596       776571 : }
    3597              : 
    3598              : 
    3599              : /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
    3600              : 
    3601              : static void
    3602     10551353 : vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
    3603              :                        hash_set<slp_tree> &visited)
    3604              : {
    3605     10551353 :   if (!node || visited.add (node))
    3606      1736943 :     return;
    3607              : 
    3608      8814410 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    3609              :     return;
    3610              : 
    3611      6532066 :   if (!SLP_TREE_PERMUTE_P (node))
    3612              :     {
    3613      6325344 :       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
    3614      6325344 :       if (STMT_VINFO_DATA_REF (stmt_info)
    3615      2740210 :           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
    3616      1546854 :         loads.safe_push (node);
    3617              :     }
    3618              : 
    3619              :   unsigned i;
    3620              :   slp_tree child;
    3621     14874948 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3622      8342882 :     vect_gather_slp_loads (loads, child, visited);
    3623              : }
    3624              : 
    3625              : 
    3626              : /* Find the last store in SLP INSTANCE.  */
    3627              : 
    3628              : stmt_vec_info
    3629      2718181 : vect_find_last_scalar_stmt_in_slp (slp_tree node)
    3630              : {
    3631      2718181 :   stmt_vec_info last = NULL;
    3632      2718181 :   stmt_vec_info stmt_vinfo;
    3633              : 
    3634      9911769 :   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
    3635      7193588 :     if (stmt_vinfo)
    3636              :       {
    3637      7193588 :         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
    3638      7193588 :         last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
    3639              :       }
    3640              : 
    3641      2718181 :   return last;
    3642              : }
    3643              : 
    3644              : /* Find the first stmt in NODE.  */
    3645              : 
    3646              : stmt_vec_info
    3647       530660 : vect_find_first_scalar_stmt_in_slp (slp_tree node)
    3648              : {
    3649       530660 :   stmt_vec_info first = NULL;
    3650       530660 :   stmt_vec_info stmt_vinfo;
    3651              : 
    3652      1798300 :   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
    3653      1267640 :     if (stmt_vinfo)
    3654              :       {
    3655      1264946 :         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
    3656      1264946 :         if (!first
    3657      1264946 :             || get_later_stmt (stmt_vinfo, first) == first)
    3658              :           first = stmt_vinfo;
    3659              :       }
    3660              : 
    3661       530660 :   return first;
    3662              : }
    3663              : 
    3664              : /* Splits a group of stores, currently beginning at FIRST_VINFO, into
    3665              :    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
    3666              :    (also containing the first GROUP1_SIZE stmts, since stores are
    3667              :    consecutive), the second containing the remainder.
    3668              :    Return the first stmt in the second group.  */
    3669              : 
    3670              : static stmt_vec_info
    3671       156061 : vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
    3672              : {
    3673       156061 :   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
    3674       156061 :   gcc_assert (group1_size > 0);
    3675       156061 :   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
    3676       156061 :   gcc_assert (group2_size > 0);
    3677       156061 :   DR_GROUP_SIZE (first_vinfo) = group1_size;
    3678              : 
    3679       156061 :   stmt_vec_info stmt_info = first_vinfo;
    3680       523166 :   for (unsigned i = group1_size; i > 1; i--)
    3681              :     {
    3682       367105 :       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
    3683       367105 :       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
    3684              :     }
    3685              :   /* STMT is now the last element of the first group.  */
    3686       156061 :   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
    3687       156061 :   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
    3688              : 
    3689       156061 :   DR_GROUP_SIZE (group2) = group2_size;
    3690       436482 :   for (stmt_info = group2; stmt_info;
    3691       280421 :        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
    3692              :     {
    3693       280421 :       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
    3694       280421 :       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
    3695              :     }
    3696              : 
    3697              :   /* For the second group, the DR_GROUP_GAP is that before the original group,
    3698              :      plus skipping over the first vector.  */
    3699       156061 :   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
    3700              : 
    3701              :   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
    3702       156061 :   DR_GROUP_GAP (first_vinfo) += group2_size;
    3703              : 
    3704       156061 :   if (dump_enabled_p ())
    3705           61 :     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
    3706              :                      group1_size, group2_size);
    3707              : 
    3708       156061 :   return group2;
    3709              : }
    3710              : 
    3711              : /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
    3712              :    statements and a vector of NUNITS elements.  */
    3713              : 
    3714              : static poly_uint64
    3715      4134296 : calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
    3716              : {
    3717      4134296 :   return exact_div (common_multiple (nunits, group_size), group_size);
    3718              : }
    3719              : 
    3720              : /* Helper that checks to see if a node is a load node.  */
    3721              : 
    3722              : static inline bool
    3723          108 : vect_is_slp_load_node  (slp_tree root)
    3724              : {
    3725          108 :   return (!SLP_TREE_PERMUTE_P (root)
    3726          108 :           && SLP_TREE_DEF_TYPE (root) == vect_internal_def
    3727          102 :           && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
    3728          172 :           && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
    3729              : }
    3730              : 
    3731              : 
    3732              : /* Helper function of optimize_load_redistribution that performs the operation
    3733              :    recursively.  */
    3734              : 
    3735              : static slp_tree
    3736        20434 : optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
    3737              :                                 vec_info *vinfo, unsigned int group_size,
    3738              :                                 hash_map<slp_tree, slp_tree> *load_map,
    3739              :                                 slp_tree root)
    3740              : {
    3741        20434 :   if (slp_tree *leader = load_map->get (root))
    3742         3669 :     return *leader;
    3743              : 
    3744        16765 :   slp_tree node;
    3745        16765 :   unsigned i;
    3746              : 
    3747              :   /* For now, we don't know anything about externals so do not do anything.  */
    3748        16765 :   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
    3749              :     return NULL;
    3750        12385 :   else if (SLP_TREE_PERMUTE_P (root))
    3751              :     {
    3752              :       /* First convert this node into a load node and add it to the leaves
    3753              :          list and flatten the permute from a lane to a load one.  If it's
    3754              :          unneeded it will be elided later.  */
    3755           76 :       vec<stmt_vec_info> stmts;
    3756           76 :       stmts.create (SLP_TREE_LANES (root));
    3757           76 :       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
    3758          140 :       for (unsigned j = 0; j < lane_perm.length (); j++)
    3759              :         {
    3760          108 :           std::pair<unsigned, unsigned> perm = lane_perm[j];
    3761          108 :           node = SLP_TREE_CHILDREN (root)[perm.first];
    3762              : 
    3763          108 :           if (!vect_is_slp_load_node (node)
    3764          108 :               || SLP_TREE_CHILDREN (node).exists ())
    3765              :             {
    3766           44 :               stmts.release ();
    3767           44 :               goto next;
    3768              :             }
    3769              : 
    3770           64 :           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
    3771              :         }
    3772              : 
    3773           32 :       if (dump_enabled_p ())
    3774            0 :         dump_printf_loc (MSG_NOTE, vect_location,
    3775              :                          "converting stmts on permute node %p\n",
    3776              :                          (void *) root);
    3777              : 
    3778           32 :       bool *matches = XALLOCAVEC (bool, group_size);
    3779           32 :       poly_uint64 max_nunits = 1;
    3780           32 :       unsigned tree_size = 0, limit = 1;
    3781           32 :       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
    3782              :                                   matches, &limit, &tree_size, bst_map);
    3783           32 :       if (!node)
    3784            0 :         stmts.release ();
    3785              : 
    3786           32 :       load_map->put (root, node);
    3787           32 :       return node;
    3788              :     }
    3789              : 
    3790        12309 : next:
    3791        12353 :   load_map->put (root, NULL);
    3792              : 
    3793        29030 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
    3794              :     {
    3795        16677 :       slp_tree value
    3796        16677 :         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
    3797              :                                           node);
    3798        16677 :       if (value)
    3799              :         {
    3800           32 :           SLP_TREE_REF_COUNT (value)++;
    3801           32 :           SLP_TREE_CHILDREN (root)[i] = value;
    3802              :           /* ???  We know the original leafs of the replaced nodes will
    3803              :              be referenced by bst_map, only the permutes created by
    3804              :              pattern matching are not.  */
    3805           32 :           if (SLP_TREE_REF_COUNT (node) == 1)
    3806           32 :             load_map->remove (node);
    3807           32 :           vect_free_slp_tree (node);
    3808              :         }
    3809              :     }
    3810              : 
    3811              :   return NULL;
    3812              : }
    3813              : 
    3814              : /* Temporary workaround for loads not being CSEd during SLP build.  This
    3815              :    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
    3816              :    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
    3817              :    same DR such that the final operation is equal to a permuted load.  Such
    3818              :    NODES are then directly converted into LOADS themselves.  The nodes are
    3819              :    CSEd using BST_MAP.  */
    3820              : 
    3821              : static void
    3822         2851 : optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
    3823              :                               vec_info *vinfo, unsigned int group_size,
    3824              :                               hash_map<slp_tree, slp_tree> *load_map,
    3825              :                               slp_tree root)
    3826              : {
    3827         2851 :   slp_tree node;
    3828         2851 :   unsigned i;
    3829              : 
    3830         6608 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
    3831              :     {
    3832         3757 :       slp_tree value
    3833         3757 :         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
    3834              :                                           node);
    3835         3757 :       if (value)
    3836              :         {
    3837            0 :           SLP_TREE_REF_COUNT (value)++;
    3838            0 :           SLP_TREE_CHILDREN (root)[i] = value;
    3839              :           /* ???  We know the original leafs of the replaced nodes will
    3840              :              be referenced by bst_map, only the permutes created by
    3841              :              pattern matching are not.  */
    3842            0 :           if (SLP_TREE_REF_COUNT (node) == 1)
    3843            0 :             load_map->remove (node);
    3844            0 :           vect_free_slp_tree (node);
    3845              :         }
    3846              :     }
    3847         2851 : }
    3848              : 
    3849              : /* Helper function of vect_match_slp_patterns.
    3850              : 
    3851              :    Attempts to match patterns against the slp tree rooted in REF_NODE using
    3852              :    VINFO.  Patterns are matched in post-order traversal.
    3853              : 
    3854              :    If matching is successful the value in REF_NODE is updated and returned, if
    3855              :    not then it is returned unchanged.  */
    3856              : 
    3857              : static bool
    3858      6082109 : vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
    3859              :                            slp_tree_to_load_perm_map_t *perm_cache,
    3860              :                            slp_compat_nodes_map_t *compat_cache,
    3861              :                            hash_set<slp_tree> *visited)
    3862              : {
    3863      6082109 :   unsigned i;
    3864      6082109 :   slp_tree node = *ref_node;
    3865      6082109 :   bool found_p = false;
    3866      6082109 :   if (!node || visited->add (node))
    3867       868815 :     return false;
    3868              : 
    3869              :   slp_tree child;
    3870      9755493 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3871      4542199 :     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
    3872              :                                           vinfo, perm_cache, compat_cache,
    3873              :                                           visited);
    3874              : 
    3875     15639882 :   for (unsigned x = 0; x < num__slp_patterns; x++)
    3876              :     {
    3877     10426588 :       vect_pattern *pattern
    3878     10426588 :         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
    3879     10426588 :       if (pattern)
    3880              :         {
    3881         1171 :           pattern->build (vinfo);
    3882         1171 :           delete pattern;
    3883         1171 :           found_p = true;
    3884              :         }
    3885              :     }
    3886              : 
    3887              :   return found_p;
    3888              : }
    3889              : 
    3890              : /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
    3891              :    vec_info VINFO.
    3892              : 
    3893              :    The modified tree is returned.  Patterns are tried in order and multiple
    3894              :    patterns may match.  */
    3895              : 
    3896              : static bool
    3897      1539910 : vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
    3898              :                          hash_set<slp_tree> *visited,
    3899              :                          slp_tree_to_load_perm_map_t *perm_cache,
    3900              :                          slp_compat_nodes_map_t *compat_cache)
    3901              : {
    3902      1539910 :   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
    3903      1539910 :   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
    3904              : 
    3905      1539910 :   if (dump_enabled_p ())
    3906        30427 :     dump_printf_loc (MSG_NOTE, vect_location,
    3907              :                      "Analyzing SLP tree %p for patterns\n",
    3908        30427 :                      (void *) SLP_INSTANCE_TREE (instance));
    3909              : 
    3910      1539910 :   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
    3911      1539910 :                                     visited);
    3912              : }
    3913              : 
    3914              : /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
    3915              :    vectorizing with VECTYPE that might be NULL.  MASKED_P indicates whether
    3916              :    the stores are masked.
    3917              :    Return true if we could use IFN_STORE_LANES instead and if that appears
    3918              :    to be the better approach.  */
    3919              : 
    3920              : static bool
    3921         5812 : vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
    3922              :                                tree vectype, bool masked_p,
    3923              :                                unsigned int group_size,
    3924              :                                unsigned int new_group_size)
    3925              : {
    3926         5812 :   if (!vectype)
    3927              :     {
    3928         5812 :       tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
    3929         5812 :       vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
    3930              :     }
    3931         5812 :   if (!vectype)
    3932              :     return false;
    3933              :   /* Allow the split if one of the two new groups would operate on full
    3934              :      vectors *within* rather than across one scalar loop iteration.
    3935              :      This is purely a heuristic, but it should work well for group
    3936              :      sizes of 3 and 4, where the possible splits are:
    3937              : 
    3938              :        3->2+1:  OK if the vector has exactly two elements
    3939              :        4->2+2:  Likewise
    3940              :        4->3+1:  Less clear-cut.  */
    3941         5812 :   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
    3942         3259 :       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
    3943         2576 :     return false;
    3944         3236 :   return vect_store_lanes_supported (vectype, group_size, masked_p) != IFN_LAST;
    3945              : }
    3946              : 
    3947              : /* Analyze an SLP instance starting from a group of grouped stores.  Call
    3948              :    vect_build_slp_tree to build a tree of packed stmts if possible.
    3949              :    Return FALSE if it's impossible to SLP any stmt in the loop.  */
    3950              : 
    3951              : static bool
    3952              : vect_analyze_slp_instance (vec_info *vinfo,
    3953              :                            scalar_stmts_to_slp_tree_map_t *bst_map,
    3954              :                            stmt_vec_info stmt_info, slp_instance_kind kind,
    3955              :                            unsigned max_tree_size, unsigned *limit,
    3956              :                            bool force_single_lane);
    3957              : 
    3958              : /* Build an interleaving scheme for the store sources RHS_NODES from
    3959              :    SCALAR_STMTS.  */
    3960              : 
    3961              : static slp_tree
    3962         7712 : vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
    3963              :                                    vec<stmt_vec_info> &scalar_stmts,
    3964              :                                    poly_uint64 max_nunits)
    3965              : {
    3966         7712 :   unsigned int group_size = scalar_stmts.length ();
    3967        15424 :   slp_tree node = vect_create_new_slp_node (scalar_stmts,
    3968         7712 :                                             SLP_TREE_CHILDREN
    3969              :                                               (rhs_nodes[0]).length ());
    3970         7712 :   SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
    3971         7712 :   node->max_nunits = max_nunits;
    3972         7712 :   for (unsigned l = 0;
    3973        15451 :        l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
    3974              :     {
    3975              :       /* And a permute merging all RHS SLP trees.  */
    3976         7739 :       slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
    3977         7739 :                                                 VEC_PERM_EXPR);
    3978         7739 :       SLP_TREE_CHILDREN (node).quick_push (perm);
    3979         7739 :       SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
    3980         7739 :       SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
    3981         7739 :       perm->max_nunits = max_nunits;
    3982         7739 :       SLP_TREE_LANES (perm) = group_size;
    3983              :       /* ???  We should set this NULL but that's not expected.  */
    3984         7739 :       SLP_TREE_REPRESENTATIVE (perm)
    3985         7739 :         = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
    3986        30312 :       for (unsigned j = 0; j < rhs_nodes.length (); ++j)
    3987              :         {
    3988        22573 :           SLP_TREE_CHILDREN (perm)
    3989        22573 :             .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
    3990        22573 :           SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
    3991        22573 :           for (unsigned k = 0;
    3992        47478 :                k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
    3993              :             {
    3994              :               /* ???  We should populate SLP_TREE_SCALAR_STMTS
    3995              :                  or SLP_TREE_SCALAR_OPS but then we might have
    3996              :                  a mix of both in our children.  */
    3997        24905 :               SLP_TREE_LANE_PERMUTATION (perm)
    3998        24905 :                 .quick_push (std::make_pair (j, k));
    3999              :             }
    4000              :         }
    4001              : 
    4002              :       /* Now we have a single permute node but we cannot code-generate
    4003              :          the case with more than two inputs.
    4004              :          Perform pairwise reduction, reducing the two inputs
    4005              :          with the least number of lanes to one and then repeat until
    4006              :          we end up with two inputs.  That scheme makes sure we end
    4007              :          up with permutes satisfying the restriction of requiring at
    4008              :          most two vector inputs to produce a single vector output
    4009              :          when the number of lanes is even.  */
    4010        14834 :       while (SLP_TREE_CHILDREN (perm).length () > 2)
    4011              :         {
    4012              :           /* When we have three equal sized groups left the pairwise
    4013              :              reduction does not result in a scheme that avoids using
    4014              :              three vectors.  Instead merge the first two groups
    4015              :              to the final size with do-not-care elements (chosen
    4016              :              from the first group) and then merge with the third.
    4017              :                   { A0, B0,  x, A1, B1,  x, ... }
    4018              :                -> { A0, B0, C0, A1, B1, C1, ... }
    4019              :              This handles group size of three (and at least
    4020              :              power-of-two multiples of that).  */
    4021         7095 :           if (SLP_TREE_CHILDREN (perm).length () == 3
    4022         3271 :               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
    4023         3271 :                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
    4024         7095 :               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
    4025         2453 :                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
    4026              :             {
    4027         2147 :               int ai = 0;
    4028         2147 :               int bi = 1;
    4029         2147 :               slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
    4030         2147 :               slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
    4031         2147 :               unsigned n = SLP_TREE_LANES (perm);
    4032              : 
    4033         2147 :               slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
    4034         2147 :               SLP_TREE_LANES (permab) = n;
    4035         2147 :               SLP_TREE_LANE_PERMUTATION (permab).create (n);
    4036         2147 :               SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
    4037         2147 :               permab->max_nunits = max_nunits;
    4038              :               /* ???  Should be NULL but that's not expected.  */
    4039         2147 :               SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
    4040         2147 :               SLP_TREE_CHILDREN (permab).quick_push (a);
    4041         4308 :               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
    4042         2161 :                 SLP_TREE_LANE_PERMUTATION (permab)
    4043         2161 :                   .quick_push (std::make_pair (0, k));
    4044         2147 :               SLP_TREE_CHILDREN (permab).quick_push (b);
    4045         4308 :               for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
    4046         2161 :                 SLP_TREE_LANE_PERMUTATION (permab)
    4047         2161 :                   .quick_push (std::make_pair (1, k));
    4048              :               /* Push the do-not-care lanes.  */
    4049         4308 :               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
    4050         2161 :                 SLP_TREE_LANE_PERMUTATION (permab)
    4051         2161 :                   .quick_push (std::make_pair (0, k));
    4052              : 
    4053              :               /* Put the merged node into 'perm', in place of a.  */
    4054         2147 :               SLP_TREE_CHILDREN (perm)[ai] = permab;
    4055              :               /* Adjust the references to b in the permutation
    4056              :                  of perm and to the later children which we'll
    4057              :                  remove.  */
    4058         8630 :               for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
    4059              :                 {
    4060         6483 :                   std::pair<unsigned, unsigned> &p
    4061         6483 :                     = SLP_TREE_LANE_PERMUTATION (perm)[k];
    4062         6483 :                   if (p.first == (unsigned) bi)
    4063              :                     {
    4064         2161 :                       p.first = ai;
    4065         2161 :                       p.second += SLP_TREE_LANES (a);
    4066              :                     }
    4067         4322 :                   else if (p.first > (unsigned) bi)
    4068         2161 :                     p.first--;
    4069              :                 }
    4070         2147 :               SLP_TREE_CHILDREN (perm).ordered_remove (bi);
    4071         2147 :               break;
    4072              :             }
    4073              : 
    4074              :           /* Pick the two nodes with the least number of lanes,
    4075              :              prefer the earliest candidate and maintain ai < bi.  */
    4076              :           int ai = -1;
    4077              :           int bi = -1;
    4078        45078 :           for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
    4079              :             {
    4080        40130 :               if (ai == -1)
    4081         4948 :                 ai = ci;
    4082        35182 :               else if (bi == -1)
    4083         4948 :                 bi = ci;
    4084        30234 :               else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
    4085        30234 :                         < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
    4086        30234 :                        || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
    4087        24904 :                            < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
    4088              :                 {
    4089        11548 :                   if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
    4090         5774 :                       <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
    4091         2687 :                     bi = ci;
    4092              :                   else
    4093              :                     {
    4094         3087 :                       ai = bi;
    4095         3087 :                       bi = ci;
    4096              :                     }
    4097              :                 }
    4098              :             }
    4099              : 
    4100              :           /* Produce a merge of nodes ai and bi.  */
    4101         4948 :           slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
    4102         4948 :           slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
    4103         4948 :           unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
    4104         4948 :           slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
    4105         4948 :           SLP_TREE_LANES (permab) = n;
    4106         4948 :           SLP_TREE_LANE_PERMUTATION (permab).create (n);
    4107         4948 :           SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
    4108         4948 :           permab->max_nunits = max_nunits;
    4109              :           /* ???  Should be NULL but that's not expected.  */
    4110         4948 :           SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
    4111         4948 :           SLP_TREE_CHILDREN (permab).quick_push (a);
    4112        13096 :           for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
    4113         8148 :             SLP_TREE_LANE_PERMUTATION (permab)
    4114         8148 :               .quick_push (std::make_pair (0, k));
    4115         4948 :           SLP_TREE_CHILDREN (permab).quick_push (b);
    4116        12420 :           for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
    4117         7472 :             SLP_TREE_LANE_PERMUTATION (permab)
    4118         7472 :               .quick_push (std::make_pair (1, k));
    4119              : 
    4120              :           /* Put the merged node into 'perm', in place of a.  */
    4121         4948 :           SLP_TREE_CHILDREN (perm)[ai] = permab;
    4122              :           /* Adjust the references to b in the permutation
    4123              :              of perm and to the later children which we'll
    4124              :              remove.  */
    4125        72097 :           for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
    4126              :             {
    4127        67149 :               std::pair<unsigned, unsigned> &p
    4128        67149 :                 = SLP_TREE_LANE_PERMUTATION (perm)[k];
    4129        67149 :               if (p.first == (unsigned) bi)
    4130              :                 {
    4131         7472 :                   p.first = ai;
    4132         7472 :                   p.second += SLP_TREE_LANES (a);
    4133              :                 }
    4134        59677 :               else if (p.first > (unsigned) bi)
    4135        25082 :                 p.first--;
    4136              :             }
    4137         4948 :           SLP_TREE_CHILDREN (perm).ordered_remove (bi);
    4138              :         }
    4139              :     }
    4140              : 
    4141         7712 :   return node;
    4142              : }
    4143              : 
    4144              : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
    4145              :    of KIND.  Return true if successful.  SCALAR_STMTS is owned by this
    4146              :    function, REMAIN and ROOT_STMT_INFOS ownership is transferred back to
    4147              :    the caller upon failure.  */
    4148              : 
    4149              : static bool
    4150      1869561 : vect_build_slp_instance (vec_info *vinfo,
    4151              :                          slp_instance_kind kind,
    4152              :                          vec<stmt_vec_info> &scalar_stmts,
    4153              :                          vec<stmt_vec_info> &root_stmt_infos,
    4154              :                          vec<tree> &remain,
    4155              :                          unsigned max_tree_size, unsigned *limit,
    4156              :                          scalar_stmts_to_slp_tree_map_t *bst_map,
    4157              :                          bool force_single_lane)
    4158              : {
    4159              :   /* If there's no budget left bail out early.  */
    4160      1869561 :   if (*limit == 0)
    4161              :     {
    4162        27238 :       scalar_stmts.release ();
    4163        27238 :       return false;
    4164              :     }
    4165              : 
    4166      1842323 :   if (kind == slp_inst_kind_ctor)
    4167              :     {
    4168        12824 :       if (dump_enabled_p ())
    4169           86 :         dump_printf_loc (MSG_NOTE, vect_location,
    4170              :                          "Analyzing vectorizable constructor: %G\n",
    4171           43 :                          root_stmt_infos[0]->stmt);
    4172              :     }
    4173      1829499 :   else if (kind == slp_inst_kind_gcond)
    4174              :     {
    4175       275635 :       if (dump_enabled_p ())
    4176         5624 :         dump_printf_loc (MSG_NOTE, vect_location,
    4177              :                          "Analyzing vectorizable control flow: %G",
    4178         2812 :                          root_stmt_infos[0]->stmt);
    4179              :     }
    4180              : 
    4181      1842323 :   if (dump_enabled_p ())
    4182              :     {
    4183        25502 :       dump_printf_loc (MSG_NOTE, vect_location,
    4184              :                        "Starting SLP discovery for\n");
    4185        54440 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4186        57876 :         dump_printf_loc (MSG_NOTE, vect_location,
    4187        28938 :                          "  %G", scalar_stmts[i]->stmt);
    4188              :     }
    4189              : 
    4190              :   /* Build the tree for the SLP instance.  */
    4191      1842323 :   unsigned int group_size = scalar_stmts.length ();
    4192      1842323 :   bool *matches = XALLOCAVEC (bool, group_size);
    4193      1842323 :   poly_uint64 max_nunits = 1;
    4194      1842323 :   unsigned tree_size = 0;
    4195              : 
    4196      1842323 :   slp_tree node = NULL;
    4197      1842323 :   if (group_size > 1 && force_single_lane)
    4198              :     {
    4199            0 :       matches[0] = true;
    4200            0 :       matches[1] = false;
    4201              :     }
    4202              :   else
    4203      1842323 :     node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4204              :                                 &max_nunits, matches, limit,
    4205              :                                 &tree_size, bst_map);
    4206      1842323 :   if (node != NULL)
    4207              :     {
    4208              :       /* Calculate the unrolling factor based on the smallest type.  */
    4209       758165 :       poly_uint64 unrolling_factor
    4210       758165 :         = calculate_unrolling_factor (max_nunits, group_size);
    4211              : 
    4212       758165 :       if (maybe_ne (unrolling_factor, 1U)
    4213       758165 :           && is_a <bb_vec_info> (vinfo))
    4214              :         {
    4215            0 :           unsigned HOST_WIDE_INT const_max_nunits;
    4216            0 :           if (!max_nunits.is_constant (&const_max_nunits)
    4217            0 :               || const_max_nunits > group_size)
    4218              :             {
    4219            0 :               if (dump_enabled_p ())
    4220            0 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    4221              :                                  "Build SLP failed: store group "
    4222              :                                  "size not a multiple of the vector size "
    4223              :                                  "in basic block SLP\n");
    4224            0 :               vect_free_slp_tree (node);
    4225            0 :               return false;
    4226              :             }
    4227              :           /* Fatal mismatch.  */
    4228            0 :           if (dump_enabled_p ())
    4229            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    4230              :                              "SLP discovery succeeded but node needs "
    4231              :                              "splitting\n");
    4232            0 :           memset (matches, true, group_size);
    4233            0 :           matches[group_size / const_max_nunits * const_max_nunits] = false;
    4234            0 :           vect_free_slp_tree (node);
    4235              :         }
    4236              :       else
    4237              :         {
    4238              :           /* Create a new SLP instance.  */
    4239       758165 :           slp_instance new_instance = XNEW (class _slp_instance);
    4240       758165 :           SLP_INSTANCE_TREE (new_instance) = node;
    4241       758165 :           SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4242       758165 :           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    4243       758165 :           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    4244       758165 :           SLP_INSTANCE_KIND (new_instance) = kind;
    4245       758165 :           new_instance->reduc_phis = NULL;
    4246       758165 :           new_instance->cost_vec = vNULL;
    4247       758165 :           new_instance->subgraph_entries = vNULL;
    4248              : 
    4249       758165 :           if (dump_enabled_p ())
    4250        22441 :             dump_printf_loc (MSG_NOTE, vect_location,
    4251              :                              "SLP size %u vs. limit %u.\n",
    4252              :                              tree_size, max_tree_size);
    4253              : 
    4254       758165 :           vinfo->slp_instances.safe_push (new_instance);
    4255              : 
    4256              :           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4257              :              the number of scalar stmts in the root in a few places.
    4258              :              Verify that assumption holds.  */
    4259      1516330 :           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4260              :                         .length () == group_size);
    4261              : 
    4262       758165 :           if (dump_enabled_p ())
    4263              :             {
    4264        22441 :               if (kind == slp_inst_kind_reduc_group)
    4265         1449 :                 dump_printf_loc (MSG_NOTE, vect_location,
    4266              :                                  "SLP discovery of size %d reduction group "
    4267              :                                  "succeeded\n", group_size);
    4268        22441 :               dump_printf_loc (MSG_NOTE, vect_location,
    4269              :                                "Final SLP tree for instance %p:\n",
    4270              :                                (void *) new_instance);
    4271        22441 :               vect_print_slp_graph (MSG_NOTE, vect_location,
    4272              :                                     SLP_INSTANCE_TREE (new_instance));
    4273              :             }
    4274              : 
    4275       758165 :           return true;
    4276              :         }
    4277              :     }
    4278              :   /* Failed to SLP.  */
    4279              : 
    4280              :   /* While we arrive here even with slp_inst_kind_store we should only
    4281              :      for group_size == 1.  The code to split store groups is only in
    4282              :      vect_analyze_slp_instance now.  */
    4283      1084158 :   gcc_assert (kind != slp_inst_kind_store || group_size == 1);
    4284              : 
    4285              :   /* Free the allocated memory.  */
    4286      1084158 :   scalar_stmts.release ();
    4287              : 
    4288              :   /* Failed to SLP.  */
    4289      1084158 :   if (dump_enabled_p ())
    4290         3061 :     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
    4291              :   return false;
    4292              : }
    4293              : 
    4294              : /* Analyze an SLP instance starting from a the start of a reduction chain.
    4295              :    Call vect_build_slp_tree to build a tree of packed stmts if possible.
    4296              :    Return FALSE if SLP build fails.  */
    4297              : 
    4298              : static bool
    4299        63371 : vect_analyze_slp_reduc_chain (loop_vec_info vinfo,
    4300              :                               scalar_stmts_to_slp_tree_map_t *bst_map,
    4301              :                               stmt_vec_info scalar_stmt,
    4302              :                               unsigned max_tree_size, unsigned *limit)
    4303              : {
    4304        63371 :   vec<stmt_vec_info> scalar_stmts = vNULL;
    4305              : 
    4306        63371 :   bool fail = false;
    4307              :   /* ???  We could leave operation code checking to SLP discovery.  */
    4308        63371 :   code_helper code = STMT_VINFO_REDUC_CODE (STMT_VINFO_REDUC_DEF
    4309              :                                               (vect_orig_stmt (scalar_stmt)));
    4310        63371 :   bool first = true;
    4311        63371 :   stmt_vec_info next_stmt = scalar_stmt;
    4312        71558 :   do
    4313              :     {
    4314        71558 :       stmt_vec_info stmt = next_stmt;
    4315        71558 :       gimple_match_op op;
    4316        71558 :       if (!gimple_extract_op (STMT_VINFO_STMT (stmt), &op))
    4317            0 :         gcc_unreachable ();
    4318       143116 :       tree reduc_def = gimple_arg (STMT_VINFO_STMT (stmt),
    4319        71558 :                                    STMT_VINFO_REDUC_IDX (stmt));
    4320        71558 :       next_stmt = vect_stmt_to_vectorize (vinfo->lookup_def (reduc_def));
    4321        71558 :       gcc_assert (is_a <gphi *> (STMT_VINFO_STMT (next_stmt))
    4322              :                   || STMT_VINFO_REDUC_IDX (next_stmt) != -1);
    4323        77102 :       if (!gimple_extract_op (STMT_VINFO_STMT (vect_orig_stmt (stmt)), &op))
    4324            0 :         gcc_unreachable ();
    4325        71558 :       if (CONVERT_EXPR_CODE_P (op.code)
    4326         3421 :           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))
    4327        74967 :           && (first
    4328         1692 :               || is_a <gphi *> (STMT_VINFO_STMT (next_stmt))))
    4329              :         ;
    4330        68153 :       else if (code != op.code)
    4331              :         {
    4332         2553 :           fail = true;
    4333         2553 :           break;
    4334              :         }
    4335              :       else
    4336        65600 :         scalar_stmts.safe_push (stmt);
    4337        69005 :       first = false;
    4338              :     }
    4339        69005 :   while (!is_a <gphi *> (STMT_VINFO_STMT (next_stmt)));
    4340        63371 :   if (fail)
    4341         2553 :     return false;
    4342              : 
    4343              :   /* Remember a stmt with the actual reduction operation.  */
    4344        60818 :   stmt_vec_info reduc_scalar_stmt = scalar_stmts[0];
    4345              : 
    4346              :   /* When the SSA def chain through reduc-idx does not form a natural
    4347              :      reduction chain try to linearize an associative operation manually.  */
    4348        60818 :   if (scalar_stmts.length () == 1
    4349        58199 :       && code.is_tree_code ()
    4350        52141 :       && associative_tree_code ((tree_code)code)
    4351              :       /* We may not associate if a fold-left reduction is required.  */
    4352       112094 :       && !needs_fold_left_reduction_p (TREE_TYPE (gimple_get_lhs
    4353              :                                                     (reduc_scalar_stmt->stmt)),
    4354              :                                        code))
    4355              :     {
    4356        49154 :       auto_vec<chain_op_t> chain;
    4357        49154 :       auto_vec<std::pair<tree_code, gimple *> > worklist;
    4358        49154 :       gimple *op_stmt = NULL, *other_op_stmt = NULL;
    4359        49154 :       if (is_a <gassign *> (scalar_stmts[0]->stmt)
    4360              :           /* We cannot linearize an operation that vect_slp_linearize_chain
    4361              :              would not put on its worklist.  */
    4362        49154 :           && gimple_assign_rhs_code (scalar_stmts[0]->stmt) == (tree_code)code)
    4363              :         {
    4364        48507 :           vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
    4365        48507 :                                     scalar_stmts[0]->stmt, op_stmt,
    4366              :                                     other_op_stmt,
    4367              :                                     NULL);
    4368              : 
    4369        48507 :           scalar_stmts.truncate (0);
    4370        48507 :           stmt_vec_info tail = NULL;
    4371       242780 :           for (auto el : chain)
    4372              :             {
    4373        97570 :               if (el.dt == vect_external_def
    4374        97570 :                   || el.dt == vect_constant_def
    4375        97570 :                   || el.code != (tree_code) code)
    4376              :                 {
    4377          311 :                   scalar_stmts.release ();
    4378          311 :                   return false;
    4379              :                 }
    4380        97259 :               stmt_vec_info stmt = vinfo->lookup_def (el.op);
    4381        97259 :               if (STMT_VINFO_REDUC_IDX (stmt) != -1
    4382        95715 :                   || STMT_VINFO_REDUC_DEF (stmt))
    4383              :                 {
    4384        48394 :                   gcc_assert (tail == NULL);
    4385        48394 :                   tail = stmt;
    4386        48394 :                   continue;
    4387              :                 }
    4388        48865 :               scalar_stmts.safe_push (stmt);
    4389              :             }
    4390        48196 :           gcc_assert (tail);
    4391              :         }
    4392              : 
    4393              :       /* When this linearization didn't produce a chain see if stripping
    4394              :          a wrapping sign conversion produces one.  */
    4395        48843 :       if (scalar_stmts.length () == 1
    4396        48843 :           && (code == PLUS_EXPR || code == MULT_EXPR || code == BIT_IOR_EXPR
    4397              :               || code == BIT_AND_EXPR || code == BIT_XOR_EXPR))
    4398              :         {
    4399        47113 :           gimple *stmt = scalar_stmts[0]->stmt;
    4400        47113 :           if (!is_gimple_assign (stmt)
    4401        46057 :               || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
    4402         4498 :               || TREE_CODE (gimple_assign_rhs1 (stmt)) != SSA_NAME
    4403        51611 :               || !tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
    4404         4498 :                                          TREE_TYPE (gimple_assign_rhs1 (stmt))))
    4405              :             {
    4406        45361 :               scalar_stmts.release ();
    4407        45361 :               return false;
    4408              :             }
    4409         1752 :           stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
    4410         1752 :           if (!is_gimple_assign (stmt)
    4411         1752 :               || gimple_assign_rhs_code (stmt) != (tree_code)code)
    4412              :             {
    4413         1733 :               scalar_stmts.release ();
    4414         1733 :               return false;
    4415              :             }
    4416           19 :           chain.truncate (0);
    4417           19 :           vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
    4418              :                                     stmt, op_stmt, other_op_stmt, NULL);
    4419              : 
    4420           19 :           scalar_stmts.truncate (0);
    4421           19 :           stmt_vec_info tail = NULL;
    4422           93 :           for (auto el : chain)
    4423              :             {
    4424           44 :               if (el.dt == vect_external_def
    4425           44 :                   || el.dt == vect_constant_def
    4426           44 :                   || el.code != (tree_code) code)
    4427              :                 {
    4428            8 :                   scalar_stmts.release ();
    4429            8 :                   return false;
    4430              :                 }
    4431           36 :               stmt_vec_info stmt = vinfo->lookup_def (el.op);
    4432           36 :               if (STMT_VINFO_REDUC_IDX (stmt) != -1
    4433           36 :                   || STMT_VINFO_REDUC_DEF (stmt))
    4434              :                 {
    4435            0 :                   gcc_assert (tail == NULL);
    4436            0 :                   tail = stmt;
    4437            0 :                   continue;
    4438              :                 }
    4439           36 :               scalar_stmts.safe_push (stmt);
    4440              :             }
    4441              :           /* Unlike the above this does not include the reduction SSA
    4442              :              cycle.  */
    4443           11 :           gcc_assert (!tail);
    4444              :         }
    4445              : 
    4446         1741 :       if (scalar_stmts.length () < 2)
    4447              :         {
    4448         1622 :           scalar_stmts.release ();
    4449         1622 :           return false;
    4450              :         }
    4451              : 
    4452          119 :       if (dump_enabled_p ())
    4453              :         {
    4454           34 :           dump_printf_loc (MSG_NOTE, vect_location,
    4455              :                            "Starting SLP discovery of reduction chain for\n");
    4456          140 :           for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4457          212 :             dump_printf_loc (MSG_NOTE, vect_location,
    4458          106 :                              "  %G", scalar_stmts[i]->stmt);
    4459              :         }
    4460              : 
    4461          119 :       unsigned int group_size = scalar_stmts.length ();
    4462          119 :       bool *matches = XALLOCAVEC (bool, group_size);
    4463          119 :       poly_uint64 max_nunits = 1;
    4464          119 :       unsigned tree_size = 0;
    4465          119 :       slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4466              :                                            &max_nunits, matches, limit,
    4467          119 :                                            &tree_size, bst_map);
    4468          119 :       if (!node)
    4469              :         {
    4470           47 :           scalar_stmts.release ();
    4471           47 :           return false;
    4472              :         }
    4473              : 
    4474           72 :       unsigned cycle_id = vinfo->reduc_infos.length ();
    4475           72 :       vect_reduc_info reduc_info = new vect_reduc_info_s ();
    4476           72 :       vinfo->reduc_infos.safe_push (reduc_info);
    4477           72 :       VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (next_stmt);
    4478           72 :       VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (next_stmt);
    4479           72 :       VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (next_stmt);
    4480           72 :       VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
    4481           72 :       reduc_info->is_reduc_chain = true;
    4482              : 
    4483              :       /* Build the node for the PHI and possibly the conversions.  */
    4484           72 :       slp_tree phis = vect_create_new_slp_node (2, ERROR_MARK);
    4485           72 :       SLP_TREE_REPRESENTATIVE (phis) = next_stmt;
    4486           72 :       phis->cycle_info.id = cycle_id;
    4487           72 :       SLP_TREE_LANES (phis) = group_size;
    4488           72 :       if (reduc_scalar_stmt == scalar_stmt)
    4489           68 :         SLP_TREE_VECTYPE (phis) = SLP_TREE_VECTYPE (node);
    4490              :       else
    4491            4 :         SLP_TREE_VECTYPE (phis)
    4492            4 :           = signed_or_unsigned_type_for (TYPE_UNSIGNED
    4493              :                                            (TREE_TYPE (gimple_get_lhs
    4494              :                                                          (scalar_stmt->stmt))),
    4495              :                                          SLP_TREE_VECTYPE (node));
    4496              :       /* ???  vect_cse_slp_nodes cannot cope with cycles without any
    4497              :          SLP_TREE_SCALAR_STMTS.  */
    4498           72 :       SLP_TREE_SCALAR_STMTS (phis).create (group_size);
    4499          375 :       for (unsigned i = 0; i < group_size; ++i)
    4500          303 :         SLP_TREE_SCALAR_STMTS (phis).quick_push (next_stmt);
    4501              : 
    4502           72 :       slp_tree op_input = phis;
    4503           72 :       if (reduc_scalar_stmt != scalar_stmt)
    4504              :         {
    4505            4 :           slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
    4506            4 :           SLP_TREE_REPRESENTATIVE (conv)
    4507            4 :             = vinfo->lookup_def (gimple_arg (reduc_scalar_stmt->stmt,
    4508            4 :                                              STMT_VINFO_REDUC_IDX
    4509              :                                                (reduc_scalar_stmt)));
    4510            4 :           SLP_TREE_CHILDREN (conv).quick_push (phis);
    4511            4 :           conv->cycle_info.id = cycle_id;
    4512            4 :           SLP_TREE_REDUC_IDX (conv) = 0;
    4513            4 :           SLP_TREE_LANES (conv) = group_size;
    4514            4 :           SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (node);
    4515            4 :           SLP_TREE_SCALAR_STMTS (conv) = vNULL;
    4516            4 :           op_input = conv;
    4517              :         }
    4518              : 
    4519           72 :       slp_tree reduc = vect_create_new_slp_node (2, ERROR_MARK);
    4520           72 :       SLP_TREE_REPRESENTATIVE (reduc) = reduc_scalar_stmt;
    4521           72 :       SLP_TREE_CHILDREN (reduc).quick_push (op_input);
    4522           72 :       SLP_TREE_CHILDREN (reduc).quick_push (node);
    4523           72 :       reduc->cycle_info.id = cycle_id;
    4524           72 :       SLP_TREE_REDUC_IDX (reduc) = 0;
    4525           72 :       SLP_TREE_LANES (reduc) = group_size;
    4526           72 :       SLP_TREE_VECTYPE (reduc) = SLP_TREE_VECTYPE (node);
    4527              :       /* ???  For the reduction epilogue we need a live lane.  */
    4528           72 :       SLP_TREE_SCALAR_STMTS (reduc).create (group_size);
    4529           72 :       SLP_TREE_SCALAR_STMTS (reduc).quick_push (reduc_scalar_stmt);
    4530          303 :       for (unsigned i = 1; i < group_size; ++i)
    4531          231 :         SLP_TREE_SCALAR_STMTS (reduc).quick_push (NULL);
    4532              : 
    4533           72 :       if (reduc_scalar_stmt != scalar_stmt)
    4534              :         {
    4535            4 :           slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
    4536            4 :           SLP_TREE_REPRESENTATIVE (conv) = scalar_stmt;
    4537            4 :           SLP_TREE_CHILDREN (conv).quick_push (reduc);
    4538            4 :           conv->cycle_info.id = cycle_id;
    4539            4 :           SLP_TREE_REDUC_IDX (conv) = 0;
    4540            4 :           SLP_TREE_LANES (conv) = group_size;
    4541            4 :           SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (phis);
    4542              :           /* ???  For the reduction epilogue we need a live lane.  */
    4543            4 :           SLP_TREE_SCALAR_STMTS (conv).create (group_size);
    4544            4 :           SLP_TREE_SCALAR_STMTS (conv).quick_push (scalar_stmt);
    4545            8 :           for (unsigned i = 1; i < group_size; ++i)
    4546            4 :             SLP_TREE_SCALAR_STMTS (conv).quick_push (NULL);
    4547            4 :           reduc = conv;
    4548              :         }
    4549              : 
    4550           72 :       edge le = loop_latch_edge (LOOP_VINFO_LOOP (vinfo));
    4551           72 :       SLP_TREE_CHILDREN (phis).quick_push (NULL);
    4552           72 :       SLP_TREE_CHILDREN (phis).quick_push (NULL);
    4553           72 :       SLP_TREE_CHILDREN (phis)[le->dest_idx] = reduc;
    4554           72 :       SLP_TREE_REF_COUNT (reduc)++;
    4555              : 
    4556              :       /* Create a new SLP instance.  */
    4557           72 :       slp_instance new_instance = XNEW (class _slp_instance);
    4558           72 :       SLP_INSTANCE_TREE (new_instance) = reduc;
    4559           72 :       SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4560           72 :       SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
    4561           72 :       SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
    4562           72 :       SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
    4563           72 :       new_instance->reduc_phis = NULL;
    4564           72 :       new_instance->cost_vec = vNULL;
    4565           72 :       new_instance->subgraph_entries = vNULL;
    4566              : 
    4567           72 :       vinfo->slp_instances.safe_push (new_instance);
    4568              : 
    4569           72 :       if (dump_enabled_p ())
    4570              :         {
    4571           24 :           dump_printf_loc (MSG_NOTE, vect_location,
    4572              :                            "Final SLP tree for instance %p:\n",
    4573              :                            (void *) new_instance);
    4574           24 :           vect_print_slp_graph (MSG_NOTE, vect_location,
    4575              :                                 SLP_INSTANCE_TREE (new_instance));
    4576              :         }
    4577              : 
    4578           72 :       return true;
    4579        49154 :     }
    4580              : 
    4581        11664 :   if (scalar_stmts.length () <= 1)
    4582              :     {
    4583         9045 :       scalar_stmts.release ();
    4584         9045 :       return false;
    4585              :     }
    4586              : 
    4587         2619 :   scalar_stmts.reverse ();
    4588         2619 :   stmt_vec_info reduc_phi_info = next_stmt;
    4589              : 
    4590              :   /* Build the tree for the SLP instance.  */
    4591         2619 :   vec<stmt_vec_info> root_stmt_infos = vNULL;
    4592         2619 :   vec<tree> remain = vNULL;
    4593              : 
    4594         2619 :   if (dump_enabled_p ())
    4595              :     {
    4596          180 :       dump_printf_loc (MSG_NOTE, vect_location,
    4597              :                        "Starting SLP discovery of reduction chain for\n");
    4598          966 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4599         1572 :         dump_printf_loc (MSG_NOTE, vect_location,
    4600          786 :                          "  %G", scalar_stmts[i]->stmt);
    4601              :     }
    4602              : 
    4603              :   /* Build the tree for the SLP instance.  */
    4604         2619 :   unsigned int group_size = scalar_stmts.length ();
    4605         2619 :   bool *matches = XALLOCAVEC (bool, group_size);
    4606         2619 :   poly_uint64 max_nunits = 1;
    4607         2619 :   unsigned tree_size = 0;
    4608              : 
    4609              :   /* ???  We need this only for SLP discovery.  */
    4610        10014 :   for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4611         7395 :     REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = scalar_stmts[0];
    4612              : 
    4613         2619 :   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4614              :                                        &max_nunits, matches, limit,
    4615         2619 :                                        &tree_size, bst_map);
    4616              : 
    4617        10014 :   for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4618         7395 :     REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = NULL;
    4619              : 
    4620         2619 :   if (node != NULL)
    4621              :     {
    4622              :       /* Create a new SLP instance.  */
    4623         2286 :       slp_instance new_instance = XNEW (class _slp_instance);
    4624         2286 :       SLP_INSTANCE_TREE (new_instance) = node;
    4625         2286 :       SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4626         2286 :       SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    4627         2286 :       SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    4628         2286 :       SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
    4629         2286 :       new_instance->reduc_phis = NULL;
    4630         2286 :       new_instance->cost_vec = vNULL;
    4631         2286 :       new_instance->subgraph_entries = vNULL;
    4632              : 
    4633         2286 :       vect_reduc_info reduc_info = info_for_reduction (vinfo, node);
    4634         2286 :       reduc_info->is_reduc_chain = true;
    4635              : 
    4636         2286 :       if (dump_enabled_p ())
    4637          135 :         dump_printf_loc (MSG_NOTE, vect_location,
    4638              :                          "SLP size %u vs. limit %u.\n",
    4639              :                          tree_size, max_tree_size);
    4640              : 
    4641              :       /* Fixup SLP reduction chains.  If this is a reduction chain with
    4642              :          a conversion in front amend the SLP tree with a node for that.  */
    4643         2286 :       gimple *scalar_def = STMT_VINFO_REDUC_DEF (reduc_phi_info)->stmt;
    4644         2286 :       if (is_gimple_assign (scalar_def)
    4645         2286 :           && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (scalar_def)))
    4646              :         {
    4647           43 :           stmt_vec_info conv_info = vect_stmt_to_vectorize
    4648           43 :                                         (STMT_VINFO_REDUC_DEF (reduc_phi_info));
    4649           43 :           scalar_stmts = vNULL;
    4650           43 :           scalar_stmts.create (group_size);
    4651          135 :           for (unsigned i = 0; i < group_size; ++i)
    4652           92 :             scalar_stmts.quick_push (conv_info);
    4653           43 :           slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
    4654           43 :           SLP_TREE_VECTYPE (conv)
    4655           43 :             = get_vectype_for_scalar_type (vinfo,
    4656           43 :                                            TREE_TYPE
    4657              :                                              (gimple_assign_lhs (scalar_def)),
    4658              :                                            group_size);
    4659           43 :           SLP_TREE_REDUC_IDX (conv) = 0;
    4660           43 :           conv->cycle_info.id = node->cycle_info.id;
    4661           43 :           SLP_TREE_CHILDREN (conv).quick_push (node);
    4662           43 :           SLP_INSTANCE_TREE (new_instance) = conv;
    4663              :         }
    4664              :       /* Fill the backedge child of the PHI SLP node.  The
    4665              :          general matching code cannot find it because the
    4666              :          scalar code does not reflect how we vectorize the
    4667              :          reduction.  */
    4668         2286 :       use_operand_p use_p;
    4669         2286 :       imm_use_iterator imm_iter;
    4670         2286 :       class loop *loop = LOOP_VINFO_LOOP (vinfo);
    4671        11023 :       FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
    4672              :                              gimple_get_lhs (scalar_def))
    4673              :         /* There are exactly two non-debug uses, the reduction
    4674              :            PHI and the loop-closed PHI node.  */
    4675         6451 :         if (!is_gimple_debug (USE_STMT (use_p))
    4676         6451 :             && gimple_bb (USE_STMT (use_p)) == loop->header)
    4677              :           {
    4678         2286 :             auto_vec<stmt_vec_info, 64> phis (group_size);
    4679         2286 :             stmt_vec_info phi_info = vinfo->lookup_stmt (USE_STMT (use_p));
    4680         8842 :             for (unsigned i = 0; i < group_size; ++i)
    4681         6556 :               phis.quick_push (phi_info);
    4682         2286 :             slp_tree *phi_node = bst_map->get (phis);
    4683         2286 :             unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
    4684         4572 :             SLP_TREE_CHILDREN (*phi_node)[dest_idx]
    4685         2286 :               = SLP_INSTANCE_TREE (new_instance);
    4686         2286 :             SLP_INSTANCE_TREE (new_instance)->refcnt++;
    4687         2286 :           }
    4688              : 
    4689         2286 :       vinfo->slp_instances.safe_push (new_instance);
    4690              : 
    4691              :       /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4692              :          the number of scalar stmts in the root in a few places.
    4693              :          Verify that assumption holds.  */
    4694         4572 :       gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4695              :                   .length () == group_size);
    4696              : 
    4697         2286 :       if (dump_enabled_p ())
    4698              :         {
    4699          135 :           dump_printf_loc (MSG_NOTE, vect_location,
    4700              :                            "Final SLP tree for instance %p:\n",
    4701              :                            (void *) new_instance);
    4702          135 :           vect_print_slp_graph (MSG_NOTE, vect_location,
    4703              :                                 SLP_INSTANCE_TREE (new_instance));
    4704              :         }
    4705              : 
    4706         2286 :       return true;
    4707              :     }
    4708              : 
    4709              :   /* Failed to SLP.  */
    4710          333 :   scalar_stmts.release ();
    4711          333 :   if (dump_enabled_p ())
    4712           45 :     dump_printf_loc (MSG_NOTE, vect_location,
    4713              :                      "SLP discovery of reduction chain failed\n");
    4714              :   return false;
    4715              : }
    4716              : 
    4717              : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
    4718              :    of KIND.  Return true if successful.  */
    4719              : 
    4720              : static bool
    4721        89272 : vect_analyze_slp_reduction (loop_vec_info vinfo,
    4722              :                             stmt_vec_info scalar_stmt,
    4723              :                             unsigned max_tree_size, unsigned *limit,
    4724              :                             scalar_stmts_to_slp_tree_map_t *bst_map,
    4725              :                             bool force_single_lane)
    4726              : {
    4727        89272 :   slp_instance_kind kind = slp_inst_kind_reduc_group;
    4728              : 
    4729              :   /* If there's no budget left bail out early.  */
    4730        89272 :   if (*limit == 0)
    4731              :     return false;
    4732              : 
    4733              :   /* Try to gather a reduction chain.  */
    4734        89272 :   if (! force_single_lane
    4735        63641 :       && STMT_VINFO_DEF_TYPE (scalar_stmt) == vect_reduction_def
    4736       152643 :       && vect_analyze_slp_reduc_chain (vinfo, bst_map, scalar_stmt,
    4737              :                                        max_tree_size, limit))
    4738              :     return true;
    4739              : 
    4740        86914 :   vec<stmt_vec_info> scalar_stmts;
    4741        86914 :   scalar_stmts.create (1);
    4742        86914 :   scalar_stmts.quick_push (scalar_stmt);
    4743              : 
    4744        86914 :   if (dump_enabled_p ())
    4745              :     {
    4746         3483 :       dump_printf_loc (MSG_NOTE, vect_location,
    4747              :                        "Starting SLP discovery for\n");
    4748         6966 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4749         6966 :         dump_printf_loc (MSG_NOTE, vect_location,
    4750         3483 :                          "  %G", scalar_stmts[i]->stmt);
    4751              :     }
    4752              : 
    4753              :   /* Build the tree for the SLP instance.  */
    4754        86914 :   unsigned int group_size = scalar_stmts.length ();
    4755        86914 :   bool *matches = XALLOCAVEC (bool, group_size);
    4756        86914 :   poly_uint64 max_nunits = 1;
    4757        86914 :   unsigned tree_size = 0;
    4758              : 
    4759        86914 :   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4760              :                                        &max_nunits, matches, limit,
    4761              :                                        &tree_size, bst_map);
    4762        86914 :   if (node != NULL)
    4763              :     {
    4764              :       /* Create a new SLP instance.  */
    4765        83906 :       slp_instance new_instance = XNEW (class _slp_instance);
    4766        83906 :       SLP_INSTANCE_TREE (new_instance) = node;
    4767        83906 :       SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4768        83906 :       SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
    4769        83906 :       SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
    4770        83906 :       SLP_INSTANCE_KIND (new_instance) = kind;
    4771        83906 :       new_instance->reduc_phis = NULL;
    4772        83906 :       new_instance->cost_vec = vNULL;
    4773        83906 :       new_instance->subgraph_entries = vNULL;
    4774              : 
    4775        83906 :       if (dump_enabled_p ())
    4776         3363 :         dump_printf_loc (MSG_NOTE, vect_location,
    4777              :                          "SLP size %u vs. limit %u.\n",
    4778              :                          tree_size, max_tree_size);
    4779              : 
    4780        83906 :       vinfo->slp_instances.safe_push (new_instance);
    4781              : 
    4782              :       /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4783              :          the number of scalar stmts in the root in a few places.
    4784              :          Verify that assumption holds.  */
    4785       167812 :       gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4786              :                   .length () == group_size);
    4787              : 
    4788        83906 :       if (dump_enabled_p ())
    4789              :         {
    4790         3363 :           dump_printf_loc (MSG_NOTE, vect_location,
    4791              :                            "Final SLP tree for instance %p:\n",
    4792              :                            (void *) new_instance);
    4793         3363 :           vect_print_slp_graph (MSG_NOTE, vect_location,
    4794              :                                 SLP_INSTANCE_TREE (new_instance));
    4795              :         }
    4796              : 
    4797        83906 :       return true;
    4798              :     }
    4799              :   /* Failed to SLP.  */
    4800              : 
    4801              :   /* Free the allocated memory.  */
    4802         3008 :   scalar_stmts.release ();
    4803              : 
    4804              :   /* Failed to SLP.  */
    4805         3008 :   if (dump_enabled_p ())
    4806          120 :     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
    4807              :   return false;
    4808              : }
    4809              : 
    4810              : /* Analyze a single SLP reduction group.  If successful add a SLP instance
    4811              :    for it and return true, otherwise return false and have *MATCHES
    4812              :    populated.  */
    4813              : 
    4814              : static bool
    4815        26943 : vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo,
    4816              :                                   vec<stmt_vec_info> scalar_stmts,
    4817              :                                   scalar_stmts_to_slp_tree_map_t *bst_map,
    4818              :                                   unsigned max_tree_size, unsigned *limit,
    4819              :                                   bool *matches)
    4820              : {
    4821              :   /* Try to form a reduction group.  */
    4822        26943 :   unsigned int group_size = scalar_stmts.length ();
    4823        26943 :   if (!matches)
    4824        11199 :     matches = XALLOCAVEC (bool, group_size);
    4825        26943 :   poly_uint64 max_nunits = 1;
    4826        26943 :   unsigned tree_size = 0;
    4827        26943 :   slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts,
    4828              :                                        group_size,
    4829              :                                        &max_nunits, matches, limit,
    4830              :                                        &tree_size, bst_map);
    4831        26943 :   if (!node)
    4832              :     return false;
    4833              : 
    4834              :   /* Create a new SLP instance.  */
    4835        12237 :   slp_instance new_instance = XNEW (class _slp_instance);
    4836        12237 :   SLP_INSTANCE_TREE (new_instance) = node;
    4837        12237 :   SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4838        12237 :   SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
    4839        12237 :   SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
    4840        12237 :   SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group;
    4841        12237 :   new_instance->reduc_phis = NULL;
    4842        12237 :   new_instance->cost_vec = vNULL;
    4843        12237 :   new_instance->subgraph_entries = vNULL;
    4844              : 
    4845        12237 :   if (dump_enabled_p ())
    4846          571 :     dump_printf_loc (MSG_NOTE, vect_location,
    4847              :                      "SLP size %u vs. limit %u.\n",
    4848              :                      tree_size, max_tree_size);
    4849              : 
    4850        12237 :   loop_vinfo->slp_instances.safe_push (new_instance);
    4851              : 
    4852              :   /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4853              :      the number of scalar stmts in the root in a few places.
    4854              :      Verify that assumption holds.  */
    4855        24474 :   gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4856              :               .length () == group_size);
    4857              : 
    4858        12237 :   if (dump_enabled_p ())
    4859              :     {
    4860          571 :       dump_printf_loc (MSG_NOTE, vect_location,
    4861              :                        "SLP discovery of size %d reduction group "
    4862              :                        "succeeded\n", group_size);
    4863          571 :       dump_printf_loc (MSG_NOTE, vect_location,
    4864              :                        "Final SLP tree for instance %p:\n",
    4865              :                        (void *) new_instance);
    4866          571 :       vect_print_slp_graph (MSG_NOTE, vect_location,
    4867              :                             SLP_INSTANCE_TREE (new_instance));
    4868              :     }
    4869              : 
    4870              :   return true;
    4871              : }
    4872              : 
    4873              : /* Analyze reductions in LOOP_VINFO and populate SLP instances
    4874              :    accordingly.  Returns false if something fails.  */
    4875              : 
    4876              : static bool
    4877       488479 : vect_analyze_slp_reductions (loop_vec_info loop_vinfo,
    4878              :                              unsigned max_tree_size, unsigned *limit,
    4879              :                              scalar_stmts_to_slp_tree_map_t *bst_map,
    4880              :                              bool force_single_lane)
    4881              : {
    4882       554153 :   if (loop_vinfo->reductions.is_empty ())
    4883              :     return true;
    4884              : 
    4885              :   /* Collect reduction statements we can combine into
    4886              :      a SLP reduction.  */
    4887        73074 :   vec<stmt_vec_info> scalar_stmts;
    4888        73074 :   scalar_stmts.create (loop_vinfo->reductions.length ());
    4889       324316 :   for (auto next_info : loop_vinfo->reductions)
    4890              :     {
    4891       105094 :       next_info = vect_stmt_to_vectorize (next_info);
    4892       105094 :       if ((STMT_VINFO_RELEVANT_P (next_info)
    4893           14 :            || STMT_VINFO_LIVE_P (next_info))
    4894              :           /* ???  Make sure we didn't skip a conversion around a
    4895              :              reduction path.  In that case we'd have to reverse
    4896              :              engineer that conversion stmt following the chain using
    4897              :              reduc_idx and from the PHI using reduc_def.  */
    4898       105080 :           && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
    4899       105080 :               || (STMT_VINFO_DEF_TYPE (next_info)
    4900              :                   == vect_double_reduction_def)))
    4901              :         {
    4902              :           /* Do not discover SLP reductions combining lane-reducing
    4903              :              ops, that will fail later.  */
    4904       105080 :           if (!force_single_lane
    4905       105080 :               && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
    4906        78760 :             scalar_stmts.quick_push (next_info);
    4907              :           /* Do SLP discovery for single-lane reductions.  */
    4908        26320 :           else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
    4909              :                                                  max_tree_size, limit,
    4910              :                                                  bst_map,
    4911              :                                                  force_single_lane))
    4912              :             {
    4913            0 :               scalar_stmts.release ();
    4914            0 :               return false;
    4915              :             }
    4916              :         }
    4917              :     }
    4918              : 
    4919        73074 :   if (scalar_stmts.length () > 1)
    4920              :     {
    4921              :       /* Try to form a reduction group.  */
    4922         4570 :       unsigned int group_size = scalar_stmts.length ();
    4923         4570 :       bool *matches = XALLOCAVEC (bool, group_size);
    4924         4570 :       if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts, bst_map,
    4925              :                                             max_tree_size, limit, matches))
    4926         4417 :         return true;
    4927              : 
    4928              :       /* When analysis as a single SLP reduction group failed try to
    4929              :          form sub-groups by collecting matching lanes.  Do not recurse
    4930              :          that on failure (to limit compile-time costs), but recurse
    4931              :          for the initial non-matching parts.  Everything not covered
    4932              :          by a sub-group gets single-reduction treatment.  */
    4933         3494 :       vec<stmt_vec_info> cands = vNULL;
    4934        11352 :       while (matches[0])
    4935              :         {
    4936        11199 :           cands.truncate (0);
    4937        11199 :           cands.reserve (group_size, true);
    4938        88243 :           for (unsigned i = 0; i < group_size; ++i)
    4939        77044 :             if (matches[i])
    4940        19532 :               cands.quick_push (scalar_stmts[i]);
    4941              : 
    4942              :           /* Try to form a reduction group.  */
    4943        11199 :           if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map,
    4944              :                                                 max_tree_size, limit, NULL))
    4945         7845 :             cands = vNULL;
    4946              :           else
    4947              :             {
    4948              :               /* Do SLP discovery for single-lane reductions.  */
    4949        20489 :               for (auto stmt_info : cands)
    4950        10452 :                 if (! vect_analyze_slp_reduction (loop_vinfo,
    4951              :                                                   vect_stmt_to_vectorize
    4952              :                                                     (stmt_info),
    4953              :                                                   max_tree_size, limit,
    4954              :                                                   bst_map, force_single_lane))
    4955              :                   {
    4956           25 :                     scalar_stmts.release ();
    4957           25 :                     cands.release ();
    4958           25 :                     return false;
    4959              :                   }
    4960              :             }
    4961              :           /* Remove the handled stmts from scalar_stmts and try again,
    4962              :              possibly repeating the above with updated matches[].  */
    4963              :           unsigned j = 0;
    4964        88148 :           for (unsigned i = 0; i < group_size; ++i)
    4965        76974 :             if (!matches[i])
    4966              :               {
    4967        57482 :                 scalar_stmts[j] = scalar_stmts[i];
    4968        57482 :                 ++j;
    4969              :               }
    4970        11174 :           scalar_stmts.truncate (j);
    4971        11174 :           group_size = scalar_stmts.length ();
    4972        11174 :           if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
    4973              :                                                 bst_map, max_tree_size, limit,
    4974              :                                                 matches))
    4975              :             return true;
    4976              :         }
    4977              :     }
    4978              :   /* Do SLP discovery for single-lane reductions.  */
    4979       255488 :   for (auto stmt_info : scalar_stmts)
    4980        52500 :     if (! vect_analyze_slp_reduction (loop_vinfo,
    4981              :                                       vect_stmt_to_vectorize (stmt_info),
    4982              :                                       max_tree_size, limit,
    4983              :                                       bst_map, force_single_lane))
    4984              :       {
    4985         2983 :         scalar_stmts.release ();
    4986         2983 :         return false;
    4987              :       }
    4988              : 
    4989        65674 :   scalar_stmts.release ();
    4990        65674 :   return true;
    4991              : }
    4992              : 
    4993              : /* Analyze an SLP instance starting from a group of grouped stores.  Call
    4994              :    vect_build_slp_tree to build a tree of packed stmts if possible.
    4995              :    Return FALSE if it's impossible to SLP any stmt in the group.  */
    4996              : 
    4997              : static bool
    4998      1089839 : vect_analyze_slp_instance (vec_info *vinfo,
    4999              :                            scalar_stmts_to_slp_tree_map_t *bst_map,
    5000              :                            stmt_vec_info stmt_info,
    5001              :                            slp_instance_kind kind,
    5002              :                            unsigned max_tree_size, unsigned *limit,
    5003              :                            bool force_single_lane)
    5004              : {
    5005      1089839 :   vec<stmt_vec_info> scalar_stmts;
    5006              : 
    5007      1089839 :   if (is_a <bb_vec_info> (vinfo))
    5008      1060690 :     vect_location = stmt_info->stmt;
    5009              : 
    5010      1089839 :   gcc_assert (kind == slp_inst_kind_store);
    5011              : 
    5012              :   /* Collect the stores and store them in scalar_stmts.  */
    5013      1089839 :   scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
    5014      1089839 :   stmt_vec_info next_info = stmt_info;
    5015      5419690 :   while (next_info)
    5016              :     {
    5017      3240012 :       scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
    5018      3240012 :       next_info = DR_GROUP_NEXT_ELEMENT (next_info);
    5019              :     }
    5020              : 
    5021      1089839 :   vec<stmt_vec_info> root_stmt_infos = vNULL;
    5022      1089839 :   vec<tree> remain = vNULL;
    5023              : 
    5024              :   /* Build the tree for the SLP instance.  */
    5025              : 
    5026              :   /* If there's no budget left bail out early.  */
    5027      1089839 :   if (*limit == 0)
    5028              :     return false;
    5029              : 
    5030      1089816 :   if (dump_enabled_p ())
    5031              :     {
    5032         4132 :       dump_printf_loc (MSG_NOTE, vect_location,
    5033              :                        "Starting SLP discovery for\n");
    5034        23834 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    5035        39404 :         dump_printf_loc (MSG_NOTE, vect_location,
    5036        19702 :                          "  %G", scalar_stmts[i]->stmt);
    5037              :     }
    5038              : 
    5039              :   /* Build the tree for the SLP instance.  */
    5040      1089816 :   unsigned int group_size = scalar_stmts.length ();
    5041      1089816 :   bool *matches = XALLOCAVEC (bool, group_size);
    5042      1089816 :   poly_uint64 max_nunits = 1;
    5043      1089816 :   unsigned tree_size = 0;
    5044      1089816 :   unsigned i;
    5045              : 
    5046      1089816 :   slp_tree node = NULL;
    5047      1089816 :   if (group_size > 1 && force_single_lane)
    5048              :     {
    5049         1690 :       matches[0] = true;
    5050         1690 :       matches[1] = false;
    5051              :     }
    5052              :   else
    5053      1088126 :     node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    5054              :                                 &max_nunits, matches, limit,
    5055              :                                 &tree_size, bst_map);
    5056      1089816 :   if (node != NULL)
    5057              :     {
    5058              :       /* Calculate the unrolling factor based on the smallest type.  */
    5059       678468 :       poly_uint64 unrolling_factor
    5060       678468 :         = calculate_unrolling_factor (max_nunits, group_size);
    5061              : 
    5062       678468 :       if (maybe_ne (unrolling_factor, 1U)
    5063       678468 :           && is_a <bb_vec_info> (vinfo))
    5064              :         {
    5065            0 :           unsigned HOST_WIDE_INT const_max_nunits;
    5066            0 :           if (!max_nunits.is_constant (&const_max_nunits)
    5067            0 :               || const_max_nunits > group_size)
    5068              :             {
    5069            0 :               if (dump_enabled_p ())
    5070            0 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    5071              :                                  "Build SLP failed: store group "
    5072              :                                  "size not a multiple of the vector size "
    5073              :                                  "in basic block SLP\n");
    5074            0 :               vect_free_slp_tree (node);
    5075            0 :               return false;
    5076              :             }
    5077              :           /* Fatal mismatch.  */
    5078            0 :           if (dump_enabled_p ())
    5079            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    5080              :                              "SLP discovery succeeded but node needs "
    5081              :                              "splitting\n");
    5082            0 :           memset (matches, true, group_size);
    5083            0 :           matches[group_size / const_max_nunits * const_max_nunits] = false;
    5084            0 :           vect_free_slp_tree (node);
    5085              :         }
    5086              :       else
    5087              :         {
    5088              :           /* Create a new SLP instance.  */
    5089       678468 :           slp_instance new_instance = XNEW (class _slp_instance);
    5090       678468 :           SLP_INSTANCE_TREE (new_instance) = node;
    5091       678468 :           SLP_INSTANCE_LOADS (new_instance) = vNULL;
    5092       678468 :           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    5093       678468 :           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    5094       678468 :           SLP_INSTANCE_KIND (new_instance) = kind;
    5095       678468 :           new_instance->reduc_phis = NULL;
    5096       678468 :           new_instance->cost_vec = vNULL;
    5097       678468 :           new_instance->subgraph_entries = vNULL;
    5098              : 
    5099       678468 :           if (dump_enabled_p ())
    5100         3148 :             dump_printf_loc (MSG_NOTE, vect_location,
    5101              :                              "SLP size %u vs. limit %u.\n",
    5102              :                              tree_size, max_tree_size);
    5103              : 
    5104       678468 :           vinfo->slp_instances.safe_push (new_instance);
    5105              : 
    5106              :           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    5107              :              the number of scalar stmts in the root in a few places.
    5108              :              Verify that assumption holds.  */
    5109      1356936 :           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    5110              :                         .length () == group_size);
    5111              : 
    5112       678468 :           if (dump_enabled_p ())
    5113              :             {
    5114         3148 :               dump_printf_loc (MSG_NOTE, vect_location,
    5115              :                                "Final SLP tree for instance %p:\n",
    5116              :                                (void *) new_instance);
    5117         3148 :               vect_print_slp_graph (MSG_NOTE, vect_location,
    5118              :                                     SLP_INSTANCE_TREE (new_instance));
    5119              :             }
    5120              : 
    5121       678468 :           return true;
    5122              :         }
    5123              :     }
    5124              :   /* Failed to SLP.  */
    5125              : 
    5126              :   /* Try to break the group up into pieces.  */
    5127       411348 :   if (*limit > 0 && kind == slp_inst_kind_store)
    5128              :     {
    5129              :       /* ???  We could delay all the actual splitting of store-groups
    5130              :          until after SLP discovery of the original group completed.
    5131              :          Then we can recurse to vect_build_slp_instance directly.  */
    5132      1076566 :       for (i = 0; i < group_size; i++)
    5133      1076566 :         if (!matches[i])
    5134              :           break;
    5135              : 
    5136              :       /* For basic block SLP, try to break the group up into multiples of
    5137              :          a vector size.  */
    5138       411347 :       if (is_a <bb_vec_info> (vinfo)
    5139       411347 :           && (i > 1 && i < group_size))
    5140              :         {
    5141              :           /* Free the allocated memory.  */
    5142       153652 :           scalar_stmts.release ();
    5143              : 
    5144       153652 :           tree scalar_type
    5145       153652 :             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
    5146       307304 :           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
    5147       153652 :                                                       1 << floor_log2 (i));
    5148       153652 :           unsigned HOST_WIDE_INT const_nunits;
    5149       153652 :           if (vectype
    5150       153652 :               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
    5151              :             {
    5152              :               /* Split into two groups at the first vector boundary.  */
    5153       153652 :               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
    5154       153652 :               unsigned group1_size = i & ~(const_nunits - 1);
    5155              : 
    5156       153652 :               if (dump_enabled_p ())
    5157           59 :                 dump_printf_loc (MSG_NOTE, vect_location,
    5158              :                                  "Splitting SLP group at stmt %u\n", i);
    5159       153652 :               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
    5160              :                                                                group1_size);
    5161       153652 :               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
    5162              :                                                     kind, max_tree_size,
    5163              :                                                     limit, false);
    5164              :               /* Split the rest at the failure point and possibly
    5165              :                  re-analyze the remaining matching part if it has
    5166              :                  at least two lanes.  */
    5167       153652 :               if (group1_size < i
    5168         5271 :                   && (i + 1 < group_size
    5169         2894 :                       || i - group1_size > 1))
    5170              :                 {
    5171         2409 :                   stmt_vec_info rest2 = rest;
    5172         2409 :                   rest = vect_split_slp_store_group (rest, i - group1_size);
    5173         2409 :                   if (i - group1_size > 1)
    5174           61 :                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
    5175              :                                                       kind, max_tree_size,
    5176              :                                                       limit, false);
    5177              :                 }
    5178              :               /* Re-analyze the non-matching tail if it has at least
    5179              :                  two lanes.  */
    5180       153652 :               if (i + 1 < group_size)
    5181        21817 :                 res |= vect_analyze_slp_instance (vinfo, bst_map,
    5182              :                                                   rest, kind, max_tree_size,
    5183              :                                                   limit, false);
    5184       153652 :               return res;
    5185              :             }
    5186              :         }
    5187              : 
    5188              :       /* For loop vectorization split the RHS into arbitrary pieces of
    5189              :          size >= 1.  */
    5190       257695 :       else if (is_a <loop_vec_info> (vinfo)
    5191       257695 :                && (group_size != 1 && i < group_size))
    5192              :         {
    5193         7973 :           gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
    5194           28 :           bool masked_p = call
    5195           28 :               && gimple_call_internal_p (call)
    5196           28 :               && internal_fn_mask_index (gimple_call_internal_fn (call)) != -1;
    5197              :           /* There are targets that cannot do even/odd interleaving schemes
    5198              :              so they absolutely need to use load/store-lanes.  For now
    5199              :              force single-lane SLP for them - they would be happy with
    5200              :              uniform power-of-two lanes (but depending on element size),
    5201              :              but even if we can use 'i' as indicator we would need to
    5202              :              backtrack when later lanes fail to discover with the same
    5203              :              granularity.  We cannot turn any of strided or scatter store
    5204              :              into store-lanes.  */
    5205              :           /* ???  If this is not in sync with what get_load_store_type
    5206              :              later decides the SLP representation is not good for other
    5207              :              store vectorization methods.  */
    5208         7973 :           bool want_store_lanes
    5209         7973 :             = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    5210         7973 :                && ! STMT_VINFO_STRIDED_P (stmt_info)
    5211         5896 :                && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
    5212         5892 :                && compare_step_with_zero (vinfo, stmt_info) > 0
    5213        13785 :                && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
    5214        15946 :                                                  masked_p, group_size, i));
    5215         7973 :           if (want_store_lanes || force_single_lane)
    5216              :             i = 1;
    5217              : 
    5218              :           /* A fatal discovery fail doesn't always mean single-lane SLP
    5219              :              isn't a possibility, so try.  */
    5220         6283 :           if (i == 0)
    5221              :             i = 1;
    5222              : 
    5223         7973 :           if (dump_enabled_p ())
    5224          883 :             dump_printf_loc (MSG_NOTE, vect_location,
    5225              :                              "Splitting SLP group at stmt %u\n", i);
    5226              : 
    5227              :           /* Analyze the stored values and pinch them together with
    5228              :              a permute node so we can preserve the whole store group.  */
    5229         7973 :           auto_vec<slp_tree> rhs_nodes;
    5230         7973 :           poly_uint64 max_nunits = 1;
    5231              : 
    5232         7973 :           unsigned int rhs_common_nlanes = 0;
    5233         7973 :           unsigned int start = 0, end = i;
    5234        36019 :           while (start < group_size)
    5235              :             {
    5236        28307 :               gcc_assert (end - start >= 1);
    5237        28307 :               vec<stmt_vec_info> substmts;
    5238        28307 :               substmts.create (end - start);
    5239        88748 :               for (unsigned j = start; j < end; ++j)
    5240        60441 :                 substmts.quick_push (scalar_stmts[j]);
    5241        28307 :               max_nunits = 1;
    5242        28307 :               node = vect_build_slp_tree (vinfo, substmts, end - start,
    5243              :                                           &max_nunits,
    5244              :                                           matches, limit, &tree_size, bst_map);
    5245        28307 :               if (node)
    5246              :                 {
    5247        22518 :                   rhs_nodes.safe_push (node);
    5248        22518 :                   vect_update_max_nunits (&max_nunits, node->max_nunits);
    5249        22518 :                   if (start == 0)
    5250         7718 :                     rhs_common_nlanes = SLP_TREE_LANES (node);
    5251        14800 :                   else if (rhs_common_nlanes != SLP_TREE_LANES (node))
    5252         1375 :                     rhs_common_nlanes = 0;
    5253        22518 :                   start = end;
    5254        22518 :                   if (want_store_lanes || force_single_lane)
    5255         5087 :                     end = start + 1;
    5256              :                   else
    5257              :                     end = group_size;
    5258              :                 }
    5259              :               else
    5260              :                 {
    5261         5789 :                   substmts.release ();
    5262         5789 :                   if (end - start == 1)
    5263              :                     {
    5264              :                       /* Single-lane discovery failed.  Free ressources.  */
    5265          281 :                       for (auto node : rhs_nodes)
    5266            8 :                         vect_free_slp_tree (node);
    5267          261 :                       scalar_stmts.release ();
    5268          261 :                       if (dump_enabled_p ())
    5269           39 :                         dump_printf_loc (MSG_NOTE, vect_location,
    5270              :                                          "SLP discovery failed\n");
    5271          261 :                       return false;
    5272              :                     }
    5273              : 
    5274              :                   /* ???  It really happens that we soft-fail SLP
    5275              :                      build at a mismatch but the matching part hard-fails
    5276              :                      later.  As we know we arrived here with a group
    5277              :                      larger than one try a group of size one!  */
    5278         5528 :                   if (!matches[0])
    5279           44 :                     end = start + 1;
    5280              :                   else
    5281        12067 :                     for (unsigned j = start; j < end; j++)
    5282        12067 :                       if (!matches[j - start])
    5283              :                         {
    5284              :                           end = j;
    5285              :                           break;
    5286              :                         }
    5287              :                 }
    5288              :             }
    5289              : 
    5290              :           /* Now re-assess whether we want store lanes in case the
    5291              :              discovery ended up producing all single-lane RHSs.  */
    5292         7712 :           if (! want_store_lanes
    5293         7712 :               && rhs_common_nlanes == 1
    5294         6655 :               && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    5295         6655 :               && ! STMT_VINFO_STRIDED_P (stmt_info)
    5296         4952 :               && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
    5297         4949 :               && compare_step_with_zero (vinfo, stmt_info) > 0
    5298        12606 :               && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes[0]),
    5299              :                                               group_size, masked_p)
    5300              :                   != IFN_LAST))
    5301              :             want_store_lanes = true;
    5302              : 
    5303              :           /* Now we assume we can build the root SLP node from all stores.  */
    5304         7712 :           if (want_store_lanes)
    5305              :             {
    5306              :               /* For store-lanes feed the store node with all RHS nodes
    5307              :                  in order.  */
    5308            0 :               node = vect_create_new_slp_node (scalar_stmts,
    5309            0 :                                                SLP_TREE_CHILDREN
    5310              :                                                  (rhs_nodes[0]).length ());
    5311            0 :               SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
    5312            0 :               node->max_nunits = max_nunits;
    5313            0 :               node->ldst_lanes = true;
    5314            0 :               SLP_TREE_CHILDREN (node)
    5315            0 :                 .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
    5316            0 :                                 + rhs_nodes.length () - 1);
    5317              :               /* First store value and possibly mask.  */
    5318            0 :               SLP_TREE_CHILDREN (node)
    5319            0 :                 .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
    5320              :               /* Rest of the store values.  All mask nodes are the same,
    5321              :                  this should be guaranteed by dataref group discovery.  */
    5322            0 :               for (unsigned j = 1; j < rhs_nodes.length (); ++j)
    5323            0 :                 SLP_TREE_CHILDREN (node)
    5324            0 :                   .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
    5325            0 :               for (slp_tree child : SLP_TREE_CHILDREN (node))
    5326            0 :                 child->refcnt++;
    5327              :             }
    5328              :           else
    5329         7712 :             node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
    5330              :                                                       max_nunits);
    5331              : 
    5332        30222 :           while (!rhs_nodes.is_empty ())
    5333        22510 :             vect_free_slp_tree (rhs_nodes.pop ());
    5334              : 
    5335              :           /* Create a new SLP instance.  */
    5336         7712 :           slp_instance new_instance = XNEW (class _slp_instance);
    5337         7712 :           SLP_INSTANCE_TREE (new_instance) = node;
    5338         7712 :           SLP_INSTANCE_LOADS (new_instance) = vNULL;
    5339         7712 :           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    5340         7712 :           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    5341         7712 :           SLP_INSTANCE_KIND (new_instance) = kind;
    5342         7712 :           new_instance->reduc_phis = NULL;
    5343         7712 :           new_instance->cost_vec = vNULL;
    5344         7712 :           new_instance->subgraph_entries = vNULL;
    5345              : 
    5346         7712 :           if (dump_enabled_p ())
    5347          844 :             dump_printf_loc (MSG_NOTE, vect_location,
    5348              :                              "SLP size %u vs. limit %u.\n",
    5349              :                              tree_size, max_tree_size);
    5350              : 
    5351         7712 :           vinfo->slp_instances.safe_push (new_instance);
    5352              : 
    5353              :           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    5354              :              the number of scalar stmts in the root in a few places.
    5355              :              Verify that assumption holds.  */
    5356        15424 :           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    5357              :                         .length () == group_size);
    5358              : 
    5359         7712 :           if (dump_enabled_p ())
    5360              :             {
    5361          844 :               dump_printf_loc (MSG_NOTE, vect_location,
    5362              :                                "Final SLP tree for instance %p:\n",
    5363              :                                (void *) new_instance);
    5364          844 :               vect_print_slp_graph (MSG_NOTE, vect_location,
    5365              :                                     SLP_INSTANCE_TREE (new_instance));
    5366              :             }
    5367         7712 :           return true;
    5368         7973 :         }
    5369              :       else
    5370              :         /* Free the allocated memory.  */
    5371       249722 :         scalar_stmts.release ();
    5372              : 
    5373              :       /* Even though the first vector did not all match, we might be able to SLP
    5374              :          (some) of the remainder.  FORNOW ignore this possibility.  */
    5375              :     }
    5376              :   else
    5377              :     /* Free the allocated memory.  */
    5378            1 :     scalar_stmts.release ();
    5379              : 
    5380              :   /* Failed to SLP.  */
    5381       249723 :   if (dump_enabled_p ())
    5382           42 :     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
    5383              :   return false;
    5384              : }
    5385              : 
    5386              : /* qsort comparator ordering SLP load nodes.  */
    5387              : 
    5388              : static int
    5389      2634002 : vllp_cmp (const void *a_, const void *b_)
    5390              : {
    5391      2634002 :   const slp_tree a = *(const slp_tree *)a_;
    5392      2634002 :   const slp_tree b = *(const slp_tree *)b_;
    5393      2634002 :   stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
    5394      2634002 :   stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
    5395      2634002 :   if (STMT_VINFO_GROUPED_ACCESS (a0)
    5396      1536055 :       && STMT_VINFO_GROUPED_ACCESS (b0)
    5397      4108635 :       && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
    5398              :     {
    5399              :       /* Same group, order after lanes used.  */
    5400       343047 :       if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
    5401              :         return 1;
    5402       334264 :       else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
    5403              :         return -1;
    5404              :       else
    5405              :         {
    5406              :           /* Try to order loads using the same lanes together, breaking
    5407              :              the tie with the lane number that first differs.  */
    5408       324730 :           if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
    5409       324730 :               && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
    5410              :             return 0;
    5411       324730 :           else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
    5412       324730 :                    && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
    5413              :             return 1;
    5414       320687 :           else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
    5415       320687 :                    && SLP_TREE_LOAD_PERMUTATION (b).exists ())
    5416              :             return -1;
    5417              :           else
    5418              :             {
    5419       313301 :               for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
    5420       313301 :                 if (SLP_TREE_LOAD_PERMUTATION (a)[i]
    5421       313301 :                     != SLP_TREE_LOAD_PERMUTATION (b)[i])
    5422              :                   {
    5423              :                     /* In-order lane first, that's what the above case for
    5424              :                        no permutation does.  */
    5425       311989 :                     if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
    5426              :                       return -1;
    5427       191521 :                     else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
    5428              :                       return 1;
    5429       100787 :                     else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
    5430       100787 :                              < SLP_TREE_LOAD_PERMUTATION (b)[i])
    5431              :                       return -1;
    5432              :                     else
    5433              :                       return 1;
    5434              :                   }
    5435              :               return 0;
    5436              :             }
    5437              :         }
    5438              :     }
    5439              :   else /* Different groups or non-groups.  */
    5440              :     {
    5441              :       /* Order groups as their first element to keep them together.  */
    5442      2290955 :       if (STMT_VINFO_GROUPED_ACCESS (a0))
    5443      2290955 :         a0 = DR_GROUP_FIRST_ELEMENT (a0);
    5444      2290955 :       if (STMT_VINFO_GROUPED_ACCESS (b0))
    5445      2290955 :         b0 = DR_GROUP_FIRST_ELEMENT (b0);
    5446      2290955 :       if (a0 == b0)
    5447              :         return 0;
    5448              :       /* Tie using UID.  */
    5449      2290835 :       else if (gimple_uid (STMT_VINFO_STMT (a0))
    5450      2290835 :                < gimple_uid (STMT_VINFO_STMT (b0)))
    5451              :         return -1;
    5452              :       else
    5453              :         {
    5454      1017532 :           gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
    5455              :                       != gimple_uid (STMT_VINFO_STMT (b0)));
    5456              :           return 1;
    5457              :         }
    5458              :     }
    5459              : }
    5460              : 
    5461              : /* Return whether if the load permutation of NODE is consecutive starting
    5462              :    with value START_VAL in the first element.  If START_VAL is not given
    5463              :    the first element's value is used.  */
    5464              : 
    5465              : bool
    5466       619295 : vect_load_perm_consecutive_p (slp_tree node, unsigned start_val)
    5467              : {
    5468       619295 :   load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
    5469              : 
    5470       619295 :   if (!perm.exists () || !perm.length ())
    5471              :     return false;
    5472              : 
    5473       619295 :   if (start_val == UINT_MAX)
    5474        79156 :     start_val = perm[0];
    5475              : 
    5476      1222836 :   for (unsigned int i = 0; i < perm.length (); i++)
    5477       626572 :     if (perm[i] != start_val + (unsigned int) i)
    5478              :       return false;
    5479              : 
    5480              :   return true;
    5481              : }
    5482              : 
    5483              : /* Process the set of LOADS that are all from the same dataref group.  */
    5484              : 
    5485              : static void
    5486       160485 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
    5487              :                               scalar_stmts_to_slp_tree_map_t *bst_map,
    5488              :                               const array_slice<slp_tree> &loads,
    5489              :                               bool force_single_lane)
    5490              : {
    5491              :   /* We at this point want to lower without a fixed VF or vector
    5492              :      size in mind which means we cannot actually compute whether we
    5493              :      need three or more vectors for a load permutation yet.  So always
    5494              :      lower.  */
    5495       160485 :   stmt_vec_info first
    5496       160485 :     = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
    5497       160485 :   unsigned group_lanes = DR_GROUP_SIZE (first);
    5498              : 
    5499              :   /* Verify if all load permutations can be implemented with a suitably
    5500              :      large element load-lanes operation.  */
    5501       160485 :   unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
    5502       160485 :   if (STMT_VINFO_STRIDED_P (first)
    5503       158192 :       || compare_step_with_zero (loop_vinfo, first) <= 0
    5504       155847 :       || exact_log2 (ld_lanes_lanes) == -1
    5505              :       /* ???  For now only support the single-lane case as there is
    5506              :          missing support on the store-lane side and code generation
    5507              :          isn't up to the task yet.  */
    5508       153068 :       || ld_lanes_lanes != 1
    5509       302575 :       || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
    5510              :                                     group_lanes / ld_lanes_lanes,
    5511              :                                     false) == IFN_LAST)
    5512              :     ld_lanes_lanes = 0;
    5513              :   else
    5514              :     /* Verify the loads access the same number of lanes aligned to
    5515              :        ld_lanes_lanes.  */
    5516            0 :     for (slp_tree load : loads)
    5517              :       {
    5518            0 :         if (SLP_TREE_LANES (load) != ld_lanes_lanes)
    5519              :           {
    5520              :             ld_lanes_lanes = 0;
    5521              :             break;
    5522              :           }
    5523            0 :         unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
    5524            0 :         if (first % ld_lanes_lanes != 0)
    5525              :           {
    5526              :             ld_lanes_lanes = 0;
    5527              :             break;
    5528              :           }
    5529            0 :         if (!vect_load_perm_consecutive_p (load))
    5530              :           {
    5531              :             ld_lanes_lanes = 0;
    5532              :             break;
    5533              :           }
    5534              :       }
    5535              : 
    5536              :   /* Only a power-of-two number of lanes matches interleaving with N levels.
    5537              :      ???  An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
    5538              :      at each step.  */
    5539       261205 :   if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
    5540              :     return;
    5541              : 
    5542       262947 :   for (slp_tree load : loads)
    5543              :     {
    5544              :       /* Leave masked or gather loads alone for now.  */
    5545       185666 :       if (!SLP_TREE_CHILDREN (load).is_empty ())
    5546        59356 :         continue;
    5547              : 
    5548              :       /* For single-element interleaving spanning multiple vectors avoid
    5549              :          lowering, we want to use VMAT_ELEMENTWISE later.  */
    5550       185660 :       if (ld_lanes_lanes == 0
    5551       185660 :           && SLP_TREE_LANES (load) == 1
    5552       166410 :           && !DR_GROUP_NEXT_ELEMENT (first)
    5553       264761 :           && maybe_gt (group_lanes,
    5554              :                        TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (load))))
    5555        51086 :         return;
    5556              : 
    5557              :       /* We want to pattern-match special cases here and keep those
    5558              :          alone.  Candidates are splats and load-lane.  */
    5559              : 
    5560              :       /* We need to lower only loads of less than half of the groups
    5561              :          lanes, including duplicate lanes.  Note this leaves nodes
    5562              :          with a non-1:1 load permutation around instead of canonicalizing
    5563              :          those into a load and a permute node.  Removing this early
    5564              :          check would do such canonicalization.  */
    5565       134574 :       if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
    5566        55846 :           && ld_lanes_lanes == 0)
    5567        55846 :         continue;
    5568              : 
    5569              :       /* Build the permute to get the original load permutation order.  */
    5570        78728 :       bool contiguous = vect_load_perm_consecutive_p (load);
    5571        78728 :       lane_permutation_t final_perm;
    5572        78728 :       final_perm.create (SLP_TREE_LANES (load));
    5573       158370 :       for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
    5574       159284 :         final_perm.quick_push (
    5575        79642 :           std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
    5576              : 
    5577              :       /* When the load permutation accesses a contiguous unpermuted,
    5578              :          power-of-two aligned and sized chunk leave the load alone.
    5579              :          We can likely (re-)load it more efficiently rather than
    5580              :          extracting it from the larger load.
    5581              :          ???  Long-term some of the lowering should move to where
    5582              :          the vector types involved are fixed.  */
    5583        82232 :       if (!force_single_lane
    5584        78728 :           && ld_lanes_lanes == 0
    5585        53059 :           && contiguous
    5586        52816 :           && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
    5587         6499 :           && pow2p_hwi (SLP_TREE_LANES (load))
    5588         6463 :           && pow2p_hwi (group_lanes)
    5589         3504 :           && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
    5590        82232 :           && group_lanes % SLP_TREE_LANES (load) == 0)
    5591              :         {
    5592         3504 :           final_perm.release ();
    5593         3504 :           continue;
    5594              :         }
    5595              : 
    5596              :       /* First build (and possibly re-use) a load node for the
    5597              :          unpermuted group.  Gaps in the middle and on the end are
    5598              :          represented with NULL stmts.  */
    5599        75224 :       vec<stmt_vec_info> stmts;
    5600        75224 :       stmts.create (group_lanes);
    5601       267329 :       for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
    5602              :         {
    5603       192105 :           if (s != first)
    5604       121634 :             for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
    5605         4753 :               stmts.quick_push (NULL);
    5606       192105 :           stmts.quick_push (s);
    5607              :         }
    5608       137170 :       for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
    5609        61946 :         stmts.quick_push (NULL);
    5610        75224 :       poly_uint64 max_nunits = 1;
    5611        75224 :       bool *matches = XALLOCAVEC (bool, group_lanes);
    5612        75224 :       unsigned limit = 1;
    5613        75224 :       unsigned tree_size = 0;
    5614        75224 :       slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
    5615              :                                          group_lanes,
    5616              :                                          &max_nunits, matches, &limit,
    5617        75224 :                                          &tree_size, bst_map);
    5618        75224 :       gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0).exists ());
    5619              : 
    5620        75224 :       if (ld_lanes_lanes != 0)
    5621              :         {
    5622              :           /* ???  If this is not in sync with what get_load_store_type
    5623              :              later decides the SLP representation is not good for other
    5624              :              store vectorization methods.  */
    5625            0 :           l0->ldst_lanes = true;
    5626            0 :           load->ldst_lanes = true;
    5627              :         }
    5628              : 
    5629       233338 :       while (1)
    5630              :         {
    5631       154281 :           unsigned group_lanes = SLP_TREE_LANES (l0);
    5632       154281 :           if (ld_lanes_lanes != 0
    5633       154281 :               || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
    5634              :             break;
    5635              : 
    5636              :           /* Try to lower by reducing the group to half its size using an
    5637              :              interleaving scheme.  For this try to compute whether all
    5638              :              elements needed for this load are in even or odd elements of
    5639              :              an even/odd decomposition with N consecutive elements.
    5640              :              Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
    5641              :              with N == 2.  */
    5642              :           /* ???  Only an even number of lanes can be handed this way, but the
    5643              :              fallback below could work for any number.  We have to make sure
    5644              :              to round up in that case.  */
    5645        79057 :           gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
    5646        11009 :           unsigned even = 0, odd = 0;
    5647        11009 :           if ((group_lanes & 1) == 0)
    5648              :             {
    5649        11009 :               even = (1 << ceil_log2 (group_lanes)) - 1;
    5650        11009 :               odd = even;
    5651        44713 :               for (auto l : final_perm)
    5652              :                 {
    5653        11686 :                   even &= ~l.second;
    5654        11686 :                   odd &= l.second;
    5655              :                 }
    5656              :             }
    5657              : 
    5658              :           /* Now build an even or odd extraction from the unpermuted load.  */
    5659        79057 :           lane_permutation_t perm;
    5660        79057 :           perm.create ((group_lanes + 1) / 2);
    5661        79057 :           unsigned even_level = even ? 1 << ctz_hwi (even) : 0;
    5662        79057 :           unsigned odd_level = odd ? 1 << ctz_hwi (odd) : 0;
    5663        79057 :           if (even_level
    5664        10092 :               && group_lanes % (2 * even_level) == 0
    5665              :               /* ???  When code generating permutes we do not try to pun
    5666              :                  to larger component modes so level != 1 isn't a natural
    5667              :                  even/odd extract.  Prefer one if possible.  */
    5668        10092 :               && (even_level == 1 || !odd_level || odd_level != 1))
    5669              :             {
    5670              :               /* { 0, 1, ... 4, 5 ..., } */
    5671        36375 :               for (unsigned i = 0; i < group_lanes / 2 / even_level; ++i)
    5672        57438 :                 for (unsigned j = 0; j < even_level; ++j)
    5673        28892 :                   perm.quick_push (std::make_pair (0, 2 * i * even_level + j));
    5674              :             }
    5675        68965 :           else if (odd_level)
    5676              :             {
    5677              :               /* { ..., 2, 3, ... 6, 7 } */
    5678         3150 :               gcc_assert (group_lanes % (2 * odd_level) == 0);
    5679        13714 :               for (unsigned i = 0; i < group_lanes / 2 / odd_level; ++i)
    5680        21182 :                 for (unsigned j = 0; j < odd_level; ++j)
    5681        10618 :                   perm.quick_push
    5682        10618 :                     (std::make_pair (0, (2 * i + 1) * odd_level + j));
    5683              :             }
    5684              :           else
    5685              :             {
    5686              :               /* As fallback extract all used lanes and fill to half the
    5687              :                  group size by repeating the last element.
    5688              :                  ???  This is quite a bad strathegy for re-use - we could
    5689              :                  brute force our way to find more optimal filling lanes to
    5690              :                  maximize re-use when looking at all loads from the group.  */
    5691        68078 :               auto_bitmap l;
    5692       272368 :               for (auto p : final_perm)
    5693        68134 :                 bitmap_set_bit (l, p.second);
    5694        68078 :               unsigned i = 0;
    5695        68078 :               bitmap_iterator bi;
    5696       136212 :               EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
    5697        68134 :                   perm.quick_push (std::make_pair (0, i));
    5698       272464 :               while (perm.length () < (group_lanes + 1) / 2)
    5699        68154 :                 perm.quick_push (perm.last ());
    5700        68078 :             }
    5701              : 
    5702              :           /* Update final_perm with the intermediate permute.  */
    5703       158791 :           for (unsigned i = 0; i < final_perm.length (); ++i)
    5704              :             {
    5705        79734 :               unsigned l = final_perm[i].second;
    5706        79734 :               unsigned j;
    5707        88182 :               for (j = 0; j < perm.length (); ++j)
    5708        88182 :                 if (perm[j].second == l)
    5709              :                   {
    5710        79734 :                     final_perm[i].second = j;
    5711        79734 :                     break;
    5712              :                   }
    5713        79734 :               gcc_assert (j < perm.length ());
    5714              :             }
    5715              : 
    5716              :           /* And create scalar stmts.  */
    5717        79057 :           vec<stmt_vec_info> perm_stmts;
    5718        79057 :           perm_stmts.create (perm.length ());
    5719       254855 :           for (unsigned i = 0; i < perm.length (); ++i)
    5720       175798 :             perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
    5721              : 
    5722        79057 :           slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
    5723        79057 :           SLP_TREE_CHILDREN (p).quick_push (l0);
    5724        79057 :           SLP_TREE_LANE_PERMUTATION (p) = perm;
    5725        79057 :           SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
    5726        79057 :           SLP_TREE_LANES (p) = perm.length ();
    5727        79057 :           SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
    5728              :           /* ???  As we have scalar stmts for this intermediate permute we
    5729              :              could CSE it via bst_map but we do not want to pick up
    5730              :              another SLP node with a load permutation.  We instead should
    5731              :              have a "local" CSE map here.  */
    5732        79057 :           SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
    5733              : 
    5734              :           /* We now have a node for (group_lanes + 1) / 2 lanes.  */
    5735        79057 :           l0 = p;
    5736        79057 :         }
    5737              : 
    5738              :       /* And finally from the ordered reduction node create the
    5739              :          permute to shuffle the lanes into the original load-permutation
    5740              :          order.  We replace the original load node with this.  */
    5741        75224 :       SLP_TREE_CODE (load) = VEC_PERM_EXPR;
    5742        75224 :       SLP_TREE_LOAD_PERMUTATION (load).release ();
    5743        75224 :       SLP_TREE_LANE_PERMUTATION (load) = final_perm;
    5744        75224 :       SLP_TREE_CHILDREN (load).create (1);
    5745        75224 :       SLP_TREE_CHILDREN (load).quick_push (l0);
    5746              :     }
    5747              : }
    5748              : 
    5749              : /* Transform SLP loads in the SLP graph created by SLP discovery to
    5750              :    group loads from the same group and lower load permutations that
    5751              :    are unlikely to be supported into a series of permutes.
    5752              :    In the degenerate case of having only single-lane SLP instances
    5753              :    this should result in a series of permute nodes emulating an
    5754              :    interleaving scheme.  */
    5755              : 
    5756              : static void
    5757       470799 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
    5758              :                               scalar_stmts_to_slp_tree_map_t *bst_map,
    5759              :                               bool force_single_lane)
    5760              : {
    5761              :   /* Gather and sort loads across all instances.  */
    5762       470799 :   hash_set<slp_tree> visited;
    5763       470799 :   auto_vec<slp_tree> loads;
    5764      2165297 :   for (auto inst : loop_vinfo->slp_instances)
    5765       754820 :     vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
    5766       470799 :   if (loads.is_empty ())
    5767        89249 :     return;
    5768       381550 :   loads.qsort (vllp_cmp);
    5769              : 
    5770              :   /* Now process each dataref group separately.  */
    5771       381550 :   unsigned firsti = 0;
    5772       714720 :   for (unsigned i = 1; i < loads.length (); ++i)
    5773              :     {
    5774       333170 :       slp_tree first = loads[firsti];
    5775       333170 :       slp_tree next = loads[i];
    5776       333170 :       stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
    5777       333170 :       stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
    5778       333170 :       if (STMT_VINFO_GROUPED_ACCESS (a0)
    5779       157187 :           && STMT_VINFO_GROUPED_ACCESS (b0)
    5780       477306 :           && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
    5781        62442 :         continue;
    5782              :       /* Now we have one or multiple SLP loads of the same group from
    5783              :          firsti to i - 1.  */
    5784       270728 :       if (STMT_VINFO_GROUPED_ACCESS (a0))
    5785        94745 :         vect_lower_load_permutations (loop_vinfo, bst_map,
    5786        94745 :                                       make_array_slice (&loads[firsti],
    5787              :                                                         i - firsti),
    5788              :                                       force_single_lane);
    5789              :       firsti = i;
    5790              :     }
    5791       763100 :   if (firsti < loads.length ()
    5792       763100 :       && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
    5793        65740 :     vect_lower_load_permutations (loop_vinfo, bst_map,
    5794        65740 :                                   make_array_slice (&loads[firsti],
    5795        65740 :                                                     loads.length () - firsti),
    5796              :                                   force_single_lane);
    5797       470799 : }
    5798              : 
    5799              : /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
    5800              :    trees of packed scalar stmts if SLP is possible.  */
    5801              : 
    5802              : opt_result
    5803      1105777 : vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
    5804              :                   bool force_single_lane)
    5805              : {
    5806      1105777 :   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
    5807      1105777 :   unsigned int i;
    5808      1105777 :   stmt_vec_info first_element;
    5809      1105777 :   slp_instance instance;
    5810              : 
    5811      1105777 :   DUMP_VECT_SCOPE ("vect_analyze_slp");
    5812              : 
    5813      1105777 :   unsigned limit = max_tree_size;
    5814              : 
    5815      1105777 :   scalar_stmts_to_slp_tree_map_t *bst_map
    5816      1105777 :     = new scalar_stmts_to_slp_tree_map_t ();
    5817              : 
    5818              :   /* Find SLP sequences starting from groups of grouped stores.  */
    5819      3125594 :   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
    5820       914309 :     if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
    5821              :                                      slp_inst_kind_store, max_tree_size, &limit,
    5822              :                                      force_single_lane)
    5823       914309 :         && loop_vinfo)
    5824              :       {
    5825          269 :         release_scalar_stmts_to_slp_tree_map (bst_map);
    5826          269 :         return opt_result::failure_at (vect_location, "SLP build failed.\n");
    5827              :       }
    5828              : 
    5829              :   /* For loops also start SLP discovery from non-grouped stores.  */
    5830      1105508 :   if (loop_vinfo)
    5831              :     {
    5832              :       data_reference_p dr;
    5833      1620564 :       FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
    5834      1132085 :         if (DR_IS_WRITE (dr))
    5835              :           {
    5836       368441 :             stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
    5837              :             /* Grouped stores are already handled above.  */
    5838       368441 :             if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    5839        99342 :               continue;
    5840       269099 :             vec<stmt_vec_info> stmts;
    5841       269099 :             vec<stmt_vec_info> roots = vNULL;
    5842       269099 :             vec<tree> remain = vNULL;
    5843       269099 :             stmts.create (1);
    5844       269099 :             stmts.quick_push (stmt_info);
    5845       269099 :             if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
    5846              :                                            stmts, roots, remain, max_tree_size,
    5847              :                                            &limit, bst_map, force_single_lane))
    5848              :               {
    5849         6929 :                 release_scalar_stmts_to_slp_tree_map (bst_map);
    5850         6929 :                 return opt_result::failure_at (vect_location,
    5851              :                                                "SLP build failed.\n");
    5852              :               }
    5853              :           }
    5854              : 
    5855              :       stmt_vec_info stmt_info;
    5856       488519 :       FOR_EACH_VEC_ELT (LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo), i, stmt_info)
    5857              :         {
    5858           20 :           vec<stmt_vec_info> stmts;
    5859           20 :           vec<stmt_vec_info> roots = vNULL;
    5860           20 :           vec<tree> remain = vNULL;
    5861           20 :           stmts.create (1);
    5862           20 :           stmts.quick_push (stmt_info);
    5863           20 :           if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
    5864              :                                          stmts, roots, remain, max_tree_size,
    5865              :                                          &limit, bst_map, force_single_lane))
    5866              :             {
    5867            0 :               release_scalar_stmts_to_slp_tree_map (bst_map);
    5868            0 :               return opt_result::failure_at (vect_location,
    5869              :                                              "SLP build failed.\n");
    5870              :             }
    5871              :         }
    5872              :     }
    5873              : 
    5874      1098579 :   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
    5875              :     {
    5876      1829331 :       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
    5877              :         {
    5878      1219231 :           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
    5879              :           /* Apply patterns.  */
    5880      3812183 :           for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
    5881      5185904 :             bb_vinfo->roots[i].stmts[j]
    5882      2672028 :               = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
    5883      1219231 :           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
    5884      1219231 :                                        bb_vinfo->roots[i].stmts,
    5885      1219231 :                                        bb_vinfo->roots[i].roots,
    5886      1219231 :                                        bb_vinfo->roots[i].remain,
    5887              :                                        max_tree_size, &limit, bst_map, false))
    5888              :             {
    5889       127790 :               bb_vinfo->roots[i].roots = vNULL;
    5890       127790 :               bb_vinfo->roots[i].remain = vNULL;
    5891              :             }
    5892      1219231 :           bb_vinfo->roots[i].stmts = vNULL;
    5893              :         }
    5894              :     }
    5895              : 
    5896      1098579 :   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
    5897              :     {
    5898              :       /* Find SLP sequences starting from groups of reductions.  */
    5899       488479 :       if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit,
    5900              :                                         bst_map, force_single_lane))
    5901              :         {
    5902         3008 :           release_scalar_stmts_to_slp_tree_map (bst_map);
    5903         3008 :           return opt_result::failure_at (vect_location, "SLP build failed.\n");
    5904              :         }
    5905              : 
    5906              :       /* Make sure to vectorize only-live stmts, usually inductions.  */
    5907      2187289 :       for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
    5908      1416093 :         for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
    5909       675953 :              gsi_next (&gsi))
    5910              :           {
    5911       685217 :             gphi *lc_phi = *gsi;
    5912       685217 :             tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
    5913       685217 :             stmt_vec_info stmt_info;
    5914       685217 :             if (TREE_CODE (def) == SSA_NAME
    5915       573196 :                 && !virtual_operand_p (def)
    5916       297765 :                 && (stmt_info = loop_vinfo->lookup_def (def))
    5917       266805 :                 && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
    5918       266805 :                 && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
    5919       207090 :                 && STMT_VINFO_LIVE_P (stmt_info)
    5920       207090 :                 && !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))
    5921       790880 :                 && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
    5922              :               {
    5923       105576 :                 vec<stmt_vec_info> stmts;
    5924       105576 :                 vec<stmt_vec_info> roots = vNULL;
    5925       105576 :                 vec<tree> remain = vNULL;
    5926       105576 :                 stmts.create (1);
    5927       105576 :                 stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
    5928       105576 :                 if (! vect_build_slp_instance (vinfo,
    5929              :                                                slp_inst_kind_reduc_group,
    5930              :                                                stmts, roots, remain,
    5931              :                                                max_tree_size, &limit,
    5932              :                                                bst_map, force_single_lane))
    5933              :                   {
    5934         9264 :                     release_scalar_stmts_to_slp_tree_map (bst_map);
    5935         9264 :                     return opt_result::failure_at (vect_location,
    5936              :                                                    "SLP build failed.\n");
    5937              :                   }
    5938              :               }
    5939         9264 :           }
    5940              : 
    5941              :       /* Find SLP sequences starting from gconds.  */
    5942      1181640 :       for (auto cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
    5943              :         {
    5944       277281 :           auto cond_info = loop_vinfo->lookup_stmt (cond);
    5945              : 
    5946       277281 :           cond_info = vect_stmt_to_vectorize (cond_info);
    5947       277281 :           vec<stmt_vec_info> roots = vNULL;
    5948       277281 :           roots.safe_push (cond_info);
    5949       277281 :           gimple *stmt = STMT_VINFO_STMT (cond_info);
    5950       277281 :           tree args0 = gimple_cond_lhs (stmt);
    5951       277281 :           tree args1 = gimple_cond_rhs (stmt);
    5952              : 
    5953              :           /* These should be enforced by cond lowering, but if it failed
    5954              :              bail.  */
    5955       277281 :           if (gimple_cond_code (stmt) != NE_EXPR
    5956       276201 :               || TREE_TYPE (args0) != boolean_type_node
    5957       552916 :               || !integer_zerop (args1))
    5958              :             {
    5959         1646 :               roots.release ();
    5960         1646 :               release_scalar_stmts_to_slp_tree_map (bst_map);
    5961         1646 :               return opt_result::failure_at (vect_location,
    5962              :                                              "SLP build failed.\n");
    5963              :             }
    5964              : 
    5965              :           /* An argument without a loop def will be codegened from vectorizing the
    5966              :              root gcond itself.  As such we don't need to try to build an SLP tree
    5967              :              from them.  It's highly likely that the resulting SLP tree here if both
    5968              :              arguments have a def will be incompatible, but we rely on it being split
    5969              :              later on.  */
    5970       275635 :           auto varg = loop_vinfo->lookup_def (args0);
    5971       275635 :           vec<stmt_vec_info> stmts;
    5972       275635 :           vec<tree> remain = vNULL;
    5973       275635 :           stmts.create (1);
    5974       275635 :           stmts.quick_push (vect_stmt_to_vectorize (varg));
    5975              : 
    5976       275635 :           if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
    5977              :                                          stmts, roots, remain,
    5978              :                                          max_tree_size, &limit,
    5979              :                                          bst_map, force_single_lane))
    5980              :             {
    5981         3762 :               roots.release ();
    5982         3762 :               release_scalar_stmts_to_slp_tree_map (bst_map);
    5983         3762 :               return opt_result::failure_at (vect_location,
    5984              :                                              "SLP build failed.\n");
    5985              :             }
    5986              :         }
    5987              :     }
    5988              : 
    5989      1080899 :   hash_set<slp_tree> visited_patterns;
    5990      1080899 :   slp_tree_to_load_perm_map_t perm_cache;
    5991      1080899 :   slp_compat_nodes_map_t compat_cache;
    5992              : 
    5993              :   /* See if any patterns can be found in the SLP tree.  */
    5994      1080899 :   bool pattern_found = false;
    5995      3701708 :   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    5996      1539910 :     pattern_found |= vect_match_slp_patterns (instance, vinfo,
    5997              :                                               &visited_patterns, &perm_cache,
    5998              :                                               &compat_cache);
    5999              : 
    6000              :   /* If any were found optimize permutations of loads.  */
    6001      1080899 :   if (pattern_found)
    6002              :     {
    6003          285 :       hash_map<slp_tree, slp_tree> load_map;
    6004         3421 :       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6005              :         {
    6006         2851 :           slp_tree root = SLP_INSTANCE_TREE (instance);
    6007         2851 :           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
    6008              :                                         &load_map, root);
    6009              :         }
    6010          285 :     }
    6011              : 
    6012              :   /* Check whether we should force some SLP instances to use load/store-lanes
    6013              :      and do so by forcing SLP re-discovery with single lanes.  We used
    6014              :      to cancel SLP when this applied to all instances in a loop but now
    6015              :      we decide this per SLP instance.  It's important to do this only
    6016              :      after SLP pattern recognition.  */
    6017      1080899 :   if (is_a <loop_vec_info> (vinfo))
    6018      1225619 :     FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6019       754820 :       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
    6020       289225 :           && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
    6021              :         {
    6022       289225 :           slp_tree slp_root = SLP_INSTANCE_TREE (instance);
    6023       289225 :           unsigned int group_size = SLP_TREE_LANES (slp_root);
    6024       289225 :           tree vectype = SLP_TREE_VECTYPE (slp_root);
    6025              : 
    6026       289225 :           stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
    6027       289225 :           gimple *rep = STMT_VINFO_STMT (rep_info);
    6028       289225 :           bool masked = (is_gimple_call (rep)
    6029         2556 :                          && gimple_call_internal_p (rep)
    6030       291761 :                          && internal_fn_mask_index
    6031         2536 :                               (gimple_call_internal_fn (rep)) != -1);
    6032       289205 :           if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
    6033        28845 :               || slp_root->ldst_lanes
    6034       318070 :               || (vect_store_lanes_supported (vectype, group_size, masked)
    6035              :                   == IFN_LAST))
    6036       289225 :             continue;
    6037              : 
    6038            0 :           auto_vec<slp_tree> loads;
    6039            0 :           hash_set<slp_tree> visited;
    6040            0 :           vect_gather_slp_loads (loads, slp_root, visited);
    6041              : 
    6042              :           /* Check whether any load in the SLP instance is possibly
    6043              :              permuted.  */
    6044            0 :           bool loads_permuted = false;
    6045            0 :           slp_tree load_node;
    6046            0 :           unsigned j;
    6047            0 :           FOR_EACH_VEC_ELT (loads, j, load_node)
    6048              :             {
    6049            0 :               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
    6050            0 :                 continue;
    6051              :               unsigned k;
    6052              :               stmt_vec_info load_info;
    6053            0 :               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
    6054            0 :                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
    6055              :                   {
    6056              :                     loads_permuted = true;
    6057              :                     break;
    6058              :                   }
    6059              :             }
    6060              : 
    6061              :           /* If the loads and stores can use load/store-lanes force re-discovery
    6062              :              with single lanes.  */
    6063            0 :           if (loads_permuted)
    6064              :             {
    6065            0 :               bool can_use_lanes = true;
    6066              :               bool prefer_load_lanes = false;
    6067            0 :               FOR_EACH_VEC_ELT (loads, j, load_node)
    6068            0 :                 if (STMT_VINFO_GROUPED_ACCESS
    6069              :                       (SLP_TREE_REPRESENTATIVE (load_node)))
    6070              :                   {
    6071            0 :                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
    6072              :                         (SLP_TREE_REPRESENTATIVE (load_node));
    6073            0 :                     rep = STMT_VINFO_STMT (stmt_vinfo);
    6074            0 :                     masked = (is_gimple_call (rep)
    6075            0 :                               && gimple_call_internal_p (rep)
    6076            0 :                               && internal_fn_mask_index
    6077            0 :                                    (gimple_call_internal_fn (rep)));
    6078              :                     /* Use SLP for strided accesses (or if we can't
    6079              :                        load-lanes).  */
    6080            0 :                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
    6081            0 :                         || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
    6082            0 :                         || vect_load_lanes_supported
    6083            0 :                              (SLP_TREE_VECTYPE (load_node),
    6084            0 :                               DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
    6085              :                         /* ???  During SLP re-discovery with a single lane
    6086              :                            a masked grouped load will appear permuted and
    6087              :                            discovery will fail.  We have to rework this
    6088              :                            on the discovery side - for now avoid ICEing.  */
    6089            0 :                         || masked)
    6090              :                       {
    6091              :                         can_use_lanes = false;
    6092              :                         break;
    6093              :                       }
    6094              :                     /* Make sure that the target would prefer store-lanes
    6095              :                        for at least one of the loads.
    6096              : 
    6097              :                        ??? Perhaps we should instead require this for
    6098              :                        all loads?  */
    6099            0 :                     prefer_load_lanes
    6100              :                       = (prefer_load_lanes
    6101            0 :                          || SLP_TREE_LANES (load_node) == group_size
    6102            0 :                          || (vect_slp_prefer_store_lanes_p
    6103            0 :                              (vinfo, stmt_vinfo,
    6104              :                               SLP_TREE_VECTYPE (load_node), masked,
    6105              :                               group_size, SLP_TREE_LANES (load_node))));
    6106              :                   }
    6107              : 
    6108            0 :               if (can_use_lanes && prefer_load_lanes)
    6109              :                 {
    6110            0 :                   if (dump_enabled_p ())
    6111            0 :                     dump_printf_loc (MSG_NOTE, vect_location,
    6112              :                                      "SLP instance %p can use load/store-lanes,"
    6113              :                                      " re-discovering with single-lanes\n",
    6114              :                                      (void *) instance);
    6115              : 
    6116            0 :                   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
    6117              : 
    6118            0 :                   vect_free_slp_instance (instance);
    6119            0 :                   limit = max_tree_size;
    6120            0 :                   bool res = vect_analyze_slp_instance (vinfo, bst_map,
    6121              :                                                         stmt_info,
    6122              :                                                         slp_inst_kind_store,
    6123              :                                                         max_tree_size, &limit,
    6124              :                                                         true);
    6125            0 :                   gcc_assert (res);
    6126            0 :                   auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
    6127            0 :                   LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
    6128              :                 }
    6129              :             }
    6130            0 :         }
    6131              : 
    6132              :   /* When we end up with load permutations that we cannot possibly handle,
    6133              :      like those requiring three vector inputs, lower them using interleaving
    6134              :      like schemes.  */
    6135      1080899 :   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
    6136              :     {
    6137       470799 :       vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
    6138       470799 :       if (dump_enabled_p ())
    6139              :         {
    6140        19939 :           dump_printf_loc (MSG_NOTE, vect_location,
    6141              :                            "SLP graph after lowering permutations:\n");
    6142        19939 :           hash_set<slp_tree> visited;
    6143        88917 :           FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6144        29125 :             vect_print_slp_graph (MSG_NOTE, vect_location,
    6145              :                                   SLP_INSTANCE_TREE (instance), visited);
    6146        19939 :         }
    6147              :     }
    6148              : 
    6149      1080899 :   release_scalar_stmts_to_slp_tree_map (bst_map);
    6150              : 
    6151      1080899 :   if (pattern_found && dump_enabled_p ())
    6152              :     {
    6153           18 :       dump_printf_loc (MSG_NOTE, vect_location,
    6154              :                        "Pattern matched SLP tree\n");
    6155           18 :       hash_set<slp_tree> visited;
    6156           90 :       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6157           36 :         vect_print_slp_graph (MSG_NOTE, vect_location,
    6158              :                               SLP_INSTANCE_TREE (instance), visited);
    6159           18 :     }
    6160              : 
    6161      1080899 :   return opt_result::success ();
    6162      1080899 : }
    6163              : 
    6164              : /* Estimates the cost of inserting layout changes into the SLP graph.
    6165              :    It can also say that the insertion is impossible.  */
    6166              : 
    6167              : struct slpg_layout_cost
    6168              : {
    6169     10563064 :   slpg_layout_cost () = default;
    6170              :   slpg_layout_cost (sreal, bool);
    6171              : 
    6172       497140 :   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
    6173      5487987 :   bool is_possible () const { return depth != sreal::max (); }
    6174              : 
    6175              :   bool operator== (const slpg_layout_cost &) const;
    6176              :   bool operator!= (const slpg_layout_cost &) const;
    6177              : 
    6178              :   bool is_better_than (const slpg_layout_cost &, bool) const;
    6179              : 
    6180              :   void add_parallel_cost (const slpg_layout_cost &);
    6181              :   void add_serial_cost (const slpg_layout_cost &);
    6182              :   void split (unsigned int);
    6183              : 
    6184              :   /* The longest sequence of layout changes needed during any traversal
    6185              :      of the partition dag, weighted by execution frequency.
    6186              : 
    6187              :      This is the most important metric when optimizing for speed, since
    6188              :      it helps to ensure that we keep the number of operations on
    6189              :      critical paths to a minimum.  */
    6190              :   sreal depth = 0;
    6191              : 
    6192              :   /* An estimate of the total number of operations needed.  It is weighted by
    6193              :      execution frequency when optimizing for speed but not when optimizing for
    6194              :      size.  In order to avoid double-counting, a node with a fanout of N will
    6195              :      distribute 1/N of its total cost to each successor.
    6196              : 
    6197              :      This is the most important metric when optimizing for size, since
    6198              :      it helps to keep the total number of operations to a minimum,  */
    6199              :   sreal total = 0;
    6200              : };
    6201              : 
    6202              : /* Construct costs for a node with weight WEIGHT.  A higher weight
    6203              :    indicates more frequent execution.  IS_FOR_SIZE is true if we are
    6204              :    optimizing for size rather than speed.  */
    6205              : 
    6206      1296275 : slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
    6207      1297143 :   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
    6208              : {
    6209      1296275 : }
    6210              : 
    6211              : bool
    6212            0 : slpg_layout_cost::operator== (const slpg_layout_cost &other) const
    6213              : {
    6214            0 :   return depth == other.depth && total == other.total;
    6215              : }
    6216              : 
    6217              : bool
    6218            0 : slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
    6219              : {
    6220            0 :   return !operator== (other);
    6221              : }
    6222              : 
    6223              : /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
    6224              :    true if we are optimizing for size rather than speed.  */
    6225              : 
    6226              : bool
    6227       320448 : slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
    6228              :                                   bool is_for_size) const
    6229              : {
    6230       320448 :   if (is_for_size)
    6231              :     {
    6232          382 :       if (total != other.total)
    6233          159 :         return total < other.total;
    6234          223 :       return depth < other.depth;
    6235              :     }
    6236              :   else
    6237              :     {
    6238       320066 :       if (depth != other.depth)
    6239       136331 :         return depth < other.depth;
    6240       183735 :       return total < other.total;
    6241              :     }
    6242              : }
    6243              : 
    6244              : /* Increase the costs to account for something with cost INPUT_COST
    6245              :    happening in parallel with the current costs.  */
    6246              : 
    6247              : void
    6248       384014 : slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
    6249              : {
    6250       384014 :   depth = std::max (depth, input_cost.depth);
    6251       384014 :   total += input_cost.total;
    6252       384014 : }
    6253              : 
    6254              : /* Increase the costs to account for something with cost INPUT_COST
    6255              :    happening in series with the current costs.  */
    6256              : 
    6257              : void
    6258      1554342 : slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
    6259              : {
    6260      1554342 :   depth += other.depth;
    6261      1554342 :   total += other.total;
    6262      1554342 : }
    6263              : 
    6264              : /* Split the total cost among TIMES successors or predecessors.  */
    6265              : 
    6266              : void
    6267      1293208 : slpg_layout_cost::split (unsigned int times)
    6268              : {
    6269      1293208 :   if (times > 1)
    6270       566190 :     total /= times;
    6271      1293208 : }
    6272              : 
    6273              : /* Information about one node in the SLP graph, for use during
    6274              :    vect_optimize_slp_pass.  */
    6275              : 
    6276              : struct slpg_vertex
    6277              : {
    6278      9884527 :   slpg_vertex (slp_tree node_) : node (node_) {}
    6279              : 
    6280              :   /* The node itself.  */
    6281              :   slp_tree node;
    6282              : 
    6283              :   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
    6284              :      partitions are flexible; they can have whichever layout consumers
    6285              :      want them to have.  */
    6286              :   int partition = -1;
    6287              : 
    6288              :   /* The number of nodes that directly use the result of this one
    6289              :      (i.e. the number of nodes that count this one as a child).  */
    6290              :   unsigned int out_degree = 0;
    6291              : 
    6292              :   /* The execution frequency of the node.  */
    6293              :   sreal weight = 0;
    6294              : 
    6295              :   /* The total execution frequency of all nodes that directly use the
    6296              :      result of this one.  */
    6297              :   sreal out_weight = 0;
    6298              : };
    6299              : 
    6300              : /* Information about one partition of the SLP graph, for use during
    6301              :    vect_optimize_slp_pass.  */
    6302              : 
    6303              : struct slpg_partition_info
    6304              : {
    6305              :   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
    6306              :      of m_partitioned_nodes.  */
    6307              :   unsigned int node_begin = 0;
    6308              :   unsigned int node_end = 0;
    6309              : 
    6310              :   /* Which layout we've chosen to use for this partition, or -1 if
    6311              :      we haven't picked one yet.  */
    6312              :   int layout = -1;
    6313              : 
    6314              :   /* The number of predecessors and successors in the partition dag.
    6315              :      The predecessors always have lower partition numbers and the
    6316              :      successors always have higher partition numbers.
    6317              : 
    6318              :      Note that the directions of these edges are not necessarily the
    6319              :      same as in the data flow graph.  For example, if an SCC has separate
    6320              :      partitions for an inner loop and an outer loop, the inner loop's
    6321              :      partition will have at least two incoming edges from the outer loop's
    6322              :      partition: one for a live-in value and one for a live-out value.
    6323              :      In data flow terms, one of these edges would also be from the outer loop
    6324              :      to the inner loop, but the other would be in the opposite direction.  */
    6325              :   unsigned int in_degree = 0;
    6326              :   unsigned int out_degree = 0;
    6327              : };
    6328              : 
    6329              : /* Information about the costs of using a particular layout for a
    6330              :    particular partition.  It can also say that the combination is
    6331              :    impossible.  */
    6332              : 
    6333              : struct slpg_partition_layout_costs
    6334              : {
    6335      1565677 :   bool is_possible () const { return internal_cost.is_possible (); }
    6336        55522 :   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
    6337              : 
    6338              :   /* The costs inherited from predecessor partitions.  */
    6339              :   slpg_layout_cost in_cost;
    6340              : 
    6341              :   /* The inherent cost of the layout within the node itself.  For example,
    6342              :      this is nonzero for a load if choosing a particular layout would require
    6343              :      the load to permute the loaded elements.  It is nonzero for a
    6344              :      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
    6345              :      to full-vector moves.  */
    6346              :   slpg_layout_cost internal_cost;
    6347              : 
    6348              :   /* The costs inherited from successor partitions.  */
    6349              :   slpg_layout_cost out_cost;
    6350              : };
    6351              : 
    6352              : /* This class tries to optimize the layout of vectors in order to avoid
    6353              :    unnecessary shuffling.  At the moment, the set of possible layouts are
    6354              :    restricted to bijective permutations.
    6355              : 
    6356              :    The goal of the pass depends on whether we're optimizing for size or
    6357              :    for speed.  When optimizing for size, the goal is to reduce the overall
    6358              :    number of layout changes (including layout changes implied by things
    6359              :    like load permutations).  When optimizing for speed, the goal is to
    6360              :    reduce the maximum latency attributable to layout changes on any
    6361              :    non-cyclical path through the data flow graph.
    6362              : 
    6363              :    For example, when optimizing a loop nest for speed, we will prefer
    6364              :    to make layout changes outside of a loop rather than inside of a loop,
    6365              :    and will prefer to make layout changes in parallel rather than serially,
    6366              :    even if that increases the overall number of layout changes.
    6367              : 
    6368              :    The high-level procedure is:
    6369              : 
    6370              :    (1) Build a graph in which edges go from uses (parents) to definitions
    6371              :        (children).
    6372              : 
    6373              :    (2) Divide the graph into a dag of strongly-connected components (SCCs).
    6374              : 
    6375              :    (3) When optimizing for speed, partition the nodes in each SCC based
    6376              :        on their containing cfg loop.  When optimizing for size, treat
    6377              :        each SCC as a single partition.
    6378              : 
    6379              :        This gives us a dag of partitions.  The goal is now to assign a
    6380              :        layout to each partition.
    6381              : 
    6382              :    (4) Construct a set of vector layouts that are worth considering.
    6383              :        Record which nodes must keep their current layout.
    6384              : 
    6385              :    (5) Perform a forward walk over the partition dag (from loads to stores)
    6386              :        accumulating the "forward" cost of using each layout.  When visiting
    6387              :        each partition, assign a tentative choice of layout to the partition
    6388              :        and use that choice when calculating the cost of using a different
    6389              :        layout in successor partitions.
    6390              : 
    6391              :    (6) Perform a backward walk over the partition dag (from stores to loads),
    6392              :        accumulating the "backward" cost of using each layout.  When visiting
    6393              :        each partition, make a final choice of layout for that partition based
    6394              :        on the accumulated forward costs (from (5)) and backward costs
    6395              :        (from (6)).
    6396              : 
    6397              :    (7) Apply the chosen layouts to the SLP graph.
    6398              : 
    6399              :    For example, consider the SLP statements:
    6400              : 
    6401              :    S1:      a_1 = load
    6402              :        loop:
    6403              :    S2:      a_2 = PHI<a_1, a_3>
    6404              :    S3:      b_1 = load
    6405              :    S4:      a_3 = a_2 + b_1
    6406              :        exit:
    6407              :    S5:      a_4 = PHI<a_3>
    6408              :    S6:      store a_4
    6409              : 
    6410              :    S2 and S4 form an SCC and are part of the same loop.  Every other
    6411              :    statement is in a singleton SCC.  In this example there is a one-to-one
    6412              :    mapping between SCCs and partitions and the partition dag looks like this;
    6413              : 
    6414              :         S1     S3
    6415              :          \     /
    6416              :           S2+S4
    6417              :             |
    6418              :            S5
    6419              :             |
    6420              :            S6
    6421              : 
    6422              :    S2, S3 and S4 will have a higher execution frequency than the other
    6423              :    statements, so when optimizing for speed, the goal is to avoid any
    6424              :    layout changes:
    6425              : 
    6426              :    - within S3
    6427              :    - within S2+S4
    6428              :    - on the S3->S2+S4 edge
    6429              : 
    6430              :    For example, if S3 was originally a reversing load, the goal of the
    6431              :    pass is to make it an unreversed load and change the layout on the
    6432              :    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
    6433              :    on S1->S2+S4 and S5->S6 would also be acceptable.)
    6434              : 
    6435              :    The difference between SCCs and partitions becomes important if we
    6436              :    add an outer loop:
    6437              : 
    6438              :    S1:      a_1 = ...
    6439              :        loop1:
    6440              :    S2:      a_2 = PHI<a_1, a_6>
    6441              :    S3:      b_1 = load
    6442              :    S4:      a_3 = a_2 + b_1
    6443              :        loop2:
    6444              :    S5:      a_4 = PHI<a_3, a_5>
    6445              :    S6:      c_1 = load
    6446              :    S7:      a_5 = a_4 + c_1
    6447              :        exit2:
    6448              :    S8:      a_6 = PHI<a_5>
    6449              :    S9:      store a_6
    6450              :        exit1:
    6451              : 
    6452              :    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
    6453              :    for speed, we usually do not want restrictions in the outer loop to "infect"
    6454              :    the decision for the inner loop.  For example, if an outer-loop node
    6455              :    in the SCC contains a statement with a fixed layout, that should not
    6456              :    prevent the inner loop from using a different layout.  Conversely,
    6457              :    the inner loop should not dictate a layout to the outer loop: if the
    6458              :    outer loop does a lot of computation, then it may not be efficient to
    6459              :    do all of that computation in the inner loop's preferred layout.
    6460              : 
    6461              :    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
    6462              :    and S5+S7 (inner).  We also try to arrange partitions so that:
    6463              : 
    6464              :    - the partition for an outer loop comes before the partition for
    6465              :      an inner loop
    6466              : 
    6467              :    - if a sibling loop A dominates a sibling loop B, A's partition
    6468              :      comes before B's
    6469              : 
    6470              :    This gives the following partition dag for the example above:
    6471              : 
    6472              :         S1        S3
    6473              :          \        /
    6474              :           S2+S4+S8   S6
    6475              :            |   \\    /
    6476              :            |    S5+S7
    6477              :            |
    6478              :           S9
    6479              : 
    6480              :    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
    6481              :    one for a reversal of the edge S7->S8.
    6482              : 
    6483              :    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
    6484              :    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
    6485              :    preferred layout against the cost of changing the layout on entry to the
    6486              :    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
    6487              : 
    6488              :    Although this works well when optimizing for speed, it has the downside
    6489              :    when optimizing for size that the choice of layout for S5+S7 is completely
    6490              :    independent of S9, which lessens the chance of reducing the overall number
    6491              :    of permutations.  We therefore do not partition SCCs when optimizing
    6492              :    for size.
    6493              : 
    6494              :    To give a concrete example of the difference between optimizing
    6495              :    for size and speed, consider:
    6496              : 
    6497              :    a[0] = (b[1] << c[3]) - d[1];
    6498              :    a[1] = (b[0] << c[2]) - d[0];
    6499              :    a[2] = (b[3] << c[1]) - d[3];
    6500              :    a[3] = (b[2] << c[0]) - d[2];
    6501              : 
    6502              :    There are three different layouts here: one for a, one for b and d,
    6503              :    and one for c.  When optimizing for speed it is better to permute each
    6504              :    of b, c and d into the order required by a, since those permutations
    6505              :    happen in parallel.  But when optimizing for size, it is better to:
    6506              : 
    6507              :    - permute c into the same order as b
    6508              :    - do the arithmetic
    6509              :    - permute the result into the order required by a
    6510              : 
    6511              :    This gives 2 permutations rather than 3.  */
    6512              : 
    6513              : class vect_optimize_slp_pass
    6514              : {
    6515              : public:
    6516       676294 :   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
    6517              :   void run ();
    6518              : 
    6519              : private:
    6520              :   /* Graph building.  */
    6521              :   struct loop *containing_loop (slp_tree);
    6522              :   bool is_cfg_latch_edge (graph_edge *);
    6523              :   void build_vertices (hash_set<slp_tree> &, slp_tree);
    6524              :   void build_vertices ();
    6525              :   void build_graph ();
    6526              : 
    6527              :   /* Partitioning.  */
    6528              :   void create_partitions ();
    6529              :   template<typename T> void for_each_partition_edge (unsigned int, T);
    6530              : 
    6531              :   /* Layout selection.  */
    6532              :   bool is_compatible_layout (slp_tree, unsigned int);
    6533              :   bool is_compatible_layout (const slpg_partition_info &, unsigned int);
    6534              :   int change_layout_cost (slp_tree, unsigned int, unsigned int);
    6535              :   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
    6536              :                                                        unsigned int);
    6537              :   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
    6538              :                                int, unsigned int);
    6539              :   int internal_node_cost (slp_tree, int, unsigned int);
    6540              :   void start_choosing_layouts ();
    6541              :   bool legitimize ();
    6542              : 
    6543              :   /* Cost propagation.  */
    6544              :   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
    6545              :                                      unsigned int, unsigned int);
    6546              :   slpg_layout_cost total_in_cost (unsigned int);
    6547              :   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
    6548              :   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
    6549              :   void forward_pass ();
    6550              :   void backward_pass ();
    6551              : 
    6552              :   /* Rematerialization.  */
    6553              :   slp_tree get_result_with_layout (slp_tree, unsigned int);
    6554              :   void materialize ();
    6555              : 
    6556              :   /* Clean-up.  */
    6557              :   void remove_redundant_permutations ();
    6558              : 
    6559              :   /* Masked load lanes discovery.  */
    6560              :   void decide_masked_load_lanes ();
    6561              : 
    6562              :   void dump ();
    6563              : 
    6564              :   vec_info *m_vinfo;
    6565              : 
    6566              :   /* True if we should optimize the graph for size, false if we should
    6567              :      optimize it for speed.  (It wouldn't be easy to make this decision
    6568              :      more locally.)  */
    6569              :   bool m_optimize_size;
    6570              : 
    6571              :   /* A graph of all SLP nodes, with edges leading from uses to definitions.
    6572              :      In other words, a node's predecessors are its slp_tree parents and
    6573              :      a node's successors are its slp_tree children.  */
    6574              :   graph *m_slpg = nullptr;
    6575              : 
    6576              :   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
    6577              :   auto_vec<slpg_vertex> m_vertices;
    6578              : 
    6579              :   /* The list of all leaves of M_SLPG. such as external definitions, constants,
    6580              :      and loads.  */
    6581              :   auto_vec<int> m_leafs;
    6582              : 
    6583              :   /* This array has one entry for every vector layout that we're considering.
    6584              :      Element 0 is null and indicates "no change".  Other entries describe
    6585              :      permutations that are inherent in the current graph and that we would
    6586              :      like to reverse if possible.
    6587              : 
    6588              :      For example, a permutation { 1, 2, 3, 0 } means that something has
    6589              :      effectively been permuted in that way, such as a load group
    6590              :      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
    6591              :      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
    6592              :      in order to put things "back" in order.  */
    6593              :   auto_vec<vec<unsigned> > m_perms;
    6594              : 
    6595              :   /* A partitioning of the nodes for which a layout must be chosen.
    6596              :      Each partition represents an <SCC, cfg loop> pair; that is,
    6597              :      nodes in different SCCs belong to different partitions, and nodes
    6598              :      within an SCC can be further partitioned according to a containing
    6599              :      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
    6600              : 
    6601              :      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
    6602              :        from leaves (such as loads) to roots (such as stores).
    6603              : 
    6604              :      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
    6605              :   auto_vec<slpg_partition_info> m_partitions;
    6606              : 
    6607              :   /* The list of all nodes for which a layout must be chosen.  Nodes for
    6608              :      partition P come before the nodes for partition P+1.  Nodes within a
    6609              :      partition are in reverse postorder.  */
    6610              :   auto_vec<unsigned int> m_partitioned_nodes;
    6611              : 
    6612              :   /* Index P * num-layouts + L contains the cost of using layout L
    6613              :      for partition P.  */
    6614              :   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
    6615              : 
    6616              :   /* Index N * num-layouts + L, if nonnull, is a node that provides the
    6617              :      original output of node N adjusted to have layout L.  */
    6618              :   auto_vec<slp_tree> m_node_layouts;
    6619              : };
    6620              : 
    6621              : /* Fill the vertices and leafs vector with all nodes in the SLP graph.
    6622              :    Also record whether we should optimize anything for speed rather
    6623              :    than size.  */
    6624              : 
    6625              : void
    6626     10690847 : vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
    6627              :                                         slp_tree node)
    6628              : {
    6629     10690847 :   unsigned i;
    6630     10690847 :   slp_tree child;
    6631              : 
    6632     10690847 :   if (visited.add (node))
    6633     10690847 :     return;
    6634              : 
    6635      9884527 :   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
    6636              :     {
    6637      7783816 :       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
    6638      6936210 :       if (optimize_bb_for_speed_p (bb))
    6639      6817800 :         m_optimize_size = false;
    6640              :     }
    6641              : 
    6642      9884527 :   node->vertex = m_vertices.length ();
    6643      9884527 :   m_vertices.safe_push (slpg_vertex (node));
    6644              : 
    6645      9884527 :   bool leaf = true;
    6646      9884527 :   bool force_leaf = false;
    6647     18524390 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    6648      8639863 :     if (child)
    6649              :       {
    6650      7783545 :         leaf = false;
    6651      7783545 :         build_vertices (visited, child);
    6652              :       }
    6653              :     else
    6654              :       force_leaf = true;
    6655              :   /* Since SLP discovery works along use-def edges all cycles have an
    6656              :      entry - but there's the exception of cycles where we do not handle
    6657              :      the entry explicitly (but with a NULL SLP node), like some reductions
    6658              :      and inductions.  Force those SLP PHIs to act as leafs to make them
    6659              :      backwards reachable.  */
    6660      9884527 :   if (leaf || force_leaf)
    6661      4878512 :     m_leafs.safe_push (node->vertex);
    6662              : }
    6663              : 
    6664              : /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
    6665              : 
    6666              : void
    6667      1352588 : vect_optimize_slp_pass::build_vertices ()
    6668              : {
    6669      1352588 :   hash_set<slp_tree> visited;
    6670      1352588 :   unsigned i;
    6671      1352588 :   slp_instance instance;
    6672      1352588 :   m_vertices.truncate (0);
    6673      1352588 :   m_leafs.truncate (0);
    6674      6965066 :   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
    6675      2907302 :     build_vertices (visited, SLP_INSTANCE_TREE (instance));
    6676      1352588 : }
    6677              : 
    6678              : /* Apply (reverse) bijectite PERM to VEC.  */
    6679              : 
    6680              : template <class T>
    6681              : static void
    6682       207113 : vect_slp_permute (vec<unsigned> perm,
    6683              :                   vec<T> &vec, bool reverse)
    6684              : {
    6685       207113 :   auto_vec<T, 64> saved;
    6686       207113 :   saved.create (vec.length ());
    6687       672043 :   for (unsigned i = 0; i < vec.length (); ++i)
    6688       464930 :     saved.quick_push (vec[i]);
    6689              : 
    6690       207113 :   if (reverse)
    6691              :     {
    6692      1333885 :       for (unsigned i = 0; i < vec.length (); ++i)
    6693       463706 :         vec[perm[i]] = saved[i];
    6694       670275 :       for (unsigned i = 0; i < vec.length (); ++i)
    6695       820309 :         gcc_assert (vec[perm[i]] == saved[i]);
    6696              :     }
    6697              :   else
    6698              :     {
    6699         3536 :       for (unsigned i = 0; i < vec.length (); ++i)
    6700         1224 :         vec[i] = saved[perm[i]];
    6701       208337 :       for (unsigned i = 0; i < vec.length (); ++i)
    6702         1836 :         gcc_assert (vec[i] == saved[perm[i]]);
    6703              :     }
    6704       207113 : }
    6705              : 
    6706              : /* Return the cfg loop that contains NODE.  */
    6707              : 
    6708              : struct loop *
    6709      3868912 : vect_optimize_slp_pass::containing_loop (slp_tree node)
    6710              : {
    6711      3868912 :   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
    6712      3868912 :   if (!rep)
    6713         5300 :     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
    6714      4300594 :   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
    6715              : }
    6716              : 
    6717              : /* Return true if UD (an edge from a use to a definition) is associated
    6718              :    with a loop latch edge in the cfg.  */
    6719              : 
    6720              : bool
    6721      7783545 : vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
    6722              : {
    6723      7783545 :   slp_tree use = m_vertices[ud->src].node;
    6724      7783545 :   slp_tree def = m_vertices[ud->dest].node;
    6725      7783545 :   if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
    6726      7783545 :        || SLP_TREE_PERMUTE_P (use))
    6727      7473426 :       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
    6728              :     return false;
    6729              : 
    6730      4538132 :   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
    6731      4538132 :   return (is_a<gphi *> (use_rep->stmt)
    6732       372244 :           && bb_loop_header_p (gimple_bb (use_rep->stmt))
    6733      4748180 :           && containing_loop (def) == containing_loop (use));
    6734              : }
    6735              : 
    6736              : /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
    6737              :    a nonnull data field.  */
    6738              : 
    6739              : void
    6740      1352588 : vect_optimize_slp_pass::build_graph ()
    6741              : {
    6742      1352588 :   m_optimize_size = true;
    6743      1352588 :   build_vertices ();
    6744              : 
    6745      2705176 :   m_slpg = new_graph (m_vertices.length ());
    6746     13942291 :   for (slpg_vertex &v : m_vertices)
    6747     29573200 :     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
    6748      8639863 :       if (child)
    6749              :         {
    6750      7783545 :           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
    6751      7783545 :           if (is_cfg_latch_edge (ud))
    6752       201472 :             ud->data = this;
    6753              :         }
    6754      1352588 : }
    6755              : 
    6756              : /* Return true if E corresponds to a loop latch edge in the cfg.  */
    6757              : 
    6758              : static bool
    6759      3992236 : skip_cfg_latch_edges (graph_edge *e)
    6760              : {
    6761      3992236 :   return e->data;
    6762              : }
    6763              : 
    6764              : /* Create the node partitions.  */
    6765              : 
    6766              : void
    6767       676294 : vect_optimize_slp_pass::create_partitions ()
    6768              : {
    6769              :   /* Calculate a postorder of the graph, ignoring edges that correspond
    6770              :      to natural latch edges in the cfg.  Reading the vector from the end
    6771              :      to the beginning gives the reverse postorder.  */
    6772       676294 :   auto_vec<int> initial_rpo;
    6773      1352588 :   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
    6774              :                false, NULL, skip_cfg_latch_edges);
    6775      2028882 :   gcc_assert (initial_rpo.length () == m_vertices.length ());
    6776              : 
    6777              :   /* Calculate the strongly connected components of the graph.  */
    6778       676294 :   auto_vec<int> scc_grouping;
    6779       676294 :   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
    6780              : 
    6781              :   /* Create a new index order in which all nodes from the same SCC are
    6782              :      consecutive.  Use scc_pos to record the index of the first node in
    6783              :      each SCC.  */
    6784       676294 :   auto_vec<unsigned int> scc_pos (num_sccs);
    6785       676294 :   int last_component = -1;
    6786       676294 :   unsigned int node_count = 0;
    6787      6970875 :   for (unsigned int node_i : scc_grouping)
    6788              :     {
    6789      4941993 :       if (last_component != m_slpg->vertices[node_i].component)
    6790              :         {
    6791      4815412 :           last_component = m_slpg->vertices[node_i].component;
    6792      9630824 :           gcc_assert (last_component == int (scc_pos.length ()));
    6793      4815412 :           scc_pos.quick_push (node_count);
    6794              :         }
    6795      4941993 :       node_count += 1;
    6796              :     }
    6797      1352588 :   gcc_assert (node_count == initial_rpo.length ()
    6798              :               && last_component + 1 == int (num_sccs));
    6799              : 
    6800              :   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
    6801              :      inside each SCC following the RPO we calculated above.  The fact that
    6802              :      we ignored natural latch edges when calculating the RPO should ensure
    6803              :      that, for natural loop nests:
    6804              : 
    6805              :      - the first node that we encounter in a cfg loop is the loop header phi
    6806              :      - the loop header phis are in dominance order
    6807              : 
    6808              :      Arranging for this is an optimization (see below) rather than a
    6809              :      correctness issue.  Unnatural loops with a tangled mess of backedges
    6810              :      will still work correctly, but might give poorer results.
    6811              : 
    6812              :      Also update scc_pos so that it gives 1 + the index of the last node
    6813              :      in the SCC.  */
    6814       676294 :   m_partitioned_nodes.safe_grow (node_count);
    6815      6294581 :   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
    6816              :     {
    6817      4941993 :       unsigned int node_i = initial_rpo[old_i];
    6818      4941993 :       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
    6819      4941993 :       m_partitioned_nodes[new_i] = node_i;
    6820              :     }
    6821              : 
    6822              :   /* When optimizing for speed, partition each SCC based on the containing
    6823              :      cfg loop. The order we constructed above should ensure that, for natural
    6824              :      cfg loops, we'll create sub-SCC partitions for outer loops before
    6825              :      the corresponding sub-SCC partitions for inner loops.  Similarly,
    6826              :      when one sibling loop A dominates another sibling loop B, we should
    6827              :      create a sub-SCC partition for A before a sub-SCC partition for B.
    6828              : 
    6829              :      As above, nothing depends for correctness on whether this achieves
    6830              :      a natural nesting, but we should get better results when it does.  */
    6831      1352588 :   m_partitions.reserve (m_vertices.length ());
    6832       676294 :   unsigned int next_partition_i = 0;
    6833       676294 :   hash_map<struct loop *, int> loop_partitions;
    6834       676294 :   unsigned int rpo_begin = 0;
    6835       676294 :   unsigned int num_partitioned_nodes = 0;
    6836      6844294 :   for (unsigned int rpo_end : scc_pos)
    6837              :     {
    6838      4815412 :       loop_partitions.empty ();
    6839              :       unsigned int partition_i = next_partition_i;
    6840      9757405 :       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
    6841              :         {
    6842              :           /* Handle externals and constants optimistically throughout.
    6843              :              But treat existing vectors as fixed since we do not handle
    6844              :              permuting them.  */
    6845      4941993 :           unsigned int node_i = m_partitioned_nodes[rpo_i];
    6846      4941993 :           auto &vertex = m_vertices[node_i];
    6847      4941993 :           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
    6848       501031 :                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
    6849      4944213 :               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
    6850      1468858 :             vertex.partition = -1;
    6851              :           else
    6852              :             {
    6853      3473135 :               bool existed;
    6854      3473135 :               if (m_optimize_size)
    6855        24319 :                 existed = next_partition_i > partition_i;
    6856              :               else
    6857              :                 {
    6858      3448816 :                   struct loop *loop = containing_loop (vertex.node);
    6859      3448816 :                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
    6860      3448816 :                   if (!existed)
    6861      3323252 :                     entry = next_partition_i;
    6862      3448816 :                   partition_i = entry;
    6863              :                 }
    6864      3473135 :               if (!existed)
    6865              :                 {
    6866      3347493 :                   m_partitions.quick_push (slpg_partition_info ());
    6867      3347493 :                   next_partition_i += 1;
    6868              :                 }
    6869      3473135 :               vertex.partition = partition_i;
    6870      3473135 :               num_partitioned_nodes += 1;
    6871      3473135 :               m_partitions[partition_i].node_end += 1;
    6872              :             }
    6873              :         }
    6874      4815412 :       rpo_begin = rpo_end;
    6875              :     }
    6876              : 
    6877              :   /* Assign ranges of consecutive node indices to each partition,
    6878              :      in partition order.  Start with node_end being the same as
    6879              :      node_begin so that the next loop can use it as a counter.  */
    6880       676294 :   unsigned int node_begin = 0;
    6881      5376375 :   for (auto &partition : m_partitions)
    6882              :     {
    6883      3347493 :       partition.node_begin = node_begin;
    6884      3347493 :       node_begin += partition.node_end;
    6885      3347493 :       partition.node_end = partition.node_begin;
    6886              :     }
    6887       676294 :   gcc_assert (node_begin == num_partitioned_nodes);
    6888              : 
    6889              :   /* Finally build the list of nodes in partition order.  */
    6890       676294 :   m_partitioned_nodes.truncate (num_partitioned_nodes);
    6891      5618287 :   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
    6892              :     {
    6893      4941993 :       int partition_i = m_vertices[node_i].partition;
    6894      4941993 :       if (partition_i >= 0)
    6895              :         {
    6896      3473135 :           unsigned int order_i = m_partitions[partition_i].node_end++;
    6897      3473135 :           m_partitioned_nodes[order_i] = node_i;
    6898              :         }
    6899              :     }
    6900       676294 : }
    6901              : 
    6902              : /* Look for edges from earlier partitions into node NODE_I and edges from
    6903              :    node NODE_I into later partitions.  Call:
    6904              : 
    6905              :       FN (ud, other_node_i)
    6906              : 
    6907              :    for each such use-to-def edge ud, where other_node_i is the node at the
    6908              :    other end of the edge.  */
    6909              : 
    6910              : template<typename T>
    6911              : void
    6912      3913556 : vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
    6913              : {
    6914      3913556 :   int partition_i = m_vertices[node_i].partition;
    6915      3913556 :   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
    6916      6792827 :        pred; pred = pred->pred_next)
    6917              :     {
    6918      2879271 :       int src_partition_i = m_vertices[pred->src].partition;
    6919      2879271 :       if (src_partition_i >= 0 && src_partition_i != partition_i)
    6920      2559336 :         fn (pred, pred->src);
    6921              :     }
    6922      3913556 :   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
    6923      8415691 :        succ; succ = succ->succ_next)
    6924              :     {
    6925      4502135 :       int dest_partition_i = m_vertices[succ->dest].partition;
    6926      4502135 :       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
    6927      2587243 :         fn (succ, succ->dest);
    6928              :     }
    6929      3913556 : }
    6930              : 
    6931              : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
    6932              :    that NODE would operate on.  This test is independent of NODE's actual
    6933              :    operation.  */
    6934              : 
    6935              : bool
    6936      1762004 : vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
    6937              :                                               unsigned int layout_i)
    6938              : {
    6939      1762004 :   if (layout_i == 0)
    6940              :     return true;
    6941              : 
    6942      1007222 :   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
    6943        14802 :     return false;
    6944              : 
    6945              :   return true;
    6946              : }
    6947              : 
    6948              : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
    6949              :    that NODE would operate on for each NODE in PARTITION.
    6950              :    This test is independent of NODE's actual operations.  */
    6951              : 
    6952              : bool
    6953        17733 : vect_optimize_slp_pass::is_compatible_layout (const slpg_partition_info
    6954              :                                                 &partition,
    6955              :                                               unsigned int layout_i)
    6956              : {
    6957        35738 :   for (unsigned int order_i = partition.node_begin;
    6958        35738 :        order_i < partition.node_end; ++order_i)
    6959              :     {
    6960        18071 :       unsigned int node_i = m_partitioned_nodes[order_i];
    6961        18071 :       auto &vertex = m_vertices[node_i];
    6962              : 
    6963              :       /* The layout is incompatible if it is individually incompatible
    6964              :          with any node in the partition.  */
    6965        18071 :       if (!is_compatible_layout (vertex.node, layout_i))
    6966              :         return false;
    6967              :     }
    6968              :   return true;
    6969              : }
    6970              : 
    6971              : /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
    6972              :    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
    6973              :    layouts is incompatible with NODE or if the change is not possible for
    6974              :    some other reason.
    6975              : 
    6976              :    The properties taken from NODE include the number of lanes and the
    6977              :    vector type.  The actual operation doesn't matter.  */
    6978              : 
    6979              : int
    6980       753796 : vect_optimize_slp_pass::change_layout_cost (slp_tree node,
    6981              :                                             unsigned int from_layout_i,
    6982              :                                             unsigned int to_layout_i)
    6983              : {
    6984       753796 :   if (!is_compatible_layout (node, from_layout_i)
    6985       753796 :       || !is_compatible_layout (node, to_layout_i))
    6986          563 :     return -1;
    6987              : 
    6988       753233 :   if (from_layout_i == to_layout_i)
    6989              :     return 0;
    6990              : 
    6991       319259 :   auto_vec<slp_tree, 1> children (1);
    6992       319259 :   children.quick_push (node);
    6993       319259 :   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
    6994       319259 :   if (from_layout_i > 0)
    6995       895698 :     for (unsigned int i : m_perms[from_layout_i])
    6996       391719 :       perm.quick_push ({ 0, i });
    6997              :   else
    6998       486613 :     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
    6999       335347 :       perm.quick_push ({ 0, i });
    7000       319259 :   if (to_layout_i > 0)
    7001       151693 :     vect_slp_permute (m_perms[to_layout_i], perm, true);
    7002       319259 :   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
    7003              :                                                children, false);
    7004       319259 :   if (count >= 0)
    7005       315109 :     return MAX (count, 1);
    7006              : 
    7007              :   /* ??? In principle we could try changing via layout 0, giving two
    7008              :      layout changes rather than 1.  Doing that would require
    7009              :      corresponding support in get_result_with_layout.  */
    7010              :   return -1;
    7011       319259 : }
    7012              : 
    7013              : /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
    7014              : 
    7015              : inline slpg_partition_layout_costs &
    7016      1078769 : vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
    7017              :                                                 unsigned int layout_i)
    7018              : {
    7019      2157538 :   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
    7020              : }
    7021              : 
    7022              : /* Change PERM in one of two ways:
    7023              : 
    7024              :    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
    7025              :      chosen for child I of NODE.
    7026              : 
    7027              :    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
    7028              : 
    7029              :    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
    7030              : 
    7031              : void
    7032        30626 : vect_optimize_slp_pass::
    7033              : change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
    7034              :                         int in_layout_i, unsigned int out_layout_i)
    7035              : {
    7036       177996 :   for (auto &entry : perm)
    7037              :     {
    7038        86118 :       int this_in_layout_i = in_layout_i;
    7039        86118 :       if (this_in_layout_i < 0)
    7040              :         {
    7041        59875 :           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
    7042        59875 :           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
    7043        59875 :           if (in_partition_i == -1u)
    7044          329 :             continue;
    7045        59546 :           this_in_layout_i = m_partitions[in_partition_i].layout;
    7046              :         }
    7047        85789 :       if (this_in_layout_i > 0)
    7048        19141 :         entry.second = m_perms[this_in_layout_i][entry.second];
    7049              :     }
    7050        30626 :   if (out_layout_i > 0)
    7051         7147 :     vect_slp_permute (m_perms[out_layout_i], perm, true);
    7052        30626 : }
    7053              : 
    7054              : /* Check whether the target allows NODE to be rearranged so that the node's
    7055              :    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
    7056              :    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
    7057              : 
    7058              :    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
    7059              :    NODE can adapt to the layout changes that have (perhaps provisionally)
    7060              :    been chosen for NODE's children, so that no extra permutations are
    7061              :    needed on either the input or the output of NODE.
    7062              : 
    7063              :    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
    7064              :    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
    7065              : 
    7066              :    IN_LAYOUT_I has no meaning for other types of node.
    7067              : 
    7068              :    Keeping the node as-is is always valid.  If the target doesn't appear
    7069              :    to support the node as-is, but might realistically support other layouts,
    7070              :    then layout 0 instead has the cost of a worst-case permutation.  On the
    7071              :    one hand, this ensures that every node has at least one valid layout,
    7072              :    avoiding what would otherwise be an awkward special case.  On the other,
    7073              :    it still encourages the pass to change an invalid pre-existing layout
    7074              :    choice into a valid one.  */
    7075              : 
    7076              : int
    7077       232627 : vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
    7078              :                                             unsigned int out_layout_i)
    7079              : {
    7080       232627 :   const int fallback_cost = 1;
    7081              : 
    7082       232627 :   if (SLP_TREE_PERMUTE_P (node))
    7083              :     {
    7084        25485 :       auto_lane_permutation_t tmp_perm;
    7085        25485 :       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
    7086              : 
    7087              :       /* Check that the child nodes support the chosen layout.  Checking
    7088              :          the first child is enough, since any second child would have the
    7089              :          same shape.  */
    7090        25485 :       auto first_child = SLP_TREE_CHILDREN (node)[0];
    7091        25485 :       if (in_layout_i > 0
    7092        25485 :           && !is_compatible_layout (first_child, in_layout_i))
    7093              :         return -1;
    7094              : 
    7095        24926 :       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
    7096        49852 :       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
    7097              :                                                   node, tmp_perm,
    7098        24926 :                                                   SLP_TREE_CHILDREN (node),
    7099              :                                                   false);
    7100        24926 :       if (count < 0)
    7101              :         {
    7102         1498 :           if (in_layout_i == 0 && out_layout_i == 0)
    7103              :             {
    7104              :               /* Use the fallback cost if the node could in principle support
    7105              :                  some nonzero layout for both the inputs and the outputs.
    7106              :                  Otherwise assume that the node will be rejected later
    7107              :                  and rebuilt from scalars.  */
    7108          363 :               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
    7109              :                 return fallback_cost;
    7110          293 :               return 0;
    7111              :             }
    7112              :           return -1;
    7113              :         }
    7114              : 
    7115              :       /* We currently have no way of telling whether the new layout is cheaper
    7116              :          or more expensive than the old one.  But at least in principle,
    7117              :          it should be worth making zero permutations (whole-vector shuffles)
    7118              :          cheaper than real permutations, in case the pass is able to remove
    7119              :          the latter.  */
    7120        23428 :       return count == 0 ? 0 : 1;
    7121        25485 :     }
    7122              : 
    7123       207142 :   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
    7124       207142 :   if (rep
    7125       206211 :       && STMT_VINFO_DATA_REF (rep)
    7126        63722 :       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
    7127       253580 :       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7128              :     {
    7129        39406 :       auto_load_permutation_t tmp_perm;
    7130        39406 :       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
    7131        39406 :       if (out_layout_i > 0)
    7132        13507 :         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
    7133              : 
    7134        39406 :       poly_uint64 vf = 1;
    7135        39406 :       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
    7136        12066 :         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    7137        39406 :       unsigned int n_perms;
    7138        39406 :       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
    7139              :                                            nullptr, vf, true, false, &n_perms))
    7140              :         {
    7141         1491 :           auto rep = SLP_TREE_REPRESENTATIVE (node);
    7142         1491 :           if (out_layout_i == 0)
    7143              :             {
    7144              :               /* Use the fallback cost if the load is an N-to-N permutation.
    7145              :                  Otherwise assume that the node will be rejected later
    7146              :                  and rebuilt from scalars.  */
    7147         1088 :               if (STMT_VINFO_GROUPED_ACCESS (rep)
    7148         2176 :                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
    7149         1088 :                       == SLP_TREE_LANES (node)))
    7150          592 :                 return fallback_cost;
    7151              :               return 0;
    7152              :             }
    7153              :           return -1;
    7154              :         }
    7155              : 
    7156              :       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
    7157        37915 :       return n_perms == 0 ? 0 : 1;
    7158        39406 :     }
    7159              : 
    7160              :   return 0;
    7161              : }
    7162              : 
    7163              : /* Decide which element layouts we should consider using.  Calculate the
    7164              :    weights associated with inserting layout changes on partition edges.
    7165              :    Also mark partitions that cannot change layout, by setting their
    7166              :    layout to zero.  */
    7167              : 
    7168              : void
    7169       676294 : vect_optimize_slp_pass::start_choosing_layouts ()
    7170              : {
    7171              :   /* Used to assign unique permutation indices.  */
    7172       676294 :   using perm_hash = unbounded_hashmap_traits<
    7173              :     vec_free_hash_base<int_hash_base<unsigned>>,
    7174              :     int_hash<int, -1, -2>
    7175              :   >;
    7176       676294 :   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
    7177              : 
    7178              :   /* Layout 0 is "no change".  */
    7179       676294 :   m_perms.safe_push (vNULL);
    7180              : 
    7181              :   /* Create layouts from existing permutations.  */
    7182       676294 :   auto_load_permutation_t tmp_perm;
    7183      5502017 :   for (unsigned int node_i : m_partitioned_nodes)
    7184              :     {
    7185              :       /* Leafs also double as entries to the reverse graph.  Allow the
    7186              :          layout of those to be changed.  */
    7187      3473135 :       auto &vertex = m_vertices[node_i];
    7188      3473135 :       auto &partition = m_partitions[vertex.partition];
    7189      3473135 :       if (!m_slpg->vertices[node_i].succ)
    7190       880167 :         partition.layout = 0;
    7191              : 
    7192              :       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
    7193      3473135 :       slp_tree node = vertex.node;
    7194      3473135 :       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
    7195      3473135 :       slp_tree child;
    7196      3473135 :       unsigned HOST_WIDE_INT imin, imax = 0;
    7197      3473135 :       bool any_permute = false;
    7198      3473135 :       tmp_perm.truncate (0);
    7199      3473135 :       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7200              :         {
    7201              :           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
    7202              :              unpermuted, record a layout that reverses this permutation.
    7203              : 
    7204              :              We would need more work to cope with loads that are internally
    7205              :              permuted and also have inputs (such as masks for
    7206              :              IFN_MASK_LOADs).  */
    7207       592008 :           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
    7208       592008 :           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
    7209              :             {
    7210       420131 :               partition.layout = -1;
    7211      3455884 :               continue;
    7212              :             }
    7213       171877 :           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
    7214       171877 :           imin = DR_GROUP_SIZE (dr_stmt) + 1;
    7215       171877 :           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
    7216              :         }
    7217      5644173 :       else if (SLP_TREE_PERMUTE_P (node)
    7218       136434 :                && SLP_TREE_CHILDREN (node).length () == 1
    7219       118081 :                && (child = SLP_TREE_CHILDREN (node)[0])
    7220      2999208 :                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
    7221       118081 :                    .is_constant (&imin)))
    7222              :         {
    7223              :           /* If the child has the same vector size as this node,
    7224              :              reversing the permutation can make the permutation a no-op.
    7225              :              In other cases it can change a true permutation into a
    7226              :              full-vector extract.  */
    7227       118081 :           tmp_perm.reserve (SLP_TREE_LANES (node));
    7228       316788 :           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7229       198707 :             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
    7230              :         }
    7231              :       else
    7232      2763046 :         continue;
    7233              : 
    7234       765279 :       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7235              :         {
    7236       475321 :           unsigned idx = tmp_perm[j];
    7237       475321 :           imin = MIN (imin, idx);
    7238       475321 :           imax = MAX (imax, idx);
    7239       475321 :           if (idx - tmp_perm[0] != j)
    7240       139172 :             any_permute = true;
    7241              :         }
    7242              :       /* If the span doesn't match we'd disrupt VF computation, avoid
    7243              :          that for now.  */
    7244       289958 :       if (imax - imin + 1 != SLP_TREE_LANES (node))
    7245        82689 :         continue;
    7246              :       /* If there's no permute no need to split one out.  In this case
    7247              :          we can consider turning a load into a permuted load, if that
    7248              :          turns out to be cheaper than alternatives.  */
    7249       207269 :       if (!any_permute)
    7250              :         {
    7251       189880 :           partition.layout = -1;
    7252       189880 :           continue;
    7253              :         }
    7254              : 
    7255              :       /* For now only handle true permutes, like
    7256              :          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
    7257              :          when permuting constants and invariants keeping the permute
    7258              :          bijective.  */
    7259        17389 :       auto_sbitmap load_index (SLP_TREE_LANES (node));
    7260        17389 :       bitmap_clear (load_index);
    7261        66381 :       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7262        48992 :         bitmap_set_bit (load_index, tmp_perm[j] - imin);
    7263              :       unsigned j;
    7264        65697 :       for (j = 0; j < SLP_TREE_LANES (node); ++j)
    7265        48446 :         if (!bitmap_bit_p (load_index, j))
    7266              :           break;
    7267        17389 :       if (j != SLP_TREE_LANES (node))
    7268          138 :         continue;
    7269              : 
    7270        17251 :       vec<unsigned> perm = vNULL;
    7271        17251 :       perm.safe_grow (SLP_TREE_LANES (node), true);
    7272        65458 :       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7273        48207 :         perm[j] = tmp_perm[j] - imin;
    7274              : 
    7275        34502 :       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
    7276              :         {
    7277              :           /* Continue to use existing layouts, but don't add any more.  */
    7278            0 :           int *entry = layout_ids.get (perm);
    7279            0 :           partition.layout = entry ? *entry : 0;
    7280            0 :           perm.release ();
    7281              :         }
    7282              :       else
    7283              :         {
    7284        17251 :           bool existed;
    7285        17251 :           int &layout_i = layout_ids.get_or_insert (perm, &existed);
    7286        17251 :           if (existed)
    7287         6225 :             perm.release ();
    7288              :           else
    7289              :             {
    7290        11026 :               layout_i = m_perms.length ();
    7291        11026 :               m_perms.safe_push (perm);
    7292              :             }
    7293        17251 :           partition.layout = layout_i;
    7294              :         }
    7295        17389 :     }
    7296              : 
    7297              :   /* Initially assume that every layout is possible and has zero cost
    7298              :      in every partition.  */
    7299       676294 :   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
    7300      1352588 :                                               * m_perms.length ());
    7301              : 
    7302              :   /* We have to mark outgoing permutations facing non-associating-reduction
    7303              :      graph entries that are not represented as to be materialized.
    7304              :      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
    7305      3482533 :   for (slp_instance instance : m_vinfo->slp_instances)
    7306      1453651 :     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
    7307              :       {
    7308         6320 :         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
    7309         6320 :         m_partitions[m_vertices[node_i].partition].layout = 0;
    7310              :       }
    7311      1447331 :     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
    7312              :       {
    7313         2255 :         stmt_vec_info stmt_info
    7314         2255 :           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
    7315         2255 :         vect_reduc_info reduc_info
    7316         2255 :           = info_for_reduction (as_a <loop_vec_info> (m_vinfo),
    7317              :                                 SLP_INSTANCE_TREE (instance));
    7318         2255 :         if (needs_fold_left_reduction_p (TREE_TYPE
    7319              :                                            (gimple_get_lhs (stmt_info->stmt)),
    7320              :                                          VECT_REDUC_INFO_CODE (reduc_info)))
    7321              :           {
    7322           97 :             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
    7323           97 :             m_partitions[m_vertices[node_i].partition].layout = 0;
    7324              :           }
    7325              :       }
    7326              : 
    7327              :   /* Check which layouts each node and partition can handle.  Calculate the
    7328              :      weights associated with inserting layout changes on edges.  */
    7329      5502017 :   for (unsigned int node_i : m_partitioned_nodes)
    7330              :     {
    7331      3473135 :       auto &vertex = m_vertices[node_i];
    7332      3473135 :       auto &partition = m_partitions[vertex.partition];
    7333      3473135 :       slp_tree node = vertex.node;
    7334              : 
    7335      3473135 :       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
    7336              :         {
    7337      3467835 :           vertex.weight = vect_slp_node_weight (node);
    7338              : 
    7339              :           /* We do not handle stores with a permutation, so all
    7340              :              incoming permutations must have been materialized.
    7341              : 
    7342              :              We also don't handle masked grouped loads, which lack a
    7343              :              permutation vector.  In this case the memory locations
    7344              :              form an implicit second input to the loads, on top of the
    7345              :              explicit mask input, and the memory input's layout cannot
    7346              :              be changed.
    7347              : 
    7348              :              On the other hand, we do support permuting gather loads and
    7349              :              masked gather loads, where each scalar load is independent
    7350              :              of the others.  This can be useful if the address/index input
    7351              :              benefits from permutation.  */
    7352      3467835 :           if (STMT_VINFO_DATA_REF (rep)
    7353      1748773 :               && STMT_VINFO_GROUPED_ACCESS (rep)
    7354      4553839 :               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7355       914127 :             partition.layout = 0;
    7356              : 
    7357              :           /* We cannot change the layout of an operation that is
    7358              :              not independent on lanes.  Note this is an explicit
    7359              :              negative list since that's much shorter than the respective
    7360              :              positive one but it's critical to keep maintaining it.  */
    7361      3467835 :           if (is_gimple_call (STMT_VINFO_STMT (rep)))
    7362        31662 :             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
    7363              :               {
    7364         1155 :               case CFN_COMPLEX_ADD_ROT90:
    7365         1155 :               case CFN_COMPLEX_ADD_ROT270:
    7366         1155 :               case CFN_COMPLEX_MUL:
    7367         1155 :               case CFN_COMPLEX_MUL_CONJ:
    7368         1155 :               case CFN_VEC_ADDSUB:
    7369         1155 :               case CFN_VEC_FMADDSUB:
    7370         1155 :               case CFN_VEC_FMSUBADD:
    7371         1155 :                 partition.layout = 0;
    7372              :               default:;
    7373              :               }
    7374              :         }
    7375              : 
    7376      7838157 :       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
    7377              :         {
    7378      4365022 :           auto &other_vertex = m_vertices[other_node_i];
    7379              : 
    7380              :           /* Count the number of edges from earlier partitions and the number
    7381              :              of edges to later partitions.  */
    7382      4365022 :           if (other_vertex.partition < vertex.partition)
    7383      2182511 :             partition.in_degree += 1;
    7384              :           else
    7385      2182511 :             partition.out_degree += 1;
    7386              : 
    7387              :           /* If the current node uses the result of OTHER_NODE_I, accumulate
    7388              :              the effects of that.  */
    7389      4365022 :           if (ud->src == int (node_i))
    7390              :             {
    7391      2182511 :               other_vertex.out_weight += vertex.weight;
    7392      2182511 :               other_vertex.out_degree += 1;
    7393              :             }
    7394      7838157 :         };
    7395      3473135 :       for_each_partition_edge (node_i, process_edge);
    7396              :     }
    7397       676294 : }
    7398              : 
    7399              : /* Return the incoming costs for node NODE_I, assuming that each input keeps
    7400              :    its current (provisional) choice of layout.  The inputs do not necessarily
    7401              :    have the same layout as each other.  */
    7402              : 
    7403              : slpg_layout_cost
    7404         3180 : vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
    7405              : {
    7406         3180 :   auto &vertex = m_vertices[node_i];
    7407         3180 :   slpg_layout_cost cost;
    7408        11625 :   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
    7409              :     {
    7410         8445 :       auto &other_vertex = m_vertices[other_node_i];
    7411         8445 :       if (other_vertex.partition < vertex.partition)
    7412              :         {
    7413         5352 :           auto &other_partition = m_partitions[other_vertex.partition];
    7414        10704 :           auto &other_costs = partition_layout_costs (other_vertex.partition,
    7415         5352 :                                                       other_partition.layout);
    7416         5352 :           slpg_layout_cost this_cost = other_costs.in_cost;
    7417         5352 :           this_cost.add_serial_cost (other_costs.internal_cost);
    7418         5352 :           this_cost.split (other_partition.out_degree);
    7419         5352 :           cost.add_parallel_cost (this_cost);
    7420              :         }
    7421        11625 :     };
    7422         3180 :   for_each_partition_edge (node_i, add_cost);
    7423         3180 :   return cost;
    7424              : }
    7425              : 
    7426              : /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
    7427              :    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
    7428              :    slpg_layout_cost::impossible () if the change isn't possible.  */
    7429              : 
    7430              : slpg_layout_cost
    7431       753796 : vect_optimize_slp_pass::
    7432              : edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
    7433              :                   unsigned int layout2_i)
    7434              : {
    7435       753796 :   auto &def_vertex = m_vertices[ud->dest];
    7436       753796 :   auto &use_vertex = m_vertices[ud->src];
    7437       753796 :   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
    7438       753796 :   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
    7439       753796 :   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
    7440              :                                     use_layout_i);
    7441       753796 :   if (factor < 0)
    7442         4713 :     return slpg_layout_cost::impossible ();
    7443              : 
    7444              :   /* We have a choice of putting the layout change at the site of the
    7445              :      definition or at the site of the use.  Prefer the former when
    7446              :      optimizing for size or when the execution frequency of the
    7447              :      definition is no greater than the combined execution frequencies of
    7448              :      the uses.  When putting the layout change at the site of the definition,
    7449              :      divvy up the cost among all consumers.  */
    7450       749083 :   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
    7451              :     {
    7452       733027 :       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
    7453       733027 :       cost.split (def_vertex.out_degree);
    7454       733027 :       return cost;
    7455              :     }
    7456        16056 :   return { use_vertex.weight * factor, m_optimize_size };
    7457              : }
    7458              : 
    7459              : /* UD represents a use-def link between FROM_NODE_I and a node in a later
    7460              :    partition; FROM_NODE_I could be the definition node or the use node.
    7461              :    The node at the other end of the link wants to use layout TO_LAYOUT_I.
    7462              :    Return the cost of any necessary fix-ups on edge UD, or return
    7463              :    slpg_layout_cost::impossible () if the change isn't possible.
    7464              : 
    7465              :    At this point, FROM_NODE_I's partition has chosen the cheapest
    7466              :    layout based on the information available so far, but this choice
    7467              :    is only provisional.  */
    7468              : 
    7469              : slpg_layout_cost
    7470       198373 : vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
    7471              :                                       unsigned int to_layout_i)
    7472              : {
    7473       198373 :   auto &from_vertex = m_vertices[from_node_i];
    7474       198373 :   unsigned int from_partition_i = from_vertex.partition;
    7475       198373 :   slpg_partition_info &from_partition = m_partitions[from_partition_i];
    7476       198373 :   gcc_assert (from_partition.layout >= 0);
    7477              : 
    7478              :   /* First calculate the cost on the assumption that FROM_PARTITION sticks
    7479              :      with its current layout preference.  */
    7480       198373 :   slpg_layout_cost cost = slpg_layout_cost::impossible ();
    7481       198373 :   auto edge_cost = edge_layout_cost (ud, from_node_i,
    7482       198373 :                                      from_partition.layout, to_layout_i);
    7483       198373 :   if (edge_cost.is_possible ())
    7484              :     {
    7485       391820 :       auto &from_costs = partition_layout_costs (from_partition_i,
    7486       195910 :                                                  from_partition.layout);
    7487       195910 :       cost = from_costs.in_cost;
    7488       195910 :       cost.add_serial_cost (from_costs.internal_cost);
    7489       195910 :       cost.split (from_partition.out_degree);
    7490       195910 :       cost.add_serial_cost (edge_cost);
    7491              :     }
    7492         2463 :   else if (from_partition.layout == 0)
    7493              :     /* We must allow the source partition to have layout 0 as a fallback,
    7494              :        in case all other options turn out to be impossible.  */
    7495         2463 :     return cost;
    7496              : 
    7497              :   /* Take the minimum of that cost and the cost that applies if
    7498              :      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
    7499       195910 :   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
    7500              :                                                       to_layout_i);
    7501       195910 :   if (direct_layout_costs.is_possible ())
    7502              :     {
    7503       176167 :       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
    7504       176167 :       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
    7505       176167 :       direct_cost.split (from_partition.out_degree);
    7506       176167 :       if (!cost.is_possible ()
    7507       176167 :           || direct_cost.is_better_than (cost, m_optimize_size))
    7508        44996 :         cost = direct_cost;
    7509              :     }
    7510              : 
    7511       195910 :   return cost;
    7512              : }
    7513              : 
    7514              : /* UD represents a use-def link between TO_NODE_I and a node in an earlier
    7515              :    partition; TO_NODE_I could be the definition node or the use node.
    7516              :    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
    7517              :    return the cost of any necessary fix-ups on edge UD, or
    7518              :    slpg_layout_cost::impossible () if the choice cannot be made.
    7519              : 
    7520              :    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
    7521              : 
    7522              : slpg_layout_cost
    7523       182752 : vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
    7524              :                                        unsigned int from_layout_i)
    7525              : {
    7526       182752 :   auto &to_vertex = m_vertices[to_node_i];
    7527       182752 :   unsigned int to_partition_i = to_vertex.partition;
    7528       182752 :   slpg_partition_info &to_partition = m_partitions[to_partition_i];
    7529       182752 :   gcc_assert (to_partition.layout >= 0);
    7530              : 
    7531              :   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
    7532              :      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
    7533              :      any other inputs keep their current choice of layout.  */
    7534       182752 :   auto &to_costs = partition_layout_costs (to_partition_i,
    7535              :                                            to_partition.layout);
    7536       182752 :   if (ud->src == int (to_node_i)
    7537       182598 :       && SLP_TREE_PERMUTE_P (to_vertex.node))
    7538              :     {
    7539         9498 :       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
    7540         9498 :       auto old_layout = from_partition.layout;
    7541         9498 :       from_partition.layout = from_layout_i;
    7542        18996 :       int factor = internal_node_cost (to_vertex.node, -1,
    7543         9498 :                                        to_partition.layout);
    7544         9498 :       from_partition.layout = old_layout;
    7545         9498 :       if (factor >= 0)
    7546              :         {
    7547         8872 :           slpg_layout_cost cost = to_costs.out_cost;
    7548        17744 :           cost.add_serial_cost ({ to_vertex.weight * factor,
    7549         8872 :                                   m_optimize_size });
    7550         8872 :           cost.split (to_partition.in_degree);
    7551         8872 :           return cost;
    7552              :         }
    7553              :     }
    7554              : 
    7555              :   /* Compute the cost if we insert any necessary layout change on edge UD.  */
    7556       173880 :   auto edge_cost = edge_layout_cost (ud, to_node_i,
    7557       173880 :                                      to_partition.layout, from_layout_i);
    7558       173880 :   if (edge_cost.is_possible ())
    7559              :     {
    7560       173880 :       slpg_layout_cost cost = to_costs.out_cost;
    7561       173880 :       cost.add_serial_cost (to_costs.internal_cost);
    7562       173880 :       cost.split (to_partition.in_degree);
    7563       173880 :       cost.add_serial_cost (edge_cost);
    7564       173880 :       return cost;
    7565              :     }
    7566              : 
    7567            0 :   return slpg_layout_cost::impossible ();
    7568              : }
    7569              : 
    7570              : /* Make a forward pass through the partitions, accumulating input costs.
    7571              :    Make a tentative (provisional) choice of layout for each partition,
    7572              :    ensuring that this choice still allows later partitions to keep
    7573              :    their original layout.  */
    7574              : 
    7575              : void
    7576         5654 : vect_optimize_slp_pass::forward_pass ()
    7577              : {
    7578       124920 :   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
    7579              :        ++partition_i)
    7580              :     {
    7581       119266 :       auto &partition = m_partitions[partition_i];
    7582              : 
    7583              :       /* If the partition consists of a single VEC_PERM_EXPR, precompute
    7584              :          the incoming cost that would apply if every predecessor partition
    7585              :          keeps its current layout.  This is used within the loop below.  */
    7586       119266 :       slpg_layout_cost in_cost;
    7587       119266 :       slp_tree single_node = nullptr;
    7588       119266 :       if (partition.node_end == partition.node_begin + 1)
    7589              :         {
    7590       113161 :           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
    7591       113161 :           single_node = m_vertices[node_i].node;
    7592       113161 :           if (SLP_TREE_PERMUTE_P (single_node))
    7593         3180 :             in_cost = total_in_cost (node_i);
    7594              :         }
    7595              : 
    7596              :       /* Go through the possible layouts.  Decide which ones are valid
    7597              :          for this partition and record which of the valid layouts has
    7598              :          the lowest cost.  */
    7599       119266 :       unsigned int min_layout_i = 0;
    7600       119266 :       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
    7601       363651 :       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
    7602              :         {
    7603       244385 :           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
    7604       244385 :           if (!layout_costs.is_possible ())
    7605        55522 :             continue;
    7606              : 
    7607              :           /* If the recorded layout is already 0 then the layout cannot
    7608              :              change.  */
    7609       244385 :           if (partition.layout == 0 && layout_i != 0)
    7610              :             {
    7611        38801 :               layout_costs.mark_impossible ();
    7612        38801 :               continue;
    7613              :             }
    7614              : 
    7615       205584 :           bool is_possible = true;
    7616       422155 :           for (unsigned int order_i = partition.node_begin;
    7617       422155 :                order_i < partition.node_end; ++order_i)
    7618              :             {
    7619       231217 :               unsigned int node_i = m_partitioned_nodes[order_i];
    7620       231217 :               auto &vertex = m_vertices[node_i];
    7621              : 
    7622              :               /* Reject the layout if it is individually incompatible
    7623              :                  with any node in the partition.  */
    7624       231217 :               if (!is_compatible_layout (vertex.node, layout_i))
    7625              :                 {
    7626        13614 :                   is_possible = false;
    7627        14646 :                   break;
    7628              :                 }
    7629              : 
    7630       601734 :               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
    7631              :                 {
    7632       384131 :                   auto &other_vertex = m_vertices[other_node_i];
    7633       384131 :                   if (other_vertex.partition < vertex.partition)
    7634              :                     {
    7635              :                       /* Accumulate the incoming costs from earlier
    7636              :                          partitions, plus the cost of any layout changes
    7637              :                          on UD itself.  */
    7638       198373 :                       auto cost = forward_cost (ud, other_node_i, layout_i);
    7639       198373 :                       if (!cost.is_possible ())
    7640         2463 :                         is_possible = false;
    7641              :                       else
    7642       195910 :                         layout_costs.in_cost.add_parallel_cost (cost);
    7643              :                     }
    7644              :                   else
    7645              :                     /* Reject the layout if it would make layout 0 impossible
    7646              :                        for later partitions.  This amounts to testing that the
    7647              :                        target supports reversing the layout change on edges
    7648              :                        to later partitions.
    7649              : 
    7650              :                        In principle, it might be possible to push a layout
    7651              :                        change all the way down a graph, so that it never
    7652              :                        needs to be reversed and so that the target doesn't
    7653              :                        need to support the reverse operation.  But it would
    7654              :                        be awkward to bail out if we hit a partition that
    7655              :                        does not support the new layout, especially since
    7656              :                        we are not dealing with a lattice.  */
    7657       185758 :                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
    7658       185758 :                                                      layout_i).is_possible ();
    7659       601734 :                 };
    7660       217603 :               for_each_partition_edge (node_i, add_cost);
    7661              : 
    7662              :               /* Accumulate the cost of using LAYOUT_I within NODE,
    7663              :                  both for the inputs and the outputs.  */
    7664       217603 :               int factor = internal_node_cost (vertex.node, layout_i,
    7665              :                                                layout_i);
    7666       217603 :               if (factor < 0)
    7667              :                 {
    7668         1032 :                   is_possible = false;
    7669         1032 :                   break;
    7670              :                 }
    7671       216571 :               else if (factor)
    7672        36093 :                 layout_costs.internal_cost.add_serial_cost
    7673        36093 :                   ({ vertex.weight * factor, m_optimize_size });
    7674              :             }
    7675       205584 :           if (!is_possible)
    7676              :             {
    7677        16721 :               layout_costs.mark_impossible ();
    7678        16721 :               continue;
    7679              :             }
    7680              : 
    7681              :           /* Combine the incoming and partition-internal costs.  */
    7682       188863 :           slpg_layout_cost combined_cost = layout_costs.in_cost;
    7683       188863 :           combined_cost.add_serial_cost (layout_costs.internal_cost);
    7684              : 
    7685              :           /* If this partition consists of a single VEC_PERM_EXPR, see
    7686              :              if the VEC_PERM_EXPR can be changed to support output layout
    7687              :              LAYOUT_I while keeping all the provisional choices of input
    7688              :              layout.  */
    7689       188863 :           if (single_node && SLP_TREE_PERMUTE_P (single_node))
    7690              :             {
    7691         5526 :               int factor = internal_node_cost (single_node, -1, layout_i);
    7692         5526 :               if (factor >= 0)
    7693              :                 {
    7694         5087 :                   auto weight = m_vertices[single_node->vertex].weight;
    7695         5087 :                   slpg_layout_cost internal_cost
    7696         5087 :                     = { weight * factor, m_optimize_size };
    7697              : 
    7698         5087 :                   slpg_layout_cost alt_cost = in_cost;
    7699         5087 :                   alt_cost.add_serial_cost (internal_cost);
    7700         5087 :                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
    7701              :                     {
    7702         1602 :                       combined_cost = alt_cost;
    7703         1602 :                       layout_costs.in_cost = in_cost;
    7704         1602 :                       layout_costs.internal_cost = internal_cost;
    7705              :                     }
    7706              :                 }
    7707              :             }
    7708              : 
    7709              :           /* Record the layout with the lowest cost.  Prefer layout 0 in
    7710              :              the event of a tie between it and another layout.  */
    7711       188863 :           if (!min_layout_cost.is_possible ()
    7712        69597 :               || combined_cost.is_better_than (min_layout_cost,
    7713        69597 :                                                m_optimize_size))
    7714              :             {
    7715       133891 :               min_layout_i = layout_i;
    7716       133891 :               min_layout_cost = combined_cost;
    7717              :             }
    7718              :         }
    7719              : 
    7720              :       /* This loop's handling of earlier partitions should ensure that
    7721              :          choosing the original layout for the current partition is no
    7722              :          less valid than it was in the original graph, even with the
    7723              :          provisional layout choices for those earlier partitions.  */
    7724       119266 :       gcc_assert (min_layout_cost.is_possible ());
    7725       119266 :       partition.layout = min_layout_i;
    7726              :     }
    7727         5654 : }
    7728              : 
    7729              : /* Make a backward pass through the partitions, accumulating output costs.
    7730              :    Make a final choice of layout for each partition.  */
    7731              : 
    7732              : void
    7733         5654 : vect_optimize_slp_pass::backward_pass ()
    7734              : {
    7735       130574 :   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
    7736              :     {
    7737       119266 :       auto &partition = m_partitions[partition_i];
    7738              : 
    7739       119266 :       unsigned int min_layout_i = 0;
    7740       119266 :       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
    7741       363651 :       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
    7742              :         {
    7743       244385 :           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
    7744       244385 :           if (!layout_costs.is_possible ())
    7745        55522 :             continue;
    7746              : 
    7747              :           /* Accumulate the costs from successor partitions.  */
    7748       188863 :           bool is_possible = true;
    7749       403328 :           for (unsigned int order_i = partition.node_begin;
    7750       403328 :                order_i < partition.node_end; ++order_i)
    7751              :             {
    7752       214465 :               unsigned int node_i = m_partitioned_nodes[order_i];
    7753       214465 :               auto &vertex = m_vertices[node_i];
    7754       593002 :               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
    7755              :                 {
    7756       378537 :                   auto &other_vertex = m_vertices[other_node_i];
    7757       378537 :                   auto &other_partition = m_partitions[other_vertex.partition];
    7758       378537 :                   if (other_vertex.partition > vertex.partition)
    7759              :                     {
    7760              :                       /* Accumulate the incoming costs from later
    7761              :                          partitions, plus the cost of any layout changes
    7762              :                          on UD itself.  */
    7763       182752 :                       auto cost = backward_cost (ud, other_node_i, layout_i);
    7764       182752 :                       if (!cost.is_possible ())
    7765            0 :                         is_possible = false;
    7766              :                       else
    7767       182752 :                         layout_costs.out_cost.add_parallel_cost (cost);
    7768              :                     }
    7769              :                   else
    7770              :                     /* Make sure that earlier partitions can (if necessary
    7771              :                        or beneficial) keep the layout that they chose in
    7772              :                        the forward pass.  This ensures that there is at
    7773              :                        least one valid choice of layout.  */
    7774       195785 :                     is_possible &= edge_layout_cost (ud, other_node_i,
    7775       195785 :                                                      other_partition.layout,
    7776       195785 :                                                      layout_i).is_possible ();
    7777       593002 :                 };
    7778       214465 :               for_each_partition_edge (node_i, add_cost);
    7779              :             }
    7780       188863 :           if (!is_possible)
    7781              :             {
    7782            0 :               layout_costs.mark_impossible ();
    7783            0 :               continue;
    7784              :             }
    7785              : 
    7786              :           /* Locally combine the costs from the forward and backward passes.
    7787              :              (This combined cost is not passed on, since that would lead
    7788              :              to double counting.)  */
    7789       188863 :           slpg_layout_cost combined_cost = layout_costs.in_cost;
    7790       188863 :           combined_cost.add_serial_cost (layout_costs.internal_cost);
    7791       188863 :           combined_cost.add_serial_cost (layout_costs.out_cost);
    7792              : 
    7793              :           /* Record the layout with the lowest cost.  Prefer layout 0 in
    7794              :              the event of a tie between it and another layout.  */
    7795       188863 :           if (!min_layout_cost.is_possible ()
    7796        69597 :               || combined_cost.is_better_than (min_layout_cost,
    7797        69597 :                                                m_optimize_size))
    7798              :             {
    7799       127263 :               min_layout_i = layout_i;
    7800       127263 :               min_layout_cost = combined_cost;
    7801              :             }
    7802              :         }
    7803              : 
    7804       119266 :       gcc_assert (min_layout_cost.is_possible ());
    7805       119266 :       partition.layout = min_layout_i;
    7806              :     }
    7807         5654 : }
    7808              : 
    7809              : /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
    7810              :    NODE already has the layout that was selected for its partition.  */
    7811              : 
    7812              : slp_tree
    7813       165612 : vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
    7814              :                                                 unsigned int to_layout_i)
    7815              : {
    7816       165612 :   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
    7817       165612 :   slp_tree result = m_node_layouts[result_i];
    7818       165612 :   if (result)
    7819              :     return result;
    7820              : 
    7821       165146 :   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
    7822       165146 :       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
    7823              :           /* We can't permute vector defs in place.  */
    7824        20220 :           && SLP_TREE_VEC_DEFS (node).is_empty ()))
    7825              :     {
    7826              :       /* If the vector is uniform or unchanged, there's nothing to do.  */
    7827        37931 :       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
    7828              :         result = node;
    7829              :       else
    7830              :         {
    7831         1982 :           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
    7832         1982 :           result = vect_create_new_slp_node (scalar_ops);
    7833         1982 :           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
    7834              :         }
    7835              :     }
    7836              :   else
    7837              :     {
    7838       127215 :       unsigned int partition_i = m_vertices[node->vertex].partition;
    7839       127215 :       unsigned int from_layout_i = m_partitions[partition_i].layout;
    7840       127215 :       if (from_layout_i == to_layout_i)
    7841       126672 :         return node;
    7842              : 
    7843              :       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
    7844              :          permutation instead of a serial one.  Leave the new permutation
    7845              :          in TMP_PERM on success.  */
    7846          543 :       auto_lane_permutation_t tmp_perm;
    7847          543 :       unsigned int num_inputs = 1;
    7848          543 :       if (SLP_TREE_PERMUTE_P (node))
    7849              :         {
    7850            7 :           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
    7851            7 :           if (from_layout_i != 0)
    7852            7 :             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
    7853            7 :           if (to_layout_i != 0)
    7854            4 :             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
    7855            7 :           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
    7856              :                                               tmp_perm,
    7857            7 :                                               SLP_TREE_CHILDREN (node),
    7858              :                                               false) >= 0)
    7859            7 :             num_inputs = SLP_TREE_CHILDREN (node).length ();
    7860              :           else
    7861            0 :             tmp_perm.truncate (0);
    7862              :         }
    7863              : 
    7864          543 :       if (dump_enabled_p ())
    7865              :         {
    7866           68 :           if (tmp_perm.length () > 0)
    7867            6 :             dump_printf_loc (MSG_NOTE, vect_location,
    7868              :                              "duplicating permutation node %p with"
    7869              :                              " layout %d\n",
    7870              :                              (void *) node, to_layout_i);
    7871              :           else
    7872           62 :             dump_printf_loc (MSG_NOTE, vect_location,
    7873              :                              "inserting permutation node in place of %p\n",
    7874              :                              (void *) node);
    7875              :         }
    7876              : 
    7877          543 :       unsigned int num_lanes = SLP_TREE_LANES (node);
    7878          543 :       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
    7879          543 :       if (SLP_TREE_SCALAR_STMTS (node).length ())
    7880              :         {
    7881          542 :           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
    7882          542 :           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
    7883          542 :           if (from_layout_i != 0)
    7884          272 :             vect_slp_permute (m_perms[from_layout_i], stmts, false);
    7885          542 :           if (to_layout_i != 0)
    7886          274 :             vect_slp_permute (m_perms[to_layout_i], stmts, true);
    7887              :         }
    7888          543 :       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
    7889          543 :       SLP_TREE_LANES (result) = num_lanes;
    7890          543 :       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
    7891          543 :       result->vertex = -1;
    7892              : 
    7893          543 :       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
    7894          543 :       if (tmp_perm.length ())
    7895              :         {
    7896            7 :           lane_perm.safe_splice (tmp_perm);
    7897            7 :           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
    7898              :         }
    7899              :       else
    7900              :         {
    7901          536 :           lane_perm.create (num_lanes);
    7902         1672 :           for (unsigned j = 0; j < num_lanes; ++j)
    7903         1136 :             lane_perm.quick_push ({ 0, j });
    7904          536 :           if (from_layout_i != 0)
    7905          265 :             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
    7906          536 :           if (to_layout_i != 0)
    7907          271 :             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
    7908          536 :           SLP_TREE_CHILDREN (result).safe_push (node);
    7909              :         }
    7910         2176 :       for (slp_tree child : SLP_TREE_CHILDREN (result))
    7911          547 :         child->refcnt++;
    7912          543 :     }
    7913        38474 :   m_node_layouts[result_i] = result;
    7914        38474 :   return result;
    7915              : }
    7916              : 
    7917              : /* Apply the chosen vector layouts to the SLP graph.  */
    7918              : 
    7919              : void
    7920        10609 : vect_optimize_slp_pass::materialize ()
    7921              : {
    7922              :   /* We no longer need the costs, so avoid having two O(N * P) arrays
    7923              :      live at the same time.  */
    7924        10609 :   m_partition_layout_costs.release ();
    7925        31827 :   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
    7926              : 
    7927        21218 :   auto_sbitmap fully_folded (m_vertices.length ());
    7928        10609 :   bitmap_clear (fully_folded);
    7929       173634 :   for (unsigned int node_i : m_partitioned_nodes)
    7930              :     {
    7931       141807 :       auto &vertex = m_vertices[node_i];
    7932       141807 :       slp_tree node = vertex.node;
    7933       141807 :       int layout_i = m_partitions[vertex.partition].layout;
    7934       141807 :       gcc_assert (layout_i >= 0);
    7935              : 
    7936              :       /* Rearrange the scalar statements to match the chosen layout.  */
    7937       141807 :       if (layout_i > 0)
    7938        15897 :         vect_slp_permute (m_perms[layout_i],
    7939        15897 :                           SLP_TREE_SCALAR_STMTS (node), true);
    7940              : 
    7941              :       /* Update load and lane permutations.  */
    7942       141807 :       if (SLP_TREE_PERMUTE_P (node))
    7943              :         {
    7944              :           /* First try to absorb the input vector layouts.  If that fails,
    7945              :              force the inputs to have layout LAYOUT_I too.  We checked that
    7946              :              that was possible before deciding to use nonzero output layouts.
    7947              :              (Note that at this stage we don't really have any guarantee that
    7948              :              the target supports the original VEC_PERM_EXPR.)  */
    7949         5337 :           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
    7950         5337 :           auto_lane_permutation_t tmp_perm;
    7951         5337 :           tmp_perm.safe_splice (perm);
    7952         5337 :           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
    7953         5337 :           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
    7954              :                                               tmp_perm,
    7955         5337 :                                               SLP_TREE_CHILDREN (node),
    7956              :                                               false) >= 0)
    7957              :             {
    7958         4974 :               if (dump_enabled_p ()
    7959         5894 :                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
    7960              :                                   perm.begin ()))
    7961           58 :                 dump_printf_loc (MSG_NOTE, vect_location,
    7962              :                                  "absorbing input layouts into %p\n",
    7963              :                                  (void *) node);
    7964        28019 :               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
    7965         4974 :               bitmap_set_bit (fully_folded, node_i);
    7966              :             }
    7967              :           else
    7968              :             {
    7969              :               /* Not MSG_MISSED because it would make no sense to users.  */
    7970          363 :               if (dump_enabled_p ())
    7971           46 :                 dump_printf_loc (MSG_NOTE, vect_location,
    7972              :                                  "failed to absorb input layouts into %p\n",
    7973              :                                  (void *) node);
    7974          363 :               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
    7975              :             }
    7976         5337 :         }
    7977              :       else
    7978              :         {
    7979       136470 :           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
    7980       136470 :           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
    7981       136470 :           if (layout_i > 0)
    7982              :             /* ???  When we handle non-bijective permutes the idea
    7983              :                is that we can force the load-permutation to be
    7984              :                { min, min + 1, min + 2, ... max }.  But then the
    7985              :                scalar defs might no longer match the lane content
    7986              :                which means wrong-code with live lane vectorization.
    7987              :                So we possibly have to have NULL entries for those.  */
    7988        15794 :             vect_slp_permute (m_perms[layout_i], load_perm, true);
    7989              :         }
    7990              :     }
    7991              : 
    7992              :   /* Do this before any nodes disappear, since it involves a walk
    7993              :      over the leaves.  */
    7994        10609 :   remove_redundant_permutations ();
    7995              : 
    7996              :   /* Replace each child with a correctly laid-out version.  */
    7997       173634 :   for (unsigned int node_i : m_partitioned_nodes)
    7998              :     {
    7999              :       /* Skip nodes that have already been handled above.  */
    8000       141807 :       if (bitmap_bit_p (fully_folded, node_i))
    8001         4974 :         continue;
    8002              : 
    8003       136833 :       auto &vertex = m_vertices[node_i];
    8004       136833 :       int in_layout_i = m_partitions[vertex.partition].layout;
    8005       136833 :       gcc_assert (in_layout_i >= 0);
    8006              : 
    8007              :       unsigned j;
    8008              :       slp_tree child;
    8009       410963 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
    8010              :         {
    8011       171570 :           if (!child)
    8012         5958 :             continue;
    8013              : 
    8014       165612 :           slp_tree new_child = get_result_with_layout (child, in_layout_i);
    8015       165612 :           if (new_child != child)
    8016              :             {
    8017         2734 :               vect_free_slp_tree (child);
    8018         2734 :               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
    8019         2734 :               new_child->refcnt += 1;
    8020              :             }
    8021              :         }
    8022              :     }
    8023        10609 : }
    8024              : 
    8025              : /* Elide load permutations that are not necessary.  Such permutations might
    8026              :    be pre-existing, rather than created by the layout optimizations.  */
    8027              : 
    8028              : void
    8029       676294 : vect_optimize_slp_pass::remove_redundant_permutations ()
    8030              : {
    8031      4468138 :   for (unsigned int node_i : m_leafs)
    8032              :     {
    8033      2439256 :       slp_tree node = m_vertices[node_i].node;
    8034      2439256 :       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
    8035      1847248 :         continue;
    8036              : 
    8037              :       /* In basic block vectorization we allow any subchain of an interleaving
    8038              :          chain.
    8039              :          FORNOW: not in loop SLP because of realignment complications.  */
    8040       592008 :       if (is_a <bb_vec_info> (m_vinfo))
    8041              :         {
    8042       157690 :           bool subchain_p = true;
    8043              :           stmt_vec_info next_load_info = NULL;
    8044              :           stmt_vec_info load_info;
    8045              :           unsigned j;
    8046       157690 :           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
    8047              :             {
    8048       128449 :               if (j != 0
    8049       128449 :                   && (next_load_info != load_info
    8050        61091 :                       || ! load_info
    8051        61091 :                       || DR_GROUP_GAP (load_info) != 1))
    8052              :                 {
    8053              :                   subchain_p = false;
    8054              :                   break;
    8055              :                 }
    8056       105821 :               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
    8057              :             }
    8058        51869 :           if (subchain_p)
    8059              :             {
    8060        29241 :               SLP_TREE_LOAD_PERMUTATION (node).release ();
    8061        29241 :               continue;
    8062              :             }
    8063              :         }
    8064              :       else
    8065              :         {
    8066       540139 :           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
    8067       540139 :           bool this_load_permuted = !vect_load_perm_consecutive_p (node, 0);
    8068              :           /* When this isn't a grouped access we know it's single element
    8069              :              and contiguous.  */
    8070       540139 :           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
    8071              :             {
    8072       420131 :               if (!this_load_permuted
    8073       420131 :                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
    8074       419336 :                       || SLP_TREE_LANES (node) == 1))
    8075       419338 :                 SLP_TREE_LOAD_PERMUTATION (node).release ();
    8076       420131 :               continue;
    8077              :             }
    8078       120008 :           stmt_vec_info first_stmt_info
    8079       120008 :             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
    8080       120513 :           if (!this_load_permuted
    8081              :               /* The load requires permutation when unrolling exposes
    8082              :                  a gap either because the group is larger than the SLP
    8083              :                  group-size or because there is a gap between the groups.  */
    8084       120008 :               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
    8085        97981 :                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
    8086          140 :                       && DR_GROUP_GAP (first_stmt_info) == 0)))
    8087              :             {
    8088          505 :               SLP_TREE_LOAD_PERMUTATION (node).release ();
    8089          505 :               continue;
    8090              :             }
    8091              :         }
    8092              :     }
    8093       676294 : }
    8094              : 
    8095              : /* Print the partition graph and layout information to the dump file.  */
    8096              : 
    8097              : void
    8098          674 : vect_optimize_slp_pass::dump ()
    8099              : {
    8100          674 :   dump_printf_loc (MSG_NOTE, vect_location,
    8101              :                    "SLP optimize permutations:\n");
    8102         1361 :   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
    8103              :     {
    8104          687 :       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
    8105          687 :       const char *sep = "";
    8106         5866 :       for (unsigned int idx : m_perms[layout_i])
    8107              :         {
    8108         3805 :           dump_printf (MSG_NOTE, "%s%d", sep, idx);
    8109         3805 :           sep = ", ";
    8110              :         }
    8111          687 :       dump_printf (MSG_NOTE, " }\n");
    8112              :     }
    8113          674 :   dump_printf_loc (MSG_NOTE, vect_location,
    8114              :                    "SLP optimize partitions:\n");
    8115         5612 :   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
    8116              :        ++partition_i)
    8117              :     {
    8118         4938 :       auto &partition = m_partitions[partition_i];
    8119         4938 :       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
    8120         4938 :       dump_printf_loc (MSG_NOTE, vect_location,
    8121              :                        "  partition %d (layout %d):\n",
    8122              :                        partition_i, partition.layout);
    8123         4938 :       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
    8124        10111 :       for (unsigned int order_i = partition.node_begin;
    8125        10111 :            order_i < partition.node_end; ++order_i)
    8126              :         {
    8127         5173 :           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
    8128        10346 :           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
    8129         5173 :                            (void *) vertex.node);
    8130         5173 :           dump_printf_loc (MSG_NOTE, vect_location,
    8131              :                            "          weight: %f\n",
    8132              :                            vertex.weight.to_double ());
    8133         5173 :           if (vertex.out_degree)
    8134         4050 :             dump_printf_loc (MSG_NOTE, vect_location,
    8135              :                              "          out weight: %f (degree %d)\n",
    8136              :                              vertex.out_weight.to_double (),
    8137              :                              vertex.out_degree);
    8138         5173 :           if (SLP_TREE_PERMUTE_P (vertex.node))
    8139          506 :             dump_printf_loc (MSG_NOTE, vect_location,
    8140              :                              "          op: VEC_PERM_EXPR\n");
    8141         4667 :           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
    8142         4649 :             dump_printf_loc (MSG_NOTE, vect_location,
    8143              :                              "          op template: %G", rep->stmt);
    8144              :         }
    8145         4938 :       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
    8146        10111 :       for (unsigned int order_i = partition.node_begin;
    8147        10111 :            order_i < partition.node_end; ++order_i)
    8148              :         {
    8149         5173 :           unsigned int node_i = m_partitioned_nodes[order_i];
    8150         5173 :           auto &vertex = m_vertices[node_i];
    8151        15617 :           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
    8152              :             {
    8153        10444 :               auto &other_vertex = m_vertices[other_node_i];
    8154        10444 :               if (other_vertex.partition < vertex.partition)
    8155         5222 :                 dump_printf_loc (MSG_NOTE, vect_location,
    8156              :                                  "      - %p [%d] --> %p\n",
    8157         5222 :                                  (void *) other_vertex.node,
    8158              :                                  other_vertex.partition,
    8159         5222 :                                  (void *) vertex.node);
    8160              :               else
    8161         5222 :                 dump_printf_loc (MSG_NOTE, vect_location,
    8162              :                                  "      - %p --> [%d] %p\n",
    8163         5222 :                                  (void *) vertex.node,
    8164              :                                  other_vertex.partition,
    8165         5222 :                                  (void *) other_vertex.node);
    8166        15617 :             };
    8167         5173 :           for_each_partition_edge (node_i, print_edge);
    8168              :         }
    8169              : 
    8170        15013 :       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
    8171              :         {
    8172        10075 :           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
    8173        10075 :           if (layout_costs.is_possible ())
    8174              :             {
    8175         8301 :               dump_printf_loc (MSG_NOTE, vect_location,
    8176              :                                "    layout %d:%s\n", layout_i,
    8177         8301 :                                partition.layout == int (layout_i)
    8178              :                                ? " (*)" : "");
    8179         8301 :               slpg_layout_cost combined_cost = layout_costs.in_cost;
    8180         8301 :               combined_cost.add_serial_cost (layout_costs.internal_cost);
    8181         8301 :               combined_cost.add_serial_cost (layout_costs.out_cost);
    8182              : #define TEMPLATE "{depth: %f, total: %f}"
    8183         8301 :               dump_printf_loc (MSG_NOTE, vect_location,
    8184              :                                "        " TEMPLATE "\n",
    8185              :                                layout_costs.in_cost.depth.to_double (),
    8186              :                                layout_costs.in_cost.total.to_double ());
    8187         8301 :               dump_printf_loc (MSG_NOTE, vect_location,
    8188              :                                "      + " TEMPLATE "\n",
    8189              :                                layout_costs.internal_cost.depth.to_double (),
    8190              :                                layout_costs.internal_cost.total.to_double ());
    8191         8301 :               dump_printf_loc (MSG_NOTE, vect_location,
    8192              :                                "      + " TEMPLATE "\n",
    8193              :                                layout_costs.out_cost.depth.to_double (),
    8194              :                                layout_costs.out_cost.total.to_double ());
    8195         8301 :               dump_printf_loc (MSG_NOTE, vect_location,
    8196              :                                "      = " TEMPLATE "\n",
    8197              :                                combined_cost.depth.to_double (),
    8198              :                                combined_cost.total.to_double ());
    8199              : #undef TEMPLATE
    8200              :             }
    8201              :           else
    8202         1774 :             dump_printf_loc (MSG_NOTE, vect_location,
    8203              :                              "    layout %d: rejected\n", layout_i);
    8204              :         }
    8205              :     }
    8206          674 : }
    8207              : 
    8208              : /* Masked load lanes discovery.  */
    8209              : 
    8210              : void
    8211       676294 : vect_optimize_slp_pass::decide_masked_load_lanes ()
    8212              : {
    8213      6971416 :   for (auto v : m_vertices)
    8214              :     {
    8215      4942534 :       slp_tree node = v.node;
    8216      4942534 :       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
    8217      3471447 :           || SLP_TREE_PERMUTE_P (node))
    8218      1608062 :         continue;
    8219      3334472 :       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
    8220      1629831 :       if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
    8221              :           /* The mask has to be uniform.  */
    8222       967517 :           || STMT_VINFO_SLP_VECT_ONLY (stmt_info)
    8223       967386 :           || ! is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
    8224      3334557 :           || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
    8225              :                                        IFN_MASK_LOAD))
    8226      3334439 :         continue;
    8227           33 :       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
    8228           66 :       if (STMT_VINFO_STRIDED_P (stmt_info)
    8229           33 :           || compare_step_with_zero (m_vinfo, stmt_info) <= 0
    8230           63 :           || vect_load_lanes_supported (SLP_TREE_VECTYPE (node),
    8231           30 :                                         DR_GROUP_SIZE (stmt_info),
    8232              :                                         true) == IFN_LAST)
    8233           33 :         continue;
    8234              : 
    8235              :       /* Uniform masks need to be suitably represented.  */
    8236            0 :       slp_tree mask = SLP_TREE_CHILDREN (node)[0];
    8237            0 :       if (!SLP_TREE_PERMUTE_P (mask)
    8238            0 :           || SLP_TREE_CHILDREN (mask).length () != 1)
    8239            0 :         continue;
    8240            0 :       bool match = true;
    8241            0 :       for (auto perm : SLP_TREE_LANE_PERMUTATION (mask))
    8242            0 :         if (perm.first != 0 || perm.second != 0)
    8243              :           {
    8244              :             match = false;
    8245              :             break;
    8246              :           }
    8247            0 :       if (!match)
    8248            0 :         continue;
    8249              : 
    8250              :       /* Now see if the consumer side matches.  */
    8251            0 :       for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
    8252            0 :            pred; pred = pred->pred_next)
    8253              :         {
    8254            0 :           slp_tree pred_node = m_vertices[pred->src].node;
    8255              :           /* All consumers should be a permute with a single outgoing lane.  */
    8256            0 :           if (!SLP_TREE_PERMUTE_P (pred_node)
    8257            0 :               || SLP_TREE_LANES (pred_node) != 1)
    8258              :             {
    8259              :               match = false;
    8260              :               break;
    8261              :             }
    8262            0 :           gcc_assert (SLP_TREE_CHILDREN (pred_node).length () == 1);
    8263              :         }
    8264            0 :       if (!match)
    8265            0 :         continue;
    8266              :       /* Now we can mark the nodes as to use load lanes.  */
    8267            0 :       node->ldst_lanes = true;
    8268            0 :       for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
    8269            0 :            pred; pred = pred->pred_next)
    8270            0 :         m_vertices[pred->src].node->ldst_lanes = true;
    8271              :       /* The catch is we have to massage the mask.  We have arranged
    8272              :          analyzed uniform masks to be represented by a splat VEC_PERM
    8273              :          which we can now simply elide as we cannot easily re-do SLP
    8274              :          discovery here.  */
    8275            0 :       slp_tree new_mask = SLP_TREE_CHILDREN (mask)[0];
    8276            0 :       SLP_TREE_REF_COUNT (new_mask)++;
    8277            0 :       SLP_TREE_CHILDREN (node)[0] = new_mask;
    8278            0 :       vect_free_slp_tree (mask);
    8279              :     }
    8280       676294 : }
    8281              : 
    8282              : /* Perform legitimizing attempts.  This is intended to improve the
    8283              :    situation when layout 0 is not valid which is a situation the cost
    8284              :    based propagation does not handle well.
    8285              :    Return true if further layout optimization is possible, false if
    8286              :    the layout configuration should be considered final.  */
    8287              : 
    8288              : bool
    8289        10609 : vect_optimize_slp_pass::legitimize ()
    8290              : {
    8291              :   /* Perform a very simple legitimizing attempt by attempting to choose
    8292              :      a single layout for all partitions that will make all permutations
    8293              :      a noop.  That should also be the optimal layout choice in case
    8294              :      layout zero is legitimate.
    8295              :      ???  Disconnected components of the SLP graph could have distinct
    8296              :      single layouts.  */
    8297        10609 :   int single_layout_i = -1;
    8298        10609 :   unsigned deferred_up_to = -1U;
    8299        31402 :   for (unsigned partition_i = 0; partition_i < m_partitions.length ();
    8300              :        ++partition_i)
    8301              :     {
    8302        26441 :       auto &partition = m_partitions[partition_i];
    8303        26441 :       if (single_layout_i == -1)
    8304              :         {
    8305        13851 :           single_layout_i = partition.layout;
    8306        13851 :           deferred_up_to = partition_i;
    8307              :         }
    8308        12590 :       else if (partition.layout == single_layout_i || partition.layout == -1)
    8309              :         ;
    8310              :       else
    8311              :         single_layout_i = 0;
    8312        23181 :       if (single_layout_i == 0)
    8313              :         return true;
    8314              : 
    8315        20853 :       if (single_layout_i != -1
    8316        20853 :           && !is_compatible_layout (partition, single_layout_i))
    8317              :         return true;
    8318              :     }
    8319              : 
    8320         4961 :   if (single_layout_i <= 0)
    8321              :     return true;
    8322              : 
    8323         5077 :   for (unsigned partition_i = 0; partition_i < deferred_up_to; ++partition_i)
    8324          122 :     if (!is_compatible_layout (m_partitions[partition_i],
    8325              :                                single_layout_i))
    8326              :       return true;
    8327              : 
    8328        12549 :   for (unsigned partition_i = 0; partition_i < m_partitions.length ();
    8329              :        ++partition_i)
    8330              :     {
    8331         7594 :       auto &partition = m_partitions[partition_i];
    8332         7594 :       partition.layout = single_layout_i;
    8333              :     }
    8334              : 
    8335              :   return false;
    8336              : }
    8337              : 
    8338              : /* Main entry point for the SLP graph optimization pass.  */
    8339              : 
    8340              : void
    8341       676294 : vect_optimize_slp_pass::run ()
    8342              : {
    8343       676294 :   build_graph ();
    8344       676294 :   create_partitions ();
    8345       676294 :   start_choosing_layouts ();
    8346       676294 :   if (m_perms.length () > 1)
    8347              :     {
    8348        10609 :       if (legitimize ())
    8349              :         {
    8350         5654 :           forward_pass ();
    8351         5654 :           backward_pass ();
    8352              :         }
    8353        10609 :       if (dump_enabled_p ())
    8354          674 :         dump ();
    8355        10609 :       materialize ();
    8356        42853 :       while (!m_perms.is_empty ())
    8357        21635 :         m_perms.pop ().release ();
    8358              :     }
    8359              :   else
    8360       665685 :     remove_redundant_permutations ();
    8361       676294 :   free_graph (m_slpg);
    8362       676294 :   build_graph ();
    8363       676294 :   decide_masked_load_lanes ();
    8364       676294 :   free_graph (m_slpg);
    8365       676294 : }
    8366              : 
    8367              : /* Apply CSE to NODE and its children using BST_MAP.  */
    8368              : 
    8369              : static void
    8370      5342030 : vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
    8371              : {
    8372      5342030 :   bool put_p = false;
    8373      5342030 :   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
    8374              :       /* Besides some VEC_PERM_EXPR, two-operator nodes also
    8375              :          lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
    8376              :          we'd have sth that works for all internal and external nodes.  */
    8377      5342030 :       && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
    8378              :     {
    8379      3845041 :       slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
    8380      3845041 :       if (leader)
    8381              :         {
    8382              :           /* We've visited this node already.  */
    8383       401865 :           if (!*leader || *leader == node)
    8384              :             return;
    8385              : 
    8386         2776 :           if (dump_enabled_p ())
    8387          907 :             dump_printf_loc (MSG_NOTE, vect_location,
    8388              :                              "re-using SLP tree %p for %p\n",
    8389              :                              (void *)*leader, (void *)node);
    8390         2776 :           vect_free_slp_tree (node);
    8391         2776 :           (*leader)->refcnt += 1;
    8392         2776 :           node = *leader;
    8393         2776 :           return;
    8394              :         }
    8395              : 
    8396              :       /* Avoid creating a cycle by populating the map only after recursion.  */
    8397      3443176 :       bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
    8398      3443176 :       node->refcnt += 1;
    8399      3443176 :       put_p = true;
    8400              :       /* And recurse.  */
    8401              :     }
    8402              : 
    8403     14774473 :   for (slp_tree &child : SLP_TREE_CHILDREN (node))
    8404      4316532 :     if (child)
    8405      3888379 :       vect_cse_slp_nodes (bst_map, child);
    8406              : 
    8407              :   /* Now record the node for CSE in other siblings.  */
    8408      4940165 :   if (put_p)
    8409      3443176 :     *bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
    8410              : }
    8411              : 
    8412              : /* Optimize the SLP graph of VINFO.  */
    8413              : 
    8414              : void
    8415      1020077 : vect_optimize_slp (vec_info *vinfo)
    8416              : {
    8417      1020077 :   if (vinfo->slp_instances.is_empty ())
    8418              :     return;
    8419       676294 :   vect_optimize_slp_pass (vinfo).run ();
    8420              : 
    8421              :   /* Apply CSE again to nodes after permute optimization.  */
    8422       676294 :   scalar_stmts_to_slp_tree_map_t *bst_map
    8423       676294 :     = new scalar_stmts_to_slp_tree_map_t ();
    8424              : 
    8425      3482533 :   for (auto inst : vinfo->slp_instances)
    8426      1453651 :     vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
    8427              : 
    8428       676294 :   release_scalar_stmts_to_slp_tree_map (bst_map);
    8429              : }
    8430              : 
    8431              : /* Gather loads reachable from the individual SLP graph entries.  */
    8432              : 
    8433              : void
    8434      1020077 : vect_gather_slp_loads (vec_info *vinfo)
    8435              : {
    8436      1020077 :   unsigned i;
    8437      1020077 :   slp_instance instance;
    8438      2473728 :   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
    8439              :     {
    8440      1453651 :       hash_set<slp_tree> visited;
    8441      1453651 :       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
    8442              :                              SLP_INSTANCE_TREE (instance), visited);
    8443      1453651 :     }
    8444      1020077 : }
    8445              : 
    8446              : /* For NODE update VF based on the number of lanes and the vector types
    8447              :    used.  */
    8448              : 
    8449              : static void
    8450      4206179 : vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
    8451              :                              hash_set<slp_tree> &visited)
    8452              : {
    8453      4206179 :   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    8454      1511965 :     return;
    8455      3057011 :   if (visited.add (node))
    8456              :     return;
    8457              : 
    8458     10220707 :   for (slp_tree child : SLP_TREE_CHILDREN (node))
    8459      3451359 :     vect_update_slp_vf_for_node (child, vf, visited);
    8460              : 
    8461              :   /* We do not visit SLP nodes for constants or externals - those neither
    8462              :      have a vector type set yet (vectorizable_* does this) nor do they
    8463              :      have max_nunits set.  Instead we rely on internal nodes max_nunit
    8464              :      to cover constant/external operands.
    8465              :      Note that when we stop using fixed size vectors externs and constants
    8466              :      shouldn't influence the (minimum) vectorization factor, instead
    8467              :      vectorizable_* should honor the vectorization factor when trying to
    8468              :      assign vector types to constants and externals and cause iteration
    8469              :      to a higher vectorization factor when required.  */
    8470      2694214 :   poly_uint64 node_vf
    8471      2694214 :     = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
    8472      2694214 :   vf = force_common_multiple (vf, node_vf);
    8473              : 
    8474              :   /* For permute nodes that are fed from externs or constants we have to
    8475              :      consider their number of lanes as well.  Likewise for store-lanes.  */
    8476      2694214 :   if (SLP_TREE_PERMUTE_P (node) || node->ldst_lanes)
    8477       705503 :     for (slp_tree child : SLP_TREE_CHILDREN (node))
    8478       189602 :       if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
    8479              :         {
    8480         3449 :           poly_uint64 child_vf
    8481         3449 :             = calculate_unrolling_factor (node->max_nunits,
    8482              :                                           SLP_TREE_LANES (child));
    8483         3449 :           vf = force_common_multiple (vf, child_vf);
    8484              :         }
    8485              : }
    8486              : 
    8487              : /* For each possible SLP instance decide whether to SLP it and calculate overall
    8488              :    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
    8489              :    least one instance.  */
    8490              : 
    8491              : bool
    8492       470799 : vect_make_slp_decision (loop_vec_info loop_vinfo)
    8493              : {
    8494       470799 :   unsigned int i;
    8495       470799 :   poly_uint64 unrolling_factor = 1;
    8496       470799 :   const vec<slp_instance> &slp_instances
    8497              :     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
    8498       470799 :   slp_instance instance;
    8499       470799 :   int decided_to_slp = 0;
    8500              : 
    8501       470799 :   DUMP_VECT_SCOPE ("vect_make_slp_decision");
    8502              : 
    8503       470799 :   hash_set<slp_tree> visited;
    8504      1225619 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
    8505              :     {
    8506       754820 :       slp_tree root = SLP_INSTANCE_TREE (instance);
    8507              : 
    8508              :       /* All unroll factors have the form:
    8509              : 
    8510              :            GET_MODE_SIZE (vinfo->vector_mode) * X
    8511              : 
    8512              :          for some rational X, so they must have a common multiple.  */
    8513       754820 :       vect_update_slp_vf_for_node (root, unrolling_factor, visited);
    8514              : 
    8515              :       /* If all instances ended up with vector(1) T roots make sure to
    8516              :          not vectorize.  RVV for example relies on loop vectorization
    8517              :          when some instances are essentially kept scalar.  See PR121048.  */
    8518       754820 :       if (SLP_TREE_VECTYPE (root)
    8519       754820 :           && known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U))
    8520       617794 :         decided_to_slp++;
    8521              :     }
    8522              : 
    8523       470799 :   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = unrolling_factor;
    8524              : 
    8525       470799 :   if (decided_to_slp && dump_enabled_p ())
    8526              :     {
    8527        19042 :       dump_printf_loc (MSG_NOTE, vect_location,
    8528              :                        "Decided to SLP %d instances. Unrolling factor ",
    8529              :                        decided_to_slp);
    8530        19042 :       dump_dec (MSG_NOTE, unrolling_factor);
    8531        19042 :       dump_printf (MSG_NOTE, "\n");
    8532              :     }
    8533              : 
    8534       470799 :   return (decided_to_slp > 0);
    8535       470799 : }
    8536              : 
    8537              : /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
    8538              : 
    8539      2190696 : _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
    8540              :   : vec_info (vec_info::bb, shared),
    8541      2190696 :     roots (vNULL)
    8542              : {
    8543              :   /* The region we are operating on.  bbs[0] is the entry, excluding
    8544              :      its PHI nodes.  In the future we might want to track an explicit
    8545              :      entry edge to cover bbs[0] PHI nodes and have a region entry
    8546              :      insert location.  */
    8547      2190696 :   bbs = _bbs.address ();
    8548      2190696 :   nbbs = _bbs.length ();
    8549              : 
    8550     17515250 :   for (unsigned i = 0; i < nbbs; ++i)
    8551              :     {
    8552     15324554 :       if (i != 0)
    8553     19924579 :         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
    8554      6790721 :              gsi_next (&si))
    8555              :           {
    8556      6790721 :             gphi *phi = si.phi ();
    8557      6790721 :             gimple_set_uid (phi, 0);
    8558      6790721 :             add_stmt (phi);
    8559              :           }
    8560     30649108 :       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
    8561    134374416 :            !gsi_end_p (gsi); gsi_next (&gsi))
    8562              :         {
    8563    119049862 :           gimple *stmt = gsi_stmt (gsi);
    8564    119049862 :           gimple_set_uid (stmt, 0);
    8565    119049862 :           if (is_gimple_debug (stmt))
    8566     74188716 :             continue;
    8567     44861146 :           add_stmt (stmt);
    8568              :         }
    8569              :     }
    8570      2190696 : }
    8571              : 
    8572              : 
    8573              : /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
    8574              :    stmts in the basic block.  */
    8575              : 
    8576      2190696 : _bb_vec_info::~_bb_vec_info ()
    8577              : {
    8578              :   /* Reset region marker.  */
    8579     17515250 :   for (unsigned i = 0; i < nbbs; ++i)
    8580              :     {
    8581     15324554 :       if (i != 0)
    8582     19940358 :         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
    8583      6806500 :              gsi_next (&si))
    8584              :           {
    8585      6806500 :             gphi *phi = si.phi ();
    8586      6806500 :             gimple_set_uid (phi, -1);
    8587              :           }
    8588     30649108 :       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
    8589    134315052 :            !gsi_end_p (gsi); gsi_next (&gsi))
    8590              :         {
    8591    118990498 :           gimple *stmt = gsi_stmt (gsi);
    8592    118990498 :           gimple_set_uid (stmt, -1);
    8593              :         }
    8594              :     }
    8595              : 
    8596      3409927 :   for (unsigned i = 0; i < roots.length (); ++i)
    8597              :     {
    8598      1219231 :       roots[i].stmts.release ();
    8599      1219231 :       roots[i].roots.release ();
    8600      1219231 :       roots[i].remain.release ();
    8601              :     }
    8602      2190696 :   roots.release ();
    8603      2190696 : }
    8604              : 
    8605              : /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
    8606              :    given then that child nodes have already been processed, and that
    8607              :    their def types currently match their SLP node's def type.  */
    8608              : 
    8609              : static bool
    8610      2802904 : vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
    8611              :                                     slp_instance node_instance,
    8612              :                                     stmt_vector_for_cost *cost_vec)
    8613              : {
    8614              :   /* Handle purely internal nodes.  */
    8615      2802904 :   if (SLP_TREE_PERMUTE_P (node))
    8616              :     {
    8617       122737 :       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
    8618              :         return false;
    8619              : 
    8620              :       stmt_vec_info slp_stmt_info;
    8621              :       unsigned int i;
    8622       323753 :       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
    8623              :         {
    8624       202343 :           if (slp_stmt_info
    8625       196802 :               && STMT_VINFO_LIVE_P (slp_stmt_info)
    8626       202343 :               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
    8627              :                                                node_instance, i,
    8628              :                                                false, cost_vec))
    8629              :             return false;
    8630              :         }
    8631       121410 :       SLP_TREE_TYPE (node) = permute_info_type;
    8632       121410 :       return true;
    8633              :     }
    8634              : 
    8635      2680167 :   return vect_analyze_stmt (vinfo, node, node_instance, cost_vec);
    8636              : }
    8637              : 
    8638              : static int
    8639      1848484 : sort_ints (const void *a_, const void *b_)
    8640              : {
    8641      1848484 :   int a = *(const int *)a_;
    8642      1848484 :   int b = *(const int *)b_;
    8643      1848484 :   return a - b;
    8644              : }
    8645              : 
    8646              : /* Verify if we can externalize a set of internal defs.  */
    8647              : 
    8648              : static bool
    8649       379303 : vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
    8650              : {
    8651              :   /* Constant generation uses get_later_stmt which can only handle
    8652              :      defs from the same BB or a set of defs that can be ordered
    8653              :      with a dominance query.  */
    8654       379303 :   basic_block bb = NULL;
    8655       379303 :   bool all_same = true;
    8656       379303 :   auto_vec<int> bbs;
    8657       758606 :   bbs.reserve_exact (stmts.length ());
    8658      2052041 :   for (stmt_vec_info stmt : stmts)
    8659              :     {
    8660       914132 :       if (!stmt)
    8661              :         return false;
    8662       914132 :       else if (!bb)
    8663       379303 :         bb = gimple_bb (stmt->stmt);
    8664       534829 :       else if (gimple_bb (stmt->stmt) != bb)
    8665       172529 :         all_same = false;
    8666       914132 :       bbs.quick_push (gimple_bb (stmt->stmt)->index);
    8667              :     }
    8668       379303 :   if (all_same)
    8669              :     return true;
    8670              : 
    8671              :   /* Produce a vector of unique BB indexes for the defs.  */
    8672       129235 :   bbs.qsort (sort_ints);
    8673              :   unsigned i, j;
    8674       314914 :   for (i = 1, j = 1; i < bbs.length (); ++i)
    8675       185679 :     if (bbs[i] != bbs[j-1])
    8676       137991 :       bbs[j++] = bbs[i];
    8677       129235 :   gcc_assert (j >= 2);
    8678       129235 :   bbs.truncate (j);
    8679              : 
    8680       258470 :   if (bbs.length () == 2)
    8681       125729 :     return (dominated_by_p (CDI_DOMINATORS,
    8682       125729 :                             BASIC_BLOCK_FOR_FN (cfun, bbs[0]),
    8683       125729 :                             BASIC_BLOCK_FOR_FN (cfun, bbs[1]))
    8684       244788 :             || dominated_by_p (CDI_DOMINATORS,
    8685       119059 :                                BASIC_BLOCK_FOR_FN (cfun, bbs[1]),
    8686       119059 :                                BASIC_BLOCK_FOR_FN (cfun, bbs[0])));
    8687              : 
    8688              :   /* ???  For more than two BBs we can sort the vector and verify the
    8689              :      result is a total order.  But we can't use vec::qsort with a
    8690              :      compare function using a dominance query since there's no way to
    8691              :      signal failure and any fallback for an unordered pair would
    8692              :      fail qsort_chk later.
    8693              :      For now simply hope that ordering after BB index provides the
    8694              :      best candidate total order.  If required we can implement our
    8695              :      own mergesort or export an entry without checking.  */
    8696       395018 :   for (unsigned i = 1; i < bbs.length (); ++i)
    8697        12238 :     if (!dominated_by_p (CDI_DOMINATORS,
    8698        12238 :                          BASIC_BLOCK_FOR_FN (cfun, bbs[i]),
    8699        12238 :                          BASIC_BLOCK_FOR_FN (cfun, bbs[i-1])))
    8700              :       return false;
    8701              : 
    8702              :   return true;
    8703       379303 : }
    8704              : 
    8705              : /* Try to build NODE from scalars, returning true on success.
    8706              :    NODE_INSTANCE is the SLP instance that contains NODE.  */
    8707              : 
    8708              : static bool
    8709       560330 : vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
    8710              :                               slp_instance node_instance)
    8711              : {
    8712       560330 :   stmt_vec_info stmt_info;
    8713       560330 :   unsigned int i;
    8714              : 
    8715       560330 :   if (!is_a <bb_vec_info> (vinfo)
    8716        70693 :       || node == SLP_INSTANCE_TREE (node_instance)
    8717        22231 :       || !SLP_TREE_SCALAR_STMTS (node).exists ()
    8718        22190 :       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
    8719              :       /* Force the mask use to be built from scalars instead.  */
    8720        20013 :       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
    8721       580136 :       || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
    8722       540524 :     return false;
    8723              : 
    8724        19806 :   if (dump_enabled_p ())
    8725           76 :     dump_printf_loc (MSG_NOTE, vect_location,
    8726              :                      "Building vector operands of %p from scalars instead\n",
    8727              :                      (void *) node);
    8728              : 
    8729              :   /* Don't remove and free the child nodes here, since they could be
    8730              :      referenced by other structures.  The analysis and scheduling phases
    8731              :      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
    8732        19806 :   unsigned int group_size = SLP_TREE_LANES (node);
    8733        19806 :   SLP_TREE_DEF_TYPE (node) = vect_external_def;
    8734              :   /* Invariants get their vector type from the uses.  */
    8735        19806 :   SLP_TREE_VECTYPE (node) = NULL_TREE;
    8736        19806 :   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
    8737        19806 :   SLP_TREE_LOAD_PERMUTATION (node).release ();
    8738        68868 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    8739              :     {
    8740        49062 :       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
    8741        49062 :       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
    8742              :     }
    8743              :   return true;
    8744              : }
    8745              : 
    8746              : /* Return true if all elements of the slice are the same.  */
    8747              : bool
    8748       479014 : vect_scalar_ops_slice::all_same_p () const
    8749              : {
    8750       526602 :   for (unsigned int i = 1; i < length; ++i)
    8751       444462 :     if (!operand_equal_p (op (0), op (i)))
    8752              :       return false;
    8753              :   return true;
    8754              : }
    8755              : 
    8756              : hashval_t
    8757       404172 : vect_scalar_ops_slice_hash::hash (const value_type &s)
    8758              : {
    8759       404172 :   hashval_t hash = 0;
    8760      1555086 :   for (unsigned i = 0; i < s.length; ++i)
    8761      1150914 :     hash = iterative_hash_expr (s.op (i), hash);
    8762       404172 :   return hash;
    8763              : }
    8764              : 
    8765              : bool
    8766       221773 : vect_scalar_ops_slice_hash::equal (const value_type &s1,
    8767              :                                    const compare_type &s2)
    8768              : {
    8769       221773 :   if (s1.length != s2.length)
    8770              :     return false;
    8771       385983 :   for (unsigned i = 0; i < s1.length; ++i)
    8772       336033 :     if (!operand_equal_p (s1.op (i), s2.op (i)))
    8773              :       return false;
    8774              :   return true;
    8775              : }
    8776              : 
    8777              : /* Compute the prologue cost for invariant or constant operands represented
    8778              :    by NODE.  */
    8779              : 
    8780              : static void
    8781      1104383 : vect_prologue_cost_for_slp (vec_info *vinfo, slp_tree node,
    8782              :                             stmt_vector_for_cost *cost_vec)
    8783              : {
    8784              :   /* There's a special case of an existing vector, that costs nothing.  */
    8785      1104383 :   if (SLP_TREE_SCALAR_OPS (node).length () == 0
    8786      1104383 :       && !SLP_TREE_VEC_DEFS (node).is_empty ())
    8787         1570 :     return;
    8788              :   /* Without looking at the actual initializer a vector of
    8789              :      constants can be implemented as load from the constant pool.
    8790              :      When all elements are the same we can use a splat.  */
    8791      1102813 :   tree vectype = SLP_TREE_VECTYPE (node);
    8792      1102813 :   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
    8793      1102813 :   unsigned HOST_WIDE_INT const_nunits;
    8794      1102813 :   unsigned nelt_limit;
    8795      1102813 :   unsigned nvectors = vect_get_num_copies (vinfo, node);
    8796      1102813 :   auto ops = &SLP_TREE_SCALAR_OPS (node);
    8797      1102813 :   auto_vec<unsigned int> starts (nvectors);
    8798      1102813 :   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
    8799      1102813 :       && ! multiple_p (const_nunits, group_size))
    8800              :     {
    8801        63602 :       nelt_limit = const_nunits;
    8802        63602 :       hash_set<vect_scalar_ops_slice_hash> vector_ops;
    8803       264979 :       for (unsigned int i = 0; i < nvectors; ++i)
    8804       201377 :         if (!vector_ops.add ({ ops, i * nelt_limit, nelt_limit }))
    8805       151427 :           starts.quick_push (i * nelt_limit);
    8806        63602 :     }
    8807              :   else
    8808              :     {
    8809              :       /* If either the vector has variable length or the vectors
    8810              :          are composed of repeated whole groups we only need to
    8811              :          cost construction once.  All vectors will be the same.  */
    8812      1039211 :       nelt_limit = group_size;
    8813      1039211 :       starts.quick_push (0);
    8814              :     }
    8815              :   /* ???  We're just tracking whether vectors in a single node are the same.
    8816              :      Ideally we'd do something more global.  */
    8817      1102813 :   bool passed = false;
    8818      4499077 :   for (unsigned int start : starts)
    8819              :     {
    8820      1190638 :       vect_cost_for_stmt kind;
    8821      1190638 :       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
    8822              :         kind = vector_load;
    8823       479014 :       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
    8824              :         kind = scalar_to_vec;
    8825              :       else
    8826       396874 :         kind = vec_construct;
    8827              :       /* The target cost hook has no idea which part of the SLP node
    8828              :          we are costing so avoid passing it down more than once.  Pass
    8829              :          it to the first vec_construct or scalar_to_vec part since for those
    8830              :          the x86 backend tries to account for GPR to XMM register moves.  */
    8831      1190638 :       record_stmt_cost (cost_vec, 1, kind, nullptr,
    8832      1190638 :                         (kind != vector_load && !passed) ? node : nullptr,
    8833              :                         vectype, 0, vect_prologue);
    8834      1190638 :       if (kind != vector_load)
    8835       479014 :         passed = true;
    8836              :     }
    8837      1102813 : }
    8838              : 
    8839              : /* Analyze statements contained in SLP tree NODE after recursively analyzing
    8840              :    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
    8841              : 
    8842              :    Return true if the operations are supported.  */
    8843              : 
    8844              : static bool
    8845      5178027 : vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
    8846              :                                   slp_instance node_instance,
    8847              :                                   hash_set<slp_tree> &visited_set,
    8848              :                                   vec<slp_tree> &visited_vec,
    8849              :                                   stmt_vector_for_cost *cost_vec)
    8850              : {
    8851      5178027 :   int i, j;
    8852      5178027 :   slp_tree child;
    8853              : 
    8854              :   /* Assume we can code-generate all invariants.  */
    8855      5178027 :   if (!node
    8856      4807011 :       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
    8857      4040464 :       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
    8858              :     return true;
    8859              : 
    8860      3489268 :   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
    8861              :     {
    8862            9 :       if (dump_enabled_p ())
    8863            0 :         dump_printf_loc (MSG_NOTE, vect_location,
    8864              :                          "Failed cyclic SLP reference in %p\n", (void *) node);
    8865            9 :       return false;
    8866              :     }
    8867      3489259 :   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
    8868              : 
    8869              :   /* If we already analyzed the exact same set of scalar stmts we're done.
    8870              :      We share the generated vector stmts for those.  */
    8871      3489259 :   if (visited_set.add (node))
    8872              :     return true;
    8873      3114118 :   visited_vec.safe_push (node);
    8874              : 
    8875      3114118 :   bool res = true;
    8876      3114118 :   unsigned visited_rec_start = visited_vec.length ();
    8877      3114118 :   unsigned cost_vec_rec_start = cost_vec->length ();
    8878      3114118 :   bool seen_non_constant_child = false;
    8879      6685376 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    8880              :     {
    8881      3882288 :       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
    8882              :                                               visited_set, visited_vec,
    8883              :                                               cost_vec);
    8884      3882288 :       if (!res)
    8885              :         break;
    8886      3571258 :       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
    8887      3571258 :         seen_non_constant_child = true;
    8888              :     }
    8889              :   /* We're having difficulties scheduling nodes with just constant
    8890              :      operands and no scalar stmts since we then cannot compute a stmt
    8891              :      insertion place.  */
    8892      3114118 :   if (res
    8893      3114118 :       && !seen_non_constant_child
    8894      3114118 :       && SLP_TREE_SCALAR_STMTS (node).is_empty ())
    8895              :     {
    8896          184 :       if (dump_enabled_p ())
    8897            6 :         dump_printf_loc (MSG_NOTE, vect_location,
    8898              :                          "Cannot vectorize all-constant op node %p\n",
    8899              :                          (void *) node);
    8900              :       res = false;
    8901              :     }
    8902              : 
    8903      3113934 :   if (res)
    8904      2802904 :     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
    8905              :                                               cost_vec);
    8906              :   /* If analysis failed we have to pop all recursive visited nodes
    8907              :      plus ourselves.  */
    8908      3114118 :   if (!res)
    8909              :     {
    8910      2807062 :       while (visited_vec.length () >= visited_rec_start)
    8911       843201 :         visited_set.remove (visited_vec.pop ());
    8912       560330 :       cost_vec->truncate (cost_vec_rec_start);
    8913              :     }
    8914              : 
    8915              :   /* When the node can be vectorized cost invariant nodes it references.
    8916              :      This is not done in DFS order to allow the referring node
    8917              :      vectorizable_* calls to nail down the invariant nodes vector type
    8918              :      and possibly unshare it if it needs a different vector type than
    8919              :      other referrers.  */
    8920      3114118 :   if (res)
    8921      5811370 :     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
    8922      3257582 :       if (child
    8923      2952223 :           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
    8924      2952223 :               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
    8925              :           /* Perform usual caching, note code-generation still
    8926              :              code-gens these nodes multiple times but we expect
    8927              :              to CSE them later.  */
    8928      4452153 :           && !visited_set.add (child))
    8929              :         {
    8930      1149755 :           visited_vec.safe_push (child);
    8931              :           /* ???  After auditing more code paths make a "default"
    8932              :              and push the vector type from NODE to all children
    8933              :              if it is not already set.  */
    8934              :           /* Compute the number of vectors to be generated.  */
    8935      1149755 :           tree vector_type = SLP_TREE_VECTYPE (child);
    8936      1149755 :           if (!vector_type)
    8937              :             {
    8938              :               /* Masked loads can have an undefined (default SSA definition)
    8939              :                  else operand.  We do not need to cost it.  */
    8940        45372 :               vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
    8941        46807 :               if (SLP_TREE_TYPE (node) == load_vec_info_type
    8942        46807 :                   && ((ops.length ()
    8943         1435 :                        && TREE_CODE (ops[0]) == SSA_NAME
    8944            0 :                        && SSA_NAME_IS_DEFAULT_DEF (ops[0])
    8945            0 :                        && VAR_P (SSA_NAME_VAR (ops[0])))
    8946         1435 :                       || SLP_TREE_DEF_TYPE (child) == vect_constant_def))
    8947         1435 :                 continue;
    8948              : 
    8949              :               /* For shifts with a scalar argument we don't need
    8950              :                  to cost or code-generate anything.
    8951              :                  ???  Represent this more explicitly.  */
    8952        43937 :               gcc_assert (SLP_TREE_TYPE (node) == shift_vec_info_type
    8953              :                           && j == 1);
    8954        43937 :               continue;
    8955        43937 :             }
    8956              : 
    8957              :           /* And cost them.  */
    8958      1104383 :           vect_prologue_cost_for_slp (vinfo, child, cost_vec);
    8959              :         }
    8960              : 
    8961              :   /* If this node or any of its children can't be vectorized, try pruning
    8962              :      the tree here rather than felling the whole thing.  */
    8963       560330 :   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
    8964              :     {
    8965              :       /* We'll need to revisit this for invariant costing and number
    8966              :          of vectorized stmt setting.   */
    8967              :       res = true;
    8968              :     }
    8969              : 
    8970              :   return res;
    8971              : }
    8972              : 
    8973              : /* Mark lanes of NODE that are live outside of the basic-block vectorized
    8974              :    region and that can be vectorized using vectorizable_live_operation
    8975              :    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
    8976              :    scalar code computing it to be retained.  */
    8977              : 
    8978              : static void
    8979       909711 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
    8980              :                              slp_instance instance,
    8981              :                              stmt_vector_for_cost *cost_vec,
    8982              :                              hash_set<stmt_vec_info> &svisited,
    8983              :                              hash_set<slp_tree> &visited)
    8984              : {
    8985       909711 :   if (visited.add (node))
    8986        41691 :     return;
    8987              : 
    8988       868020 :   unsigned i;
    8989       868020 :   stmt_vec_info stmt_info;
    8990       868020 :   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
    8991      3144472 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    8992              :     {
    8993      2276452 :       if (!stmt_info || svisited.contains (stmt_info))
    8994        55511 :         continue;
    8995      2251277 :       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
    8996      2251277 :       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
    8997        12044 :           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
    8998              :         /* Only the pattern root stmt computes the original scalar value.  */
    8999         8963 :         continue;
    9000      2242314 :       if (!PURE_SLP_STMT (orig_stmt_info))
    9001              :         /* Iff the stmt is not part of the vector coverage because it or
    9002              :            uses of it are used by SLP graph leafs as extern input there is
    9003              :            no point in trying to live code-generate from a vector stmt as
    9004              :            the scalar stmt will survive anyway.  */
    9005        21373 :         continue;
    9006      2220941 :       bool mark_visited = true;
    9007      2220941 :       gimple *orig_stmt = orig_stmt_info->stmt;
    9008      2220941 :       ssa_op_iter op_iter;
    9009      2220941 :       def_operand_p def_p;
    9010      4934481 :       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
    9011              :         {
    9012              :           /* We have to verify whether we can insert the lane extract
    9013              :              before all uses.  The following is a conservative approximation.
    9014              :              We cannot put this into vectorizable_live_operation because
    9015              :              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
    9016              :              doesn't work.
    9017              :              Note that while the fact that we emit code for loads at the
    9018              :              first load should make this a non-problem leafs we construct
    9019              :              from scalars are vectorized after the last scalar def.
    9020              :              ???  If we'd actually compute the insert location during
    9021              :              analysis we could use sth less conservative than the last
    9022              :              scalar stmt in the node for the dominance check.  */
    9023              :           /* ???  What remains is "live" uses in vector CTORs in the same
    9024              :              SLP graph which is where those uses can end up code-generated
    9025              :              right after their definition instead of close to their original
    9026              :              use.  But that would restrict us to code-generate lane-extracts
    9027              :              from the latest stmt in a node.  So we compensate for this
    9028              :              during code-generation, simply not replacing uses for those
    9029              :              hopefully rare cases.  */
    9030       492599 :           imm_use_iterator use_iter;
    9031       492599 :           gimple *use_stmt;
    9032       492599 :           stmt_vec_info use_stmt_info;
    9033              : 
    9034       492599 :           bool live_p = false;
    9035       492599 :           bool can_insert = true;
    9036      1893164 :           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
    9037       923637 :             if (!is_gimple_debug (use_stmt)
    9038       923637 :                 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
    9039       689036 :                     || !PURE_SLP_STMT (use_stmt_info)))
    9040              :               {
    9041       144350 :                 live_p = true;
    9042       144350 :                 if (!vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
    9043              :                   {
    9044        15671 :                     if (dump_enabled_p ())
    9045           46 :                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9046              :                                        "Cannot determine insertion place for "
    9047              :                                        "lane extract\n");
    9048              :                     can_insert = false;
    9049              :                     break;
    9050              :                   }
    9051       492599 :               }
    9052       492599 :           if (live_p && can_insert)
    9053              :             {
    9054              :               /* Only record a live stmt when we can replace all uses.  We
    9055              :                  record from which SLP tree we vectorize the uses, so we'll
    9056              :                  cost once and can deal with the case that not all SLP nodes
    9057              :                  may be suitable for code-generation of all live uses.
    9058              :                  ???  But we never split up the work between multiple SLP
    9059              :                  nodes.  */
    9060        64415 :               STMT_VINFO_LIVE_P (stmt_info) = true;
    9061        64415 :               if (!vectorizable_live_operation (bb_vinfo, stmt_info, node,
    9062              :                                                 instance, i, false, cost_vec))
    9063              :                 {
    9064            0 :                   STMT_VINFO_LIVE_P (stmt_info) = false;
    9065            0 :                   mark_visited = false;
    9066              :                 }
    9067              :             }
    9068              :         }
    9069      2220941 :       if (mark_visited)
    9070      2220941 :         svisited.add (stmt_info);
    9071              :     }
    9072              : 
    9073              :   slp_tree child;
    9074      2507736 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9075       877809 :     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    9076       232701 :       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
    9077              :                                    svisited, visited);
    9078              : }
    9079              : 
    9080              : /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
    9081              :    are live outside of the basic-block vectorized region and that can be
    9082              :    vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
    9083              : 
    9084              : static void
    9085       234604 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
    9086              : {
    9087       234604 :   if (bb_vinfo->slp_instances.is_empty ())
    9088            0 :     return;
    9089              : 
    9090       234604 :   hash_set<slp_tree> visited;
    9091       234604 :   hash_set<stmt_vec_info> svisited;
    9092      1380822 :   for (slp_instance instance : bb_vinfo->slp_instances)
    9093              :     {
    9094       677010 :       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
    9095        28847 :         STMT_VINFO_LIVE_P (SLP_INSTANCE_ROOT_STMTS (instance)[0]) = true;
    9096       677010 :       vect_location = instance->location ();
    9097       677010 :       vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
    9098              :                                    instance, &instance->cost_vec,
    9099              :                                    svisited, visited);
    9100              :     }
    9101       234604 : }
    9102              : 
    9103              : /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
    9104              : 
    9105              : static bool
    9106        74335 : vectorizable_bb_reduc_epilogue (slp_instance instance,
    9107              :                                 stmt_vector_for_cost *cost_vec)
    9108              : {
    9109        74335 :   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
    9110        74335 :   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
    9111        74335 :   if (reduc_code == MINUS_EXPR)
    9112            0 :     reduc_code = PLUS_EXPR;
    9113        74335 :   internal_fn reduc_fn;
    9114        74335 :   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
    9115        74335 :   if (!vectype
    9116        74323 :       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
    9117        74323 :       || reduc_fn == IFN_LAST
    9118        74323 :       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
    9119       109421 :       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
    9120        35086 :                                      TREE_TYPE (vectype)))
    9121              :     {
    9122        49569 :       if (dump_enabled_p ())
    9123          277 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9124              :                          "not vectorized: basic block reduction epilogue "
    9125              :                          "operation unsupported.\n");
    9126        49569 :       return false;
    9127              :     }
    9128              : 
    9129              :   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
    9130              :      cost log2 vector operations plus shuffles and one extraction.  */
    9131        24766 :   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
    9132        24766 :   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
    9133              :                     vectype, 0, vect_body);
    9134        24766 :   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
    9135              :                     vectype, 0, vect_body);
    9136        24766 :   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
    9137              :                     vectype, 0, vect_body);
    9138              : 
    9139              :   /* Since we replace all stmts of a possibly longer scalar reduction
    9140              :      chain account for the extra scalar stmts for that.  */
    9141        24766 :   if (!instance->remain_defs.is_empty ())
    9142        20102 :     record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
    9143        10051 :                       instance->root_stmts[0], 0, vect_body);
    9144              :   return true;
    9145              : }
    9146              : 
    9147              : /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
    9148              :    and recurse to children.  */
    9149              : 
    9150              : static void
    9151       189392 : vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
    9152              :                               hash_set<slp_tree> &visited)
    9153              : {
    9154       189392 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
    9155       189392 :       || visited.add (node))
    9156        83380 :     return;
    9157              : 
    9158              :   stmt_vec_info stmt;
    9159              :   unsigned i;
    9160       359663 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
    9161       253651 :     if (stmt)
    9162       258974 :       roots.remove (vect_orig_stmt (stmt));
    9163              : 
    9164              :   slp_tree child;
    9165       234476 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9166       128464 :     if (child)
    9167       127074 :       vect_slp_prune_covered_roots (child, roots, visited);
    9168              : }
    9169              : 
    9170              : /* Analyze statements in SLP instances of VINFO.  Return true if the
    9171              :    operations are supported. */
    9172              : 
    9173              : bool
    9174       657820 : vect_slp_analyze_operations (vec_info *vinfo)
    9175              : {
    9176       657820 :   slp_instance instance;
    9177       657820 :   int i;
    9178              : 
    9179       657820 :   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
    9180              : 
    9181       657820 :   hash_set<slp_tree> visited;
    9182      1715057 :   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
    9183              :     {
    9184      1295739 :       auto_vec<slp_tree> visited_vec;
    9185      1295739 :       stmt_vector_for_cost cost_vec;
    9186      1295739 :       cost_vec.create (2);
    9187      1295739 :       if (is_a <bb_vec_info> (vinfo))
    9188       776571 :         vect_location = instance->location ();
    9189      1295739 :       if (!vect_slp_analyze_node_operations (vinfo,
    9190              :                                              SLP_INSTANCE_TREE (instance),
    9191              :                                              instance, visited, visited_vec,
    9192              :                                              &cost_vec)
    9193              :           /* CTOR instances require vectorized defs for the SLP tree root.  */
    9194      1066236 :           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
    9195         5611 :               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
    9196              :                   != vect_internal_def
    9197              :                   /* Make sure we vectorized with the expected type.  */
    9198         5611 :                   || !useless_type_conversion_p
    9199         5611 :                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
    9200              :                                               (instance->root_stmts[0]->stmt))),
    9201         5611 :                          TREE_TYPE (SLP_TREE_VECTYPE
    9202              :                                             (SLP_INSTANCE_TREE (instance))))))
    9203              :           /* Check we can vectorize the reduction.  */
    9204      1066221 :           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
    9205        74335 :               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
    9206              :           /* Check we can vectorize the gcond.  */
    9207      2312391 :           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
    9208        60059 :               && !vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
    9209        60059 :                                            SLP_INSTANCE_ROOT_STMTS (instance)[0],
    9210              :                                            NULL,
    9211              :                                            SLP_INSTANCE_TREE (instance),
    9212              :                                            &cost_vec)))
    9213              :         {
    9214       336548 :           cost_vec.release ();
    9215       336548 :           slp_tree node = SLP_INSTANCE_TREE (instance);
    9216       336548 :           stmt_vec_info stmt_info;
    9217       336548 :           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
    9218       254157 :             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
    9219        82391 :           else if (!SLP_TREE_SCALAR_STMTS (node).is_empty ()
    9220        82391 :                    && SLP_TREE_SCALAR_STMTS (node)[0])
    9221              :             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
    9222              :           else
    9223            0 :             stmt_info = SLP_TREE_REPRESENTATIVE (node);
    9224       336548 :           if (is_a <loop_vec_info> (vinfo))
    9225              :             {
    9226       238502 :               if (dump_enabled_p ())
    9227         6477 :                 dump_printf_loc (MSG_NOTE, vect_location,
    9228              :                                  "unsupported SLP instance starting from: %G",
    9229              :                                  stmt_info->stmt);
    9230       238502 :               return false;
    9231              :             }
    9232        98046 :           if (dump_enabled_p ())
    9233          331 :             dump_printf_loc (MSG_NOTE, vect_location,
    9234              :                              "removing SLP instance operations starting from: %G",
    9235              :                              stmt_info->stmt);
    9236       538594 :           while (!visited_vec.is_empty ())
    9237              :             {
    9238       440548 :               slp_tree node = visited_vec.pop ();
    9239       440548 :               SLP_TREE_TYPE (node) = undef_vec_info_type;
    9240       440548 :               if (node->data)
    9241              :                 {
    9242        12290 :                   delete node->data;
    9243        12290 :                   node->data = nullptr;
    9244              :                 }
    9245       440548 :               visited.remove (node);
    9246              :             }
    9247        98046 :           vect_free_slp_instance (instance);
    9248        98046 :           vinfo->slp_instances.ordered_remove (i);
    9249              :         }
    9250              :       else
    9251              :         {
    9252       959191 :           i++;
    9253       959191 :           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
    9254              :             {
    9255       280666 :               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
    9256       280666 :               cost_vec.release ();
    9257              :             }
    9258              :           else
    9259              :             /* For BB vectorization remember the SLP graph entry
    9260              :                cost for later.  */
    9261       678525 :             instance->cost_vec = cost_vec;
    9262              :         }
    9263      1295739 :     }
    9264              : 
    9265              :   /* Now look for SLP instances with a root that are covered by other
    9266              :      instances and remove them.  */
    9267       419318 :   hash_set<stmt_vec_info> roots;
    9268      1729986 :   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
    9269       924298 :     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
    9270        32948 :       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
    9271       419318 :   if (!roots.is_empty ())
    9272              :     {
    9273        13128 :       visited.empty ();
    9274        75446 :       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
    9275        62318 :         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
    9276              :                                       visited);
    9277        75446 :       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
    9278        62318 :         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
    9279        32948 :             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
    9280              :           {
    9281         1515 :             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
    9282         1515 :             if (dump_enabled_p ())
    9283           20 :               dump_printf_loc (MSG_NOTE, vect_location,
    9284              :                                "removing SLP instance operations starting "
    9285              :                                "from: %G", root->stmt);
    9286         1515 :             vect_free_slp_instance (instance);
    9287         1515 :             vinfo->slp_instances.ordered_remove (i);
    9288              :           }
    9289              :         else
    9290        60803 :           ++i;
    9291              :     }
    9292              : 
    9293       838636 :   return !vinfo->slp_instances.is_empty ();
    9294      1077138 : }
    9295              : 
    9296              : /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
    9297              :    closing the eventual chain.  */
    9298              : 
    9299              : static slp_instance
    9300       742532 : get_ultimate_leader (slp_instance instance,
    9301              :                      hash_map<slp_instance, slp_instance> &instance_leader)
    9302              : {
    9303       742532 :   auto_vec<slp_instance *, 8> chain;
    9304       742532 :   slp_instance *tem;
    9305       820011 :   while (*(tem = instance_leader.get (instance)) != instance)
    9306              :     {
    9307        77479 :       chain.safe_push (tem);
    9308        77479 :       instance = *tem;
    9309              :     }
    9310       820011 :   while (!chain.is_empty ())
    9311        77479 :     *chain.pop () = instance;
    9312       742532 :   return instance;
    9313       742532 : }
    9314              : 
    9315              : namespace {
    9316              : /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
    9317              :    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
    9318              :    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
    9319              : 
    9320              :    INSTANCE_LEADER is as for get_ultimate_leader.  */
    9321              : 
    9322              : template<typename T>
    9323              : bool
    9324      3288225 : vect_map_to_instance (slp_instance instance, T key,
    9325              :                       hash_map<T, slp_instance> &key_to_instance,
    9326              :                       hash_map<slp_instance, slp_instance> &instance_leader)
    9327              : {
    9328              :   bool existed_p;
    9329      3288225 :   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
    9330      3288225 :   if (!existed_p)
    9331              :     ;
    9332       174586 :   else if (key_instance != instance)
    9333              :     {
    9334              :       /* If we're running into a previously marked key make us the
    9335              :          leader of the current ultimate leader.  This keeps the
    9336              :          leader chain acyclic and works even when the current instance
    9337              :          connects two previously independent graph parts.  */
    9338        65522 :       slp_instance key_leader
    9339        65522 :         = get_ultimate_leader (key_instance, instance_leader);
    9340        65522 :       if (key_leader != instance)
    9341        19457 :         instance_leader.put (key_leader, instance);
    9342              :     }
    9343      3288225 :   key_instance = instance;
    9344      3288225 :   return existed_p;
    9345              : }
    9346              : }
    9347              : 
    9348              : /* Worker of vect_bb_partition_graph, recurse on NODE.  */
    9349              : 
    9350              : static void
    9351       909711 : vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
    9352              :                            slp_instance instance, slp_tree node,
    9353              :                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
    9354              :                            hash_map<slp_tree, slp_instance> &node_to_instance,
    9355              :                            hash_map<slp_instance, slp_instance> &instance_leader)
    9356              : {
    9357       909711 :   stmt_vec_info stmt_info;
    9358       909711 :   unsigned i;
    9359              : 
    9360      3288225 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    9361      2378514 :     if (stmt_info)
    9362      2378514 :       vect_map_to_instance (instance, stmt_info, stmt_to_instance,
    9363              :                             instance_leader);
    9364              : 
    9365       909711 :   if (vect_map_to_instance (instance, node, node_to_instance,
    9366              :                             instance_leader))
    9367       909711 :     return;
    9368              : 
    9369              :   slp_tree child;
    9370      1745829 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9371       877809 :     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    9372       232701 :       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
    9373              :                                  node_to_instance, instance_leader);
    9374              : }
    9375              : 
    9376              : /* Partition the SLP graph into pieces that can be costed independently.  */
    9377              : 
    9378              : static void
    9379       234604 : vect_bb_partition_graph (bb_vec_info bb_vinfo)
    9380              : {
    9381       234604 :   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
    9382              : 
    9383              :   /* First walk the SLP graph assigning each involved scalar stmt a
    9384              :      corresponding SLP graph entry and upon visiting a previously
    9385              :      marked stmt, make the stmts leader the current SLP graph entry.  */
    9386       234604 :   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
    9387       234604 :   hash_map<slp_tree, slp_instance> node_to_instance;
    9388       234604 :   hash_map<slp_instance, slp_instance> instance_leader;
    9389       234604 :   slp_instance instance;
    9390       911614 :   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
    9391              :     {
    9392       677010 :       instance_leader.put (instance, instance);
    9393       677010 :       vect_bb_partition_graph_r (bb_vinfo,
    9394              :                                  instance, SLP_INSTANCE_TREE (instance),
    9395              :                                  stmt_to_instance, node_to_instance,
    9396              :                                  instance_leader);
    9397              :     }
    9398              : 
    9399              :   /* Then collect entries to each independent subgraph.  */
    9400      1146218 :   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
    9401              :     {
    9402       677010 :       slp_instance leader = get_ultimate_leader (instance, instance_leader);
    9403       677010 :       leader->subgraph_entries.safe_push (instance);
    9404       677010 :       if (dump_enabled_p ()
    9405       677010 :           && leader != instance)
    9406           69 :         dump_printf_loc (MSG_NOTE, vect_location,
    9407              :                          "instance %p is leader of %p\n",
    9408              :                          (void *) leader, (void *) instance);
    9409              :     }
    9410       234604 : }
    9411              : 
    9412              : /* Compute the scalar cost of the SLP node NODE and its children
    9413              :    and return it.  Do not account defs that are marked in LIFE and
    9414              :    update LIFE according to uses of NODE.  */
    9415              : 
    9416              : static void
    9417       673512 : vect_bb_slp_scalar_cost (bb_vec_info vinfo,
    9418              :                          vec<stmt_vec_info> &worklist,
    9419              :                          stmt_vector_for_cost *cost_vec,
    9420              :                          hash_set<stmt_vec_info> &visited)
    9421              : {
    9422      3125389 :   while (!worklist.is_empty ())
    9423              :     {
    9424      2451877 :       stmt_vec_info stmt = worklist.pop ();
    9425      2737996 :       if (!PURE_SLP_STMT (stmt))
    9426       301567 :         continue;
    9427              : 
    9428              :       /* When the stmt is live but not actually vectorized we have
    9429              :          to keep the feeding scalar defs.  */
    9430      2168646 :       if (!STMT_VINFO_LIVE_P (vect_stmt_to_vectorize (stmt)))
    9431              :         {
    9432      2102918 :           bool live_p = false;
    9433      2102918 :           ssa_op_iter op_iter;
    9434      2102918 :           def_operand_p def_p;
    9435      4603190 :           FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt->stmt, op_iter, SSA_OP_DEF)
    9436              :             {
    9437       397354 :               imm_use_iterator use_iter;
    9438       397354 :               gimple *use_stmt;
    9439      1435577 :               FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
    9440       640869 :                 if (!is_gimple_debug (use_stmt))
    9441              :                   {
    9442       473186 :                     stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
    9443       473186 :                     if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
    9444              :                       {
    9445        24123 :                         if (dump_enabled_p ())
    9446              :                           {
    9447           36 :                             dump_printf_loc (MSG_NOTE, vect_location,
    9448              :                                              "stmt considered live: %G",
    9449              :                                              stmt->stmt);
    9450           36 :                             dump_printf_loc (MSG_NOTE, vect_location,
    9451              :                                              "because of use in: %G",
    9452              :                                              use_stmt);
    9453              :                           }
    9454              :                         live_p = true;
    9455              :                       }
    9456       397354 :                   }
    9457              :             }
    9458      2102918 :           if (live_p)
    9459        15448 :             continue;
    9460              :         }
    9461              : 
    9462              :       /* The following assert verifies that vect_bb_partition_graph
    9463              :          partitions the SLP graph in a way that each scalar stmt of
    9464              :          the coverage of the SLP graph belongs to exactly one subgraph.
    9465              :          ???  This is currently not guaranteed since the function
    9466              :          works purely on SLP_TREE_SCALAR_STMTS, resulting in the assert
    9467              :          tripping or scalar stmts costed multiple times, making vectorization
    9468              :          more profitable than it really is.  */
    9469              :       /* gcc_checking_assert (!gimple_visited_p (stmt->stmt)); */
    9470              : 
    9471      2150310 :       if (vect_nop_conversion_p (stmt))
    9472              :         ;
    9473              :       /* For single-argument PHIs assume coalescing which means zero
    9474              :          cost for the scalar and the vector PHIs.  This avoids
    9475              :          artificially favoring the vector path (but may pessimize it
    9476              :          in some cases).  */
    9477      2129210 :       else if (is_a <gphi *> (stmt->stmt)
    9478      2129210 :                && gimple_phi_num_args (as_a <gphi *> (stmt->stmt)) == 1)
    9479              :         ;
    9480              :       else
    9481              :         {
    9482      2120495 :           vect_cost_for_stmt kind;
    9483      2120495 :           if (STMT_VINFO_DATA_REF (stmt))
    9484              :             {
    9485      1946884 :               data_reference_p dr = STMT_VINFO_DATA_REF (stmt);
    9486      1946884 :               tree base = get_base_address (DR_REF (dr));
    9487              :               /* When the scalar access is to a non-global not
    9488              :                  address-taken decl that is not BLKmode assume we can
    9489              :                  access it with a single non-load/store instruction.  */
    9490      1946884 :               if (DECL_P (base)
    9491      1500150 :                   && !is_global_var (base)
    9492      1424301 :                   && !TREE_ADDRESSABLE (base)
    9493      2495859 :                   && DECL_MODE (base) != BLKmode)
    9494              :                 kind = scalar_stmt;
    9495      1803660 :               else if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt)))
    9496              :                 kind = scalar_load;
    9497              :               else
    9498      1578508 :                 kind = scalar_store;
    9499              :             }
    9500              :           else
    9501              :             kind = scalar_stmt;
    9502              :           /* Cost each scalar stmt only once.  */
    9503      2120495 :           gimple_set_visited (stmt->stmt, true);
    9504      2120495 :           record_stmt_cost (cost_vec, 1, kind, stmt, NULL_TREE, 0, vect_body);
    9505              :         }
    9506              : 
    9507              :       /* Now walk relevant parts of the SSA use-def graph.  */
    9508      2150310 :       slp_oprnds child_ops (stmt);
    9509      4509517 :       for (unsigned i = 0; i < child_ops.num_slp_children; ++i)
    9510              :         {
    9511      2359207 :           tree op = child_ops.get_op_for_slp_child (stmt, i);
    9512      2359207 :           stmt_vec_info def = vinfo->lookup_def (op);
    9513      2359207 :           if (def && !visited.add (def))
    9514       688266 :             worklist.safe_push (def);
    9515              :         }
    9516              :     }
    9517       673512 : }
    9518              : 
    9519              : 
    9520              : /* Comparator for the loop-index sorted cost vectors.  */
    9521              : 
    9522              : static int
    9523     17328373 : li_cost_vec_cmp (const void *a_, const void *b_)
    9524              : {
    9525     17328373 :   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
    9526     17328373 :   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
    9527     17328373 :   if (a->first < b->first)
    9528              :     return -1;
    9529     16588493 :   else if (a->first == b->first)
    9530     15947238 :     return 0;
    9531              :   return 1;
    9532              : }
    9533              : 
    9534              : /* Check if vectorization of the basic block is profitable for the
    9535              :    subgraph denoted by SLP_INSTANCES.  */
    9536              : 
    9537              : static bool
    9538       654192 : vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
    9539              :                                     vec<slp_instance> slp_instances,
    9540              :                                     loop_p orig_loop)
    9541              : {
    9542       654192 :   slp_instance instance;
    9543       654192 :   int i;
    9544       654192 :   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
    9545       654192 :   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
    9546              : 
    9547       654192 :   if (dump_enabled_p ())
    9548              :     {
    9549           99 :       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
    9550           99 :       hash_set<slp_tree> visited;
    9551          399 :       FOR_EACH_VEC_ELT (slp_instances, i, instance)
    9552          102 :         vect_print_slp_graph (MSG_NOTE, vect_location,
    9553              :                               SLP_INSTANCE_TREE (instance), visited);
    9554           99 :     }
    9555              : 
    9556              :   /* Then DFS walk scalar stmts, performing costing and handling
    9557              :      still live scalar stmts via the previously computed vector coverage.  */
    9558       654192 :   stmt_vector_for_cost scalar_costs = vNULL;
    9559       654192 :   stmt_vector_for_cost vector_costs = vNULL;
    9560       654192 :   hash_set<slp_tree> visited;
    9561       654192 :   hash_set<stmt_vec_info> svisited;
    9562      1327704 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
    9563              :     {
    9564       673512 :       auto_vec<stmt_vec_info> worklist;
    9565       673512 :       if (SLP_INSTANCE_ROOT_STMTS (instance).exists ())
    9566        57138 :         record_stmt_cost (&scalar_costs,
    9567        28569 :                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
    9568              :                           scalar_stmt,
    9569        28569 :                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
    9570      3792390 :       for (auto stmt : SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)))
    9571              :         {
    9572      1771854 :           stmt = vect_orig_stmt (stmt);
    9573      1771854 :           if (!svisited.add (stmt))
    9574      1763611 :             worklist.safe_push (stmt);
    9575              :         }
    9576       673512 :       vect_bb_slp_scalar_cost (bb_vinfo, worklist, &scalar_costs, svisited);
    9577       673512 :       vector_costs.safe_splice (instance->cost_vec);
    9578       673512 :       instance->cost_vec.release ();
    9579       673512 :     }
    9580              : 
    9581       654192 :   if (dump_enabled_p ())
    9582           99 :     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
    9583              : 
    9584              :   /* When costing non-loop vectorization we need to consider each covered
    9585              :      loop independently and make sure vectorization is profitable.  For
    9586              :      now we assume a loop may be not entered or executed an arbitrary
    9587              :      number of iterations (???  static information can provide more
    9588              :      precise info here) which means we can simply cost each containing
    9589              :      loops stmts separately.  */
    9590              : 
    9591              :   /* First produce cost vectors sorted by loop index.  */
    9592       654192 :   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
    9593       654192 :     li_scalar_costs (scalar_costs.length ());
    9594       654192 :   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
    9595       654192 :     li_vector_costs (vector_costs.length ());
    9596       654192 :   stmt_info_for_cost *cost;
    9597      2803256 :   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
    9598              :     {
    9599      2149064 :       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
    9600      2149064 :       li_scalar_costs.quick_push (std::make_pair (l, cost));
    9601              :     }
    9602              :   /* Use a random used loop as fallback in case the first vector_costs
    9603              :      entry does not have a stmt_info associated with it.  */
    9604       654192 :   unsigned l = li_scalar_costs[0].first;
    9605      2386216 :   FOR_EACH_VEC_ELT (vector_costs, i, cost)
    9606              :     {
    9607              :       /* We inherit from the previous COST, invariants, externals and
    9608              :          extracts immediately follow the cost for the related stmt.  */
    9609      1732024 :       if (cost->stmt_info)
    9610      1017044 :         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
    9611      1732024 :       li_vector_costs.quick_push (std::make_pair (l, cost));
    9612              :     }
    9613       654192 :   li_scalar_costs.qsort (li_cost_vec_cmp);
    9614       654192 :   li_vector_costs.qsort (li_cost_vec_cmp);
    9615              : 
    9616              :   /* Now cost the portions individually.  */
    9617              :   unsigned vi = 0;
    9618              :   unsigned si = 0;
    9619      1135720 :   bool profitable = true;
    9620      1135720 :   while (si < li_scalar_costs.length ()
    9621      1794504 :          && vi < li_vector_costs.length ())
    9622              :     {
    9623       658772 :       unsigned sl = li_scalar_costs[si].first;
    9624       658772 :       unsigned vl = li_vector_costs[vi].first;
    9625       658772 :       if (sl != vl)
    9626              :         {
    9627         1026 :           if (dump_enabled_p ())
    9628            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    9629              :                              "Scalar %d and vector %d loop part do not "
    9630              :                              "match up, skipping scalar part\n", sl, vl);
    9631              :           /* Skip the scalar part, assuming zero cost on the vector side.  */
    9632         1694 :           do
    9633              :             {
    9634         1694 :               si++;
    9635              :             }
    9636         1694 :           while (si < li_scalar_costs.length ()
    9637         3497 :                  && li_scalar_costs[si].first == sl);
    9638         1026 :           continue;
    9639              :         }
    9640              : 
    9641       657746 :       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
    9642      2129981 :       do
    9643              :         {
    9644      2129981 :           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
    9645      2129981 :           si++;
    9646              :         }
    9647      2129981 :       while (si < li_scalar_costs.length ()
    9648      4267423 :              && li_scalar_costs[si].first == sl);
    9649       657746 :       scalar_target_cost_data->finish_cost (nullptr);
    9650       657746 :       scalar_cost = scalar_target_cost_data->body_cost ();
    9651              : 
    9652              :       /* Complete the target-specific vector cost calculation.  */
    9653       657746 :       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
    9654      1704799 :       do
    9655              :         {
    9656      1704799 :           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
    9657      1704799 :           vi++;
    9658              :         }
    9659      1704799 :       while (vi < li_vector_costs.length ()
    9660      3418199 :              && li_vector_costs[vi].first == vl);
    9661       657746 :       vect_target_cost_data->finish_cost (scalar_target_cost_data);
    9662       657746 :       vec_prologue_cost = vect_target_cost_data->prologue_cost ();
    9663       657746 :       vec_inside_cost = vect_target_cost_data->body_cost ();
    9664       657746 :       vec_epilogue_cost = vect_target_cost_data->epilogue_cost ();
    9665       657746 :       delete scalar_target_cost_data;
    9666       657746 :       delete vect_target_cost_data;
    9667              : 
    9668       657746 :       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
    9669              : 
    9670       657746 :       if (dump_enabled_p ())
    9671              :         {
    9672           99 :           dump_printf_loc (MSG_NOTE, vect_location,
    9673              :                            "Cost model analysis for part in loop %d:\n", sl);
    9674           99 :           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
    9675              :                        vec_inside_cost + vec_outside_cost);
    9676           99 :           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
    9677              :         }
    9678              : 
    9679              :       /* Vectorization is profitable if its cost is more than the cost of scalar
    9680              :          version.  Note that we err on the vector side for equal cost because
    9681              :          the cost estimate is otherwise quite pessimistic (constant uses are
    9682              :          free on the scalar side but cost a load on the vector side for
    9683              :          example).  */
    9684       657746 :       if (vec_outside_cost + vec_inside_cost > scalar_cost)
    9685              :         {
    9686              :           profitable = false;
    9687              :           break;
    9688              :         }
    9689              :     }
    9690      1131127 :   if (profitable && vi < li_vector_costs.length ())
    9691              :     {
    9692         1084 :       if (dump_enabled_p ())
    9693           12 :         dump_printf_loc (MSG_NOTE, vect_location,
    9694              :                          "Excess vector cost for part in loop %d:\n",
    9695            6 :                          li_vector_costs[vi].first);
    9696              :       profitable = false;
    9697              :     }
    9698              : 
    9699              :   /* Unset visited flag.  This is delayed when the subgraph is profitable
    9700              :      and we process the loop for remaining unvectorized if-converted code.  */
    9701       654192 :   if (!orig_loop || !profitable)
    9702      2801837 :     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
    9703      2147748 :       gimple_set_visited  (cost->stmt_info->stmt, false);
    9704              : 
    9705       654192 :   scalar_costs.release ();
    9706       654192 :   vector_costs.release ();
    9707              : 
    9708       654192 :   return profitable;
    9709       654192 : }
    9710              : 
    9711              : /* qsort comparator for lane defs.  */
    9712              : 
    9713              : static int
    9714           40 : vld_cmp (const void *a_, const void *b_)
    9715              : {
    9716           40 :   auto *a = (const std::pair<unsigned, tree> *)a_;
    9717           40 :   auto *b = (const std::pair<unsigned, tree> *)b_;
    9718           40 :   return a->first - b->first;
    9719              : }
    9720              : 
    9721              : /* Return true if USE_STMT is a vector lane insert into VEC and set
    9722              :    *THIS_LANE to the lane number that is set.  */
    9723              : 
    9724              : static bool
    9725          248 : vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
    9726              : {
    9727          248 :   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
    9728           91 :   if (!use_ass
    9729           91 :       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
    9730           22 :       || (vec
    9731           22 :           ? gimple_assign_rhs1 (use_ass) != vec
    9732           24 :           : ((vec = gimple_assign_rhs1 (use_ass)), false))
    9733           46 :       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
    9734           46 :                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
    9735           46 :       || !constant_multiple_p
    9736           46 :             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
    9737           92 :              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
    9738              :              this_lane))
    9739          202 :     return false;
    9740              :   return true;
    9741              : }
    9742              : 
    9743              : /* Find any vectorizable constructors and add them to the grouped_store
    9744              :    array.  */
    9745              : 
    9746              : static void
    9747      2190696 : vect_slp_check_for_roots (bb_vec_info bb_vinfo)
    9748              : {
    9749     17515250 :   for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
    9750     30649108 :     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
    9751    134374416 :          !gsi_end_p (gsi); gsi_next (&gsi))
    9752              :     {
    9753    119049862 :       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
    9754              :       /* This can be used to start SLP discovery for early breaks for BB early breaks
    9755              :          when we get that far.  */
    9756    119049862 :       if (!assign)
    9757    178649264 :         continue;
    9758              : 
    9759     30641742 :       tree rhs = gimple_assign_rhs1 (assign);
    9760     30641742 :       enum tree_code code = gimple_assign_rhs_code (assign);
    9761     30641742 :       use_operand_p use_p;
    9762     30641742 :       gimple *use_stmt;
    9763     30641742 :       if (code == CONSTRUCTOR)
    9764              :         {
    9765      1566051 :           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
    9766        63519 :               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
    9767        92549 :                            CONSTRUCTOR_NELTS (rhs))
    9768        42885 :               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
    9769      1608932 :               || uniform_vector_p (rhs))
    9770      1553229 :             continue;
    9771              : 
    9772              :           unsigned j;
    9773              :           tree val;
    9774        63555 :           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
    9775        50733 :             if (TREE_CODE (val) != SSA_NAME
    9776        50733 :                 || !bb_vinfo->lookup_def (val))
    9777              :               break;
    9778        31676 :           if (j != CONSTRUCTOR_NELTS (rhs))
    9779         3016 :             continue;
    9780              : 
    9781        12822 :           vec<stmt_vec_info> roots = vNULL;
    9782        12822 :           roots.safe_push (bb_vinfo->lookup_stmt (assign));
    9783        12822 :           vec<stmt_vec_info> stmts;
    9784        12822 :           stmts.create (CONSTRUCTOR_NELTS (rhs));
    9785        71720 :           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
    9786        46076 :             stmts.quick_push
    9787        46076 :               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
    9788        12822 :           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
    9789        12822 :                                                stmts, roots));
    9790              :         }
    9791     29075691 :       else if (code == BIT_INSERT_EXPR
    9792          929 :                && VECTOR_TYPE_P (TREE_TYPE (rhs))
    9793          611 :                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
    9794          611 :                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
    9795          608 :                && integer_zerop (gimple_assign_rhs3 (assign))
    9796          341 :                && useless_type_conversion_p
    9797          341 :                     (TREE_TYPE (TREE_TYPE (rhs)),
    9798          341 :                      TREE_TYPE (gimple_assign_rhs2 (assign)))
    9799     29076313 :                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
    9800              :         {
    9801              :           /* We start to match on insert to lane zero but since the
    9802              :              inserts need not be ordered we'd have to search both
    9803              :              the def and the use chains.  */
    9804          215 :           tree vectype = TREE_TYPE (rhs);
    9805          215 :           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
    9806          215 :           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
    9807          215 :           auto_sbitmap lanes (nlanes);
    9808          215 :           bitmap_clear (lanes);
    9809          215 :           bitmap_set_bit (lanes, 0);
    9810          215 :           tree def = gimple_assign_lhs (assign);
    9811          215 :           lane_defs.quick_push
    9812          215 :                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
    9813          215 :           unsigned lanes_found = 1;
    9814              :           /* Start with the use chains, the last stmt will be the root.  */
    9815          215 :           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
    9816          215 :           vec<stmt_vec_info> roots = vNULL;
    9817          215 :           roots.safe_push (last);
    9818          217 :           do
    9819              :             {
    9820          217 :               use_operand_p use_p;
    9821          217 :               gimple *use_stmt;
    9822          217 :               if (!single_imm_use (def, &use_p, &use_stmt))
    9823              :                 break;
    9824          211 :               unsigned this_lane;
    9825          211 :               if (!bb_vinfo->lookup_stmt (use_stmt)
    9826          211 :                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
    9827          233 :                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
    9828              :                 break;
    9829           22 :               if (bitmap_bit_p (lanes, this_lane))
    9830              :                 break;
    9831            2 :               lanes_found++;
    9832            2 :               bitmap_set_bit (lanes, this_lane);
    9833            2 :               gassign *use_ass = as_a <gassign *> (use_stmt);
    9834            2 :               lane_defs.quick_push (std::make_pair
    9835            2 :                                      (this_lane, gimple_assign_rhs2 (use_ass)));
    9836            2 :               last = bb_vinfo->lookup_stmt (use_ass);
    9837            2 :               roots.safe_push (last);
    9838            2 :               def = gimple_assign_lhs (use_ass);
    9839              :             }
    9840            2 :           while (lanes_found < nlanes);
    9841          215 :           if (roots.length () > 1)
    9842            2 :             std::swap(roots[0], roots[roots.length () - 1]);
    9843          215 :           if (lanes_found < nlanes)
    9844              :             {
    9845              :               /* Now search the def chain.  */
    9846          215 :               def = gimple_assign_rhs1 (assign);
    9847          217 :               do
    9848              :                 {
    9849          217 :                   if (TREE_CODE (def) != SSA_NAME
    9850          217 :                       || !has_single_use (def))
    9851              :                     break;
    9852           56 :                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
    9853           56 :                   unsigned this_lane;
    9854           56 :                   if (!bb_vinfo->lookup_stmt (def_stmt)
    9855           37 :                       || !vect_slp_is_lane_insert (def_stmt,
    9856              :                                                    NULL_TREE, &this_lane)
    9857           80 :                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
    9858              :                     break;
    9859           24 :                   if (bitmap_bit_p (lanes, this_lane))
    9860              :                     break;
    9861            4 :                   lanes_found++;
    9862            4 :                   bitmap_set_bit (lanes, this_lane);
    9863            8 :                   lane_defs.quick_push (std::make_pair
    9864            4 :                                           (this_lane,
    9865            4 :                                            gimple_assign_rhs2 (def_stmt)));
    9866            4 :                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
    9867            4 :                   def = gimple_assign_rhs1 (def_stmt);
    9868              :                 }
    9869            4 :               while (lanes_found < nlanes);
    9870              :             }
    9871          215 :           if (lanes_found == nlanes)
    9872              :             {
    9873              :               /* Sort lane_defs after the lane index and register the root.  */
    9874            2 :               lane_defs.qsort (vld_cmp);
    9875            2 :               vec<stmt_vec_info> stmts;
    9876            2 :               stmts.create (nlanes);
    9877           10 :               for (unsigned i = 0; i < nlanes; ++i)
    9878            8 :                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
    9879            2 :               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
    9880            2 :                                                    stmts, roots));
    9881              :             }
    9882              :           else
    9883          213 :             roots.release ();
    9884          215 :         }
    9885     29075476 :       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
    9886     28095926 :                && (associative_tree_code (code) || code == MINUS_EXPR)
    9887              :                /* ???  This pessimizes a two-element reduction.  PR54400.
    9888              :                   ???  In-order reduction could be handled if we only
    9889              :                   traverse one operand chain in vect_slp_linearize_chain.  */
    9890     32969010 :                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
    9891              :                /* Ops with constants at the tail can be stripped here.  */
    9892      5741956 :                && TREE_CODE (rhs) == SSA_NAME
    9893      5686394 :                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
    9894              :                /* Should be the chain end.  */
    9895     31330112 :                && (!single_imm_use (gimple_assign_lhs (assign),
    9896              :                                     &use_p, &use_stmt)
    9897      1743661 :                    || !is_gimple_assign (use_stmt)
    9898      1193890 :                    || (gimple_assign_rhs_code (use_stmt) != code
    9899       884705 :                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
    9900       498929 :                            || (gimple_assign_rhs_code (use_stmt)
    9901       498929 :                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
    9902              :         {
    9903              :           /* We start the match at the end of a possible association
    9904              :              chain.  */
    9905      1848422 :           auto_vec<chain_op_t> chain;
    9906      1848422 :           auto_vec<std::pair<tree_code, gimple *> > worklist;
    9907      1848422 :           auto_vec<gimple *> chain_stmts;
    9908      1848422 :           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
    9909      1848422 :           if (code == MINUS_EXPR)
    9910       297670 :             code = PLUS_EXPR;
    9911      1848422 :           internal_fn reduc_fn;
    9912      2128217 :           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
    9913      1848422 :               || reduc_fn == IFN_LAST)
    9914       279795 :             continue;
    9915      1568627 :           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
    9916              :                                     /* ??? */
    9917              :                                     code_stmt, alt_code_stmt, &chain_stmts,
    9918              :                                     false);
    9919      3137254 :           if (chain.length () > 1)
    9920              :             {
    9921              :               /* Sort the chain according to def_type and operation.  */
    9922      1568627 :               chain.sort (dt_sort_cmp, bb_vinfo);
    9923              :               /* ???  Now we'd want to strip externals and constants
    9924              :                  but record those to be handled in the epilogue.  */
    9925              :               /* ???  For now do not allow mixing ops or externs/constants.  */
    9926      1568627 :               bool invalid = false;
    9927      1568627 :               unsigned remain_cnt = 0;
    9928      1568627 :               unsigned last_idx = 0;
    9929      4742725 :               for (unsigned i = 0; i < chain.length (); ++i)
    9930              :                 {
    9931      3471768 :                   if (chain[i].code != code)
    9932              :                     {
    9933              :                       invalid = true;
    9934              :                       break;
    9935              :                     }
    9936      3174098 :                   if (chain[i].dt != vect_internal_def
    9937              :                       /* Avoid stmts where the def is not the LHS, like
    9938              :                          ASMs.  */
    9939      6137650 :                       || (gimple_get_lhs (bb_vinfo->lookup_def
    9940      2963552 :                                                       (chain[i].op)->stmt)
    9941      2963552 :                           != chain[i].op))
    9942       213490 :                     remain_cnt++;
    9943              :                   else
    9944              :                     last_idx = i;
    9945              :                 }
    9946              :               /* Make sure to have an even number of lanes as we later do
    9947              :                  all-or-nothing discovery, not trying to split further.  */
    9948      1568627 :               if ((chain.length () - remain_cnt) & 1)
    9949       168627 :                 remain_cnt++;
    9950      1568627 :               if (!invalid && chain.length () - remain_cnt > 1)
    9951              :                 {
    9952      1206407 :                   vec<stmt_vec_info> stmts;
    9953      1206407 :                   vec<tree> remain = vNULL;
    9954      1206407 :                   stmts.create (chain.length ());
    9955      1206407 :                   if (remain_cnt > 0)
    9956       114594 :                     remain.create (remain_cnt);
    9957      3879226 :                   for (unsigned i = 0; i < chain.length (); ++i)
    9958              :                     {
    9959      2672819 :                       stmt_vec_info stmt_info;
    9960      2672819 :                       if (chain[i].dt == vect_internal_def
    9961      2633089 :                           && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
    9962      2633089 :                               gimple_get_lhs (stmt_info->stmt) == chain[i].op)
    9963      5305824 :                           && (i != last_idx
    9964      1206407 :                               || (stmts.length () & 1)))
    9965      2546868 :                         stmts.quick_push (stmt_info);
    9966              :                       else
    9967       125951 :                         remain.quick_push (chain[i].op);
    9968              :                     }
    9969      1206407 :                   vec<stmt_vec_info> roots;
    9970      1206407 :                   roots.create (chain_stmts.length ());
    9971      2672819 :                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
    9972      1466412 :                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
    9973      1206407 :                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
    9974      1206407 :                                                        stmts, roots, remain));
    9975              :                 }
    9976              :             }
    9977      1848422 :         }
    9978              :     }
    9979      2190696 : }
    9980              : 
    9981              : /* Walk the grouped store chains and replace entries with their
    9982              :    pattern variant if any.  */
    9983              : 
    9984              : static void
    9985       610100 : vect_fixup_store_groups_with_patterns (vec_info *vinfo)
    9986              : {
    9987       610100 :   stmt_vec_info first_element;
    9988       610100 :   unsigned i;
    9989              : 
    9990      1495260 :   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
    9991              :     {
    9992              :       /* We also have CTORs in this array.  */
    9993       885160 :       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
    9994            0 :         continue;
    9995       885160 :       if (STMT_VINFO_IN_PATTERN_P (first_element))
    9996              :         {
    9997          252 :           stmt_vec_info orig = first_element;
    9998          252 :           first_element = STMT_VINFO_RELATED_STMT (first_element);
    9999          252 :           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
   10000          252 :           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
   10001          252 :           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
   10002          252 :           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
   10003          252 :           vinfo->grouped_stores[i] = first_element;
   10004              :         }
   10005       885160 :       stmt_vec_info prev = first_element;
   10006      2488431 :       while (DR_GROUP_NEXT_ELEMENT (prev))
   10007              :         {
   10008      1603271 :           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
   10009      1603271 :           if (STMT_VINFO_IN_PATTERN_P (elt))
   10010              :             {
   10011          849 :               stmt_vec_info orig = elt;
   10012          849 :               elt = STMT_VINFO_RELATED_STMT (elt);
   10013          849 :               DR_GROUP_NEXT_ELEMENT (prev) = elt;
   10014          849 :               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
   10015          849 :               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
   10016              :             }
   10017      1603271 :           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
   10018      1603271 :           prev = elt;
   10019              :         }
   10020              :     }
   10021       610100 : }
   10022              : 
   10023              : /* Check if the region described by BB_VINFO can be vectorized, returning
   10024              :    true if so.  When returning false, set FATAL to true if the same failure
   10025              :    would prevent vectorization at other vector sizes, false if it is still
   10026              :    worth trying other sizes.  N_STMTS is the number of statements in the
   10027              :    region.  */
   10028              : 
   10029              : static bool
   10030      2190696 : vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
   10031              :                        vec<int> *dataref_groups)
   10032              : {
   10033      2190696 :   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
   10034              : 
   10035      2190696 :   slp_instance instance;
   10036      2190696 :   int i;
   10037              : 
   10038              :   /* The first group of checks is independent of the vector size.  */
   10039      2190696 :   fatal = true;
   10040              : 
   10041              :   /* Analyze the data references.  */
   10042              : 
   10043      2190696 :   if (!vect_analyze_data_refs (bb_vinfo, NULL))
   10044              :     {
   10045            0 :       if (dump_enabled_p ())
   10046            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10047              :                          "not vectorized: unhandled data-ref in basic "
   10048              :                          "block.\n");
   10049            0 :       return false;
   10050              :     }
   10051              : 
   10052      2190696 :   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
   10053              :     {
   10054            0 :      if (dump_enabled_p ())
   10055            0 :        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10056              :                         "not vectorized: unhandled data access in "
   10057              :                         "basic block.\n");
   10058            0 :       return false;
   10059              :     }
   10060              : 
   10061      2190696 :   vect_slp_check_for_roots (bb_vinfo);
   10062              : 
   10063              :   /* If there are no grouped stores and no constructors in the region
   10064              :      there is no need to continue with pattern recog as vect_analyze_slp
   10065              :      will fail anyway.  */
   10066      2190696 :   if (bb_vinfo->grouped_stores.is_empty ()
   10067      1849545 :       && bb_vinfo->roots.is_empty ())
   10068              :     {
   10069      1580596 :       if (dump_enabled_p ())
   10070         1022 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10071              :                          "not vectorized: no grouped stores in "
   10072              :                          "basic block.\n");
   10073      1580596 :       return false;
   10074              :     }
   10075              : 
   10076              :   /* While the rest of the analysis below depends on it in some way.  */
   10077       610100 :   fatal = false;
   10078              : 
   10079       610100 :   vect_pattern_recog (bb_vinfo);
   10080              : 
   10081              :   /* Update store groups from pattern processing.  */
   10082       610100 :   vect_fixup_store_groups_with_patterns (bb_vinfo);
   10083              : 
   10084              :   /* Check the SLP opportunities in the basic block, analyze and build SLP
   10085              :      trees.  */
   10086       610100 :   if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
   10087              :     {
   10088            0 :       if (dump_enabled_p ())
   10089              :         {
   10090            0 :           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10091              :                            "Failed to SLP the basic block.\n");
   10092            0 :           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10093              :                            "not vectorized: failed to find SLP opportunities "
   10094              :                            "in basic block.\n");
   10095              :         }
   10096            0 :       return false;
   10097              :     }
   10098              : 
   10099              :   /* Optimize permutations.  */
   10100       610100 :   vect_optimize_slp (bb_vinfo);
   10101              : 
   10102              :   /* Gather the loads reachable from the SLP graph entries.  */
   10103       610100 :   vect_gather_slp_loads (bb_vinfo);
   10104              : 
   10105       610100 :   vect_record_base_alignments (bb_vinfo);
   10106              : 
   10107              :   /* Analyze and verify the alignment of data references and the
   10108              :      dependence in the SLP instances.  */
   10109      1395190 :   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
   10110              :     {
   10111       785090 :       vect_location = instance->location ();
   10112       785090 :       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
   10113       785090 :           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
   10114              :         {
   10115         8519 :           slp_tree node = SLP_INSTANCE_TREE (instance);
   10116         8519 :           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   10117         8519 :           if (dump_enabled_p ())
   10118            4 :             dump_printf_loc (MSG_NOTE, vect_location,
   10119              :                              "removing SLP instance operations starting from: %G",
   10120              :                              stmt_info->stmt);
   10121         8519 :           vect_free_slp_instance (instance);
   10122         8519 :           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
   10123         8519 :           continue;
   10124         8519 :         }
   10125              : 
   10126              :       /* Mark all the statements that we want to vectorize as relevant.  */
   10127       776571 :       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
   10128              : 
   10129       776571 :       i++;
   10130              :     }
   10131      2220485 :   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
   10132              :     return false;
   10133              : 
   10134       264393 :   if (!vect_slp_analyze_operations (bb_vinfo))
   10135              :     {
   10136        29789 :       if (dump_enabled_p ())
   10137           87 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10138              :                          "not vectorized: bad operation in basic block.\n");
   10139        29789 :       return false;
   10140              :     }
   10141              : 
   10142              :   /* Mark all the statements that we vectorize.  */
   10143       234604 :   vect_bb_slp_mark_stmts_vectorized (bb_vinfo);
   10144              : 
   10145              :   /* Compute vectorizable live stmts.  */
   10146       234604 :   vect_bb_slp_mark_live_stmts (bb_vinfo);
   10147              : 
   10148       234604 :   vect_bb_partition_graph (bb_vinfo);
   10149              : 
   10150       234604 :   return true;
   10151              : }
   10152              : 
   10153              : /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
   10154              :    basic blocks in BBS, returning true on success.
   10155              :    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
   10156              : 
   10157              : static bool
   10158      1871569 : vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
   10159              :                  vec<int> *dataref_groups, unsigned int n_stmts,
   10160              :                  loop_p orig_loop)
   10161              : {
   10162      1871569 :   bb_vec_info bb_vinfo;
   10163      1871569 :   auto_vector_modes vector_modes;
   10164              : 
   10165              :   /* Autodetect first vector size we try.  */
   10166      1871569 :   machine_mode next_vector_mode = VOIDmode;
   10167      1871569 :   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
   10168      1871569 :   unsigned int mode_i = 0;
   10169              : 
   10170      1871569 :   vec_info_shared shared;
   10171              : 
   10172      1871569 :   machine_mode autodetected_vector_mode = VOIDmode;
   10173      2509823 :   while (1)
   10174              :     {
   10175      2190696 :       bool vectorized = false;
   10176      2190696 :       bool fatal = false;
   10177      2190696 :       bb_vinfo = new _bb_vec_info (bbs, &shared);
   10178              : 
   10179      2190696 :       bool first_time_p = shared.datarefs.is_empty ();
   10180      2190696 :       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
   10181      2190696 :       if (first_time_p)
   10182      1894098 :         bb_vinfo->shared->save_datarefs ();
   10183              :       else
   10184       296598 :         bb_vinfo->shared->check_datarefs ();
   10185      2190696 :       bb_vinfo->vector_mode = next_vector_mode;
   10186              : 
   10187      2190696 :       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
   10188              :         {
   10189       234604 :           if (dump_enabled_p ())
   10190              :             {
   10191         1508 :               dump_printf_loc (MSG_NOTE, vect_location,
   10192              :                                "***** Analysis succeeded with vector mode"
   10193          754 :                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
   10194          754 :               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
   10195              :             }
   10196              : 
   10197       234604 :           bb_vinfo->shared->check_datarefs ();
   10198              : 
   10199       234604 :           bool force_clear = false;
   10200       234604 :           auto_vec<slp_instance> profitable_subgraphs;
   10201      1380822 :           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
   10202              :             {
   10203       677010 :               if (instance->subgraph_entries.is_empty ())
   10204       217242 :                 continue;
   10205              : 
   10206       657553 :               dump_user_location_t saved_vect_location = vect_location;
   10207       657553 :               vect_location = instance->location ();
   10208       657553 :               if (!unlimited_cost_model (NULL)
   10209       654197 :                   && !param_vect_allow_possibly_not_worthwhile_vectorizations
   10210      1311745 :                   && !vect_bb_vectorization_profitable_p
   10211       654192 :                         (bb_vinfo, instance->subgraph_entries, orig_loop))
   10212              :                 {
   10213       178328 :                   if (dump_enabled_p ())
   10214           28 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10215              :                                      "not vectorized: vectorization is not "
   10216              :                                      "profitable.\n");
   10217       178328 :                   vect_location = saved_vect_location;
   10218       178328 :                   continue;
   10219              :                 }
   10220              : 
   10221       479225 :               vect_location = saved_vect_location;
   10222       479225 :               if (!dbg_cnt (vect_slp))
   10223              :                 {
   10224            0 :                   force_clear = true;
   10225            0 :                   continue;
   10226              :                 }
   10227              : 
   10228       479225 :               profitable_subgraphs.safe_push (instance);
   10229              :             }
   10230              : 
   10231              :           /* When we're vectorizing an if-converted loop body make sure
   10232              :              we vectorized all if-converted code.  */
   10233       393486 :           if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
   10234              :             {
   10235          106 :               gcc_assert (bb_vinfo->nbbs == 1);
   10236          212 :               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
   10237         4388 :                    !gsi_end_p (gsi); gsi_next (&gsi))
   10238              :                 {
   10239              :                   /* The costing above left us with DCEable vectorized scalar
   10240              :                      stmts having the visited flag set on profitable
   10241              :                      subgraphs.  Do the delayed clearing of the flag here.  */
   10242         4282 :                   if (gimple_visited_p (gsi_stmt (gsi)))
   10243              :                     {
   10244         1260 :                       gimple_set_visited (gsi_stmt (gsi), false);
   10245         1260 :                       continue;
   10246              :                     }
   10247         3022 :                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
   10248          813 :                     continue;
   10249              : 
   10250         6334 :                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
   10251         2666 :                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
   10252              :                       {
   10253           69 :                         if (!profitable_subgraphs.is_empty ()
   10254           31 :                             && dump_enabled_p ())
   10255            0 :                           dump_printf_loc (MSG_NOTE, vect_location,
   10256              :                                            "not profitable because of "
   10257              :                                            "unprofitable if-converted scalar "
   10258              :                                            "code\n");
   10259           38 :                         profitable_subgraphs.truncate (0);
   10260              :                       }
   10261              :                 }
   10262              :             }
   10263              : 
   10264              :           /* Finally schedule the profitable subgraphs.  */
   10265      1031547 :           for (slp_instance instance : profitable_subgraphs)
   10266              :             {
   10267       479179 :               if (!vectorized && dump_enabled_p ())
   10268          729 :                 dump_printf_loc (MSG_NOTE, vect_location,
   10269              :                                  "Basic block will be vectorized "
   10270              :                                  "using SLP\n");
   10271       479179 :               vectorized = true;
   10272              : 
   10273              :               /* Dump before scheduling as store vectorization will remove
   10274              :                  the original stores and mess with the instance tree
   10275              :                  so querying its location will eventually ICE.  */
   10276       479179 :               if (flag_checking)
   10277      1927295 :                 for (slp_instance sub : instance->subgraph_entries)
   10278       489758 :                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
   10279       479179 :               unsigned HOST_WIDE_INT bytes;
   10280       479179 :               if (dump_enabled_p ())
   10281         3469 :                 for (slp_instance sub : instance->subgraph_entries)
   10282              :                   {
   10283          919 :                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
   10284         1838 :                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
   10285          919 :                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
   10286          919 :                                        sub->location (),
   10287              :                                        "basic block part vectorized using %wu "
   10288              :                                        "byte vectors\n", bytes);
   10289              :                     else
   10290              :                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
   10291              :                                        sub->location (),
   10292              :                                        "basic block part vectorized using "
   10293              :                                        "variable length vectors\n");
   10294              :                   }
   10295              : 
   10296       479179 :               dump_user_location_t saved_vect_location = vect_location;
   10297       479179 :               vect_location = instance->location ();
   10298              : 
   10299       479179 :               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
   10300              : 
   10301       479179 :               vect_location = saved_vect_location;
   10302              :             }
   10303              : 
   10304              : 
   10305              :           /* Generate the invariant statements.  */
   10306       234604 :           if (!gimple_seq_empty_p (bb_vinfo->inv_pattern_def_seq))
   10307              :             {
   10308           23 :               if (dump_enabled_p ())
   10309            0 :                 dump_printf_loc (MSG_NOTE, vect_location,
   10310              :                          "------>generating invariant statements\n");
   10311              : 
   10312           23 :               bb_vinfo->insert_seq_on_entry (NULL,
   10313              :                                              bb_vinfo->inv_pattern_def_seq);
   10314              :             }
   10315       234604 :         }
   10316              :       else
   10317              :         {
   10318      1956092 :           if (dump_enabled_p ())
   10319         1314 :             dump_printf_loc (MSG_NOTE, vect_location,
   10320              :                              "***** Analysis failed with vector mode %s\n",
   10321         1314 :                              GET_MODE_NAME (bb_vinfo->vector_mode));
   10322              :         }
   10323              : 
   10324      2190696 :       if (mode_i == 0)
   10325      1871569 :         autodetected_vector_mode = bb_vinfo->vector_mode;
   10326              : 
   10327      2190696 :       if (!fatal)
   10328      3134089 :         while (mode_i < vector_modes.length ()
   10329      1754619 :                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
   10330              :           {
   10331       333293 :             if (dump_enabled_p ())
   10332         1660 :               dump_printf_loc (MSG_NOTE, vect_location,
   10333              :                                "***** The result for vector mode %s would"
   10334              :                                " be the same\n",
   10335          830 :                                GET_MODE_NAME (vector_modes[mode_i]));
   10336       333293 :             mode_i += 1;
   10337              :           }
   10338              : 
   10339      2190696 :       delete bb_vinfo;
   10340              : 
   10341      2190696 :       if (mode_i < vector_modes.length ()
   10342      2014175 :           && VECTOR_MODE_P (autodetected_vector_mode)
   10343      1995328 :           && (related_vector_mode (vector_modes[mode_i],
   10344              :                                    GET_MODE_INNER (autodetected_vector_mode))
   10345       997664 :               == autodetected_vector_mode)
   10346      4204871 :           && (related_vector_mode (autodetected_vector_mode,
   10347       519273 :                                    GET_MODE_INNER (vector_modes[mode_i]))
   10348      1038546 :               == vector_modes[mode_i]))
   10349              :         {
   10350       519273 :           if (dump_enabled_p ())
   10351          205 :             dump_printf_loc (MSG_NOTE, vect_location,
   10352              :                              "***** Skipping vector mode %s, which would"
   10353              :                              " repeat the analysis for %s\n",
   10354          205 :                              GET_MODE_NAME (vector_modes[mode_i]),
   10355          205 :                              GET_MODE_NAME (autodetected_vector_mode));
   10356       519273 :           mode_i += 1;
   10357              :         }
   10358              : 
   10359      2190696 :       if (vectorized
   10360      2031845 :           || mode_i == vector_modes.length ()
   10361      1855369 :           || autodetected_vector_mode == VOIDmode
   10362              :           /* If vect_slp_analyze_bb_1 signaled that analysis for all
   10363              :              vector sizes will fail do not bother iterating.  */
   10364      3029554 :           || fatal)
   10365      3743138 :         return vectorized;
   10366              : 
   10367              :       /* Try the next biggest vector size.  */
   10368       319127 :       next_vector_mode = vector_modes[mode_i++];
   10369       319127 :       if (dump_enabled_p ())
   10370          218 :         dump_printf_loc (MSG_NOTE, vect_location,
   10371              :                          "***** Re-trying analysis with vector mode %s\n",
   10372          218 :                          GET_MODE_NAME (next_vector_mode));
   10373       319127 :     }
   10374      1871569 : }
   10375              : 
   10376              : 
   10377              : /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
   10378              :    true if anything in the basic-block was vectorized.  */
   10379              : 
   10380              : static bool
   10381      1871569 : vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
   10382              : {
   10383      1871569 :   vec<data_reference_p> datarefs = vNULL;
   10384      1871569 :   auto_vec<int> dataref_groups;
   10385      1871569 :   int insns = 0;
   10386      1871569 :   int current_group = 0;
   10387              : 
   10388     12392371 :   for (unsigned i = 0; i < bbs.length (); i++)
   10389              :     {
   10390     10520802 :       basic_block bb = bbs[i];
   10391     88140584 :       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
   10392     77619782 :            gsi_next (&gsi))
   10393              :         {
   10394     77619782 :           gimple *stmt = gsi_stmt (gsi);
   10395     77619782 :           if (is_gimple_debug (stmt))
   10396     48154919 :             continue;
   10397              : 
   10398     29464863 :           insns++;
   10399              : 
   10400     29464863 :           if (gimple_location (stmt) != UNKNOWN_LOCATION)
   10401     26432853 :             vect_location = stmt;
   10402              : 
   10403     29464863 :           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
   10404              :                                               &dataref_groups, current_group))
   10405      5078363 :             ++current_group;
   10406              :         }
   10407              :       /* New BBs always start a new DR group.  */
   10408     10520802 :       ++current_group;
   10409              :     }
   10410              : 
   10411      1871569 :   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
   10412      1871569 : }
   10413              : 
   10414              : /* Special entry for the BB vectorizer.  Analyze and transform a single
   10415              :    if-converted BB with ORIG_LOOPs body being the not if-converted
   10416              :    representation.  Returns true if anything in the basic-block was
   10417              :    vectorized.  */
   10418              : 
   10419              : bool
   10420        19219 : vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
   10421              : {
   10422        19219 :   auto_vec<basic_block> bbs;
   10423        19219 :   bbs.safe_push (bb);
   10424        19219 :   return vect_slp_bbs (bbs, orig_loop);
   10425        19219 : }
   10426              : 
   10427              : /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
   10428              :    true if anything in the basic-block was vectorized.  */
   10429              : 
   10430              : bool
   10431       906490 : vect_slp_function (function *fun)
   10432              : {
   10433       906490 :   bool r = false;
   10434       906490 :   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
   10435       906490 :   auto_bitmap exit_bbs;
   10436       906490 :   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
   10437       906490 :   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
   10438       906490 :   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
   10439       906490 :                                                       true, rpo, NULL);
   10440              : 
   10441              :   /* For the moment split the function into pieces to avoid making
   10442              :      the iteration on the vector mode moot.  Split at points we know
   10443              :      to not handle well which is CFG merges (SLP discovery doesn't
   10444              :      handle non-loop-header PHIs) and loop exits.  Since pattern
   10445              :      recog requires reverse iteration to visit uses before defs
   10446              :      simply chop RPO into pieces.  */
   10447       906490 :   auto_vec<basic_block> bbs;
   10448     11439017 :   for (unsigned i = 0; i < n; i++)
   10449              :     {
   10450     10532527 :       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
   10451     10532527 :       bool split = false;
   10452              : 
   10453              :       /* Split when a BB is not dominated by the first block.  */
   10454     19852247 :       if (!bbs.is_empty ()
   10455      9319720 :           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
   10456              :         {
   10457       659411 :           if (dump_enabled_p ())
   10458          146 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10459              :                              "splitting region at dominance boundary bb%d\n",
   10460              :                              bb->index);
   10461              :           split = true;
   10462              :         }
   10463              :       /* Split when the loop determined by the first block
   10464              :          is exited.  This is because we eventually insert
   10465              :          invariants at region begin.  */
   10466     18533425 :       else if (!bbs.is_empty ()
   10467      8660309 :                && bbs[0]->loop_father != bb->loop_father
   10468      2259274 :                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
   10469              :         {
   10470         3805 :           if (dump_enabled_p ())
   10471            6 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10472              :                              "splitting region at loop %d exit at bb%d\n",
   10473            3 :                              bbs[0]->loop_father->num, bb->index);
   10474              :           split = true;
   10475              :         }
   10476      9869311 :       else if (!bbs.is_empty ()
   10477      8656504 :                && bb->loop_father->header == bb
   10478       468363 :                && bb->loop_father->dont_vectorize)
   10479              :         {
   10480         7271 :           if (dump_enabled_p ())
   10481           72 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10482              :                              "splitting region at dont-vectorize loop %d "
   10483              :                              "entry at bb%d\n",
   10484              :                              bb->loop_father->num, bb->index);
   10485              :           split = true;
   10486              :         }
   10487              : 
   10488     11203014 :       if (split && !bbs.is_empty ())
   10489              :         {
   10490       670487 :           r |= vect_slp_bbs (bbs, NULL);
   10491       670487 :           bbs.truncate (0);
   10492              :         }
   10493              : 
   10494     10532527 :       if (bbs.is_empty ())
   10495              :         {
   10496              :           /* We need to be able to insert at the head of the region which
   10497              :              we cannot for region starting with a returns-twice call.  */
   10498      1883294 :           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
   10499       400344 :             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
   10500              :               {
   10501          301 :                 if (dump_enabled_p ())
   10502            2 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10503              :                                    "skipping bb%d as start of region as it "
   10504              :                                    "starts with returns-twice call\n",
   10505              :                                    bb->index);
   10506        30944 :                 continue;
   10507              :               }
   10508              :           /* If the loop this BB belongs to is marked as not to be vectorized
   10509              :              honor that also for BB vectorization.  */
   10510      1882993 :           if (bb->loop_father->dont_vectorize)
   10511        30643 :             continue;
   10512              :         }
   10513              : 
   10514     10501583 :       bbs.safe_push (bb);
   10515              : 
   10516              :       /* When we have a stmt ending this block and defining a
   10517              :          value we have to insert on edges when inserting after it for
   10518              :          a vector containing its definition.  Avoid this for now.  */
   10519     21003166 :       if (gimple *last = *gsi_last_bb (bb))
   10520      8508410 :         if (gimple_get_lhs (last)
   10521      8508410 :             && is_ctrl_altering_stmt (last))
   10522              :           {
   10523       275380 :             if (dump_enabled_p ())
   10524            2 :               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10525              :                                "splitting region at control altering "
   10526              :                                "definition %G", last);
   10527       275380 :             r |= vect_slp_bbs (bbs, NULL);
   10528       275380 :             bbs.truncate (0);
   10529              :           }
   10530              :     }
   10531              : 
   10532       906490 :   if (!bbs.is_empty ())
   10533       906483 :     r |= vect_slp_bbs (bbs, NULL);
   10534              : 
   10535       906490 :   free (rpo);
   10536              : 
   10537       906490 :   return r;
   10538       906490 : }
   10539              : 
   10540              : /* Build a variable-length vector in which the elements in ELTS are repeated
   10541              :    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
   10542              :    RESULTS and add any new instructions to SEQ.
   10543              : 
   10544              :    The approach we use is:
   10545              : 
   10546              :    (1) Find a vector mode VM with integer elements of mode IM.
   10547              : 
   10548              :    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
   10549              :        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
   10550              :        from small vectors to IM.
   10551              : 
   10552              :    (3) Duplicate each ELTS'[I] into a vector of mode VM.
   10553              : 
   10554              :    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
   10555              :        correct byte contents.
   10556              : 
   10557              :    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
   10558              : 
   10559              :    We try to find the largest IM for which this sequence works, in order
   10560              :    to cut down on the number of interleaves.  */
   10561              : 
   10562              : void
   10563            0 : duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
   10564              :                           const vec<tree> &elts, unsigned int nresults,
   10565              :                           vec<tree> &results)
   10566              : {
   10567            0 :   unsigned int nelts = elts.length ();
   10568            0 :   tree element_type = TREE_TYPE (vector_type);
   10569              : 
   10570              :   /* (1) Find a vector mode VM with integer elements of mode IM.  */
   10571            0 :   unsigned int nvectors = 1;
   10572            0 :   tree new_vector_type;
   10573            0 :   tree permutes[2];
   10574            0 :   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
   10575              :                                        &nvectors, &new_vector_type,
   10576              :                                        permutes))
   10577            0 :     gcc_unreachable ();
   10578              : 
   10579              :   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
   10580            0 :   unsigned int partial_nelts = nelts / nvectors;
   10581            0 :   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
   10582              : 
   10583            0 :   tree_vector_builder partial_elts;
   10584            0 :   auto_vec<tree, 32> pieces (nvectors * 2);
   10585            0 :   pieces.quick_grow_cleared (nvectors * 2);
   10586            0 :   for (unsigned int i = 0; i < nvectors; ++i)
   10587              :     {
   10588              :       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
   10589              :              ELTS' has mode IM.  */
   10590            0 :       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
   10591            0 :       for (unsigned int j = 0; j < partial_nelts; ++j)
   10592            0 :         partial_elts.quick_push (elts[i * partial_nelts + j]);
   10593            0 :       tree t = gimple_build_vector (seq, &partial_elts);
   10594            0 :       t = gimple_build (seq, VIEW_CONVERT_EXPR,
   10595            0 :                         TREE_TYPE (new_vector_type), t);
   10596              : 
   10597              :       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
   10598            0 :       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
   10599              :     }
   10600              : 
   10601              :   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
   10602              :          correct byte contents.
   10603              : 
   10604              :      Conceptually, we need to repeat the following operation log2(nvectors)
   10605              :      times, where hi_start = nvectors / 2:
   10606              : 
   10607              :         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
   10608              :         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
   10609              : 
   10610              :      However, if each input repeats every N elements and the VF is
   10611              :      a multiple of N * 2, the HI result is the same as the LO result.
   10612              :      This will be true for the first N1 iterations of the outer loop,
   10613              :      followed by N2 iterations for which both the LO and HI results
   10614              :      are needed.  I.e.:
   10615              : 
   10616              :         N1 + N2 = log2(nvectors)
   10617              : 
   10618              :      Each "N1 iteration" doubles the number of redundant vectors and the
   10619              :      effect of the process as a whole is to have a sequence of nvectors/2**N1
   10620              :      vectors that repeats 2**N1 times.  Rather than generate these redundant
   10621              :      vectors, we halve the number of vectors for each N1 iteration.  */
   10622              :   unsigned int in_start = 0;
   10623              :   unsigned int out_start = nvectors;
   10624              :   unsigned int new_nvectors = nvectors;
   10625            0 :   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
   10626              :     {
   10627            0 :       unsigned int hi_start = new_nvectors / 2;
   10628            0 :       unsigned int out_i = 0;
   10629            0 :       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
   10630              :         {
   10631            0 :           if ((in_i & 1) != 0
   10632            0 :               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
   10633              :                              2 * in_repeat))
   10634            0 :             continue;
   10635              : 
   10636            0 :           tree output = make_ssa_name (new_vector_type);
   10637            0 :           tree input1 = pieces[in_start + (in_i / 2)];
   10638            0 :           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
   10639            0 :           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
   10640              :                                                input1, input2,
   10641              :                                                permutes[in_i & 1]);
   10642            0 :           gimple_seq_add_stmt (seq, stmt);
   10643            0 :           pieces[out_start + out_i] = output;
   10644            0 :           out_i += 1;
   10645              :         }
   10646            0 :       std::swap (in_start, out_start);
   10647            0 :       new_nvectors = out_i;
   10648              :     }
   10649              : 
   10650              :   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
   10651            0 :   results.reserve (nresults);
   10652            0 :   for (unsigned int i = 0; i < nresults; ++i)
   10653            0 :     if (i < new_nvectors)
   10654            0 :       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
   10655            0 :                                         pieces[in_start + i]));
   10656              :     else
   10657            0 :       results.quick_push (results[i - new_nvectors]);
   10658            0 : }
   10659              : 
   10660              : 
   10661              : /* For constant and loop invariant defs in OP_NODE this function creates
   10662              :    vector defs that will be used in the vectorized stmts and stores them
   10663              :    to SLP_TREE_VEC_DEFS of OP_NODE.  */
   10664              : 
   10665              : static void
   10666       488821 : vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
   10667              : {
   10668       488821 :   unsigned HOST_WIDE_INT nunits;
   10669       488821 :   tree vec_cst;
   10670       488821 :   unsigned j, number_of_places_left_in_vector;
   10671       488821 :   tree vector_type;
   10672       488821 :   tree vop;
   10673       488821 :   int group_size = op_node->ops.length ();
   10674       488821 :   unsigned int vec_num, i;
   10675       488821 :   unsigned number_of_copies = 1;
   10676       488821 :   bool constant_p;
   10677       488821 :   gimple_seq ctor_seq = NULL;
   10678       488821 :   auto_vec<tree, 16> permute_results;
   10679              : 
   10680              :   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
   10681       488821 :   vector_type = SLP_TREE_VECTYPE (op_node);
   10682              : 
   10683       488821 :   unsigned int number_of_vectors = vect_get_num_copies (vinfo, op_node);
   10684       488821 :   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
   10685       488821 :   auto_vec<tree> voprnds (number_of_vectors);
   10686              : 
   10687              :   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
   10688              :      created vectors. It is greater than 1 if unrolling is performed.
   10689              : 
   10690              :      For example, we have two scalar operands, s1 and s2 (e.g., group of
   10691              :      strided accesses of size two), while NUNITS is four (i.e., four scalars
   10692              :      of this type can be packed in a vector).  The output vector will contain
   10693              :      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
   10694              :      will be 2).
   10695              : 
   10696              :      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
   10697              :      containing the operands.
   10698              : 
   10699              :      For example, NUNITS is four as before, and the group size is 8
   10700              :      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
   10701              :      {s5, s6, s7, s8}.  */
   10702              : 
   10703              :   /* When using duplicate_and_interleave, we just need one element for
   10704              :      each scalar statement.  */
   10705       488821 :   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
   10706              :     nunits = group_size;
   10707              : 
   10708       488821 :   number_of_copies = nunits * number_of_vectors / group_size;
   10709              : 
   10710       488821 :   number_of_places_left_in_vector = nunits;
   10711       488821 :   constant_p = true;
   10712       488821 :   tree uniform_elt = NULL_TREE;
   10713       488821 :   tree_vector_builder elts (vector_type, nunits, 1);
   10714       488821 :   elts.quick_grow (nunits);
   10715       488821 :   stmt_vec_info insert_after = NULL;
   10716      1458816 :   for (j = 0; j < number_of_copies; j++)
   10717              :     {
   10718       969995 :       tree op;
   10719      3720084 :       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
   10720              :         {
   10721              :           /* Create 'vect_ = {op0,op1,...,opn}'.  */
   10722      1780094 :           tree orig_op = op;
   10723      1780094 :           if (number_of_places_left_in_vector == nunits)
   10724              :             uniform_elt = op;
   10725      1162205 :           else if (uniform_elt && operand_equal_p (uniform_elt, op))
   10726       739371 :             op = elts[number_of_places_left_in_vector];
   10727              :           else
   10728              :             uniform_elt = NULL_TREE;
   10729      1780094 :           number_of_places_left_in_vector--;
   10730      1780094 :           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
   10731              :             {
   10732       273277 :               if (CONSTANT_CLASS_P (op))
   10733              :                 {
   10734        99826 :                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
   10735              :                     {
   10736              :                       /* Can't use VIEW_CONVERT_EXPR for booleans because
   10737              :                          of possibly different sizes of scalar value and
   10738              :                          vector element.  */
   10739           51 :                       if (integer_zerop (op))
   10740           51 :                         op = build_int_cst (TREE_TYPE (vector_type), 0);
   10741            0 :                       else if (integer_onep (op))
   10742            0 :                         op = build_all_ones_cst (TREE_TYPE (vector_type));
   10743              :                       else
   10744            0 :                         gcc_unreachable ();
   10745              :                     }
   10746              :                   else
   10747        99775 :                     op = fold_unary (VIEW_CONVERT_EXPR,
   10748              :                                      TREE_TYPE (vector_type), op);
   10749        99826 :                   gcc_assert (op && CONSTANT_CLASS_P (op));
   10750              :                 }
   10751              :               else
   10752              :                 {
   10753       173451 :                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
   10754       173451 :                   gimple *init_stmt;
   10755       173451 :                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
   10756              :                     {
   10757          403 :                       tree true_val
   10758          403 :                         = build_all_ones_cst (TREE_TYPE (vector_type));
   10759          403 :                       tree false_val
   10760          403 :                         = build_zero_cst (TREE_TYPE (vector_type));
   10761          403 :                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
   10762          403 :                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
   10763              :                                                        op, true_val,
   10764              :                                                        false_val);
   10765              :                     }
   10766              :                   else
   10767              :                     {
   10768       173048 :                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
   10769              :                                    op);
   10770       173048 :                       init_stmt
   10771       173048 :                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
   10772              :                                                op);
   10773              :                     }
   10774       173451 :                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
   10775       173451 :                   op = new_temp;
   10776              :                 }
   10777              :             }
   10778      1780094 :           elts[number_of_places_left_in_vector] = op;
   10779      1780094 :           if (!CONSTANT_CLASS_P (op))
   10780       314833 :             constant_p = false;
   10781              :           /* For BB vectorization we have to compute an insert location
   10782              :              when a def is inside the analyzed region since we cannot
   10783              :              simply insert at the BB start in this case.  */
   10784      1780094 :           stmt_vec_info opdef;
   10785      1780094 :           if (TREE_CODE (orig_op) == SSA_NAME
   10786       181014 :               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
   10787       161182 :               && is_a <bb_vec_info> (vinfo)
   10788      1883611 :               && (opdef = vinfo->lookup_def (orig_op)))
   10789              :             {
   10790        84671 :               if (!insert_after)
   10791              :                 insert_after = opdef;
   10792              :               else
   10793        46730 :                 insert_after = get_later_stmt (insert_after, opdef);
   10794              :             }
   10795              : 
   10796      1780094 :           if (number_of_places_left_in_vector == 0)
   10797              :             {
   10798       617889 :               auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
   10799       617889 :               if (uniform_elt)
   10800       645152 :                 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
   10801       322576 :                                                         elts[0]);
   10802       590626 :               else if (constant_p
   10803       590626 :                        ? multiple_p (type_nunits, nunits)
   10804       108388 :                        : known_eq (type_nunits, nunits))
   10805       295313 :                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
   10806              :               else
   10807              :                 {
   10808            0 :                   if (permute_results.is_empty ())
   10809            0 :                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
   10810              :                                               elts, number_of_vectors,
   10811              :                                               permute_results);
   10812            0 :                   vec_cst = permute_results[number_of_vectors - j - 1];
   10813              :                 }
   10814       617889 :               if (!gimple_seq_empty_p (ctor_seq))
   10815              :                 {
   10816       135485 :                   if (insert_after)
   10817              :                     {
   10818        37941 :                       gimple_stmt_iterator gsi;
   10819        37941 :                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
   10820              :                         {
   10821          592 :                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
   10822          592 :                           gsi_insert_seq_before (&gsi, ctor_seq,
   10823              :                                                  GSI_CONTINUE_LINKING);
   10824              :                         }
   10825        37349 :                       else if (!stmt_ends_bb_p (insert_after->stmt))
   10826              :                         {
   10827        37349 :                           gsi = gsi_for_stmt (insert_after->stmt);
   10828        37349 :                           gsi_insert_seq_after (&gsi, ctor_seq,
   10829              :                                                 GSI_CONTINUE_LINKING);
   10830              :                         }
   10831              :                       else
   10832              :                         {
   10833              :                           /* When we want to insert after a def where the
   10834              :                              defining stmt throws then insert on the fallthru
   10835              :                              edge.  */
   10836            0 :                           edge e = find_fallthru_edge
   10837            0 :                                      (gimple_bb (insert_after->stmt)->succs);
   10838            0 :                           basic_block new_bb
   10839            0 :                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
   10840            0 :                           gcc_assert (!new_bb);
   10841              :                         }
   10842              :                     }
   10843              :                   else
   10844        97544 :                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
   10845       135485 :                   ctor_seq = NULL;
   10846              :                 }
   10847       617889 :               voprnds.quick_push (vec_cst);
   10848       617889 :               insert_after = NULL;
   10849       617889 :               number_of_places_left_in_vector = nunits;
   10850       617889 :               constant_p = true;
   10851       617889 :               elts.new_vector (vector_type, nunits, 1);
   10852       617889 :               elts.quick_grow (nunits);
   10853              :             }
   10854              :         }
   10855              :     }
   10856              : 
   10857              :   /* Since the vectors are created in the reverse order, we should invert
   10858              :      them.  */
   10859       488821 :   vec_num = voprnds.length ();
   10860      1106710 :   for (j = vec_num; j != 0; j--)
   10861              :     {
   10862       617889 :       vop = voprnds[j - 1];
   10863       617889 :       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
   10864              :     }
   10865              : 
   10866              :   /* In case that VF is greater than the unrolling factor needed for the SLP
   10867              :      group of stmts, NUMBER_OF_VECTORS to be created is greater than
   10868              :      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
   10869              :      to replicate the vectors.  */
   10870       488821 :   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
   10871       488821 :     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
   10872              :          i++)
   10873            0 :       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
   10874       488821 : }
   10875              : 
   10876              : /* Get the scalar definition of the Nth lane from SLP_NODE or NULL_TREE
   10877              :    if there is no definition for it in the scalar IL or it is not known.  */
   10878              : 
   10879              : tree
   10880         2665 : vect_get_slp_scalar_def (slp_tree slp_node, unsigned n)
   10881              : {
   10882         2665 :   if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
   10883              :     {
   10884         2653 :       if (!SLP_TREE_SCALAR_STMTS (slp_node).exists ())
   10885              :         return NULL_TREE;
   10886         2653 :       stmt_vec_info def = SLP_TREE_SCALAR_STMTS (slp_node)[n];
   10887         2653 :       if (!def)
   10888              :         return NULL_TREE;
   10889         2653 :       return gimple_get_lhs (STMT_VINFO_STMT (def));
   10890              :     }
   10891              :   else
   10892           12 :     return SLP_TREE_SCALAR_OPS (slp_node)[n];
   10893              : }
   10894              : 
   10895              : /* Get the Ith vectorized definition from SLP_NODE.  */
   10896              : 
   10897              : tree
   10898       145197 : vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
   10899              : {
   10900       145197 :   return SLP_TREE_VEC_DEFS (slp_node)[i];
   10901              : }
   10902              : 
   10903              : /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
   10904              : 
   10905              : void
   10906       926415 : vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
   10907              : {
   10908      1852830 :   vec_defs->create (SLP_TREE_VEC_DEFS (slp_node).length ());
   10909       926415 :   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
   10910       926415 : }
   10911              : 
   10912              : /* Get N vectorized definitions for SLP_NODE.  */
   10913              : 
   10914              : void
   10915         2965 : vect_get_slp_defs (vec_info *,
   10916              :                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
   10917              : {
   10918         2965 :   if (n == -1U)
   10919         2965 :     n = SLP_TREE_CHILDREN (slp_node).length ();
   10920              : 
   10921        10681 :   for (unsigned i = 0; i < n; ++i)
   10922              :     {
   10923         7716 :       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
   10924         7716 :       vec<tree> vec_defs = vNULL;
   10925         7716 :       vect_get_slp_defs (child, &vec_defs);
   10926         7716 :       vec_oprnds->quick_push (vec_defs);
   10927              :     }
   10928         2965 : }
   10929              : 
   10930              : /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
   10931              :    - PERM gives the permutation that the caller wants to use for NODE,
   10932              :      which might be different from SLP_LOAD_PERMUTATION.
   10933              :    - DUMP_P controls whether the function dumps information.  */
   10934              : 
   10935              : static bool
   10936       129718 : vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
   10937              :                                 load_permutation_t &perm,
   10938              :                                 const vec<tree> &dr_chain,
   10939              :                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
   10940              :                                 bool analyze_only, bool dump_p,
   10941              :                                 unsigned *n_perms, unsigned int *n_loads,
   10942              :                                 bool dce_chain)
   10943              : {
   10944       129718 :   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   10945       129718 :   int vec_index = 0;
   10946       129718 :   tree vectype = SLP_TREE_VECTYPE (node);
   10947       129718 :   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
   10948       129718 :   unsigned int mask_element;
   10949       129718 :   unsigned dr_group_size;
   10950       129718 :   machine_mode mode;
   10951              : 
   10952       129718 :   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
   10953              :     {
   10954              :       /* We have both splats of the same non-grouped load and groups
   10955              :          of distinct invariant loads entering here.  */
   10956         1603 :       unsigned max_idx = 0;
   10957         8819 :       for (auto idx : perm)
   10958         4010 :         max_idx = idx > max_idx ? idx : max_idx;
   10959         1603 :       dr_group_size = max_idx + 1;
   10960              :     }
   10961              :   else
   10962              :     {
   10963       128115 :       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
   10964       128115 :       dr_group_size = DR_GROUP_SIZE (stmt_info);
   10965              :     }
   10966              : 
   10967       129718 :   mode = TYPE_MODE (vectype);
   10968       129718 :   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   10969       129718 :   unsigned int nstmts = vect_get_num_copies (vinfo, node);
   10970              : 
   10971              :   /* Initialize the vect stmts of NODE to properly insert the generated
   10972              :      stmts later.  */
   10973       129718 :   if (! analyze_only)
   10974        56993 :     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
   10975        21997 :       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
   10976              : 
   10977              :   /* Generate permutation masks for every NODE. Number of masks for each NODE
   10978              :      is equal to GROUP_SIZE.
   10979              :      E.g., we have a group of three nodes with three loads from the same
   10980              :      location in each node, and the vector size is 4. I.e., we have a
   10981              :      a0b0c0a1b1c1... sequence and we need to create the following vectors:
   10982              :      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
   10983              :      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
   10984              :      ...
   10985              : 
   10986              :      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
   10987              :      The last mask is illegal since we assume two operands for permute
   10988              :      operation, and the mask element values can't be outside that range.
   10989              :      Hence, the last mask must be converted into {2,5,5,5}.
   10990              :      For the first two permutations we need the first and the second input
   10991              :      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
   10992              :      we need the second and the third vectors: {b1,c1,a2,b2} and
   10993              :      {c2,a3,b3,c3}.  */
   10994              : 
   10995       129718 :   int vect_stmts_counter = 0;
   10996       129718 :   unsigned int index = 0;
   10997       129718 :   int first_vec_index = -1;
   10998       129718 :   int second_vec_index = -1;
   10999       129718 :   bool noop_p = true;
   11000       129718 :   *n_perms = 0;
   11001              : 
   11002       129718 :   vec_perm_builder mask;
   11003       129718 :   unsigned int nelts_to_build;
   11004       129718 :   unsigned int nvectors_per_build;
   11005       129718 :   unsigned int in_nlanes;
   11006       129718 :   bool repeating_p = (group_size == dr_group_size
   11007       164787 :                       && multiple_p (nunits, group_size));
   11008       129718 :   if (repeating_p)
   11009              :     {
   11010              :       /* A single vector contains a whole number of copies of the node, so:
   11011              :          (a) all permutes can use the same mask; and
   11012              :          (b) the permutes only need a single vector input.  */
   11013        32843 :       mask.new_vector (nunits, group_size, 3);
   11014        32843 :       nelts_to_build = mask.encoded_nelts ();
   11015              :       /* It's possible to obtain zero nstmts during analyze_only, so make
   11016              :          it at least one to ensure the later computation for n_perms
   11017              :          proceed.  */
   11018        32843 :       nvectors_per_build = nstmts > 0 ? nstmts : 1;
   11019        32843 :       in_nlanes = dr_group_size * 3;
   11020              :     }
   11021              :   else
   11022              :     {
   11023              :       /* We need to construct a separate mask for each vector statement.  */
   11024        96875 :       unsigned HOST_WIDE_INT const_nunits, const_vf;
   11025        96875 :       if (!nunits.is_constant (&const_nunits)
   11026        96875 :           || !vf.is_constant (&const_vf))
   11027              :         return false;
   11028        96875 :       mask.new_vector (const_nunits, const_nunits, 1);
   11029        96875 :       nelts_to_build = const_vf * group_size;
   11030        96875 :       nvectors_per_build = 1;
   11031        96875 :       in_nlanes = const_vf * dr_group_size;
   11032              :     }
   11033       129718 :   auto_sbitmap used_in_lanes (in_nlanes);
   11034       129718 :   bitmap_clear (used_in_lanes);
   11035       129718 :   auto_bitmap used_defs;
   11036              : 
   11037       129718 :   unsigned int count = mask.encoded_nelts ();
   11038       129718 :   mask.quick_grow (count);
   11039       129718 :   vec_perm_indices indices;
   11040              : 
   11041       689367 :   for (unsigned int j = 0; j < nelts_to_build; j++)
   11042              :     {
   11043       569265 :       unsigned int iter_num = j / group_size;
   11044       569265 :       unsigned int stmt_num = j % group_size;
   11045       569265 :       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
   11046       569265 :       bitmap_set_bit (used_in_lanes, i);
   11047       569265 :       if (repeating_p)
   11048              :         {
   11049              :           first_vec_index = 0;
   11050              :           mask_element = i;
   11051              :         }
   11052              :       else
   11053              :         {
   11054              :           /* Enforced before the loop when !repeating_p.  */
   11055       358797 :           unsigned int const_nunits = nunits.to_constant ();
   11056       358797 :           vec_index = i / const_nunits;
   11057       358797 :           mask_element = i % const_nunits;
   11058       358797 :           if (vec_index == first_vec_index
   11059       358797 :               || first_vec_index == -1)
   11060              :             {
   11061              :               first_vec_index = vec_index;
   11062              :             }
   11063       143598 :           else if (vec_index == second_vec_index
   11064       143598 :                    || second_vec_index == -1)
   11065              :             {
   11066       137505 :               second_vec_index = vec_index;
   11067       137505 :               mask_element += const_nunits;
   11068              :             }
   11069              :           else
   11070              :             {
   11071         6093 :               if (dump_p)
   11072          280 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11073              :                                  "permutation requires at "
   11074              :                                  "least three vectors %G",
   11075              :                                  stmt_info->stmt);
   11076         6093 :               gcc_assert (analyze_only);
   11077              :               return false;
   11078              :             }
   11079              : 
   11080       352704 :           gcc_assert (mask_element < 2 * const_nunits);
   11081              :         }
   11082              : 
   11083       563172 :       if (mask_element != index)
   11084       362611 :         noop_p = false;
   11085       563172 :       mask[index++] = mask_element;
   11086              : 
   11087       563172 :       if (index == count)
   11088              :         {
   11089       152983 :           if (!noop_p)
   11090              :             {
   11091       210392 :               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
   11092       124574 :               if (!can_vec_perm_const_p (mode, mode, indices))
   11093              :                 {
   11094         3523 :                   if (dump_p)
   11095              :                     {
   11096           79 :                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11097              :                                        "unsupported vect permute { ");
   11098          669 :                       for (i = 0; i < count; ++i)
   11099              :                         {
   11100          590 :                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
   11101          590 :                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
   11102              :                         }
   11103           79 :                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
   11104              :                     }
   11105         3523 :                   gcc_assert (analyze_only);
   11106              :                   return false;
   11107              :                 }
   11108              : 
   11109       121051 :               tree mask_vec = NULL_TREE;
   11110       121051 :               if (!analyze_only)
   11111        20320 :                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
   11112              : 
   11113       121051 :               if (second_vec_index == -1)
   11114        36812 :                 second_vec_index = first_vec_index;
   11115              : 
   11116       244976 :               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
   11117              :                 {
   11118       123925 :                   ++*n_perms;
   11119       123925 :                   if (analyze_only)
   11120       103323 :                     continue;
   11121              :                   /* Generate the permute statement if necessary.  */
   11122        20602 :                   tree first_vec = dr_chain[first_vec_index + ri];
   11123        20602 :                   tree second_vec = dr_chain[second_vec_index + ri];
   11124        20602 :                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
   11125        20602 :                   tree perm_dest
   11126        20602 :                     = vect_create_destination_var (gimple_assign_lhs (stmt),
   11127              :                                                    vectype);
   11128        20602 :                   perm_dest = make_ssa_name (perm_dest);
   11129        20602 :                   gimple *perm_stmt
   11130        20602 :                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
   11131              :                                            second_vec, mask_vec);
   11132        20602 :                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
   11133              :                                                gsi);
   11134        20602 :                   if (dce_chain)
   11135              :                     {
   11136        19913 :                       bitmap_set_bit (used_defs, first_vec_index + ri);
   11137        19913 :                       bitmap_set_bit (used_defs, second_vec_index + ri);
   11138              :                     }
   11139              : 
   11140              :                   /* Store the vector statement in NODE.  */
   11141        20602 :                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
   11142              :                 }
   11143              :             }
   11144        28409 :           else if (!analyze_only)
   11145              :             {
   11146         2790 :               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
   11147              :                 {
   11148         1395 :                   tree first_vec = dr_chain[first_vec_index + ri];
   11149              :                   /* If mask was NULL_TREE generate the requested
   11150              :                      identity transform.  */
   11151         1395 :                   if (dce_chain)
   11152         1388 :                     bitmap_set_bit (used_defs, first_vec_index + ri);
   11153              : 
   11154              :                   /* Store the vector statement in NODE.  */
   11155         1395 :                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
   11156              :                 }
   11157              :             }
   11158              : 
   11159              :           index = 0;
   11160              :           first_vec_index = -1;
   11161              :           second_vec_index = -1;
   11162              :           noop_p = true;
   11163              :         }
   11164              :     }
   11165              : 
   11166       120102 :   if (n_loads)
   11167              :     {
   11168        81631 :       if (repeating_p)
   11169        10774 :         *n_loads = nstmts;
   11170              :       else
   11171              :         {
   11172              :           /* Enforced above when !repeating_p.  */
   11173        70857 :           unsigned int const_nunits = nunits.to_constant ();
   11174        70857 :           *n_loads = 0;
   11175        70857 :           bool load_seen = false;
   11176       990289 :           for (unsigned i = 0; i < in_nlanes; ++i)
   11177              :             {
   11178       919432 :               if (i % const_nunits == 0)
   11179              :                 {
   11180       389136 :                   if (load_seen)
   11181       110468 :                     *n_loads += 1;
   11182              :                   load_seen = false;
   11183              :                 }
   11184       919432 :               if (bitmap_bit_p (used_in_lanes, i))
   11185       253112 :                 load_seen = true;
   11186              :             }
   11187        70857 :           if (load_seen)
   11188        48327 :             *n_loads += 1;
   11189              :         }
   11190              :     }
   11191              : 
   11192       120102 :   if (dce_chain)
   11193       218550 :     for (unsigned i = 0; i < dr_chain.length (); ++i)
   11194        71890 :       if (!bitmap_bit_p (used_defs, i))
   11195              :         {
   11196        39337 :           tree def = dr_chain[i];
   11197        39681 :           do
   11198              :             {
   11199        39681 :               gimple *stmt = SSA_NAME_DEF_STMT (def);
   11200        39681 :               if (is_gimple_assign (stmt)
   11201        39681 :                   && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
   11202        39681 :                       || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
   11203         4913 :                 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
   11204              :               else
   11205              :                 def = NULL;
   11206        39681 :               gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
   11207        39681 :               gsi_remove (&rgsi, true);
   11208        39681 :               release_defs (stmt);
   11209              :             }
   11210        39681 :           while (def);
   11211              :         }
   11212              : 
   11213              :   return true;
   11214       129718 : }
   11215              : 
   11216              : /* Generate vector permute statements from a list of loads in DR_CHAIN.
   11217              :    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
   11218              :    permute statements for the SLP node NODE.  Store the number of vector
   11219              :    permute instructions in *N_PERMS and the number of vector load
   11220              :    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
   11221              :    that were not needed.  */
   11222              : 
   11223              : bool
   11224        90312 : vect_transform_slp_perm_load (vec_info *vinfo,
   11225              :                               slp_tree node, const vec<tree> &dr_chain,
   11226              :                               gimple_stmt_iterator *gsi, poly_uint64 vf,
   11227              :                               bool analyze_only, unsigned *n_perms,
   11228              :                               unsigned int *n_loads, bool dce_chain)
   11229              : {
   11230        90312 :   return vect_transform_slp_perm_load_1 (vinfo, node,
   11231        90312 :                                          SLP_TREE_LOAD_PERMUTATION (node),
   11232              :                                          dr_chain, gsi, vf, analyze_only,
   11233              :                                          dump_enabled_p (), n_perms, n_loads,
   11234        90312 :                                          dce_chain);
   11235              : }
   11236              : 
   11237              : /* Produce the next vector result for SLP permutation NODE by adding a vector
   11238              :    statement at GSI.  If MASK_VEC is nonnull, add:
   11239              : 
   11240              :       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
   11241              : 
   11242              :    otherwise add:
   11243              : 
   11244              :       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF,
   11245              :                                       { N, N+1, N+2, ... }>
   11246              : 
   11247              :    where N == IDENTITY_OFFSET which is either zero or equal to the
   11248              :    number of elements of the result.  */
   11249              : 
   11250              : static void
   11251        31239 : vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
   11252              :                           slp_tree node, tree first_def, tree second_def,
   11253              :                           tree mask_vec, poly_uint64 identity_offset)
   11254              : {
   11255        31239 :   tree vectype = SLP_TREE_VECTYPE (node);
   11256              : 
   11257              :   /* ???  We SLP match existing vector element extracts but
   11258              :      allow punning which we need to re-instantiate at uses
   11259              :      but have no good way of explicitly representing.  */
   11260        31239 :   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
   11261        31239 :       && !types_compatible_p (TREE_TYPE (first_def), vectype))
   11262              :     {
   11263           14 :       gassign *conv_stmt
   11264           14 :         = gimple_build_assign (make_ssa_name (vectype),
   11265              :                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
   11266           14 :       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
   11267           14 :       first_def = gimple_assign_lhs (conv_stmt);
   11268              :     }
   11269        31239 :   gassign *perm_stmt;
   11270        31239 :   tree perm_dest = make_ssa_name (vectype);
   11271        31239 :   if (mask_vec)
   11272              :     {
   11273        27933 :       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
   11274        27933 :                            TYPE_SIZE (vectype))
   11275        27933 :           && !types_compatible_p (TREE_TYPE (second_def), vectype))
   11276              :         {
   11277            8 :           gassign *conv_stmt
   11278            8 :             = gimple_build_assign (make_ssa_name (vectype),
   11279              :                                    build1 (VIEW_CONVERT_EXPR,
   11280              :                                            vectype, second_def));
   11281            8 :           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
   11282            8 :           second_def = gimple_assign_lhs (conv_stmt);
   11283              :         }
   11284        27933 :       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
   11285              :                                        first_def, second_def,
   11286              :                                        mask_vec);
   11287              :     }
   11288              :   else
   11289              :     {
   11290         3306 :       auto def_nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
   11291         3306 :       unsigned HOST_WIDE_INT vecno;
   11292         3306 :       poly_uint64 eltno;
   11293         3306 :       if (!can_div_trunc_p (poly_uint64 (identity_offset), def_nunits,
   11294              :                             &vecno, &eltno))
   11295              :         gcc_unreachable ();
   11296         3306 :       tree def = vecno & 1 ? second_def : first_def;
   11297         3306 :       if (!types_compatible_p (TREE_TYPE (def), vectype))
   11298              :         {
   11299              :           /* For identity permutes we still need to handle the case
   11300              :              of offsetted extracts or concats.  */
   11301          261 :           unsigned HOST_WIDE_INT c;
   11302          261 :           if (known_le (TYPE_VECTOR_SUBPARTS (vectype), def_nunits))
   11303              :             {
   11304          257 :               unsigned HOST_WIDE_INT elsz
   11305          257 :                 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (def))));
   11306          514 :               tree lowpart = build3 (BIT_FIELD_REF, vectype, def,
   11307          257 :                                      TYPE_SIZE (vectype),
   11308          257 :                                      bitsize_int (eltno * elsz));
   11309          257 :               perm_stmt = gimple_build_assign (perm_dest, lowpart);
   11310              :             }
   11311            4 :           else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
   11312            4 :                                         def_nunits, &c) && c == 2)
   11313              :             {
   11314            4 :               gcc_assert (known_eq (identity_offset, 0U));
   11315            4 :               tree ctor = build_constructor_va (vectype, 2,
   11316              :                                                 NULL_TREE, first_def,
   11317              :                                                 NULL_TREE, second_def);
   11318            4 :               perm_stmt = gimple_build_assign (perm_dest, ctor);
   11319              :             }
   11320              :           else
   11321            0 :             gcc_unreachable ();
   11322              :         }
   11323              :       else
   11324              :         {
   11325              :           /* We need a copy here in case the def was external.  */
   11326         3045 :           gcc_assert (known_eq (eltno, 0U));
   11327         3045 :           perm_stmt = gimple_build_assign (perm_dest, def);
   11328              :         }
   11329              :     }
   11330        31239 :   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
   11331              :   /* Store the vector statement in NODE.  */
   11332        31239 :   node->push_vec_def (perm_stmt);
   11333        31239 : }
   11334              : 
   11335              : /* Subroutine of vectorizable_slp_permutation.  Check whether the target
   11336              :    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
   11337              :    If GSI is nonnull, emit the permutation there.
   11338              : 
   11339              :    When GSI is null, the only purpose of NODE is to give properties
   11340              :    of the result, such as the vector type and number of SLP lanes.
   11341              :    The node does not need to be a VEC_PERM_EXPR.
   11342              : 
   11343              :    If the target supports the operation, return the number of individual
   11344              :    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
   11345              :    dump file if DUMP_P is true.  */
   11346              : 
   11347              : static int
   11348       488709 : vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
   11349              :                                 slp_tree node, lane_permutation_t &perm,
   11350              :                                 vec<slp_tree> &children, bool dump_p)
   11351              : {
   11352       488709 :   tree vectype = SLP_TREE_VECTYPE (node);
   11353              : 
   11354              :   /* ???  We currently only support all same vector input types
   11355              :      while the SLP IL should really do a concat + select and thus accept
   11356              :      arbitrary mismatches.  */
   11357       488709 :   slp_tree child;
   11358       488709 :   unsigned i;
   11359       488709 :   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   11360       488709 :   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
   11361              :   /* True if we're permuting a single input of 2N vectors down
   11362              :      to N vectors.  This case doesn't generalize beyond 2 since
   11363              :      VEC_PERM_EXPR only takes 2 inputs.  */
   11364       488709 :   bool pack_p = false;
   11365              :   /* If we're permuting inputs of N vectors each into X*N outputs,
   11366              :      this is the value of X, otherwise it is 1.  */
   11367       488709 :   unsigned int unpack_factor = 1;
   11368       488709 :   tree op_vectype = NULL_TREE;
   11369       490276 :   FOR_EACH_VEC_ELT (children, i, child)
   11370       490201 :     if (SLP_TREE_VECTYPE (child))
   11371              :       {
   11372              :         op_vectype = SLP_TREE_VECTYPE (child);
   11373              :         break;
   11374              :       }
   11375       488709 :   if (!op_vectype)
   11376           75 :     op_vectype = vectype;
   11377      1061557 :   FOR_EACH_VEC_ELT (children, i, child)
   11378              :     {
   11379       572848 :       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
   11380        10464 :            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
   11381       572848 :           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
   11382      1145696 :           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
   11383              :         {
   11384            0 :           if (dump_p)
   11385            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11386              :                              "Unsupported vector types in lane permutation\n");
   11387            0 :           return -1;
   11388              :         }
   11389       572848 :       auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
   11390       572848 :       unsigned int this_unpack_factor;
   11391              :       /* Detect permutations of external, pre-existing vectors.  The external
   11392              :          node's SLP_TREE_LANES stores the total number of units in the vector,
   11393              :          or zero if the vector has variable length.
   11394              : 
   11395              :          We are expected to keep the original VEC_PERM_EXPR for such cases.
   11396              :          There is no repetition to model.  */
   11397       572848 :       if (SLP_TREE_DEF_TYPE (child) == vect_external_def
   11398       572848 :           && SLP_TREE_SCALAR_OPS (child).is_empty ())
   11399              :         repeating_p = false;
   11400              :       /* Check whether the input has twice as many lanes per vector.  */
   11401       564947 :       else if (children.length () == 1
   11402       564947 :                && known_eq (SLP_TREE_LANES (child) * nunits,
   11403              :                             SLP_TREE_LANES (node) * op_nunits * 2))
   11404              :         pack_p = true;
   11405              :       /* Check whether the output has N times as many lanes per vector.  */
   11406       572848 :       else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
   11407       521147 :                                     SLP_TREE_LANES (child) * nunits,
   11408              :                                     &this_unpack_factor)
   11409       486186 :                && (i == 0 || unpack_factor == this_unpack_factor))
   11410              :         unpack_factor = this_unpack_factor;
   11411              :       else
   11412              :         repeating_p = false;
   11413              :     }
   11414              : 
   11415       977418 :   gcc_assert (perm.length () == SLP_TREE_LANES (node));
   11416              : 
   11417              :   /* Load-lanes permute.  This permute only acts as a forwarder to
   11418              :      select the correct vector def of the load-lanes load which
   11419              :      has the permuted vectors in its vector defs like
   11420              :      { v0, w0, r0, v1, w1, r1 ... } for a ld3.  All costs are
   11421              :      accounted for in the costing for the actual load so we
   11422              :      return zero here.  */
   11423       488709 :   if (node->ldst_lanes)
   11424              :     {
   11425            0 :       gcc_assert (children.length () == 1);
   11426            0 :       if (!gsi)
   11427              :         /* This is a trivial op always supported.  */
   11428              :         return 0;
   11429            0 :       slp_tree child = children[0];
   11430            0 :       unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
   11431            0 :                           / SLP_TREE_LANES (node));
   11432            0 :       unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
   11433            0 :       unsigned nvectors = vect_get_num_copies (vinfo, node);
   11434            0 :       for (unsigned i = 0; i < nvectors; ++i)
   11435              :         {
   11436            0 :           tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num  + vec_idx];
   11437            0 :           node->push_vec_def (def);
   11438              :         }
   11439              :       return 0;
   11440              :     }
   11441              : 
   11442              :   /* Set REPEATING_P to true if the permutations are cyclical wrt UNPACK_FACTOR
   11443              :      and if we can generate the vectors in a vector-length agnostic way.
   11444              :      This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
   11445              :      compile time.
   11446              : 
   11447              :      The significance of UNPACK_STEP is that, when PACK_P is false,
   11448              :      output vector I operates on a window of UNPACK_STEP elements from each
   11449              :      input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR).  For example,
   11450              :      when UNPACK_FACTOR is 2, the first output vector operates on lanes
   11451              :      [0, NUNITS / 2 - 1] of each input vector and the second output vector
   11452              :      operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
   11453              : 
   11454              :      When REPEATING_P is true, NOUTPUTS holds the total number of outputs
   11455              :      that we actually need to generate.  */
   11456       488709 :   uint64_t noutputs = 0;
   11457       488709 :   poly_uint64 unpack_step = 0;
   11458       488709 :   loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
   11459       182184 :   if (!linfo
   11460       527855 :       || !multiple_p (nunits, unpack_factor, &unpack_step)
   11461       181247 :       || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
   11462       181247 :                                * SLP_TREE_LANES (node), nunits, &noutputs))
   11463              :     repeating_p = false;
   11464              : 
   11465              :   /* We can handle the conditions described for REPEATING_P above for
   11466              :      both variable- and constant-length vectors.  The fallback requires
   11467              :      us to generate every element of every permute vector explicitly,
   11468              :      which is only possible for constant-length permute vectors.
   11469              : 
   11470              :      Set:
   11471              : 
   11472              :      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
   11473              :        mask vectors that we want to build.
   11474              : 
   11475              :      - NCOPIES to the number of copies of PERM that we need in order
   11476              :        to build the necessary permute mask vectors.  */
   11477       181247 :   uint64_t npatterns;
   11478       181247 :   unsigned nelts_per_pattern;
   11479       181247 :   uint64_t ncopies;
   11480       181247 :   if (repeating_p)
   11481              :     {
   11482              :       /* We need permute mask vectors that have the form:
   11483              : 
   11484              :            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
   11485              : 
   11486              :          In other words, the original n-element permute in PERM is
   11487              :          "unrolled" to fill a full vector.  The stepped vector encoding
   11488              :          that we use for permutes requires 3n elements.  */
   11489       142101 :       npatterns = SLP_TREE_LANES (node);
   11490       142101 :       nelts_per_pattern = ncopies = 3;
   11491              :     }
   11492              :   else
   11493              :     {
   11494              :       /* Calculate every element of every permute mask vector explicitly,
   11495              :          instead of relying on the pattern described above.  */
   11496       346608 :       if (!nunits.is_constant (&npatterns)
   11497       346608 :           || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
   11498              :         {
   11499              :           if (dump_p)
   11500              :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11501              :                              "unsupported permutation %p on variable-length"
   11502              :                              " vectors\n", (void *) node);
   11503              :           return -1;
   11504              :         }
   11505       346608 :       nelts_per_pattern = ncopies = 1;
   11506       346608 :       if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
   11507              :         {
   11508              :           if (dump_p)
   11509              :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11510              :                              "unsupported permutation %p for variable VF\n",
   11511              :                              (void *) node);
   11512              :           return -1;
   11513              :         }
   11514              :       pack_p = false;
   11515              :       unpack_factor = 1;
   11516              :     }
   11517       488709 :   unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
   11518       488709 :   gcc_assert (repeating_p || multiple_p (olanes, nunits));
   11519              : 
   11520              :   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
   11521              :      from the { SLP operand, scalar lane } permutation as recorded in the
   11522              :      SLP node as intermediate step.  This part should already work
   11523              :      with SLP children with arbitrary number of lanes.  */
   11524       488709 :   auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
   11525       488709 :   auto_vec<poly_uint64> active_lane;
   11526       488709 :   vperm.create (olanes);
   11527       488709 :   active_lane.safe_grow_cleared (children.length (), true);
   11528       985645 :   for (unsigned int ui = 0; ui < unpack_factor; ++ui)
   11529              :     {
   11530      2172428 :       for (unsigned j = 0; j < children.length (); ++j)
   11531       589278 :         active_lane[j] = ui * unpack_step;
   11532      1394498 :       for (unsigned i = 0; i < ncopies; ++i)
   11533              :         {
   11534      5588748 :           for (unsigned pi = 0; pi < perm.length (); ++pi)
   11535              :             {
   11536      1896812 :               std::pair<unsigned, unsigned> p = perm[pi];
   11537      1896812 :               tree vtype = SLP_TREE_VECTYPE (children[p.first]);
   11538      1896812 :               if (repeating_p)
   11539       827706 :                 vperm.quick_push ({{p.first, 0},
   11540       827706 :                                    p.second + active_lane[p.first]});
   11541              :               else
   11542              :                 {
   11543              :                   /* We checked above that the vectors are constant-length.  */
   11544      1069106 :                   unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
   11545      1069106 :                     .to_constant ();
   11546      1069106 :                   unsigned lane = active_lane[p.first].to_constant ();
   11547      1069106 :                   unsigned vi = (lane + p.second) / vnunits;
   11548      1069106 :                   unsigned vl = (lane + p.second) % vnunits;
   11549      1069106 :                   vperm.quick_push ({{p.first, vi}, vl});
   11550              :                 }
   11551              :             }
   11552              :           /* Advance to the next group.  */
   11553      1951336 :           for (unsigned j = 0; j < children.length (); ++j)
   11554      1053774 :             active_lane[j] += SLP_TREE_LANES (children[j]);
   11555              :         }
   11556              :     }
   11557              : 
   11558       488709 :   if (dump_p)
   11559              :     {
   11560         8909 :       dump_printf_loc (MSG_NOTE, vect_location,
   11561              :                        "vectorizing permutation %p", (void *)node);
   11562        32209 :       for (unsigned i = 0; i < perm.length (); ++i)
   11563        23300 :         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
   11564         8909 :       if (repeating_p)
   11565         7502 :         dump_printf (MSG_NOTE, " (repeat %d)", SLP_TREE_LANES (node));
   11566         8909 :       dump_printf (MSG_NOTE, "\n");
   11567         8909 :       dump_printf_loc (MSG_NOTE, vect_location, "as");
   11568        89301 :       for (unsigned i = 0; i < vperm.length (); ++i)
   11569              :         {
   11570        80392 :           if (i != 0
   11571        80392 :               && (repeating_p
   11572        54232 :                   ? multiple_p (i, npatterns)
   11573        59784 :                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
   11574        24113 :             dump_printf (MSG_NOTE, ",");
   11575        80392 :           dump_printf (MSG_NOTE, " vops%u[%u][",
   11576        80392 :                        vperm[i].first.first, vperm[i].first.second);
   11577        80392 :           dump_dec (MSG_NOTE, vperm[i].second);
   11578        80392 :           dump_printf (MSG_NOTE, "]");
   11579              :         }
   11580         8909 :       dump_printf (MSG_NOTE, "\n");
   11581              :     }
   11582              : 
   11583              :   /* We can only handle two-vector permutes, everything else should
   11584              :      be lowered on the SLP level.  The following is closely inspired
   11585              :      by vect_transform_slp_perm_load and is supposed to eventually
   11586              :      replace it.
   11587              :      ???   As intermediate step do code-gen in the SLP tree representation
   11588              :      somehow?  */
   11589       488709 :   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
   11590       488709 :   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
   11591       488709 :   unsigned int index = 0;
   11592       488709 :   poly_uint64 mask_element;
   11593       488709 :   vec_perm_builder mask;
   11594       488709 :   mask.new_vector (nunits, npatterns, nelts_per_pattern);
   11595       488709 :   unsigned int count = mask.encoded_nelts ();
   11596       488709 :   mask.quick_grow (count);
   11597       488709 :   vec_perm_indices indices;
   11598       488709 :   unsigned nperms = 0;
   11599              :   /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
   11600              :      vectors to check during analysis, but we need to generate NOUTPUTS
   11601              :      vectors during transformation.  */
   11602       488709 :   unsigned total_nelts = olanes;
   11603       488709 :   unsigned process_nelts = olanes;
   11604       488709 :   if (repeating_p)
   11605              :     {
   11606       142101 :       total_nelts = (total_nelts / unpack_factor) * noutputs;
   11607       142101 :       if (gsi)
   11608         9805 :         process_nelts = total_nelts;
   11609              :     }
   11610       488709 :   unsigned last_ei = (total_nelts - 1) % process_nelts;
   11611      2394807 :   for (unsigned i = 0; i < process_nelts; ++i)
   11612              :     {
   11613              :       /* VI is the input vector index when generating code for REPEATING_P.  */
   11614      1913436 :       unsigned vi = i / olanes * (pack_p ? 2 : 1);
   11615      1913436 :       unsigned ei = i % olanes;
   11616      1913436 :       mask_element = vperm[ei].second;
   11617      1913436 :       if (pack_p)
   11618              :         {
   11619              :           /* In this case, we have N outputs and the single child provides 2N
   11620              :              inputs.  Output X permutes inputs 2X and 2X+1.
   11621              : 
   11622              :              The mask indices are taken directly from the SLP permutation node.
   11623              :              Index X selects from the first vector if (X / NUNITS) % 2 == 0;
   11624              :              X selects from the second vector otherwise.  These conditions
   11625              :              are only known at compile time for constant-length vectors.  */
   11626              :           first_vec = std::make_pair (0, 0);
   11627              :           second_vec = std::make_pair (0, 1);
   11628              :         }
   11629      1744719 :       else if (first_vec.first == -1U
   11630      1744719 :                || first_vec == vperm[ei].first)
   11631      1512510 :         first_vec = vperm[ei].first;
   11632       232209 :       else if (second_vec.first == -1U
   11633       232209 :                || second_vec == vperm[ei].first)
   11634              :         {
   11635       231812 :           second_vec = vperm[ei].first;
   11636       231812 :           mask_element += nunits;
   11637              :         }
   11638              :       else
   11639              :         {
   11640          397 :           if (dump_p)
   11641            7 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11642              :                              "permutation requires at "
   11643              :                              "least three vectors\n");
   11644          397 :           gcc_assert (!gsi);
   11645              :           return -1;
   11646              :         }
   11647              : 
   11648      1913039 :       mask[index++] = mask_element;
   11649              : 
   11650      1913039 :       if (index == count)
   11651              :         {
   11652       805465 :           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
   11653              :                               TYPE_VECTOR_SUBPARTS (op_vectype));
   11654       631258 :           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
   11655       965662 :                              && constant_multiple_p (mask[0], nunits));
   11656       631258 :           machine_mode vmode = TYPE_MODE (vectype);
   11657       631258 :           machine_mode op_vmode = TYPE_MODE (op_vectype);
   11658       631258 :           unsigned HOST_WIDE_INT c;
   11659       631258 :           if ((!identity_p
   11660       587934 :                && !can_vec_perm_const_p (vmode, op_vmode, indices))
   11661       631258 :               || (identity_p
   11662        43324 :                   && !known_le (nunits,
   11663              :                                 TYPE_VECTOR_SUBPARTS (op_vectype))
   11664         6949 :                   && (!constant_multiple_p (nunits,
   11665            8 :                                             TYPE_VECTOR_SUBPARTS (op_vectype),
   11666            8 :                                             &c) || c != 2)))
   11667              :             {
   11668         6941 :               if (dump_p)
   11669              :                 {
   11670          152 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
   11671              :                                    vect_location,
   11672              :                                    "unsupported vect permute { ");
   11673         1586 :                   for (i = 0; i < count; ++i)
   11674              :                     {
   11675         1434 :                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
   11676         1434 :                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
   11677              :                     }
   11678          152 :                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
   11679              :                 }
   11680         6941 :               gcc_assert (!gsi);
   11681         7338 :               return -1;
   11682              :             }
   11683              : 
   11684       624317 :           if (!identity_p)
   11685       580993 :             nperms += CEIL (total_nelts, process_nelts) - (ei > last_ei);
   11686       624317 :           if (gsi)
   11687              :             {
   11688        31239 :               if (second_vec.first == -1U)
   11689         7055 :                 second_vec = first_vec;
   11690              : 
   11691        31239 :               slp_tree
   11692        31239 :                 first_node = children[first_vec.first],
   11693        31239 :                 second_node = children[second_vec.first];
   11694              : 
   11695        31239 :               tree mask_vec = NULL_TREE;
   11696        31239 :               if (!identity_p)
   11697        27933 :                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
   11698              : 
   11699        31239 :               tree first_def
   11700        31239 :                 = vect_get_slp_vect_def (first_node, first_vec.second + vi);
   11701        31239 :               tree second_def
   11702        31239 :                 = vect_get_slp_vect_def (second_node, second_vec.second + vi);
   11703        31239 :               vect_add_slp_permutation (vinfo, gsi, node, first_def,
   11704        31239 :                                         second_def, mask_vec, mask[0]);
   11705              :             }
   11706              : 
   11707              :           index = 0;
   11708              :           first_vec = std::make_pair (-1U, -1U);
   11709              :           second_vec = std::make_pair (-1U, -1U);
   11710              :         }
   11711              :     }
   11712              : 
   11713       481371 :   return nperms;
   11714       488709 : }
   11715              : 
   11716              : /* Vectorize the SLP permutations in NODE as specified
   11717              :    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
   11718              :    child number and lane number.
   11719              :    Interleaving of two two-lane two-child SLP subtrees (not supported):
   11720              :      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
   11721              :    A blend of two four-lane two-child SLP subtrees:
   11722              :      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
   11723              :    Highpart of a four-lane one-child SLP subtree (not supported):
   11724              :      [ { 0, 2 }, { 0, 3 } ]
   11725              :    Where currently only a subset is supported by code generating below.  */
   11726              : 
   11727              : bool
   11728       139180 : vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
   11729              :                               slp_tree node, stmt_vector_for_cost *cost_vec)
   11730              : {
   11731       139180 :   tree vectype = SLP_TREE_VECTYPE (node);
   11732       139180 :   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
   11733       139180 :   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
   11734       139180 :                                                SLP_TREE_CHILDREN (node),
   11735              :                                                dump_enabled_p ());
   11736       139180 :   if (nperms < 0)
   11737              :     return false;
   11738              : 
   11739       137853 :   if (!gsi && nperms != 0)
   11740       115894 :     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
   11741              : 
   11742              :   return true;
   11743              : }
   11744              : 
   11745              : /* Vectorize SLP NODE.  */
   11746              : 
   11747              : static void
   11748      1464892 : vect_schedule_slp_node (vec_info *vinfo,
   11749              :                         slp_tree node, slp_instance instance)
   11750              : {
   11751      1464892 :   gimple_stmt_iterator si;
   11752      1464892 :   int i;
   11753      1464892 :   slp_tree child;
   11754              : 
   11755              :   /* Vectorize externals and constants.  */
   11756      1464892 :   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
   11757      1464892 :       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
   11758              :     {
   11759              :       /* ???  vectorizable_shift can end up using a scalar operand which is
   11760              :          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
   11761              :          node in this case.  */
   11762       496520 :       if (!SLP_TREE_VECTYPE (node))
   11763       496520 :         return;
   11764              : 
   11765              :       /* There are two reasons vector defs might already exist.  The first
   11766              :          is that we are vectorizing an existing vector def.  The second is
   11767              :          when performing BB vectorization shared constant/external nodes
   11768              :          are not split apart during partitioning so during the code-gen
   11769              :          DFS walk we can end up visiting them twice.  */
   11770       489644 :       if (! SLP_TREE_VEC_DEFS (node).exists ())
   11771       488821 :         vect_create_constant_vectors (vinfo, node);
   11772       489644 :       return;
   11773              :     }
   11774              : 
   11775       968372 :   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
   11776              : 
   11777       968372 :   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
   11778       968372 :   if (SLP_TREE_VECTYPE (node))
   11779       968366 :     SLP_TREE_VEC_DEFS (node).create (vect_get_num_copies (vinfo, node));
   11780              : 
   11781       968372 :   if (!SLP_TREE_PERMUTE_P (node) && STMT_VINFO_DATA_REF (stmt_info))
   11782              :     {
   11783              :       /* Vectorized loads go before the first scalar load to make it
   11784              :          ready early, vectorized stores go before the last scalar
   11785              :          stmt which is where all uses are ready.  */
   11786       708467 :       stmt_vec_info last_stmt_info = NULL;
   11787       708467 :       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
   11788       165285 :         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
   11789              :       else /* DR_IS_WRITE */
   11790       543182 :         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
   11791       708467 :       si = gsi_for_stmt (last_stmt_info->stmt);
   11792       708467 :     }
   11793       259905 :   else if (!SLP_TREE_PERMUTE_P (node)
   11794       243462 :            && (SLP_TREE_TYPE (node) == cycle_phi_info_type
   11795              :                || SLP_TREE_TYPE (node) == induc_vec_info_type
   11796              :                || SLP_TREE_TYPE (node) == phi_info_type))
   11797              :     {
   11798              :       /* For PHI node vectorization we do not use the insertion iterator.  */
   11799        53814 :       si = gsi_none ();
   11800              :     }
   11801              :   else
   11802              :     {
   11803              :       /* Emit other stmts after the children vectorized defs which is
   11804              :          earliest possible.  */
   11805              :       gimple *last_stmt = NULL;
   11806              :       bool seen_vector_def = false;
   11807       573176 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
   11808       367085 :         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
   11809              :           {
   11810              :             /* For fold-left reductions we are retaining the scalar
   11811              :                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
   11812              :                set so the representation isn't perfect.  Resort to the
   11813              :                last scalar def here.  */
   11814       294343 :             if (SLP_TREE_VEC_DEFS (child).is_empty ())
   11815              :               {
   11816          862 :                 gcc_assert (SLP_TREE_TYPE (child) == cycle_phi_info_type);
   11817          862 :                 gphi *phi = as_a <gphi *>
   11818          862 :                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
   11819          862 :                 if (!last_stmt)
   11820              :                   last_stmt = phi;
   11821          647 :                 else if (vect_stmt_dominates_stmt_p (last_stmt, phi))
   11822              :                   last_stmt = phi;
   11823          636 :                 else if (vect_stmt_dominates_stmt_p (phi, last_stmt))
   11824              :                   ;
   11825              :                 else
   11826            0 :                   gcc_unreachable ();
   11827              :               }
   11828              :             /* We are emitting all vectorized stmts in the same place and
   11829              :                the last one is the last.
   11830              :                ???  Unless we have a load permutation applied and that
   11831              :                figures to re-use an earlier generated load.  */
   11832              :             unsigned j;
   11833              :             tree vdef;
   11834       696343 :             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
   11835              :               {
   11836       402000 :                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
   11837       402000 :                 if (!last_stmt)
   11838              :                   last_stmt = vstmt;
   11839       206596 :                 else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
   11840              :                   last_stmt = vstmt;
   11841        45172 :                 else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
   11842              :                   ;
   11843              :                 else
   11844            0 :                   gcc_unreachable ();
   11845              :               }
   11846              :           }
   11847        72742 :         else if (!SLP_TREE_VECTYPE (child))
   11848              :           {
   11849              :             /* For externals we use unvectorized at all scalar defs.  */
   11850              :             unsigned j;
   11851              :             tree def;
   11852        14631 :             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
   11853         8391 :               if (TREE_CODE (def) == SSA_NAME
   11854         8391 :                   && !SSA_NAME_IS_DEFAULT_DEF (def))
   11855              :                 {
   11856          295 :                   gimple *stmt = SSA_NAME_DEF_STMT (def);
   11857          295 :                   if (gimple_uid (stmt) == -1u)
   11858              :                     /* If the stmt is not inside the region do not
   11859              :                        use it as possible insertion point.  */
   11860              :                     ;
   11861          285 :                   else if (!last_stmt)
   11862              :                     last_stmt = stmt;
   11863          261 :                   else if (vect_stmt_dominates_stmt_p (last_stmt, stmt))
   11864              :                     last_stmt = stmt;
   11865          159 :                   else if (vect_stmt_dominates_stmt_p (stmt, last_stmt))
   11866              :                     ;
   11867              :                   else
   11868            0 :                     gcc_unreachable ();
   11869              :                 }
   11870              :           }
   11871              :         else
   11872              :           {
   11873              :             /* For externals we have to look at all defs since their
   11874              :                insertion place is decided per vector.  But beware
   11875              :                of pre-existing vectors where we need to make sure
   11876              :                we do not insert before the region boundary.  */
   11877        66502 :             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
   11878          650 :                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
   11879              :               seen_vector_def = true;
   11880              :             else
   11881              :               {
   11882              :                 unsigned j;
   11883              :                 tree vdef;
   11884       527887 :                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
   11885        94417 :                   if (TREE_CODE (vdef) == SSA_NAME
   11886        94417 :                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
   11887              :                     {
   11888        19631 :                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
   11889        19631 :                       if (!last_stmt)
   11890              :                         last_stmt = vstmt;
   11891        10906 :                       else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
   11892              :                         last_stmt = vstmt;
   11893         8718 :                       else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
   11894              :                         ;
   11895              :                       else
   11896            0 :                         gcc_unreachable ();
   11897              :                     }
   11898              :               }
   11899              :           }
   11900              :       /* This can happen when all children are pre-existing vectors or
   11901              :          constants.  */
   11902       206091 :       if (!last_stmt)
   11903         1723 :         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
   11904         1723 :       if (!last_stmt)
   11905              :         {
   11906            0 :           gcc_assert (seen_vector_def);
   11907            0 :           si = gsi_after_labels (vinfo->bbs[0]);
   11908              :         }
   11909       206091 :       else if (is_ctrl_altering_stmt (last_stmt))
   11910              :         {
   11911              :           /* We split regions to vectorize at control altering stmts
   11912              :              with a definition so this must be an external which
   11913              :              we can insert at the start of the region.  */
   11914            0 :           si = gsi_after_labels (vinfo->bbs[0]);
   11915              :         }
   11916       206091 :       else if (is_a <bb_vec_info> (vinfo)
   11917        18117 :                && !SLP_TREE_PERMUTE_P (node)
   11918        16655 :                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
   11919       207430 :                && gimple_could_trap_p (stmt_info->stmt))
   11920              :         {
   11921              :           /* We've constrained possibly trapping operations to all come
   11922              :              from the same basic-block, if vectorized defs would allow earlier
   11923              :              scheduling still force vectorized stmts to the original block.
   11924              :              This is only necessary for BB vectorization since for loop vect
   11925              :              all operations are in a single BB and scalar stmt based
   11926              :              placement doesn't play well with epilogue vectorization.  */
   11927           53 :           gcc_assert (dominated_by_p (CDI_DOMINATORS,
   11928              :                                       gimple_bb (stmt_info->stmt),
   11929              :                                       gimple_bb (last_stmt)));
   11930           53 :           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
   11931              :         }
   11932       206038 :       else if (is_a <gphi *> (last_stmt))
   11933        14410 :         si = gsi_after_labels (gimple_bb (last_stmt));
   11934              :       else
   11935              :         {
   11936       191628 :           si = gsi_for_stmt (last_stmt);
   11937       191628 :           gsi_next (&si);
   11938              : 
   11939       191628 :           if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
   11940              :             {
   11941              :               /* Avoid scheduling stmts to random places in the CFG, any
   11942              :                  stmt dominance check we performed is possibly wrong as UIDs
   11943              :                  are not initialized for all of the function for loop
   11944              :                  vectorization.  Instead append to the loop preheader.  */
   11945       173780 :               if ((LOOP_VINFO_LOOP (loop_vinfo)->header
   11946       173780 :                    != gimple_bb (last_stmt))
   11947       176997 :                   && dominated_by_p (CDI_DOMINATORS,
   11948              :                                      LOOP_VINFO_LOOP (loop_vinfo)->header,
   11949         3217 :                                      gimple_bb (last_stmt)))
   11950         1406 :                 si = gsi_end_bb (loop_preheader_edge
   11951          703 :                                    (LOOP_VINFO_LOOP (loop_vinfo))->src);
   11952              :               /* Avoid scheduling internal defs outside of the loop when
   11953              :                  we might have only implicitly tracked loop mask/len defs.  */
   11954           74 :               if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
   11955       173780 :                   || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
   11956              :                 {
   11957           74 :                   gimple_stmt_iterator si2
   11958           74 :                     = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
   11959           74 :                   if ((gsi_end_p (si2)
   11960            0 :                        && (LOOP_VINFO_LOOP (loop_vinfo)->header
   11961            0 :                            != gimple_bb (last_stmt))
   11962            0 :                        && dominated_by_p (CDI_DOMINATORS,
   11963              :                                           LOOP_VINFO_LOOP (loop_vinfo)->header,
   11964            0 :                                           gimple_bb (last_stmt)))
   11965           74 :                       || (!gsi_end_p (si2)
   11966           74 :                           && last_stmt != *si2
   11967           72 :                           && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
   11968            3 :                     si = si2;
   11969              :                 }
   11970              :             }
   11971              :         }
   11972              :     }
   11973              : 
   11974       968372 :   if (dump_enabled_p ())
   11975              :     {
   11976        71382 :       if (stmt_info)
   11977        71329 :         dump_printf_loc (MSG_NOTE, vect_location,
   11978              :                          "------>vectorizing SLP node starting from: %G",
   11979              :                          stmt_info->stmt);
   11980              :       else
   11981              :         {
   11982           53 :           dump_printf_loc (MSG_NOTE, vect_location,
   11983              :                            "------>vectorizing SLP node:\n");
   11984           53 :           vect_print_slp_tree (MSG_NOTE, vect_location, node);
   11985              :         }
   11986              :     }
   11987       968372 :   vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
   11988              : }
   11989              : 
   11990              : /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
   11991              :    For loop vectorization this is done in vectorizable_call, but for SLP
   11992              :    it needs to be deferred until end of vect_schedule_slp, because multiple
   11993              :    SLP instances may refer to the same scalar stmt.  */
   11994              : 
   11995              : static void
   11996       597687 : vect_remove_slp_scalar_calls (vec_info *vinfo,
   11997              :                               slp_tree node, hash_set<slp_tree> &visited)
   11998              : {
   11999       597687 :   gimple *new_stmt;
   12000       597687 :   gimple_stmt_iterator gsi;
   12001       597687 :   int i;
   12002       597687 :   slp_tree child;
   12003       597687 :   tree lhs;
   12004       597687 :   stmt_vec_info stmt_info;
   12005              : 
   12006       597687 :   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
   12007       187200 :     return;
   12008              : 
   12009       453745 :   if (visited.add (node))
   12010              :     return;
   12011              : 
   12012       918687 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
   12013       508200 :     vect_remove_slp_scalar_calls (vinfo, child, visited);
   12014              : 
   12015      1299795 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
   12016              :     {
   12017       482953 :       if (!stmt_info)
   12018         3974 :         continue;
   12019       478979 :       stmt_info = vect_orig_stmt (stmt_info);
   12020       478979 :       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
   12021         5231 :       if (!stmt || gimple_bb (stmt) == NULL)
   12022       473786 :         continue;
   12023         5193 :       lhs = gimple_call_lhs (stmt);
   12024         5193 :       if (lhs)
   12025         4585 :         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
   12026              :       else
   12027          608 :         new_stmt = gimple_build_nop ();
   12028         5193 :       unlink_stmt_vdef (stmt_info->stmt);
   12029         5193 :       gsi = gsi_for_stmt (stmt);
   12030         5193 :       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
   12031         5193 :       if (lhs)
   12032         4585 :         SSA_NAME_DEF_STMT (lhs) = new_stmt;
   12033              :     }
   12034              : }
   12035              : 
   12036              : static void
   12037        89487 : vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
   12038              : {
   12039        89487 :   hash_set<slp_tree> visited;
   12040        89487 :   vect_remove_slp_scalar_calls (vinfo, node, visited);
   12041        89487 : }
   12042              : 
   12043              : /* Vectorize the instance root.  */
   12044              : 
   12045              : void
   12046        10896 : vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance instance)
   12047              : {
   12048        10896 :   gassign *rstmt = NULL;
   12049              : 
   12050        10896 :   if (instance->kind == slp_inst_kind_ctor)
   12051              :     {
   12052         5206 :       if (SLP_TREE_VEC_DEFS (node).length () == 1)
   12053              :         {
   12054         5169 :           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
   12055         5169 :           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
   12056         5169 :           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
   12057         5169 :                                           TREE_TYPE (vect_lhs)))
   12058            0 :             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
   12059              :                                vect_lhs);
   12060         5169 :           rstmt = gimple_build_assign (root_lhs, vect_lhs);
   12061              :         }
   12062              :       else
   12063              :         {
   12064           37 :           gcc_assert (SLP_TREE_VEC_DEFS (node).length () > 1);
   12065           37 :           tree child_def;
   12066           37 :           int j;
   12067           37 :           vec<constructor_elt, va_gc> *v;
   12068           37 :           vec_alloc (v, SLP_TREE_VEC_DEFS (node).length ());
   12069              : 
   12070              :           /* A CTOR can handle V16HI composition from VNx8HI so we
   12071              :              do not need to convert vector elements if the types
   12072              :              do not match.  */
   12073          111 :           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
   12074           74 :             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
   12075           37 :           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
   12076           37 :           tree rtype
   12077           37 :             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
   12078           37 :           tree r_constructor = build_constructor (rtype, v);
   12079           37 :           rstmt = gimple_build_assign (lhs, r_constructor);
   12080              :         }
   12081              :     }
   12082         5690 :   else if (instance->kind == slp_inst_kind_bb_reduc)
   12083              :     {
   12084              :       /* Largely inspired by reduction chain epilogue handling in
   12085              :          vect_create_epilog_for_reduction.  */
   12086         4126 :       vec<tree> vec_defs = vNULL;
   12087         4126 :       vect_get_slp_defs (node, &vec_defs);
   12088         4126 :       enum tree_code reduc_code
   12089         4126 :         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
   12090              :       /* ???  We actually have to reflect signs somewhere.  */
   12091         4126 :       if (reduc_code == MINUS_EXPR)
   12092            0 :         reduc_code = PLUS_EXPR;
   12093         4126 :       gimple_seq epilogue = NULL;
   12094              :       /* We may end up with more than one vector result, reduce them
   12095              :          to one vector.  */
   12096         4126 :       tree vec_def = vec_defs[0];
   12097         4126 :       tree vectype = TREE_TYPE (vec_def);
   12098         4126 :       tree compute_vectype = vectype;
   12099         4126 :       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
   12100         3927 :                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
   12101         6886 :                                  && operation_can_overflow (reduc_code));
   12102         2619 :       if (pun_for_overflow_p)
   12103              :         {
   12104         2619 :           compute_vectype = unsigned_type_for (vectype);
   12105         2619 :           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
   12106              :                                   compute_vectype, vec_def);
   12107              :         }
   12108         6514 :       for (unsigned i = 1; i < vec_defs.length (); ++i)
   12109              :         {
   12110         2388 :           tree def = vec_defs[i];
   12111         2388 :           if (pun_for_overflow_p)
   12112         2285 :             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
   12113              :                                 compute_vectype, def);
   12114         2388 :           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
   12115              :                                   vec_def, def);
   12116              :         }
   12117         4126 :       vec_defs.release ();
   12118              :       /* ???  Support other schemes than direct internal fn.  */
   12119         4126 :       internal_fn reduc_fn;
   12120         4126 :       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
   12121         4126 :           || reduc_fn == IFN_LAST)
   12122            0 :         gcc_unreachable ();
   12123         4126 :       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
   12124         4126 :                                       TREE_TYPE (compute_vectype), vec_def);
   12125         4126 :       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
   12126              :         {
   12127         2565 :           tree rem_def = NULL_TREE;
   12128        11907 :           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
   12129              :             {
   12130         9342 :               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
   12131         9342 :               if (!rem_def)
   12132              :                 rem_def = def;
   12133              :               else
   12134         6777 :                 rem_def = gimple_build (&epilogue, reduc_code,
   12135         6777 :                                         TREE_TYPE (scalar_def),
   12136              :                                         rem_def, def);
   12137              :             }
   12138         2565 :           scalar_def = gimple_build (&epilogue, reduc_code,
   12139         2565 :                                      TREE_TYPE (scalar_def),
   12140              :                                      scalar_def, rem_def);
   12141              :         }
   12142         4126 :       scalar_def = gimple_convert (&epilogue,
   12143         4126 :                                    TREE_TYPE (vectype), scalar_def);
   12144         4126 :       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
   12145         4126 :       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
   12146         4126 :       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
   12147         4126 :       update_stmt (gsi_stmt (rgsi));
   12148         4126 :       return;
   12149              :     }
   12150         1564 :   else if (instance->kind == slp_inst_kind_gcond)
   12151              :     {
   12152              :       /* Only support a single root for now as we can't codegen CFG yet and so we
   12153              :          can't support lane > 1 at this time.  */
   12154         1564 :       gcc_assert (instance->root_stmts.length () == 1);
   12155         1564 :       auto root_stmt_info = instance->root_stmts[0];
   12156         1564 :       auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
   12157         1564 :       gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
   12158         1564 :       gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
   12159         1564 :       bool res = vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
   12160              :                                           root_stmt_info, &rgsi, node, NULL);
   12161         1564 :       gcc_assert (res);
   12162         1564 :       return;
   12163              :     }
   12164              :   else
   12165            0 :     gcc_unreachable ();
   12166              : 
   12167         5206 :   gcc_assert (rstmt);
   12168              : 
   12169         5206 :   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
   12170         5206 :   gsi_replace (&rgsi, rstmt, true);
   12171              : }
   12172              : 
   12173              : struct slp_scc_info
   12174              : {
   12175              :   bool on_stack;
   12176              :   int dfs;
   12177              :   int lowlink;
   12178              : };
   12179              : 
   12180              : /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
   12181              : 
   12182              : static void
   12183      1464892 : vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
   12184              :                    hash_map<slp_tree, slp_scc_info> &scc_info,
   12185              :                    int &maxdfs, vec<slp_tree> &stack)
   12186              : {
   12187      1464892 :   bool existed_p;
   12188      1464892 :   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
   12189      1464892 :   gcc_assert (!existed_p);
   12190      1464892 :   info->dfs = maxdfs;
   12191      1464892 :   info->lowlink = maxdfs;
   12192      1464892 :   maxdfs++;
   12193              : 
   12194              :   /* Leaf.  */
   12195      1464892 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
   12196              :     {
   12197       496520 :       info->on_stack = false;
   12198       496520 :       vect_schedule_slp_node (vinfo, node, instance);
   12199      1024465 :       return;
   12200              :     }
   12201              : 
   12202       968372 :   info->on_stack = true;
   12203       968372 :   stack.safe_push (node);
   12204              : 
   12205       968372 :   unsigned i;
   12206       968372 :   slp_tree child;
   12207              :   /* DFS recurse.  */
   12208      1998077 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
   12209              :     {
   12210      1029705 :       if (!child)
   12211        54894 :         continue;
   12212       974811 :       slp_scc_info *child_info = scc_info.get (child);
   12213       974811 :       if (!child_info)
   12214              :         {
   12215       885754 :           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
   12216              :           /* Recursion might have re-allocated the node.  */
   12217       885754 :           info = scc_info.get (node);
   12218       885754 :           child_info = scc_info.get (child);
   12219       885754 :           info->lowlink = MIN (info->lowlink, child_info->lowlink);
   12220              :         }
   12221        89057 :       else if (child_info->on_stack)
   12222        25273 :         info->lowlink = MIN (info->lowlink, child_info->dfs);
   12223              :     }
   12224       968372 :   if (info->lowlink != info->dfs)
   12225              :     return;
   12226              : 
   12227       936947 :   auto_vec<slp_tree, 4> phis_to_fixup;
   12228              : 
   12229              :   /* Singleton.  */
   12230       936947 :   if (stack.last () == node)
   12231              :     {
   12232       913341 :       stack.pop ();
   12233       913341 :       info->on_stack = false;
   12234       913341 :       vect_schedule_slp_node (vinfo, node, instance);
   12235       913341 :       if (!SLP_TREE_PERMUTE_P (node)
   12236       913341 :           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
   12237        30335 :         phis_to_fixup.quick_push (node);
   12238              :     }
   12239              :   else
   12240              :     {
   12241              :       /* SCC.  */
   12242        23606 :       int last_idx = stack.length () - 1;
   12243        55031 :       while (stack[last_idx] != node)
   12244        31425 :         last_idx--;
   12245              :       /* We can break the cycle at PHIs who have at least one child
   12246              :          code generated.  Then we could re-start the DFS walk until
   12247              :          all nodes in the SCC are covered (we might have new entries
   12248              :          for only back-reachable nodes).  But it's simpler to just
   12249              :          iterate and schedule those that are ready.  */
   12250        23606 :       unsigned todo = stack.length () - last_idx;
   12251        23945 :       do
   12252              :         {
   12253       104705 :           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
   12254              :             {
   12255        56815 :               slp_tree entry = stack[idx];
   12256        56815 :               if (!entry)
   12257          956 :                 continue;
   12258        55859 :               bool phi = (!SLP_TREE_PERMUTE_P (entry)
   12259        55859 :                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
   12260        55859 :               bool ready = !phi;
   12261       141335 :               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
   12262       110326 :                   if (!child)
   12263              :                     {
   12264        22728 :                       gcc_assert (phi);
   12265              :                       ready = true;
   12266              :                       break;
   12267              :                     }
   12268        87598 :                   else if (scc_info.get (child)->on_stack)
   12269              :                     {
   12270        23823 :                       if (!phi)
   12271              :                         {
   12272              :                           ready = false;
   12273              :                           break;
   12274              :                         }
   12275              :                     }
   12276              :                   else
   12277              :                     {
   12278        63775 :                       if (phi)
   12279              :                         {
   12280              :                           ready = true;
   12281              :                           break;
   12282              :                         }
   12283              :                     }
   12284        33131 :               if (ready)
   12285              :                 {
   12286        55031 :                   vect_schedule_slp_node (vinfo, entry, instance);
   12287        55031 :                   scc_info.get (entry)->on_stack = false;
   12288        55031 :                   stack[idx] = NULL;
   12289        55031 :                   todo--;
   12290        55031 :                   if (phi)
   12291        24052 :                     phis_to_fixup.safe_push (entry);
   12292              :                 }
   12293              :             }
   12294              :         }
   12295        23945 :       while (todo != 0);
   12296              : 
   12297              :       /* Pop the SCC.  */
   12298        23606 :       stack.truncate (last_idx);
   12299              :     }
   12300              : 
   12301              :   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
   12302              :   slp_tree phi_node;
   12303      1928281 :   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
   12304              :     {
   12305        54387 :       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
   12306        54387 :       edge_iterator ei;
   12307        54387 :       edge e;
   12308       171407 :       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
   12309              :         {
   12310       117020 :           unsigned dest_idx = e->dest_idx;
   12311       117020 :           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
   12312       117020 :           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
   12313        65868 :             continue;
   12314        51152 :           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
   12315              :           /* Simply fill all args.  */
   12316        51152 :           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
   12317              :               != vect_first_order_recurrence)
   12318       109985 :             for (unsigned i = 0; i < n; ++i)
   12319              :               {
   12320        58876 :                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
   12321        58876 :                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
   12322        58876 :                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
   12323              :                              e, gimple_phi_arg_location (phi, dest_idx));
   12324              :               }
   12325              :           else
   12326              :             {
   12327              :               /* Unless it is a first order recurrence which needs
   12328              :                  args filled in for both the PHI node and the permutes.  */
   12329           43 :               gimple *perm
   12330           43 :                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
   12331           43 :               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
   12332           43 :               add_phi_arg (as_a <gphi *> (rphi),
   12333              :                            vect_get_slp_vect_def (child, n - 1),
   12334              :                            e, gimple_phi_arg_location (phi, dest_idx));
   12335          123 :               for (unsigned i = 0; i < n; ++i)
   12336              :                 {
   12337           80 :                   gimple *perm
   12338           80 :                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
   12339           80 :                   if (i > 0)
   12340           37 :                     gimple_assign_set_rhs1 (perm,
   12341              :                                             vect_get_slp_vect_def (child, i - 1));
   12342           80 :                   gimple_assign_set_rhs2 (perm,
   12343              :                                           vect_get_slp_vect_def (child, i));
   12344           80 :                   update_stmt (perm);
   12345              :                 }
   12346              :             }
   12347              :         }
   12348              :     }
   12349       936947 : }
   12350              : 
   12351              : /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
   12352              : 
   12353              : void
   12354       540591 : vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
   12355              : {
   12356       540591 :   slp_instance instance;
   12357       540591 :   unsigned int i;
   12358              : 
   12359       540591 :   hash_map<slp_tree, slp_scc_info> scc_info;
   12360       540591 :   int maxdfs = 0;
   12361      1119836 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
   12362              :     {
   12363       579245 :       slp_tree node = SLP_INSTANCE_TREE (instance);
   12364       579245 :       if (dump_enabled_p ())
   12365              :         {
   12366        15999 :           dump_printf_loc (MSG_NOTE, vect_location,
   12367              :                            "Vectorizing SLP tree:\n");
   12368              :           /* ???  Dump all?  */
   12369        15999 :           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
   12370          449 :             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
   12371          449 :                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
   12372        15999 :           vect_print_slp_graph (MSG_NOTE, vect_location,
   12373              :                                 SLP_INSTANCE_TREE (instance));
   12374              :         }
   12375              :       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
   12376              :          have a PHI be the node breaking the cycle.  */
   12377       579245 :       auto_vec<slp_tree> stack;
   12378       579245 :       if (!scc_info.get (node))
   12379       579138 :         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
   12380              : 
   12381       579245 :       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
   12382        10896 :         vectorize_slp_instance_root_stmt (vinfo, node, instance);
   12383              : 
   12384       579245 :       if (dump_enabled_p ())
   12385        15999 :         dump_printf_loc (MSG_NOTE, vect_location,
   12386              :                          "vectorizing stmts using SLP.\n");
   12387       579245 :     }
   12388              : 
   12389      1660427 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
   12390              :     {
   12391       579245 :       slp_tree root = SLP_INSTANCE_TREE (instance);
   12392       579245 :       stmt_vec_info store_info;
   12393       579245 :       unsigned int j;
   12394              : 
   12395              :       /* Remove scalar call stmts.  Do not do this for basic-block
   12396              :          vectorization as not all uses may be vectorized.
   12397              :          ???  Why should this be necessary?  DCE should be able to
   12398              :          remove the stmts itself.
   12399              :          ???  For BB vectorization we can as well remove scalar
   12400              :          stmts starting from the SLP tree root if they have no
   12401              :          uses.  */
   12402       579245 :       if (is_a <loop_vec_info> (vinfo))
   12403        89487 :         vect_remove_slp_scalar_calls (vinfo, root);
   12404              : 
   12405              :       /* Remove vectorized stores original scalar stmts.  */
   12406      2586703 :       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
   12407              :         {
   12408      1464276 :           if (!store_info
   12409      1464262 :               || !STMT_VINFO_DATA_REF (store_info)
   12410      1436786 :               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
   12411              :             break;
   12412              : 
   12413      1428213 :           store_info = vect_orig_stmt (store_info);
   12414              :           /* Free the attached stmt_vec_info and remove the stmt.  */
   12415      1428213 :           vinfo->remove_stmt (store_info);
   12416              : 
   12417              :           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
   12418              :              to not crash in vect_free_slp_tree later.  */
   12419      1428213 :           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
   12420       542851 :             SLP_TREE_REPRESENTATIVE (root) = NULL;
   12421              :         }
   12422              :     }
   12423       540591 : }
        

Generated by: LCOV version 2.4-beta

LCOV profile is generated on x86_64 machine using following configure options: configure --disable-bootstrap --enable-coverage=opt --enable-languages=c,c++,fortran,go,jit,lto,rust,m2 --enable-host-shared. GCC test suite is run with the built compiler.