LCOV - code coverage report
Current view: top level - gcc - tree-vect-slp.cc (source / functions) Coverage Total Hit
Test: gcc.info Lines: 92.4 % 5921 5472
Test Date: 2026-05-30 15:37:04 Functions: 95.1 % 182 173
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /* SLP - Basic Block Vectorization
       2              :    Copyright (C) 2007-2026 Free Software Foundation, Inc.
       3              :    Contributed by Dorit Naishlos <dorit@il.ibm.com>
       4              :    and Ira Rosen <irar@il.ibm.com>
       5              : 
       6              : This file is part of GCC.
       7              : 
       8              : GCC is free software; you can redistribute it and/or modify it under
       9              : the terms of the GNU General Public License as published by the Free
      10              : Software Foundation; either version 3, or (at your option) any later
      11              : version.
      12              : 
      13              : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
      14              : WARRANTY; without even the implied warranty of MERCHANTABILITY or
      15              : FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      16              : for more details.
      17              : 
      18              : You should have received a copy of the GNU General Public License
      19              : along with GCC; see the file COPYING3.  If not see
      20              : <http://www.gnu.org/licenses/>.  */
      21              : 
      22              : #include "config.h"
      23              : #define INCLUDE_ALGORITHM
      24              : #include "system.h"
      25              : #include "coretypes.h"
      26              : #include "backend.h"
      27              : #include "target.h"
      28              : #include "rtl.h"
      29              : #include "tree.h"
      30              : #include "gimple.h"
      31              : #include "tree-pass.h"
      32              : #include "ssa.h"
      33              : #include "optabs-tree.h"
      34              : #include "insn-config.h"
      35              : #include "recog.h"            /* FIXME: for insn_data */
      36              : #include "fold-const.h"
      37              : #include "stor-layout.h"
      38              : #include "gimple-iterator.h"
      39              : #include "cfgloop.h"
      40              : #include "tree-vectorizer.h"
      41              : #include "langhooks.h"
      42              : #include "gimple-walk.h"
      43              : #include "dbgcnt.h"
      44              : #include "tree-vector-builder.h"
      45              : #include "vec-perm-indices.h"
      46              : #include "gimple-fold.h"
      47              : #include "internal-fn.h"
      48              : #include "dump-context.h"
      49              : #include "cfganal.h"
      50              : #include "tree-eh.h"
      51              : #include "tree-cfg.h"
      52              : #include "alloc-pool.h"
      53              : #include "sreal.h"
      54              : #include "predict.h"
      55              : 
      56              : #define REDUC_GROUP_FIRST_ELEMENT(S) \
      57              :   (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element)
      58              : 
      59              : static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
      60              :                                             load_permutation_t &,
      61              :                                             const vec<tree> &,
      62              :                                             gimple_stmt_iterator *,
      63              :                                             poly_uint64, bool, bool,
      64              :                                             unsigned *,
      65              :                                             unsigned * = nullptr,
      66              :                                             bool = false);
      67              : static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
      68              :                                            slp_tree, lane_permutation_t &,
      69              :                                            vec<slp_tree> &, bool);
      70              : static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
      71              : static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
      72              : 
      73              : static object_allocator<_slp_tree> *slp_tree_pool;
      74              : static slp_tree slp_first_node;
      75              : 
      76              : void
      77      1119179 : vect_slp_init (void)
      78              : {
      79      1119179 :   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
      80      1119179 : }
      81              : 
      82              : void
      83      1119179 : vect_slp_fini (void)
      84              : {
      85      1781007 :   while (slp_first_node)
      86       661828 :     delete slp_first_node;
      87      2238358 :   delete slp_tree_pool;
      88      1119179 :   slp_tree_pool = NULL;
      89      1119179 : }
      90              : 
      91              : void *
      92      7724265 : _slp_tree::operator new (size_t n)
      93              : {
      94      7724265 :   gcc_assert (n == sizeof (_slp_tree));
      95      7724265 :   return slp_tree_pool->allocate_raw ();
      96              : }
      97              : 
      98              : void
      99      7724265 : _slp_tree::operator delete (void *node, size_t n)
     100              : {
     101      7724265 :   gcc_assert (n == sizeof (_slp_tree));
     102      7724265 :   slp_tree_pool->remove_raw (node);
     103      7724265 : }
     104              : 
     105              : 
     106              : /* Initialize a SLP node.  */
     107              : 
     108      7724265 : _slp_tree::_slp_tree ()
     109              : {
     110      7724265 :   this->prev_node = NULL;
     111      7724265 :   if (slp_first_node)
     112      6761217 :     slp_first_node->prev_node = this;
     113      7724265 :   this->next_node = slp_first_node;
     114      7724265 :   slp_first_node = this;
     115      7724265 :   SLP_TREE_SCALAR_STMTS (this) = vNULL;
     116      7724265 :   SLP_TREE_SCALAR_OPS (this) = vNULL;
     117      7724265 :   SLP_TREE_LIVE_LANES (this) = vNULL;
     118      7724265 :   SLP_TREE_VEC_DEFS (this) = vNULL;
     119      7724265 :   SLP_TREE_CHILDREN (this) = vNULL;
     120      7724265 :   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
     121      7724265 :   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
     122      7724265 :   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
     123      7724265 :   SLP_TREE_CODE (this) = ERROR_MARK;
     124      7724265 :   SLP_TREE_GS_SCALE (this) = 0;
     125      7724265 :   SLP_TREE_GS_BASE (this) = NULL_TREE;
     126      7724265 :   this->ldst_lanes = false;
     127      7724265 :   this->avoid_stlf_fail = false;
     128      7724265 :   SLP_TREE_VECTYPE (this) = NULL_TREE;
     129      7724265 :   SLP_TREE_REPRESENTATIVE (this) = NULL;
     130      7724265 :   this->cycle_info.id = -1;
     131      7724265 :   this->cycle_info.reduc_idx = -1;
     132      7724265 :   SLP_TREE_REF_COUNT (this) = 1;
     133      7724265 :   this->failed = NULL;
     134      7724265 :   this->max_nunits = 1;
     135      7724265 :   this->lanes = 0;
     136      7724265 :   SLP_TREE_TYPE (this) = undef_vec_info_type;
     137      7724265 :   this->data = NULL;
     138      7724265 : }
     139              : 
     140              : /* Tear down a SLP node.  */
     141              : 
     142      7724265 : _slp_tree::~_slp_tree ()
     143              : {
     144      7724265 :   if (this->prev_node)
     145      4671856 :     this->prev_node->next_node = this->next_node;
     146              :   else
     147      3052409 :     slp_first_node = this->next_node;
     148      7724265 :   if (this->next_node)
     149      5824743 :     this->next_node->prev_node = this->prev_node;
     150      7724265 :   SLP_TREE_CHILDREN (this).release ();
     151      7724265 :   SLP_TREE_SCALAR_STMTS (this).release ();
     152      7724265 :   SLP_TREE_SCALAR_OPS (this).release ();
     153      7724265 :   SLP_TREE_LIVE_LANES (this).release ();
     154      7724265 :   SLP_TREE_VEC_DEFS (this).release ();
     155      7724265 :   SLP_TREE_LOAD_PERMUTATION (this).release ();
     156      7724265 :   SLP_TREE_LANE_PERMUTATION (this).release ();
     157      7724265 :   if (this->failed)
     158      2013173 :     free (failed);
     159      7724265 :   if (this->data)
     160      1243347 :     delete this->data;
     161      7724265 : }
     162              : 
     163              : /* Push the single SSA definition in DEF to the vector of vector defs.  */
     164              : 
     165              : void
     166       526592 : _slp_tree::push_vec_def (gimple *def)
     167              : {
     168       526592 :   if (gphi *phi = dyn_cast <gphi *> (def))
     169        58656 :     vec_defs.quick_push (gimple_phi_result (phi));
     170              :   else
     171              :     {
     172       467936 :       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
     173       467936 :       vec_defs.quick_push (get_def_from_ptr (defop));
     174              :     }
     175       526592 : }
     176              : 
     177              : /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
     178              : 
     179              : void
     180     14671281 : vect_free_slp_tree (slp_tree node)
     181              : {
     182     14671281 :   int i;
     183     14671281 :   slp_tree child;
     184              : 
     185     14671281 :   if (--SLP_TREE_REF_COUNT (node) != 0)
     186     14671281 :     return;
     187              : 
     188     10976927 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     189      3914490 :     if (child)
     190      3557140 :       vect_free_slp_tree (child);
     191              : 
     192      7062437 :   delete node;
     193              : }
     194              : 
     195              : /* Return a location suitable for dumpings related to the SLP instance.  */
     196              : 
     197              : dump_user_location_t
     198      3404189 : _slp_instance::location () const
     199              : {
     200      3404189 :   if (!root_stmts.is_empty ())
     201       319218 :     return root_stmts[0]->stmt;
     202              :   else
     203      3084971 :     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
     204              : }
     205              : 
     206              : 
     207              : /* Free the memory allocated for the SLP instance.  */
     208              : 
     209              : void
     210      1555778 : vect_free_slp_instance (slp_instance instance)
     211              : {
     212      1555778 :   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
     213      1555778 :   SLP_INSTANCE_LOADS (instance).release ();
     214      1555778 :   SLP_INSTANCE_ROOT_STMTS (instance).release ();
     215      1555778 :   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
     216      1555778 :   instance->subgraph_entries.release ();
     217      1555778 :   instance->cost_vec.release ();
     218      1555778 :   free (instance);
     219      1555778 : }
     220              : 
     221              : 
     222              : /* Create a SLP node with NOPS children with CODE, either VEC_PERM_EXPR
     223              :    for a permute node or else ERROR_MARK.  */
     224              : 
     225              : slp_tree
     226        95248 : vect_create_new_slp_node (unsigned nops, tree_code code)
     227              : {
     228        95248 :   gcc_assert (code == ERROR_MARK || code == VEC_PERM_EXPR);
     229        95248 :   slp_tree node = new _slp_tree;
     230        95248 :   SLP_TREE_SCALAR_STMTS (node) = vNULL;
     231        95248 :   SLP_TREE_CHILDREN (node).create (nops);
     232        95248 :   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
     233        95248 :   SLP_TREE_CODE (node) = code;
     234        95248 :   return node;
     235              : }
     236              : 
     237              : /* Create a SLP node inplace at NODE for SCALAR_STMTS and NOPS children.  */
     238              : 
     239              : static slp_tree
     240      3765227 : vect_create_new_slp_node (slp_tree node,
     241              :                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
     242              : {
     243      3765227 :   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
     244      3765227 :   SLP_TREE_CHILDREN (node).create (nops);
     245      3765227 :   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
     246      3765227 :   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
     247      3765227 :   SLP_TREE_LANES (node) = scalar_stmts.length ();
     248      3765227 :   return node;
     249              : }
     250              : 
     251              : /* Create an SLP node for SCALAR_STMTS and NOPS children.  */
     252              : 
     253              : static slp_tree
     254         8037 : vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
     255              : {
     256         8037 :   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
     257              : }
     258              : 
     259              : /* Create a vect_external_def SLP node inplace at NODE for scalar
     260              :    operands OPS.  */
     261              : 
     262              : static slp_tree
     263      1839969 : vect_create_new_slp_node (slp_tree node, vec<tree> ops)
     264              : {
     265      1839969 :   SLP_TREE_SCALAR_OPS (node) = ops;
     266      1839969 :   SLP_TREE_DEF_TYPE (node) = vect_external_def;
     267            0 :   SLP_TREE_LANES (node) = ops.length ();
     268      1839969 :   return node;
     269              : }
     270              : 
     271              : /* Create a vect_external_def SLP node for scalar operands OPS.  */
     272              : 
     273              : static slp_tree
     274      1839969 : vect_create_new_slp_node (vec<tree> ops)
     275              : {
     276      1839969 :   return vect_create_new_slp_node (new _slp_tree, ops);
     277              : }
     278              : 
     279              : 
     280              : /* This structure is used in creation of an SLP tree.  Each instance
     281              :    corresponds to the same operand in a group of scalar stmts in an SLP
     282              :    node.  */
     283              : typedef struct _slp_oprnd_info
     284              : {
     285              :   /* Def-stmts for the operands.  */
     286              :   vec<stmt_vec_info> def_stmts;
     287              :   /* Operands.  */
     288              :   vec<tree> ops;
     289              :   /* Information about the first statement, its vector def-type, type, the
     290              :      operand itself in case it's constant, and an indication if it's a pattern
     291              :      stmt and gather/scatter info.  */
     292              :   tree first_op_type;
     293              :   enum vect_def_type first_dt;
     294              :   bool any_pattern;
     295              :   bool first_gs_p;
     296              :   gather_scatter_info first_gs_info;
     297              : } *slp_oprnd_info;
     298              : 
     299              : 
     300              : /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
     301              :    operand.  */
     302              : static vec<slp_oprnd_info>
     303      3332559 : vect_create_oprnd_info (int nops, int group_size)
     304              : {
     305      3332559 :   int i;
     306      3332559 :   slp_oprnd_info oprnd_info;
     307      3332559 :   vec<slp_oprnd_info> oprnds_info;
     308              : 
     309      3332559 :   oprnds_info.create (nops);
     310     11954468 :   for (i = 0; i < nops; i++)
     311              :     {
     312      5289350 :       oprnd_info = XNEW (struct _slp_oprnd_info);
     313      5289350 :       oprnd_info->def_stmts.create (group_size);
     314      5289350 :       oprnd_info->ops.create (group_size);
     315      5289350 :       oprnd_info->first_dt = vect_uninitialized_def;
     316      5289350 :       oprnd_info->first_op_type = NULL_TREE;
     317      5289350 :       oprnd_info->any_pattern = false;
     318      5289350 :       oprnd_info->first_gs_p = false;
     319      5289350 :       oprnds_info.quick_push (oprnd_info);
     320              :     }
     321              : 
     322      3332559 :   return oprnds_info;
     323              : }
     324              : 
     325              : 
     326              : /* Free operands info.  */
     327              : 
     328              : static void
     329      3332559 : vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
     330              : {
     331      3332559 :   int i;
     332      3332559 :   slp_oprnd_info oprnd_info;
     333              : 
     334      8621909 :   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     335              :     {
     336      5289350 :       oprnd_info->def_stmts.release ();
     337      5289350 :       oprnd_info->ops.release ();
     338      5289350 :       XDELETE (oprnd_info);
     339              :     }
     340              : 
     341      3332559 :   oprnds_info.release ();
     342      3332559 : }
     343              : 
     344              : /* Return the execution frequency of NODE (so that a higher value indicates
     345              :    a "more important" node when optimizing for speed).  */
     346              : 
     347              : static sreal
     348      3489700 : vect_slp_node_weight (slp_tree node)
     349              : {
     350      3489700 :   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
     351      3489700 :   basic_block bb = gimple_bb (stmt_info->stmt);
     352      3489700 :   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
     353              : }
     354              : 
     355              : /* Return true if STMTS contains a pattern statement.  */
     356              : 
     357              : static bool
     358        22258 : vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
     359              : {
     360        22258 :   stmt_vec_info stmt_info;
     361        22258 :   unsigned int i;
     362        71982 :   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
     363        51960 :     if (stmt_info && is_pattern_stmt_p (stmt_info))
     364              :       return true;
     365              :   return false;
     366              : }
     367              : 
     368              : /* Return true when all lanes in the external or constant NODE have
     369              :    the same value.  */
     370              : 
     371              : static bool
     372       594228 : vect_slp_tree_uniform_p (slp_tree node)
     373              : {
     374       594228 :   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
     375              :               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
     376              : 
     377              :   /* Pre-exsting vectors.  */
     378      1045727 :   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
     379              :     return false;
     380              : 
     381              :   unsigned i;
     382              :   tree op, first = NULL_TREE;
     383      1361082 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
     384      1218353 :     if (!first)
     385              :       first = op;
     386       624125 :     else if (!operand_equal_p (first, op, 0))
     387              :       return false;
     388              : 
     389              :   return true;
     390              : }
     391              : 
     392              : /* Find the place of the data-ref in STMT_INFO in the interleaving chain
     393              :    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
     394              :    of the chain.  */
     395              : 
     396              : int
     397       701848 : vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
     398              :                                       stmt_vec_info first_stmt_info)
     399              : {
     400       701848 :   stmt_vec_info next_stmt_info = first_stmt_info;
     401       701848 :   int result = 0;
     402              : 
     403       701848 :   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
     404              :     return -1;
     405              : 
     406      1753522 :   do
     407              :     {
     408      1753522 :       if (next_stmt_info == stmt_info)
     409              :         return result;
     410      1051674 :       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
     411      1051674 :       if (next_stmt_info)
     412      1051674 :         result += DR_GROUP_GAP (next_stmt_info);
     413              :     }
     414      1051674 :   while (next_stmt_info);
     415              : 
     416              :   return -1;
     417              : }
     418              : 
     419              : /* Check whether it is possible to load COUNT elements of type ELT_TYPE
     420              :    using the method implemented by duplicate_and_interleave.  Return true
     421              :    if so, returning the number of intermediate vectors in *NVECTORS_OUT
     422              :    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
     423              :    (if nonnull).  */
     424              : 
     425              : bool
     426            0 : can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
     427              :                                 tree elt_type, unsigned int *nvectors_out,
     428              :                                 tree *vector_type_out,
     429              :                                 tree *permutes)
     430              : {
     431            0 :   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
     432            0 :   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
     433            0 :     return false;
     434              : 
     435            0 :   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
     436            0 :   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
     437            0 :   unsigned int nvectors = 1;
     438            0 :   for (;;)
     439              :     {
     440            0 :       scalar_int_mode int_mode;
     441            0 :       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
     442            0 :       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
     443              :         {
     444              :           /* Get the natural vector type for this SLP group size.  */
     445            0 :           tree int_type = build_nonstandard_integer_type
     446            0 :             (GET_MODE_BITSIZE (int_mode), 1);
     447            0 :           tree vector_type
     448            0 :             = get_vectype_for_scalar_type (vinfo, int_type, count);
     449            0 :           poly_int64 half_nelts;
     450            0 :           if (vector_type
     451            0 :               && VECTOR_MODE_P (TYPE_MODE (vector_type))
     452            0 :               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
     453              :                            GET_MODE_SIZE (base_vector_mode))
     454            0 :               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
     455              :                              2, &half_nelts))
     456              :             {
     457              :               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
     458              :                  together into elements of type INT_TYPE and using the result
     459              :                  to build NVECTORS vectors.  */
     460            0 :               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
     461            0 :               vec_perm_builder sel1 (nelts, 2, 3);
     462            0 :               vec_perm_builder sel2 (nelts, 2, 3);
     463              : 
     464            0 :               for (unsigned int i = 0; i < 3; ++i)
     465              :                 {
     466            0 :                   sel1.quick_push (i);
     467            0 :                   sel1.quick_push (i + nelts);
     468            0 :                   sel2.quick_push (half_nelts + i);
     469            0 :                   sel2.quick_push (half_nelts + i + nelts);
     470              :                 }
     471            0 :               vec_perm_indices indices1 (sel1, 2, nelts);
     472            0 :               vec_perm_indices indices2 (sel2, 2, nelts);
     473            0 :               machine_mode vmode = TYPE_MODE (vector_type);
     474            0 :               if (can_vec_perm_const_p (vmode, vmode, indices1)
     475            0 :                   && can_vec_perm_const_p (vmode, vmode, indices2))
     476              :                 {
     477            0 :                   if (nvectors_out)
     478            0 :                     *nvectors_out = nvectors;
     479            0 :                   if (vector_type_out)
     480            0 :                     *vector_type_out = vector_type;
     481            0 :                   if (permutes)
     482              :                     {
     483            0 :                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
     484              :                                                                 indices1);
     485            0 :                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
     486              :                                                                 indices2);
     487              :                     }
     488            0 :                   return true;
     489              :                 }
     490            0 :             }
     491              :         }
     492            0 :       if (!multiple_p (elt_bytes, 2, &elt_bytes))
     493              :         return false;
     494            0 :       nvectors *= 2;
     495              :       /* We need to be able to fuse COUNT / NVECTORS elements together.  */
     496            0 :       if (!multiple_p (count, nvectors))
     497              :         return false;
     498              :     }
     499              : }
     500              : 
     501              : /* Return true if DTA and DTB match.  */
     502              : 
     503              : static bool
     504     17006516 : vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
     505              : {
     506     17006516 :   return (dta == dtb
     507       350522 :           || ((dta == vect_external_def || dta == vect_constant_def)
     508       217337 :               && (dtb == vect_external_def || dtb == vect_constant_def)));
     509              : }
     510              : 
     511              : #define GATHER_SCATTER_OFFSET (-3)
     512              : 
     513              : /* For most SLP statements, there is a one-to-one mapping between
     514              :    gimple arguments and child nodes.  If that is not true for STMT,
     515              :    return an array that contains:
     516              : 
     517              :    - the number of child nodes, followed by
     518              :    - for each child node, the index of the argument associated with that node.
     519              :      The special index -1 is the first operand of an embedded comparison and
     520              :      the special index -2 is the second operand of an embedded comparison.
     521              :      The special indes -3 is the offset of a gather as analyzed by
     522              :      vect_check_gather_scatter.
     523              : 
     524              :    SWAP is as for vect_get_and_check_slp_defs.  */
     525              : 
     526              : static const int *
     527     24281791 : vect_get_operand_map (const gimple *stmt, bool gather_scatter_p,
     528              :                       unsigned char swap)
     529              : {
     530     24281791 :   static const int no_arg_map[] = { 0 };
     531     24281791 :   static const int arg0_map[] = { 1, 0 };
     532     24281791 :   static const int arg2_map[] = { 1, 2 };
     533     24281791 :   static const int arg2_arg3_map[] = { 2, 2, 3 };
     534     24281791 :   static const int arg2_arg4_map[] = { 2, 2, 4 };
     535     24281791 :   static const int arg2_arg5_arg6_map[] = { 3, 2, 5, 6 };
     536     24281791 :   static const int arg2_arg4_arg5_map[] = { 3, 2, 4, 5 };
     537     24281791 :   static const int arg3_arg2_map[] = { 2, 3, 2 };
     538     24281791 :   static const int op00_map[] = { 1, -1 };
     539     24281791 :   static const int op1_op0_map[] = { 2, 1, 0 };
     540     24281791 :   static const int off_map[] = { 1, GATHER_SCATTER_OFFSET };
     541     24281791 :   static const int off_op0_map[] = { 2, GATHER_SCATTER_OFFSET, 0 };
     542     24281791 :   static const int off_arg2_arg3_map[] = { 3, GATHER_SCATTER_OFFSET, 2, 3 };
     543     24281791 :   static const int off_arg3_arg2_map[] = { 3, GATHER_SCATTER_OFFSET, 3, 2 };
     544     24281791 :   static const int mask_call_maps[6][7] = {
     545              :         { 1, 1, },
     546              :         { 2, 1, 2, },
     547              :         { 3, 1, 2, 3, },
     548              :         { 4, 1, 2, 3, 4, },
     549              :         { 5, 1, 2, 3, 4, 5, },
     550              :         { 6, 1, 2, 3, 4, 5, 6 },
     551              :   };
     552              : 
     553     24281791 :   gcc_checking_assert (!swap
     554              :                        || !is_gimple_assign (stmt)
     555              :                        || TREE_CODE_CLASS
     556              :                             (gimple_assign_rhs_code (stmt)) == tcc_comparison
     557              :                        || commutative_tree_code
     558              :                             (gimple_assign_rhs_code (stmt)));
     559              : 
     560     24281791 :   if (auto assign = dyn_cast<const gassign *> (stmt))
     561              :     {
     562     22823475 :       tree_code code = gimple_assign_rhs_code (assign);
     563     22823475 :       if (code == COND_EXPR
     564     22823475 :           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
     565            0 :         gcc_unreachable ();
     566     22823475 :       else if ((TREE_CODE_CLASS (code) == tcc_comparison
     567     21485444 :                 || commutative_tree_code (code))
     568     31761078 :                && swap)
     569              :         return op1_op0_map;
     570     22782730 :       else if (code == VIEW_CONVERT_EXPR)
     571              :         return op00_map;
     572     22774579 :       else if (gather_scatter_p)
     573        43349 :         return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
     574        43349 :                 ? off_op0_map : off_map);
     575              :     }
     576      1458316 :   else if (auto call = dyn_cast<const gcall *> (stmt))
     577              :     {
     578       161573 :       if (gimple_call_internal_p (call))
     579        92080 :         switch (gimple_call_internal_fn (call))
     580              :           {
     581        15940 :           case IFN_MASK_LOAD:
     582        27186 :             return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
     583              : 
     584              :           case IFN_GATHER_LOAD:
     585              :             return arg2_map;
     586              : 
     587            0 :           case IFN_MASK_GATHER_LOAD:
     588            0 :           case IFN_MASK_LEN_GATHER_LOAD:
     589            0 :             return arg2_arg5_arg6_map;
     590              : 
     591            0 :           case IFN_SCATTER_STORE:
     592            0 :             return arg2_arg4_map;
     593              : 
     594            0 :           case IFN_MASK_SCATTER_STORE:
     595            0 :           case IFN_MASK_LEN_SCATTER_STORE:
     596            0 :             return arg2_arg4_arg5_map;
     597              : 
     598         9481 :           case IFN_MASK_STORE:
     599        17540 :             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
     600              : 
     601          988 :           case IFN_MASK_CALL:
     602          988 :             {
     603          988 :               unsigned nargs = gimple_call_num_args (call);
     604          988 :               if (nargs >= 2 && nargs <= 7)
     605          988 :                 return mask_call_maps[nargs-2];
     606              :               else
     607              :                 return nullptr;
     608              :             }
     609              : 
     610          278 :           case IFN_CLZ:
     611          278 :           case IFN_CTZ:
     612          278 :             return arg0_map;
     613              : 
     614         6306 :           case IFN_GOMP_SIMD_LANE:
     615         6306 :             return no_arg_map;
     616              : 
     617              :           default:
     618              :             break;
     619              :           }
     620              :     }
     621              :   return nullptr;
     622              : }
     623              : 
     624              : static const int *
     625     24265806 : vect_get_operand_map (const stmt_vec_info stmt, unsigned char swap = 0)
     626              : {
     627            0 :   return vect_get_operand_map (stmt->stmt, STMT_VINFO_GATHER_SCATTER_P (stmt),
     628            0 :                                swap);
     629              : }
     630              : 
     631              : /* Return the SLP node child index for operand OP of STMT.  */
     632              : 
     633              : int
     634      1375777 : vect_slp_child_index_for_operand (const stmt_vec_info stmt, int op)
     635              : {
     636      1375777 :   const int *opmap = vect_get_operand_map (stmt);
     637      1375777 :   if (!opmap)
     638              :     return op;
     639        21863 :   for (int i = 1; i < 1 + opmap[0]; ++i)
     640        21863 :     if (opmap[i] == op)
     641        12246 :       return i - 1;
     642            0 :   gcc_unreachable ();
     643              : }
     644              : 
     645              : /* Helper class for mapping of GIMPLE operands to SLP children.  */
     646              : /* ???  Add vect_slp_child_index_for_operand here and amend opmaps
     647              :    with the full reverse mapping and indicating the position of the
     648              :    first commutative operand index, eliding the swap_p argument from
     649              :    vect_get_operand_map.  Adjust all consumers.  */
     650              : 
     651              : struct slp_oprnds {
     652              :   slp_oprnds (stmt_vec_info);
     653              :   tree get_op_for_slp_child (stmt_vec_info, unsigned);
     654              :   const int *opmap;
     655              :   const unsigned int num_slp_children;
     656              : };
     657              : 
     658      4414551 : slp_oprnds::slp_oprnds (stmt_vec_info stmt_info)
     659      4414551 :   : opmap (vect_get_operand_map (stmt_info)),
     660      4414551 :     num_slp_children (opmap ? opmap[0] : gimple_num_args (stmt_info->stmt))
     661              : {
     662      4414551 : }
     663              : 
     664              : /* For SLP child number N get the corresponding tree operand from GIMPLE
     665              :    statement described by STMT_INFO.  */
     666              : 
     667              : tree
     668      4867996 : slp_oprnds::get_op_for_slp_child (stmt_vec_info stmt_info, unsigned n)
     669              : {
     670      4867996 :   gcc_assert (n < num_slp_children);
     671      4867996 :   int opno = opmap ? opmap[n + 1] : (int) n;
     672      4867996 :   if (opno == GATHER_SCATTER_OFFSET)
     673            0 :     gcc_unreachable (); // TODO
     674      4867996 :   else if (opno < 0)
     675         1934 :     return TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
     676              :   else
     677      4866062 :     return gimple_arg (stmt_info->stmt, opno);
     678              : }
     679              : 
     680              : /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
     681              :    they are of a valid type and that they match the defs of the first stmt of
     682              :    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
     683              :    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
     684              :    indicates swap is required for cond_expr stmts.  Specifically, SWAP
     685              :    is 1 if STMT is cond and operands of comparison need to be swapped;
     686              :    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
     687              : 
     688              :    If there was a fatal error return -1; if the error could be corrected by
     689              :    swapping operands of father node of this one, return 1; if everything is
     690              :    ok return 0.  */
     691              : static int
     692     12708576 : vect_get_and_check_slp_defs (vec_info *vinfo, tree vectype, unsigned char swap,
     693              :                              bool *skip_args,
     694              :                              vec<stmt_vec_info> stmts, unsigned stmt_num,
     695              :                              vec<slp_oprnd_info> *oprnds_info)
     696              : {
     697     12708576 :   stmt_vec_info stmt_info = stmts[stmt_num];
     698     12708576 :   tree oprnd;
     699     12708576 :   unsigned int i, number_of_oprnds;
     700     12708576 :   enum vect_def_type dt = vect_uninitialized_def;
     701     12708576 :   slp_oprnd_info oprnd_info;
     702     12708576 :   gather_scatter_info gs_info;
     703     12708576 :   unsigned int gs_op = -1u;
     704     12708576 :   unsigned int commutative_op = -1U;
     705     12708576 :   bool first = stmt_num == 0;
     706              : 
     707     12708576 :   if (!stmt_info)
     708              :     {
     709            0 :       for (auto oi : *oprnds_info)
     710              :         {
     711            0 :           oi->def_stmts.quick_push (NULL);
     712            0 :           oi->ops.quick_push (NULL_TREE);
     713              :         }
     714              :       return 0;
     715              :     }
     716              : 
     717     12708576 :   if (!is_a<gcall *> (stmt_info->stmt)
     718              :       && !is_a<gassign *> (stmt_info->stmt)
     719              :       && !is_a<gphi *> (stmt_info->stmt))
     720              :     return -1;
     721              : 
     722     12708576 :   number_of_oprnds = gimple_num_args (stmt_info->stmt);
     723     12708576 :   const int *map = vect_get_operand_map (stmt_info, swap);
     724     12708576 :   if (map)
     725        75915 :     number_of_oprnds = *map++;
     726     12708576 :   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
     727              :     {
     728        49322 :       if (gimple_call_internal_p (stmt))
     729              :         {
     730        32584 :           internal_fn ifn = gimple_call_internal_fn (stmt);
     731        32584 :           commutative_op = first_commutative_argument (ifn);
     732        32584 :           if (internal_gather_scatter_fn_p (ifn))
     733              :             {
     734            0 :               vect_describe_gather_scatter_call
     735            0 :                 (stmt_info,
     736            0 :                  first ? &(*oprnds_info)[0]->first_gs_info : &gs_info);
     737            0 :               if (first)
     738            0 :                 (*oprnds_info)[0]->first_gs_p = true;
     739              :               gs_op = 0;
     740              :             }
     741              :         }
     742              :     }
     743     12659254 :   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
     744              :     {
     745     14778099 :       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
     746      8367446 :         commutative_op = 0;
     747              :     }
     748              : 
     749     12708576 :   bool swapped = (swap != 0);
     750     12708576 :   bool backedge = false;
     751     12708576 :   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
     752     35153419 :   for (i = 0; i < number_of_oprnds; i++)
     753              :     {
     754     22446056 :       oprnd_info = (*oprnds_info)[i];
     755     22446056 :       int opno = map ? map[i] : int (i);
     756     22446056 :       if (opno == GATHER_SCATTER_OFFSET)
     757              :         {
     758        22752 :           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
     759        22752 :           if (!is_a <loop_vec_info> (vinfo)
     760        22752 :               || !vect_check_gather_scatter (stmt_info, vectype,
     761              :                                              as_a <loop_vec_info> (vinfo),
     762              :                                              first ? &oprnd_info->first_gs_info
     763              :                                              : &gs_info))
     764         1213 :             return -1;
     765              : 
     766        22752 :           if (first)
     767              :             {
     768        22501 :               oprnd_info->first_gs_p = true;
     769        22501 :               oprnd = oprnd_info->first_gs_info.offset;
     770              :             }
     771              :           else
     772              :             {
     773          251 :               gs_op = i;
     774          251 :               oprnd = gs_info.offset;
     775              :             }
     776              :         }
     777     22423304 :       else if (opno < 0)
     778         2842 :         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
     779              :       else
     780              :         {
     781     22420462 :           oprnd = gimple_arg (stmt_info->stmt, opno);
     782     22420462 :           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
     783              :             {
     784      1219502 :               edge e = gimple_phi_arg_edge (stmt, opno);
     785      2439004 :               backedge = (is_a <bb_vec_info> (vinfo)
     786      1880031 :                           ? e->flags & EDGE_DFS_BACK
     787       660529 :                           : dominated_by_p (CDI_DOMINATORS, e->src,
     788       660529 :                                             gimple_bb (stmt_info->stmt)));
     789              :             }
     790              :         }
     791              : 
     792     22446056 :       stmt_vec_info def_stmt_info;
     793     22446056 :       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
     794              :         {
     795          995 :           if (dump_enabled_p ())
     796            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     797              :                              "Build SLP failed: can't analyze def for %T\n",
     798              :                              oprnd);
     799              : 
     800          995 :           return -1;
     801              :         }
     802              : 
     803     22445061 :       if (skip_args[i])
     804              :         {
     805       526482 :           oprnd_info->def_stmts.quick_push (NULL);
     806       526482 :           oprnd_info->ops.quick_push (NULL_TREE);
     807       526482 :           oprnd_info->first_dt = vect_uninitialized_def;
     808       526482 :           continue;
     809              :         }
     810              : 
     811     21918579 :       oprnd_info->def_stmts.quick_push (def_stmt_info);
     812     21918579 :       oprnd_info->ops.quick_push (oprnd);
     813              : 
     814     21918579 :       if (def_stmt_info
     815     21918579 :           && is_pattern_stmt_p (def_stmt_info))
     816              :         {
     817       396402 :           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
     818              :               != def_stmt_info)
     819       279971 :             oprnd_info->any_pattern = true;
     820              :           else
     821              :             /* If we promote this to external use the original stmt def.  */
     822       116431 :             oprnd_info->ops.last ()
     823       232862 :               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
     824              :         }
     825              : 
     826              :       /* If there's a extern def on a backedge make sure we can
     827              :          code-generate at the region start.
     828              :          ???  This is another case that could be fixed by adjusting
     829              :          how we split the function but at the moment we'd have conflicting
     830              :          goals there.  */
     831     21918579 :       if (backedge
     832       167638 :           && dts[i] == vect_external_def
     833          239 :           && is_a <bb_vec_info> (vinfo)
     834          239 :           && TREE_CODE (oprnd) == SSA_NAME
     835          218 :           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
     836     21918797 :           && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
     837          218 :                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
     838              :         {
     839          218 :           if (dump_enabled_p ())
     840            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     841              :                              "Build SLP failed: extern def %T only defined "
     842              :                              "on backedge\n", oprnd);
     843          218 :           return -1;
     844              :         }
     845              : 
     846     21918361 :       if (first)
     847              :         {
     848      4800622 :           tree type = TREE_TYPE (oprnd);
     849      4800622 :           dt = dts[i];
     850              : 
     851              :           /* For the swapping logic below force vect_reduction_def
     852              :              for the reduction op in a SLP reduction group.  */
     853      4800622 :           if (!STMT_VINFO_DATA_REF (stmt_info)
     854      3629700 :               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     855         5210 :               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
     856      4803199 :               && def_stmt_info)
     857         2577 :             dts[i] = dt = vect_reduction_def;
     858              : 
     859              :           /* Check the types of the definition.  */
     860      4800622 :           switch (dt)
     861              :             {
     862      4800622 :             case vect_external_def:
     863      4800622 :             case vect_constant_def:
     864      4800622 :             case vect_internal_def:
     865      4800622 :             case vect_reduction_def:
     866      4800622 :             case vect_double_reduction_def:
     867      4800622 :             case vect_induction_def:
     868      4800622 :             case vect_nested_cycle:
     869      4800622 :             case vect_first_order_recurrence:
     870      4800622 :               break;
     871              : 
     872            0 :             default:
     873              :               /* FORNOW: Not supported.  */
     874            0 :               if (dump_enabled_p ())
     875            0 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     876              :                                  "Build SLP failed: illegal type of def %T\n",
     877              :                                  oprnd);
     878            0 :               return -1;
     879              :             }
     880              : 
     881      4800622 :           oprnd_info->first_dt = dt;
     882      4800622 :           oprnd_info->first_op_type = type;
     883              :         }
     884              :     }
     885     12707363 :   if (first)
     886              :     return 0;
     887              : 
     888              :   /* Now match the operand definition types to that of the first stmt.  */
     889     26240542 :   for (i = 0; i < number_of_oprnds;)
     890              :     {
     891     17131596 :       if (skip_args[i])
     892              :         {
     893        43202 :           ++i;
     894        43202 :           continue;
     895              :         }
     896              : 
     897     17088394 :       oprnd_info = (*oprnds_info)[i];
     898     17088394 :       dt = dts[i];
     899     17088394 :       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
     900     17088394 :       oprnd = oprnd_info->ops[stmt_num];
     901     17088394 :       tree type = TREE_TYPE (oprnd);
     902              : 
     903     17088394 :       if (!types_compatible_p (oprnd_info->first_op_type, type))
     904              :         {
     905        87752 :           if (dump_enabled_p ())
     906          109 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     907              :                              "Build SLP failed: different operand types\n");
     908        87752 :           return 1;
     909              :         }
     910              : 
     911     17000642 :       if ((gs_op == i) != oprnd_info->first_gs_p)
     912              :         {
     913            0 :           if (dump_enabled_p ())
     914            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     915              :                              "Build SLP failed: mixed gather and non-gather\n");
     916            0 :           return 1;
     917              :         }
     918     17000642 :       else if (gs_op == i)
     919              :         {
     920          221 :           if (!operand_equal_p (oprnd_info->first_gs_info.base,
     921          221 :                                 gs_info.base))
     922              :             {
     923           16 :               if (dump_enabled_p ())
     924            6 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     925              :                                  "Build SLP failed: different gather base\n");
     926           16 :               return 1;
     927              :             }
     928          205 :           if (oprnd_info->first_gs_info.scale != gs_info.scale)
     929              :             {
     930            8 :               if (dump_enabled_p ())
     931            2 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     932              :                                  "Build SLP failed: different gather scale\n");
     933            8 :               return 1;
     934              :             }
     935              :         }
     936              : 
     937              :       /* Not first stmt of the group, check that the def-stmt/s match
     938              :          the def-stmt/s of the first stmt.  Allow different definition
     939              :          types for reduction chains: the first stmt must be a
     940              :          vect_reduction_def (a phi node), and the rest
     941              :          end in the reduction chain.  */
     942     17000618 :       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
     943       293639 :            && !(oprnd_info->first_dt == vect_reduction_def
     944         4535 :                 && !STMT_VINFO_DATA_REF (stmt_info)
     945         4535 :                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     946         4509 :                 && def_stmt_info
     947         4509 :                 && !STMT_VINFO_DATA_REF (def_stmt_info)
     948         4509 :                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
     949              :                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
     950     16711488 :           || (!STMT_VINFO_DATA_REF (stmt_info)
     951     15400078 :               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     952         9386 :               && ((!def_stmt_info
     953         9217 :                    || STMT_VINFO_DATA_REF (def_stmt_info)
     954        16906 :                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
     955              :                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
     956         9386 :                   != (oprnd_info->first_dt != vect_reduction_def))))
     957              :         {
     958              :           /* Try swapping operands if we got a mismatch.  For BB
     959              :              vectorization only in case it will clearly improve things.  */
     960       291579 :           if (i == commutative_op && !swapped
     961       289130 :               && (!is_a <bb_vec_info> (vinfo)
     962         4620 :                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
     963         4620 :                                              dts[i+1])
     964         1122 :                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
     965              :                           || vect_def_types_match
     966          156 :                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
     967              :             {
     968         2449 :               if (dump_enabled_p ())
     969          152 :                 dump_printf_loc (MSG_NOTE, vect_location,
     970              :                                  "trying swapped operands\n");
     971         2449 :               std::swap (dts[i], dts[i+1]);
     972         2449 :               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
     973         2449 :                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
     974         2449 :               std::swap ((*oprnds_info)[i]->ops[stmt_num],
     975         2449 :                          (*oprnds_info)[i+1]->ops[stmt_num]);
     976              :               /* After swapping some operands we lost track whether an
     977              :                  operand has any pattern defs so be conservative here.  */
     978         2449 :               if ((*oprnds_info)[i]->any_pattern
     979         2449 :                   || (*oprnds_info)[i+1]->any_pattern)
     980           36 :                 (*oprnds_info)[i]->any_pattern
     981           18 :                   = (*oprnds_info)[i+1]->any_pattern = true;
     982         2449 :               swapped = true;
     983         2449 :               continue;
     984              :             }
     985              : 
     986       286681 :           if (is_a <bb_vec_info> (vinfo)
     987       271285 :               && !oprnd_info->any_pattern
     988       557728 :               && number_of_oprnds > 1)
     989              :             {
     990              :               /* Now for commutative ops we should see whether we can
     991              :                  make the other operand matching.  */
     992       103532 :               if (dump_enabled_p ())
     993          203 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     994              :                                  "treating operand as external\n");
     995       103532 :               oprnd_info->first_dt = dt = vect_external_def;
     996              :             }
     997              :           else
     998              :             {
     999       183149 :               if (dump_enabled_p ())
    1000          407 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1001              :                                  "Build SLP failed: different types\n");
    1002       183149 :               return 1;
    1003              :             }
    1004              :         }
    1005              : 
    1006              :       /* Make sure to demote the overall operand to external.  */
    1007     16815020 :       if (dt == vect_external_def)
    1008       333818 :         oprnd_info->first_dt = vect_external_def;
    1009              :       /* For a SLP reduction chain we want to duplicate the reduction to
    1010              :          each of the chain members.  That gets us a sane SLP graph (still
    1011              :          the stmts are not 100% correct wrt the initial values).  */
    1012     16481202 :       else if ((dt == vect_internal_def
    1013     16481202 :                 || dt == vect_reduction_def)
    1014     15556904 :                && oprnd_info->first_dt == vect_reduction_def
    1015       100868 :                && !STMT_VINFO_DATA_REF (stmt_info)
    1016       100868 :                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
    1017         4509 :                && !STMT_VINFO_DATA_REF (def_stmt_info)
    1018     16485711 :                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
    1019              :                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
    1020              :         {
    1021         4509 :           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
    1022         4509 :           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
    1023              :         }
    1024              : 
    1025     16815020 :       ++i;
    1026              :     }
    1027              : 
    1028              :   /* Swap operands.  */
    1029      9108946 :   if (swapped)
    1030              :     {
    1031        40814 :       if (dump_enabled_p ())
    1032          438 :         dump_printf_loc (MSG_NOTE, vect_location,
    1033              :                          "swapped operands to match def types in %G",
    1034              :                          stmt_info->stmt);
    1035              :     }
    1036              : 
    1037              :   return 0;
    1038              : }
    1039              : 
    1040              : /* Return true if call statements CALL1 and CALL2 are similar enough
    1041              :    to be combined into the same SLP group.  */
    1042              : 
    1043              : bool
    1044        21243 : compatible_calls_p (gcall *call1, gcall *call2, bool allow_two_operators)
    1045              : {
    1046        21243 :   unsigned int nargs = gimple_call_num_args (call1);
    1047        21243 :   if (nargs != gimple_call_num_args (call2))
    1048              :     return false;
    1049              : 
    1050        19292 :   auto cfn1 = gimple_call_combined_fn (call1);
    1051        19292 :   auto cfn2 = gimple_call_combined_fn (call2);
    1052        19292 :   if (cfn1 != cfn2
    1053            2 :       && (!allow_two_operators
    1054            2 :           || !((cfn1 == CFN_FMA || cfn1 == CFN_FMS)
    1055            2 :                && (cfn2 == CFN_FMA || cfn2 == CFN_FMS))))
    1056              :     return false;
    1057              : 
    1058        19292 :   if (gimple_call_internal_p (call1))
    1059              :     {
    1060         7009 :       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
    1061         7009 :                                TREE_TYPE (gimple_call_lhs (call2))))
    1062              :         return false;
    1063        14432 :       for (unsigned int i = 0; i < nargs; ++i)
    1064         7423 :         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
    1065         7423 :                                  TREE_TYPE (gimple_call_arg (call2, i))))
    1066              :           return false;
    1067              :     }
    1068              :   else
    1069              :     {
    1070        12283 :       if (!operand_equal_p (gimple_call_fn (call1),
    1071        12283 :                             gimple_call_fn (call2), 0))
    1072              :         return false;
    1073              : 
    1074        26928 :       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
    1075              :         return false;
    1076              :     }
    1077              : 
    1078              :   /* Check that any unvectorized arguments are equal.  */
    1079        15985 :   if (const int *map = vect_get_operand_map (call1, false, false))
    1080              :     {
    1081           15 :       unsigned int nkept = *map++;
    1082           15 :       unsigned int mapi = 0;
    1083           57 :       for (unsigned int i = 0; i < nargs; ++i)
    1084           42 :         if (mapi < nkept && map[mapi] == int (i))
    1085           27 :           mapi += 1;
    1086           15 :         else if (!operand_equal_p (gimple_call_arg (call1, i),
    1087           15 :                                    gimple_call_arg (call2, i)))
    1088              :           return false;
    1089              :     }
    1090              : 
    1091              :   return true;
    1092              : }
    1093              : 
    1094              : /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
    1095              :    caller's attempt to find the vector type in STMT_INFO with the narrowest
    1096              :    element type.  Return true if VECTYPE is nonnull and if it is valid
    1097              :    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
    1098              :    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
    1099              :    vect_build_slp_tree.  */
    1100              : 
    1101              : static bool
    1102      5497333 : vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
    1103              :                         unsigned int group_size,
    1104              :                         tree vectype, poly_uint64 *max_nunits)
    1105              : {
    1106      5497333 :   if (!vectype)
    1107              :     {
    1108         3925 :       if (dump_enabled_p ())
    1109            7 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1110              :                          "Build SLP failed: unsupported data-type in %G\n",
    1111              :                          stmt_info->stmt);
    1112              :       /* Fatal mismatch.  */
    1113         3925 :       return false;
    1114              :     }
    1115              : 
    1116              :   /* If populating the vector type requires unrolling then fail
    1117              :      before adjusting *max_nunits for basic-block vectorization.  */
    1118      5493408 :   if (is_a <bb_vec_info> (vinfo)
    1119      5493408 :       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
    1120              :     {
    1121       142018 :       if (dump_enabled_p ())
    1122           34 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1123              :                          "Build SLP failed: unrolling required "
    1124              :                          "in basic block SLP\n");
    1125              :       /* Fatal mismatch.  */
    1126       142018 :       return false;
    1127              :     }
    1128              : 
    1129              :   /* In case of multiple types we need to detect the smallest type.  */
    1130      5351390 :   vect_update_max_nunits (max_nunits, vectype);
    1131      5351390 :   return true;
    1132              : }
    1133              : 
    1134              : /* Verify if the scalar stmts STMTS are isomorphic, require data
    1135              :    permutation or are of unsupported types of operation.  Return
    1136              :    true if they are, otherwise return false and indicate in *MATCHES
    1137              :    which stmts are not isomorphic to the first one.  If MATCHES[0]
    1138              :    is false then this indicates the comparison could not be
    1139              :    carried out or the stmts will never be vectorized by SLP.
    1140              : 
    1141              :    Note COND_EXPR is possibly isomorphic to another one after swapping its
    1142              :    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
    1143              :    the first stmt by swapping the two operands of comparison; set SWAP[i]
    1144              :    to 2 if stmt I is isormorphic to the first stmt by inverting the code
    1145              :    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
    1146              :    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
    1147              : 
    1148              : static bool
    1149      5762971 : vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
    1150              :                        vec<stmt_vec_info> stmts, unsigned int group_size,
    1151              :                        poly_uint64 *max_nunits, bool *matches,
    1152              :                        bool *two_operators, tree *node_vectype)
    1153              : {
    1154      5762971 :   unsigned int i;
    1155      5762971 :   stmt_vec_info first_stmt_info = stmts[0];
    1156      5762971 :   code_helper first_stmt_code = ERROR_MARK;
    1157      5762971 :   code_helper alt_stmt_code = ERROR_MARK;
    1158      5762971 :   code_helper first_cond_code = ERROR_MARK;
    1159      5762971 :   bool need_same_oprnds = false;
    1160      5762971 :   tree first_lhs = NULL_TREE;
    1161      5762971 :   tree first_op1 = NULL_TREE;
    1162      5762971 :   stmt_vec_info first_load = NULL, prev_first_load = NULL;
    1163      5762971 :   bool first_stmt_ldst_p = false, first_stmt_ldst_masklen_p = false;
    1164      5762971 :   bool first_stmt_phi_p = false;
    1165      5762971 :   int first_reduc_idx = -1;
    1166      5762971 :   bool maybe_soft_fail = false;
    1167      5762971 :   tree soft_fail_nunits_vectype = NULL_TREE;
    1168              : 
    1169      5762971 :   tree vectype, nunits_vectype;
    1170      5762971 :   if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
    1171              :                                        &nunits_vectype, group_size))
    1172              :     {
    1173              :       /* Fatal mismatch.  */
    1174       207234 :       matches[0] = false;
    1175       207234 :       return false;
    1176              :     }
    1177      5555737 :   if (is_a <bb_vec_info> (vinfo)
    1178      5555737 :       && known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U))
    1179              :     {
    1180       358361 :       if (dump_enabled_p ())
    1181          296 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1182              :                          "Build SLP failed: not using single lane "
    1183              :                          "vector type %T\n", vectype);
    1184       358361 :       matches[0] = false;
    1185       358361 :       return false;
    1186              :     }
    1187              :   /* Record nunits required but continue analysis, producing matches[]
    1188              :      as if nunits was not an issue.  This allows splitting of groups
    1189              :      to happen.  */
    1190      5197376 :   if (nunits_vectype
    1191      5197376 :       && !vect_record_max_nunits (vinfo, first_stmt_info, group_size,
    1192              :                                   nunits_vectype, max_nunits))
    1193              :     {
    1194       142018 :       gcc_assert (is_a <bb_vec_info> (vinfo));
    1195       142018 :       maybe_soft_fail = true;
    1196       142018 :       soft_fail_nunits_vectype = nunits_vectype;
    1197              :     }
    1198              : 
    1199      5197376 :   gcc_assert (vectype || !gimple_get_lhs (first_stmt_info->stmt));
    1200      5197376 :   *node_vectype = vectype;
    1201              : 
    1202              :   /* For every stmt in NODE find its def stmt/s.  */
    1203      5197376 :   stmt_vec_info stmt_info;
    1204     22179446 :   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
    1205              :     {
    1206     17145263 :       bool ldst_p = false;
    1207     17145263 :       bool ldst_masklen_p = false;
    1208     17145263 :       bool phi_p = false;
    1209     17145263 :       code_helper rhs_code = ERROR_MARK;
    1210              : 
    1211     17145263 :       swap[i] = 0;
    1212     17145263 :       matches[i] = false;
    1213     17145263 :       if (!stmt_info)
    1214              :         {
    1215        40246 :           matches[i] = true;
    1216     17022316 :           continue;
    1217              :         }
    1218              : 
    1219     17105017 :       gimple *stmt = stmt_info->stmt;
    1220     17105017 :       if (dump_enabled_p ())
    1221       218452 :         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
    1222              : 
    1223              :       /* Fail to vectorize statements marked as unvectorizable, throw
    1224              :          or are volatile.  */
    1225     17105017 :       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
    1226     16914600 :           || stmt_can_throw_internal (cfun, stmt)
    1227     33229063 :           || gimple_has_volatile_ops (stmt))
    1228              :         {
    1229       195916 :           if (dump_enabled_p ())
    1230          199 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1231              :                              "Build SLP failed: unvectorizable statement %G",
    1232              :                              stmt);
    1233              :           /* ???  For BB vectorization we want to commutate operands in a way
    1234              :              to shuffle all unvectorizable defs into one operand and have
    1235              :              the other still vectorized.  The following doesn't reliably
    1236              :              work for this though but it's the easiest we can do here.  */
    1237       195916 :           if (is_a <bb_vec_info> (vinfo) && i != 0)
    1238        64372 :             continue;
    1239              :           /* Fatal mismatch.  */
    1240       131544 :           matches[0] = false;
    1241       131544 :           return false;
    1242              :         }
    1243              : 
    1244     16909101 :       gcall *call_stmt = dyn_cast <gcall *> (stmt);
    1245     16909101 :       tree lhs = gimple_get_lhs (stmt);
    1246     16909101 :       if (lhs == NULL_TREE && !call_stmt)
    1247              :         {
    1248           36 :           if (dump_enabled_p ())
    1249            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1250              :                              "Build SLP failed: not GIMPLE_ASSIGN nor "
    1251              :                              "GIMPLE_CALL %G", stmt);
    1252           36 :           if (is_a <bb_vec_info> (vinfo) && i != 0)
    1253           36 :             continue;
    1254              :           /* Fatal mismatch.  */
    1255            0 :           matches[0] = false;
    1256            0 :           return false;
    1257              :         }
    1258              : 
    1259     16909065 :       if (call_stmt)
    1260              :         {
    1261       102597 :           combined_fn cfn = gimple_call_combined_fn (call_stmt);
    1262       102597 :           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
    1263        58630 :             rhs_code = cfn;
    1264              :           else
    1265              :             rhs_code = CALL_EXPR;
    1266              : 
    1267       102597 :           if (cfn == CFN_GATHER_LOAD
    1268       102597 :               || cfn == CFN_SCATTER_STORE)
    1269              :             ldst_p = true;
    1270              :           else if (cfn == CFN_MASK_LOAD
    1271              :                    || cfn == CFN_MASK_GATHER_LOAD
    1272              :                    || cfn == CFN_MASK_LEN_GATHER_LOAD
    1273              :                    || cfn == CFN_MASK_SCATTER_STORE
    1274              :                    || cfn == CFN_MASK_LEN_SCATTER_STORE)
    1275              :             {
    1276              :               ldst_p = true;
    1277              :               ldst_masklen_p = true;
    1278              :             }
    1279              :           else if (cfn == CFN_MASK_STORE)
    1280              :             {
    1281              :               ldst_p = true;
    1282              :               ldst_masklen_p = true;
    1283              :               rhs_code = CFN_MASK_STORE;
    1284              :             }
    1285              :           else if (cfn == CFN_GOMP_SIMD_LANE)
    1286              :             ;
    1287        91063 :           else if ((cfn != CFN_LAST
    1288              :                     && cfn != CFN_MASK_CALL
    1289        47096 :                     && internal_fn_p (cfn)
    1290        36893 :                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
    1291        90988 :                    || gimple_call_tail_p (call_stmt)
    1292        90988 :                    || gimple_call_noreturn_p (call_stmt)
    1293       182051 :                    || gimple_call_chain (call_stmt))
    1294              :             {
    1295          424 :               if (dump_enabled_p ())
    1296           13 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1297              :                                  "Build SLP failed: unsupported call type %G",
    1298              :                                  (gimple *) call_stmt);
    1299          424 :               if (is_a <bb_vec_info> (vinfo) && i != 0)
    1300           64 :                 continue;
    1301              :               /* Fatal mismatch.  */
    1302          360 :               matches[0] = false;
    1303          360 :               return false;
    1304              :             }
    1305              :         }
    1306     16806468 :       else if (gimple_code (stmt) == GIMPLE_PHI)
    1307              :         {
    1308              :           rhs_code = ERROR_MARK;
    1309              :           phi_p = true;
    1310              :         }
    1311              :       else
    1312              :         {
    1313     16015914 :           rhs_code = gimple_assign_rhs_code (stmt);
    1314     16015914 :           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
    1315              :         }
    1316              : 
    1317              :       /* Check the operation.  */
    1318     16908641 :       if (i == 0)
    1319              :         {
    1320      5065472 :           first_lhs = lhs;
    1321      5065472 :           first_stmt_code = rhs_code;
    1322      5065472 :           first_stmt_ldst_p = ldst_p;
    1323      5065472 :           first_stmt_ldst_masklen_p = ldst_masklen_p;
    1324      5065472 :           first_stmt_phi_p = phi_p;
    1325      5065472 :           first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
    1326              : 
    1327              :           /* Shift arguments should be equal in all the packed stmts for a
    1328              :              vector shift with scalar shift operand.  */
    1329      5065472 :           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
    1330      4929982 :               || rhs_code == LROTATE_EXPR
    1331      9995382 :               || rhs_code == RROTATE_EXPR)
    1332              :             {
    1333              :               /* First see if we have a vector/vector shift.  */
    1334       135945 :               if (!directly_supported_p (rhs_code, vectype, optab_vector))
    1335              :                 {
    1336              :                   /* No vector/vector shift, try for a vector/scalar shift.  */
    1337       123911 :                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
    1338              :                     {
    1339        11991 :                       if (dump_enabled_p ())
    1340          386 :                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1341              :                                          "Build SLP failed: "
    1342              :                                          "op not supported by target.\n");
    1343        11991 :                       if (is_a <bb_vec_info> (vinfo) && i != 0)
    1344              :                         continue;
    1345              :                       /* Fatal mismatch.  */
    1346        11991 :                       matches[0] = false;
    1347        11991 :                       return false;
    1348              :                     }
    1349       111920 :                   need_same_oprnds = true;
    1350       111920 :                   first_op1 = gimple_assign_rhs2 (stmt);
    1351              :                 }
    1352              :             }
    1353      4929527 :           else if (rhs_code == WIDEN_LSHIFT_EXPR)
    1354              :             {
    1355            0 :               need_same_oprnds = true;
    1356            0 :               first_op1 = gimple_assign_rhs2 (stmt);
    1357              :             }
    1358      4929527 :           else if (!ldst_p
    1359      4929527 :                    && rhs_code == BIT_FIELD_REF)
    1360              :             {
    1361         5773 :               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
    1362         5773 :               if (!is_a <bb_vec_info> (vinfo)
    1363         5647 :                   || TREE_CODE (vec) != SSA_NAME
    1364              :                   /* When the element types are not compatible we pun the
    1365              :                      source to the target vectype which requires equal size.  */
    1366        11408 :                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
    1367         4912 :                        || !types_compatible_p (TREE_TYPE (vectype),
    1368         4912 :                                                TREE_TYPE (TREE_TYPE (vec))))
    1369         1039 :                       && !operand_equal_p (TYPE_SIZE (vectype),
    1370         1039 :                                            TYPE_SIZE (TREE_TYPE (vec)))))
    1371              :                 {
    1372          781 :                   if (dump_enabled_p ())
    1373            0 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1374              :                                      "Build SLP failed: "
    1375              :                                      "BIT_FIELD_REF not supported\n");
    1376              :                   /* Fatal mismatch.  */
    1377          781 :                   matches[0] = false;
    1378          781 :                   return false;
    1379              :                 }
    1380              :             }
    1381      4923754 :           else if (rhs_code == CFN_DIV_POW2)
    1382              :             {
    1383            0 :               need_same_oprnds = true;
    1384            0 :               first_op1 = gimple_call_arg (call_stmt, 1);
    1385              :             }
    1386      4923754 :           else if (rhs_code == CFN_GOMP_SIMD_LANE)
    1387              :             {
    1388         3153 :               need_same_oprnds = true;
    1389         3153 :               first_op1 = gimple_call_arg (call_stmt, 1);
    1390              :             }
    1391              :         }
    1392              :       else
    1393              :         {
    1394     11843169 :           int comm_arg;
    1395     11843522 :           if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
    1396              :               /* For SLP reduction groups the index isn't necessarily
    1397              :                  uniform but only that of the first stmt matters.  */
    1398         2169 :               && !(first_reduc_idx != -1
    1399         2169 :                    && STMT_VINFO_REDUC_IDX (stmt_info) != -1
    1400         2169 :                    && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
    1401     11843169 :               && !(first_reduc_idx != -1
    1402          982 :                    && STMT_VINFO_REDUC_IDX (stmt_info) != -1
    1403          982 :                    && (comm_arg = first_commutative_argument
    1404          982 :                                     (rhs_code, TREE_TYPE (lhs))) >= 0
    1405              :                    && (first_reduc_idx
    1406          773 :                        == 2 * comm_arg + 1 - STMT_VINFO_REDUC_IDX (stmt_info))))
    1407              :             {
    1408          353 :               if (dump_enabled_p ())
    1409              :                 {
    1410           12 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1411              :                                    "Build SLP failed: different reduc_idx "
    1412              :                                    "%d instead of %d in %G",
    1413              :                                    STMT_VINFO_REDUC_IDX (stmt_info),
    1414              :                                    first_reduc_idx, stmt);
    1415              :                 }
    1416              :               /* Mismatch.  */
    1417          353 :               continue;
    1418              :             }
    1419     11842816 :           if (!ldst_p
    1420      9291882 :               && first_stmt_code != rhs_code
    1421     13251164 :               && alt_stmt_code == ERROR_MARK)
    1422              :             alt_stmt_code = rhs_code;
    1423     13225959 :           if ((!ldst_p
    1424      9291882 :                && first_stmt_code != rhs_code
    1425      1408348 :                && (first_stmt_code != IMAGPART_EXPR
    1426          129 :                    || rhs_code != REALPART_EXPR)
    1427      1408328 :                && (first_stmt_code != REALPART_EXPR
    1428          526 :                    || rhs_code != IMAGPART_EXPR)
    1429              :                /* Handle mismatches in plus/minus by computing both
    1430              :                   and merging the results.  */
    1431      1408317 :                && !((((first_stmt_code == PLUS_EXPR
    1432      1304554 :                        || first_stmt_code == MINUS_EXPR)
    1433       132550 :                       && (alt_stmt_code == PLUS_EXPR
    1434       123459 :                           || alt_stmt_code == MINUS_EXPR))
    1435      1379078 :                      || ((first_stmt_code == CFN_FMA
    1436      1379076 :                           || first_stmt_code == CFN_FMS)
    1437            2 :                          && (alt_stmt_code == CFN_FMA
    1438            2 :                              || alt_stmt_code == CFN_FMS)))
    1439        29241 :                     && rhs_code == alt_stmt_code)
    1440      1418950 :                && !(first_stmt_code.is_tree_code ()
    1441      1302879 :                     && rhs_code.is_tree_code ()
    1442      1209241 :                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
    1443              :                         == tcc_comparison)
    1444       127081 :                     && (swap_tree_comparison (tree_code (first_stmt_code))
    1445       127081 :                         == tree_code (rhs_code))
    1446              :                     && (first_reduc_idx == -1
    1447            0 :                         || REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
    1448              :               || (ldst_p
    1449      5101868 :                   && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
    1450      2550934 :                       != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
    1451              :               || (ldst_p
    1452      2507529 :                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    1453      2507529 :                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
    1454     10459820 :               || first_stmt_ldst_p != ldst_p
    1455     10459681 :               || (ldst_p && first_stmt_ldst_masklen_p != ldst_masklen_p)
    1456     22302489 :               || first_stmt_phi_p != phi_p)
    1457              :             {
    1458      1383143 :               if (dump_enabled_p ())
    1459              :                 {
    1460         2929 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1461              :                                    "Build SLP failed: different operation "
    1462              :                                    "in stmt %G", stmt);
    1463         2929 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1464              :                                    "original stmt %G", first_stmt_info->stmt);
    1465              :                 }
    1466              :               /* Mismatch.  */
    1467      1383143 :               continue;
    1468              :             }
    1469              : 
    1470     10462002 :           if (!ldst_p
    1471      7952279 :               && first_stmt_code == BIT_FIELD_REF
    1472     10465391 :               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
    1473         5718 :                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
    1474              :             {
    1475         2329 :               if (dump_enabled_p ())
    1476           40 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1477              :                                  "Build SLP failed: different BIT_FIELD_REF "
    1478              :                                  "arguments in %G", stmt);
    1479              :               /* Mismatch.  */
    1480         2329 :               continue;
    1481              :             }
    1482              : 
    1483     10457344 :           if (call_stmt
    1484        22075 :               && first_stmt_code != CFN_MASK_LOAD
    1485     10478933 :               && first_stmt_code != CFN_MASK_STORE)
    1486              :             {
    1487        21243 :               if (!is_a <gcall *> (stmts[0]->stmt)
    1488        21243 :                   || !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
    1489              :                                           call_stmt, true))
    1490              :                 {
    1491         5258 :                   if (dump_enabled_p ())
    1492            0 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1493              :                                      "Build SLP failed: different calls in %G",
    1494              :                                      stmt);
    1495              :                   /* Mismatch.  */
    1496         5258 :                   continue;
    1497              :                 }
    1498              :             }
    1499              : 
    1500     10265931 :           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
    1501     11252021 :               && (gimple_bb (first_stmt_info->stmt)
    1502       986090 :                   != gimple_bb (stmt_info->stmt)))
    1503              :             {
    1504        27268 :               if (dump_enabled_p ())
    1505            8 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1506              :                                  "Build SLP failed: different BB for PHI "
    1507              :                                  "or possibly trapping operation in %G", stmt);
    1508              :               /* Mismatch.  */
    1509        27268 :               continue;
    1510              :             }
    1511              : 
    1512     10424818 :           if (need_same_oprnds)
    1513              :             {
    1514        55226 :               tree other_op1 = gimple_arg (stmt, 1);
    1515        55226 :               if (!operand_equal_p (first_op1, other_op1, 0))
    1516              :                 {
    1517         7630 :                   if (dump_enabled_p ())
    1518          123 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1519              :                                      "Build SLP failed: different shift "
    1520              :                                      "arguments in %G", stmt);
    1521              :                   /* Mismatch.  */
    1522         7630 :                   continue;
    1523              :                 }
    1524              :             }
    1525              : 
    1526     10417925 :           if (first_lhs
    1527     10417188 :               && lhs
    1528     10417188 :               && !types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (first_lhs)))
    1529              :             {
    1530          737 :               if (dump_enabled_p ())
    1531            6 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1532              :                                  "Build SLP failed: different vector type "
    1533              :                                  "in %G", stmt);
    1534              :               /* Mismatch.  */
    1535          737 :               continue;
    1536              :             }
    1537              :         }
    1538              : 
    1539              :       /* Grouped store or load.  */
    1540     15469151 :       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    1541              :         {
    1542      3871822 :           gcc_assert (ldst_p);
    1543      3871822 :           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
    1544              :             {
    1545              :               /* Store.  */
    1546      3035756 :               gcc_assert (rhs_code == CFN_MASK_STORE
    1547              :                           || REFERENCE_CLASS_P (lhs)
    1548              :                           || DECL_P (lhs));
    1549              :             }
    1550              :           else
    1551              :             {
    1552              :               /* Load.  */
    1553       836066 :               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
    1554       836066 :               if (prev_first_load)
    1555              :                 {
    1556              :                   /* Check that there are no loads from different interleaving
    1557              :                      chains in the same node.  */
    1558       381347 :                   if (prev_first_load != first_load)
    1559              :                     {
    1560        54516 :                       if (dump_enabled_p ())
    1561         1994 :                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
    1562              :                                          vect_location,
    1563              :                                          "Build SLP failed: different "
    1564              :                                          "interleaving chains in one node %G",
    1565              :                                          stmt);
    1566              :                       /* Mismatch.  */
    1567        54516 :                       continue;
    1568              :                     }
    1569              :                 }
    1570              :               else
    1571              :                 prev_first_load = first_load;
    1572              :            }
    1573              :         }
    1574              :       /* Non-grouped store or load.  */
    1575     11597329 :       else if (ldst_p)
    1576              :         {
    1577       888095 :           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
    1578       616978 :               && rhs_code != CFN_GATHER_LOAD
    1579              :               && rhs_code != CFN_MASK_GATHER_LOAD
    1580              :               && rhs_code != CFN_MASK_LEN_GATHER_LOAD
    1581              :               && rhs_code != CFN_SCATTER_STORE
    1582              :               && rhs_code != CFN_MASK_SCATTER_STORE
    1583              :               && rhs_code != CFN_MASK_LEN_SCATTER_STORE
    1584       616978 :               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    1585              :               /* Not grouped loads are handled as externals for BB
    1586              :                  vectorization.  For loop vectorization we can handle
    1587              :                  splats the same we handle single element interleaving.
    1588              :                  Likewise we can handle a collection of invariant refs.  */
    1589      1486222 :               && (is_a <bb_vec_info> (vinfo)
    1590       598127 :                   || (stmt_info != first_stmt_info
    1591        68088 :                   && !(integer_zerop (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
    1592          241 :                       && integer_zerop (DR_STEP (STMT_VINFO_DATA_REF
    1593              :                                                          (first_stmt_info)))))))
    1594              :             {
    1595              :               /* Not grouped load.  */
    1596        67606 :               if (dump_enabled_p ())
    1597          145 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1598              :                                  "Build SLP failed: not grouped load %G", stmt);
    1599              : 
    1600        67606 :               if (i != 0)
    1601        67606 :                 continue;
    1602              :               /* Fatal mismatch.  */
    1603            0 :               matches[0] = false;
    1604            0 :               return false;
    1605              :             }
    1606              :         }
    1607              :       /* Not memory operation.  */
    1608              :       else
    1609              :         {
    1610     10709234 :           if (!phi_p
    1611     10041670 :               && rhs_code.is_tree_code ()
    1612      9993049 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
    1613      1520940 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
    1614       940254 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
    1615       878250 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
    1616        64930 :               && rhs_code != VIEW_CONVERT_EXPR
    1617              :               && rhs_code != CALL_EXPR
    1618              :               && rhs_code != BIT_FIELD_REF
    1619     10709234 :               && rhs_code != SSA_NAME)
    1620              :             {
    1621        18517 :               if (dump_enabled_p ())
    1622            7 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1623              :                                  "Build SLP failed: operation unsupported %G",
    1624              :                                  stmt);
    1625        18517 :               if (is_a <bb_vec_info> (vinfo) && i != 0)
    1626            0 :                 continue;
    1627              :               /* Fatal mismatch.  */
    1628        18517 :               matches[0] = false;
    1629        18517 :               return false;
    1630              :             }
    1631              : 
    1632     10690717 :           if (rhs_code == COND_EXPR)
    1633              :             {
    1634        59121 :               tree cond_expr = gimple_assign_rhs1 (stmt);
    1635        59121 :               enum tree_code cond_code = TREE_CODE (cond_expr);
    1636        59121 :               enum tree_code swap_code = ERROR_MARK;
    1637        59121 :               enum tree_code invert_code = ERROR_MARK;
    1638              : 
    1639        59121 :               if (i == 0)
    1640        49828 :                 first_cond_code = TREE_CODE (cond_expr);
    1641         9293 :               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
    1642              :                 {
    1643            0 :                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
    1644            0 :                   swap_code = swap_tree_comparison (cond_code);
    1645            0 :                   invert_code = invert_tree_comparison (cond_code, honor_nans);
    1646              :                 }
    1647              : 
    1648        59121 :               if (first_cond_code == cond_code)
    1649              :                 ;
    1650              :               /* Isomorphic can be achieved by swapping.  */
    1651            0 :               else if (first_cond_code == swap_code)
    1652            0 :                 swap[i] = 1;
    1653              :               /* Isomorphic can be achieved by inverting.  */
    1654            0 :               else if (first_cond_code == invert_code)
    1655            0 :                 swap[i] = 2;
    1656              :               else
    1657              :                 {
    1658            0 :                   if (dump_enabled_p ())
    1659            0 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1660              :                                      "Build SLP failed: different"
    1661              :                                      " operation %G", stmt);
    1662              :                   /* Mismatch.  */
    1663            0 :                   continue;
    1664              :                 }
    1665              :             }
    1666              : 
    1667     10690717 :           if (i != 0
    1668      7910343 :               && first_stmt_code != rhs_code
    1669        68745 :               && first_stmt_code.is_tree_code ()
    1670        68743 :               && rhs_code.is_tree_code ()
    1671        68743 :               && TREE_CODE_CLASS ((tree_code)first_stmt_code) == tcc_comparison
    1672     10730406 :               && (swap_tree_comparison ((tree_code)first_stmt_code)
    1673        39689 :                   == (tree_code)rhs_code))
    1674        39689 :             swap[i] = 1;
    1675              : 
    1676     10690717 :           if (i != 0
    1677      7910343 :               && first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
    1678         1574 :               && first_reduc_idx != -1
    1679         1574 :               && STMT_VINFO_REDUC_IDX (stmt_info) != -1
    1680         1574 :               && rhs_code.is_tree_code ()
    1681         1566 :               && commutative_tree_code (tree_code (rhs_code))
    1682     10692283 :               && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info))
    1683         1566 :             swap[i] = 1;
    1684              :         }
    1685              : 
    1686     15328512 :       matches[i] = true;
    1687              :     }
    1688              : 
    1689     20372186 :   for (i = 0; i < group_size; ++i)
    1690     16019815 :     if (!matches[i])
    1691              :       return false;
    1692              : 
    1693              :   /* If we allowed a two-operation SLP node verify the target can cope
    1694              :      with the permute we are going to use.  */
    1695      4352371 :   if (alt_stmt_code != ERROR_MARK
    1696      4352371 :       && (!alt_stmt_code.is_tree_code ()
    1697        53561 :           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
    1698        53561 :               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
    1699              :     {
    1700        14463 :       *two_operators = true;
    1701              :     }
    1702              : 
    1703      4352371 :   if (maybe_soft_fail)
    1704              :     {
    1705       141610 :       unsigned HOST_WIDE_INT const_nunits;
    1706       141610 :       if (!TYPE_VECTOR_SUBPARTS
    1707       141610 :             (soft_fail_nunits_vectype).is_constant (&const_nunits)
    1708       141610 :           || const_nunits > group_size)
    1709            0 :         matches[0] = false;
    1710              :       else
    1711              :         {
    1712              :           /* With constant vector elements simulate a mismatch at the
    1713              :              point we need to split.  */
    1714       141610 :           unsigned tail = group_size & (const_nunits - 1);
    1715       141610 :           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
    1716              :         }
    1717       141610 :       return false;
    1718              :     }
    1719              : 
    1720              :   return true;
    1721              : }
    1722              : 
    1723              : /* Traits for the hash_set to record failed SLP builds for a stmt set.
    1724              :    Note we never remove apart from at destruction time so we do not
    1725              :    need a special value for deleted that differs from empty.  */
    1726              : struct bst_traits
    1727              : {
    1728              :   typedef vec <stmt_vec_info> value_type;
    1729              :   typedef vec <stmt_vec_info> compare_type;
    1730              :   static inline hashval_t hash (value_type);
    1731              :   static inline bool equal (value_type existing, value_type candidate);
    1732    479989088 :   static inline bool is_empty (value_type x) { return !x.exists (); }
    1733    107526258 :   static inline bool is_deleted (value_type x) { return !x.exists (); }
    1734              :   static const bool empty_zero_p = true;
    1735            0 :   static inline void mark_empty (value_type &x) { x.release (); }
    1736              :   static inline void mark_deleted (value_type &x) { x.release (); }
    1737      9231886 :   static inline void remove (value_type &x) { x.release (); }
    1738              : };
    1739              : inline hashval_t
    1740     93669909 : bst_traits::hash (value_type x)
    1741              : {
    1742     93669909 :   inchash::hash h;
    1743    424237028 :   for (unsigned i = 0; i < x.length (); ++i)
    1744    330567119 :     h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
    1745     93669909 :   return h.end ();
    1746              : }
    1747              : inline bool
    1748     81959155 : bst_traits::equal (value_type existing, value_type candidate)
    1749              : {
    1750    245877465 :   if (existing.length () != candidate.length ())
    1751              :     return false;
    1752     83357726 :   for (unsigned i = 0; i < existing.length (); ++i)
    1753     79004451 :     if (existing[i] != candidate[i])
    1754              :       return false;
    1755              :   return true;
    1756              : }
    1757              : 
    1758              : typedef hash_map <vec <stmt_vec_info>, slp_tree,
    1759              :                   simple_hashmap_traits <bst_traits, slp_tree> >
    1760              :   scalar_stmts_to_slp_tree_map_t;
    1761              : 
    1762              : /* Release BST_MAP.  */
    1763              : 
    1764              : static void
    1765      1794282 : release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
    1766              : {
    1767              :   /* The map keeps a reference on SLP nodes built, release that.  */
    1768     11026168 :   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
    1769     20258054 :        it != bst_map->end (); ++it)
    1770      9231886 :     if ((*it).second)
    1771      9231886 :       vect_free_slp_tree ((*it).second);
    1772      1794282 :   delete bst_map;
    1773      1794282 : }
    1774              : 
    1775              : /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
    1776              :    but then vec::insert does memmove and that's not compatible with
    1777              :    std::pair.  */
    1778              : struct chain_op_t
    1779              : {
    1780      3691438 :   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
    1781      3691438 :       : code (code_), dt (dt_), op (op_) {}
    1782              :   tree_code code;
    1783              :   vect_def_type dt;
    1784              :   tree op;
    1785              : };
    1786              : 
    1787              : /* Comparator for sorting associatable chains.  */
    1788              : 
    1789              : static int
    1790      8248221 : dt_sort_cmp (const void *op1_, const void *op2_, void *)
    1791              : {
    1792      8248221 :   auto *op1 = (const chain_op_t *) op1_;
    1793      8248221 :   auto *op2 = (const chain_op_t *) op2_;
    1794      8248221 :   if (op1->dt != op2->dt)
    1795       942589 :     return (int)op1->dt - (int)op2->dt;
    1796      7305632 :   return (int)op1->code - (int)op2->code;
    1797              : }
    1798              : 
    1799              : /* Linearize the associatable expression chain at START with the
    1800              :    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
    1801              :    filling CHAIN with the result and using WORKLIST as intermediate storage.
    1802              :    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
    1803              :    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
    1804              :    stmts, starting with START.  When ALLOW_ALT_CODE is false, do not
    1805              :    follow into MINUS_EXPR when building a PLUS chain (treat MINUS as leaf).  */
    1806              : 
    1807              : static void
    1808      1671252 : vect_slp_linearize_chain (vec_info *vinfo,
    1809              :                           vec<std::pair<tree_code, gimple *> > &worklist,
    1810              :                           vec<chain_op_t> &chain,
    1811              :                           enum tree_code code, gimple *start,
    1812              :                           gimple *&code_stmt, gimple *&alt_code_stmt,
    1813              :                           vec<gimple *> *chain_stmts,
    1814              :                           bool allow_alt_code = true)
    1815              : {
    1816              :   /* For each lane linearize the addition/subtraction (or other
    1817              :      uniform associatable operation) expression tree.  */
    1818      1671252 :   worklist.safe_push (std::make_pair (code, start));
    1819      3691438 :   while (!worklist.is_empty ())
    1820              :     {
    1821      2020186 :       auto entry = worklist.pop ();
    1822      2020186 :       gassign *stmt = as_a <gassign *> (entry.second);
    1823      2020186 :       enum tree_code in_code = entry.first;
    1824      4040372 :       enum tree_code this_code = gimple_assign_rhs_code (stmt);
    1825              :       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
    1826      2020186 :       if (!code_stmt
    1827      2020186 :           && gimple_assign_rhs_code (stmt) == code)
    1828      1418096 :         code_stmt = stmt;
    1829       602090 :       else if (!alt_code_stmt
    1830       602090 :                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
    1831       305953 :         alt_code_stmt = stmt;
    1832      2020186 :       if (chain_stmts)
    1833      1947409 :         chain_stmts->safe_push (stmt);
    1834      6060558 :       for (unsigned opnum = 1; opnum <= 2; ++opnum)
    1835              :         {
    1836      4040372 :           tree op = gimple_op (stmt, opnum);
    1837      4040372 :           vect_def_type dt;
    1838      4040372 :           stmt_vec_info def_stmt_info;
    1839      4040372 :           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
    1840      4040372 :           gcc_assert (res);
    1841      4040372 :           if (dt == vect_internal_def
    1842      4040372 :               && is_pattern_stmt_p (def_stmt_info))
    1843         8636 :             op = gimple_get_lhs (def_stmt_info->stmt);
    1844      4040372 :           gimple *use_stmt;
    1845      4040372 :           use_operand_p use_p;
    1846      4040372 :           if (dt == vect_internal_def
    1847      3756986 :               && single_imm_use (op, &use_p, &use_stmt)
    1848      2329185 :               && is_gimple_assign (def_stmt_info->stmt)
    1849      6190318 :               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
    1850      1801324 :                   || (allow_alt_code
    1851        49385 :                       && code == PLUS_EXPR
    1852        28860 :                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
    1853              :                           == MINUS_EXPR))))
    1854              :             {
    1855       348934 :               tree_code op_def_code = this_code;
    1856       348934 :               if (op_def_code == MINUS_EXPR && opnum == 1)
    1857        51097 :                 op_def_code = PLUS_EXPR;
    1858       348934 :               if (in_code == MINUS_EXPR)
    1859          135 :                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
    1860       348934 :               worklist.safe_push (std::make_pair (op_def_code,
    1861       348934 :                                                   def_stmt_info->stmt));
    1862              :             }
    1863              :           else
    1864              :             {
    1865      3691438 :               tree_code op_def_code = this_code;
    1866      3691438 :               if (op_def_code == MINUS_EXPR && opnum == 1)
    1867       254973 :                 op_def_code = PLUS_EXPR;
    1868      3691438 :               if (in_code == MINUS_EXPR)
    1869         3997 :                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
    1870      3691438 :               chain.safe_push (chain_op_t (op_def_code, dt, op));
    1871              :             }
    1872              :         }
    1873              :     }
    1874      1671252 : }
    1875              : 
    1876              : static slp_tree
    1877              : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
    1878              :                        vec<stmt_vec_info> stmts, unsigned int group_size,
    1879              :                        poly_uint64 *max_nunits,
    1880              :                        bool *matches, unsigned *limit, unsigned *tree_size,
    1881              :                        scalar_stmts_to_slp_tree_map_t *bst_map);
    1882              : 
    1883              : static slp_tree
    1884      6250055 : vect_build_slp_tree (vec_info *vinfo,
    1885              :                      vec<stmt_vec_info> stmts, unsigned int group_size,
    1886              :                      poly_uint64 *max_nunits,
    1887              :                      bool *matches, unsigned *limit, unsigned *tree_size,
    1888              :                      scalar_stmts_to_slp_tree_map_t *bst_map)
    1889              : {
    1890      6250055 :   if (slp_tree *leader = bst_map->get (stmts))
    1891              :     {
    1892       481847 :       if (dump_enabled_p ())
    1893        17132 :         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
    1894        17132 :                          !(*leader)->failed ? "" : "failed ",
    1895              :                          (void *) *leader);
    1896       481847 :       if (!(*leader)->failed)
    1897              :         {
    1898       434016 :           SLP_TREE_REF_COUNT (*leader)++;
    1899       434016 :           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
    1900       434016 :           stmts.release ();
    1901       434016 :           return *leader;
    1902              :         }
    1903        47831 :       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
    1904        47831 :       return NULL;
    1905              :     }
    1906              : 
    1907              :   /* Single-lane SLP doesn't have the chance of run-away, do not account
    1908              :      it to the limit.  */
    1909      5768208 :   if (stmts.length () > 1)
    1910              :     {
    1911      3183116 :       if (*limit == 0)
    1912              :         {
    1913         1235 :           if (dump_enabled_p ())
    1914           12 :             dump_printf_loc (MSG_NOTE, vect_location,
    1915              :                              "SLP discovery limit exceeded\n");
    1916         1235 :           memset (matches, 0, sizeof (bool) * group_size);
    1917         1235 :           return NULL;
    1918              :         }
    1919      3181881 :       --*limit;
    1920              :     }
    1921              : 
    1922              :   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
    1923              :      so we can pick up backedge destinations during discovery.  */
    1924      5766973 :   slp_tree res = new _slp_tree;
    1925      5766973 :   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
    1926      5766973 :   SLP_TREE_SCALAR_STMTS (res) = stmts;
    1927      5766973 :   bst_map->put (stmts.copy (), res);
    1928              : 
    1929      5766973 :   if (dump_enabled_p ())
    1930       146007 :     dump_printf_loc (MSG_NOTE, vect_location,
    1931              :                      "starting SLP discovery for node %p\n", (void *) res);
    1932              : 
    1933      5766973 :   poly_uint64 this_max_nunits = 1;
    1934      5766973 :   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
    1935              :                                         &this_max_nunits,
    1936              :                                         matches, limit, tree_size, bst_map);
    1937      5766973 :   if (!res_)
    1938              :     {
    1939      2013173 :       if (dump_enabled_p ())
    1940         8297 :         dump_printf_loc (MSG_NOTE, vect_location,
    1941              :                          "SLP discovery for node %p failed\n", (void *) res);
    1942              :       /* Mark the node invalid so we can detect those when still in use
    1943              :          as backedge destinations.  */
    1944      2013173 :       SLP_TREE_SCALAR_STMTS (res) = vNULL;
    1945      2013173 :       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
    1946      2013173 :       res->failed = XNEWVEC (bool, group_size);
    1947      2013173 :       if (flag_checking)
    1948              :         {
    1949              :           unsigned i;
    1950      3567746 :           for (i = 0; i < group_size; ++i)
    1951      3567746 :             if (!matches[i])
    1952              :               break;
    1953      2013173 :           gcc_assert (i < group_size);
    1954              :         }
    1955      2013173 :       memcpy (res->failed, matches, sizeof (bool) * group_size);
    1956              :     }
    1957              :   else
    1958              :     {
    1959      3753800 :       if (dump_enabled_p ())
    1960       137710 :         dump_printf_loc (MSG_NOTE, vect_location,
    1961              :                          "SLP discovery for node %p succeeded\n",
    1962              :                          (void *) res);
    1963      3753800 :       gcc_assert (res_ == res);
    1964      3753800 :       res->max_nunits = this_max_nunits;
    1965      3753800 :       vect_update_max_nunits (max_nunits, this_max_nunits);
    1966              :       /* Keep a reference for the bst_map use.  */
    1967      3753800 :       SLP_TREE_REF_COUNT (res)++;
    1968              :     }
    1969              :   return res_;
    1970              : }
    1971              : 
    1972              : /* Helper for building an associated SLP node chain.  */
    1973              : 
    1974              : static void
    1975          178 : vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
    1976              :                                    slp_tree op0, slp_tree op1,
    1977              :                                    stmt_vec_info oper1, stmt_vec_info oper2,
    1978              :                                    vec<std::pair<unsigned, unsigned> > lperm)
    1979              : {
    1980          178 :   unsigned group_size = SLP_TREE_LANES (op1);
    1981              : 
    1982          178 :   slp_tree child1 = new _slp_tree;
    1983          178 :   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
    1984          178 :   SLP_TREE_VECTYPE (child1) = vectype;
    1985          178 :   SLP_TREE_LANES (child1) = group_size;
    1986          178 :   SLP_TREE_CHILDREN (child1).create (2);
    1987          178 :   SLP_TREE_CHILDREN (child1).quick_push (op0);
    1988          178 :   SLP_TREE_CHILDREN (child1).quick_push (op1);
    1989          178 :   SLP_TREE_REPRESENTATIVE (child1) = oper1;
    1990              : 
    1991          178 :   slp_tree child2 = new _slp_tree;
    1992          178 :   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
    1993          178 :   SLP_TREE_VECTYPE (child2) = vectype;
    1994          178 :   SLP_TREE_LANES (child2) = group_size;
    1995          178 :   SLP_TREE_CHILDREN (child2).create (2);
    1996          178 :   SLP_TREE_CHILDREN (child2).quick_push (op0);
    1997          178 :   SLP_TREE_REF_COUNT (op0)++;
    1998          178 :   SLP_TREE_CHILDREN (child2).quick_push (op1);
    1999          178 :   SLP_TREE_REF_COUNT (op1)++;
    2000          178 :   SLP_TREE_REPRESENTATIVE (child2) = oper2;
    2001              : 
    2002          178 :   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
    2003          178 :   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
    2004          178 :   SLP_TREE_VECTYPE (perm) = vectype;
    2005          178 :   SLP_TREE_LANES (perm) = group_size;
    2006              :   /* ???  We should set this NULL but that's not expected.  */
    2007          178 :   SLP_TREE_REPRESENTATIVE (perm) = oper1;
    2008          178 :   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
    2009          178 :   SLP_TREE_CHILDREN (perm).quick_push (child1);
    2010          178 :   SLP_TREE_CHILDREN (perm).quick_push (child2);
    2011          178 : }
    2012              : 
    2013              : /* Recursively build an SLP tree starting from NODE.
    2014              :    Fail (and return a value not equal to zero) if def-stmts are not
    2015              :    isomorphic, require data permutation or are of unsupported types of
    2016              :    operation.  Otherwise, return 0.
    2017              :    The value returned is the depth in the SLP tree where a mismatch
    2018              :    was found.  */
    2019              : 
    2020              : static slp_tree
    2021      5766973 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
    2022              :                        vec<stmt_vec_info> stmts, unsigned int group_size,
    2023              :                        poly_uint64 *max_nunits,
    2024              :                        bool *matches, unsigned *limit, unsigned *tree_size,
    2025              :                        scalar_stmts_to_slp_tree_map_t *bst_map)
    2026              : {
    2027      5766973 :   unsigned nops, i, this_tree_size = 0;
    2028      5766973 :   poly_uint64 this_max_nunits = *max_nunits;
    2029              : 
    2030      5766973 :   matches[0] = false;
    2031              : 
    2032      5766973 :   stmt_vec_info stmt_info = stmts[0];
    2033      5766973 :   if (!is_a<gcall *> (stmt_info->stmt)
    2034              :       && !is_a<gassign *> (stmt_info->stmt)
    2035              :       && !is_a<gphi *> (stmt_info->stmt))
    2036              :     return NULL;
    2037              : 
    2038      5766902 :   nops = gimple_num_args (stmt_info->stmt);
    2039      5766902 :   if (const int *map = vect_get_operand_map (stmt_info))
    2040        35108 :     nops = map[0];
    2041              : 
    2042              :   /* If the SLP node is a PHI (induction or reduction), terminate
    2043              :      the recursion.  */
    2044      5766902 :   bool *skip_args = XALLOCAVEC (bool, nops);
    2045      5766902 :   memset (skip_args, 0, sizeof (bool) * nops);
    2046      5766902 :   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
    2047      2781491 :     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
    2048              :       {
    2049       299977 :         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
    2050       299977 :         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
    2051              :                                                     group_size);
    2052       299977 :         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
    2053              :                                      max_nunits))
    2054              :           return NULL;
    2055              : 
    2056       296052 :         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
    2057       296052 :         if (def_type == vect_induction_def)
    2058              :           {
    2059              :             /* Induction PHIs are not cycles but walk the initial
    2060              :                value.  Only for inner loops through, for outer loops
    2061              :                we need to pick up the value from the actual PHIs
    2062              :                to more easily support peeling and epilogue vectorization.  */
    2063       190200 :             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    2064       190200 :             if (!nested_in_vect_loop_p (loop, stmt_info))
    2065       189376 :               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
    2066              :             else
    2067              :               loop = loop->inner;
    2068       190200 :             skip_args[loop_latch_edge (loop)->dest_idx] = true;
    2069              :           }
    2070       105852 :         else if (def_type == vect_reduction_def
    2071              :                  || def_type == vect_double_reduction_def
    2072              :                  || def_type == vect_nested_cycle
    2073       105852 :                  || def_type == vect_first_order_recurrence)
    2074              :           {
    2075              :             /* Else def types have to match.  */
    2076              :             stmt_vec_info other_info;
    2077              :             bool all_same = true;
    2078       239614 :             FOR_EACH_VEC_ELT (stmts, i, other_info)
    2079              :               {
    2080       135076 :                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
    2081      1768716 :                   return NULL;
    2082       135070 :                 if (other_info != stmt_info)
    2083        26185 :                   all_same = false;
    2084              :               }
    2085       104538 :             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    2086              :             /* Reduction initial values are not explicitly represented.  */
    2087       104538 :             if (def_type != vect_first_order_recurrence
    2088       104538 :                 && gimple_bb (stmt_info->stmt) == loop->header)
    2089       101393 :               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
    2090              :             /* Reduction chain backedge defs are filled manually.
    2091              :                ???  Need a better way to identify a SLP reduction chain PHI.
    2092              :                Or a better overall way to SLP match those.  */
    2093       104538 :             if (stmts.length () > 1
    2094       104538 :                 && all_same && def_type == vect_reduction_def)
    2095         2311 :               skip_args[loop_latch_edge (loop)->dest_idx] = true;
    2096              :           }
    2097         1308 :         else if (def_type != vect_internal_def)
    2098              :           return NULL;
    2099              :       }
    2100              : 
    2101              : 
    2102      5762971 :   bool two_operators = false;
    2103      5762971 :   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
    2104      5762971 :   tree vectype = NULL_TREE;
    2105      5762971 :   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
    2106              :                               &this_max_nunits, matches, &two_operators,
    2107              :                               &vectype))
    2108              :     return NULL;
    2109              : 
    2110              :   /* If the SLP node is a load, terminate the recursion unless masked.  */
    2111      4210761 :   if (STMT_VINFO_DATA_REF (stmt_info)
    2112      2037596 :       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
    2113              :     {
    2114       901201 :       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
    2115              :         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
    2116              :       else
    2117              :         {
    2118       882601 :           *max_nunits = this_max_nunits;
    2119       882601 :           (*tree_size)++;
    2120       882601 :           node = vect_create_new_slp_node (node, stmts, 0);
    2121       882601 :           SLP_TREE_VECTYPE (node) = vectype;
    2122              :           /* And compute the load permutation.  Whether it is actually
    2123              :              a permutation depends on the unrolling factor which is
    2124              :              decided later.  */
    2125       882601 :           vec<unsigned> load_permutation;
    2126       882601 :           int j;
    2127       882601 :           stmt_vec_info load_info;
    2128       882601 :           load_permutation.create (group_size);
    2129       882601 :           stmt_vec_info first_stmt_info
    2130       882601 :             = STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2131       882601 :               ? DR_GROUP_FIRST_ELEMENT (stmt_info) : stmt_info;
    2132       882601 :           bool any_permute = false;
    2133      2126748 :           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
    2134              :             {
    2135      1244147 :               int load_place;
    2136      1244147 :               if (! load_info)
    2137              :                 {
    2138        39926 :                   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    2139              :                     load_place = j;
    2140              :                   else
    2141              :                     load_place = 0;
    2142              :                 }
    2143      1204221 :               else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    2144       701848 :                 load_place = vect_get_place_in_interleaving_chain
    2145       701848 :                     (load_info, first_stmt_info);
    2146              :               else
    2147              :                 /* Recognize the splat case as { 0, 0, ... } but make
    2148              :                    sure to use the appropriate refs for collections
    2149              :                    of invariant refs.  */
    2150       502373 :                 load_place = (load_info == stmt_info) ? 0 : j;
    2151       742015 :               gcc_assert (load_place != -1);
    2152      1244147 :               any_permute |= load_place != j;
    2153      1244147 :               load_permutation.quick_push (load_place);
    2154              :             }
    2155              : 
    2156       882601 :           if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
    2157              :             {
    2158         3406 :               gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
    2159         3406 :               bool has_gaps = false;
    2160         3406 :               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    2161          209 :                 for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
    2162         1346 :                      si; si = DR_GROUP_NEXT_ELEMENT (si))
    2163         1137 :                   if (DR_GROUP_GAP (si) != 1)
    2164          160 :                     has_gaps = true;
    2165              :               /* We cannot handle permuted masked loads directly, see
    2166              :                  PR114375.  We cannot handle strided masked loads or masked
    2167              :                  loads with gaps unless the mask is uniform.  */
    2168         3406 :               if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2169          209 :                    && (DR_GROUP_GAP (first_stmt_info) != 0
    2170          149 :                        || (has_gaps
    2171           55 :                            && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))))
    2172         6717 :                   || STMT_VINFO_STRIDED_P (stmt_info))
    2173              :                 {
    2174          108 :                   load_permutation.release ();
    2175          108 :                   matches[0] = false;
    2176       879347 :                   return NULL;
    2177              :                 }
    2178              : 
    2179              :               /* For permuted masked loads do an unpermuted masked load of
    2180              :                  the whole group followed by a SLP permute node.  */
    2181         3298 :               if (any_permute
    2182         3298 :                   || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2183           84 :                       && DR_GROUP_SIZE (first_stmt_info) != group_size))
    2184              :                 {
    2185              :                   /* Discover the whole unpermuted load.  */
    2186           44 :                   vec<stmt_vec_info> stmts2;
    2187           44 :                   unsigned dr_group_size = STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2188           78 :                       ? DR_GROUP_SIZE (first_stmt_info) : 1;
    2189           44 :                   stmts2.create (dr_group_size);
    2190           44 :                   stmts2.quick_grow_cleared (dr_group_size);
    2191           44 :                   unsigned i = 0;
    2192           44 :                   for (stmt_vec_info si = first_stmt_info;
    2193          594 :                        si; si = DR_GROUP_NEXT_ELEMENT (si))
    2194              :                     {
    2195          550 :                       if (si != first_stmt_info)
    2196         2106 :                         for (unsigned k = 1; k < DR_GROUP_GAP (si); ++k)
    2197         1600 :                           stmts2[i++] = NULL;
    2198          550 :                       stmts2[i++] = si;
    2199              :                     }
    2200           44 :                   bool *matches2 = XALLOCAVEC (bool, dr_group_size);
    2201           44 :                   slp_tree unperm_load
    2202           44 :                     = vect_build_slp_tree (vinfo, stmts2, dr_group_size,
    2203              :                                            &this_max_nunits, matches2, limit,
    2204           44 :                                            &this_tree_size, bst_map);
    2205              :                   /* When we are able to do the full masked load emit that
    2206              :                      followed by 'node' being the desired final permutation.  */
    2207           44 :                   if (unperm_load)
    2208              :                     {
    2209           16 :                       gcc_assert
    2210              :                         (!SLP_TREE_LOAD_PERMUTATION (unperm_load).exists ());
    2211           16 :                       lane_permutation_t lperm;
    2212           16 :                       lperm.create (group_size);
    2213           56 :                       for (unsigned j = 0; j < load_permutation.length (); ++j)
    2214           40 :                         lperm.quick_push
    2215           40 :                           (std::make_pair (0, load_permutation[j]));
    2216           16 :                       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
    2217           16 :                       SLP_TREE_CHILDREN (node).safe_push (unperm_load);
    2218           16 :                       SLP_TREE_LANE_PERMUTATION (node) = lperm;
    2219           16 :                       load_permutation.release ();
    2220           16 :                       return node;
    2221              :                     }
    2222           28 :                   stmts2.release ();
    2223           28 :                   load_permutation.release ();
    2224           28 :                   matches[0] = false;
    2225           28 :                   return NULL;
    2226              :                 }
    2227         3254 :               load_permutation.release ();
    2228              :             }
    2229              :           else
    2230              :             {
    2231       879195 :               if (!any_permute
    2232       766424 :                   && STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2233      1169250 :                   && group_size == DR_GROUP_SIZE (first_stmt_info))
    2234       126548 :                 load_permutation.release ();
    2235       879195 :               SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
    2236       879195 :               return node;
    2237              :             }
    2238              :         }
    2239              :     }
    2240      3309560 :   else if (gimple_assign_single_p (stmt_info->stmt)
    2241      2277250 :            && !gimple_vuse (stmt_info->stmt)
    2242      3317356 :            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
    2243              :     {
    2244              :       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
    2245              :          the same SSA name vector of a compatible type to vectype.  */
    2246         2366 :       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
    2247         2366 :       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
    2248         2366 :       stmt_vec_info estmt_info;
    2249         7440 :       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
    2250              :         {
    2251         5221 :           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
    2252         5221 :           tree bfref = gimple_assign_rhs1 (estmt);
    2253         5221 :           HOST_WIDE_INT lane;
    2254         5221 :           if (!known_eq (bit_field_size (bfref),
    2255              :                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
    2256        10295 :               || !constant_multiple_p (bit_field_offset (bfref),
    2257         5074 :                                        bit_field_size (bfref), &lane))
    2258              :             {
    2259          147 :               lperm.release ();
    2260          147 :               matches[0] = false;
    2261          147 :               return NULL;
    2262              :             }
    2263         5074 :           lperm.safe_push (std::make_pair (0, (unsigned)lane));
    2264              :         }
    2265         2219 :       slp_tree vnode = vect_create_new_slp_node (vNULL);
    2266         2219 :       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
    2267              :         /* ???  We record vectype here but we hide eventually necessary
    2268              :            punning and instead rely on code generation to materialize
    2269              :            VIEW_CONVERT_EXPRs as necessary.  We instead should make
    2270              :            this explicit somehow.  */
    2271          703 :         SLP_TREE_VECTYPE (vnode) = vectype;
    2272              :       else
    2273              :         {
    2274              :           /* For different size but compatible elements we can still
    2275              :              use VEC_PERM_EXPR without punning.  */
    2276         1516 :           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
    2277              :                       && types_compatible_p (TREE_TYPE (vectype),
    2278              :                                              TREE_TYPE (TREE_TYPE (vec))));
    2279         1516 :           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
    2280              :         }
    2281         2219 :       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
    2282         2219 :       unsigned HOST_WIDE_INT const_nunits;
    2283         2219 :       if (nunits.is_constant (&const_nunits))
    2284         2219 :         SLP_TREE_LANES (vnode) = const_nunits;
    2285         2219 :       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
    2286              :       /* We are always building a permutation node even if it is an identity
    2287              :          permute to shield the rest of the vectorizer from the odd node
    2288              :          representing an actual vector without any scalar ops.
    2289              :          ???  We could hide it completely with making the permute node
    2290              :          external?  */
    2291         2219 :       node = vect_create_new_slp_node (node, stmts, 1);
    2292         2219 :       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
    2293         2219 :       SLP_TREE_LANE_PERMUTATION (node) = lperm;
    2294         2219 :       SLP_TREE_VECTYPE (node) = vectype;
    2295         2219 :       SLP_TREE_CHILDREN (node).quick_push (vnode);
    2296         2219 :       return node;
    2297              :     }
    2298              :   /* When discovery reaches an associatable operation see whether we can
    2299              :      improve that to match up lanes in a way superior to the operand
    2300              :      swapping code which at most looks at two defs.
    2301              :      ???  For BB vectorization we cannot do the brute-force search
    2302              :      for matching as we can succeed by means of builds from scalars
    2303              :      and have no good way to "cost" one build against another.  */
    2304      3307194 :   else if (is_a <loop_vec_info> (vinfo)
    2305              :            /* Do not bother for single-lane SLP.  */
    2306      1964804 :            && group_size > 1
    2307              :            /* ???  We don't handle !vect_internal_def defs below.  */
    2308       111580 :            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
    2309              :            /* ???  Do not associate a reduction, this will wreck REDUC_IDX
    2310              :               mapping as long as that exists on the stmt_info level.  */
    2311        86213 :            && STMT_VINFO_REDUC_IDX (stmt_info) == -1
    2312        77718 :            && is_gimple_assign (stmt_info->stmt)
    2313        77404 :            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
    2314        50840 :                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
    2315      3335666 :            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
    2316        16258 :                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
    2317        13736 :                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
    2318              :     {
    2319              :       /* See if we have a chain of (mixed) adds or subtracts or other
    2320              :          associatable ops.  */
    2321        21439 :       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
    2322        21439 :       if (code == MINUS_EXPR)
    2323          796 :         code = PLUS_EXPR;
    2324        21439 :       stmt_vec_info other_op_stmt_info = NULL;
    2325        21439 :       stmt_vec_info op_stmt_info = NULL;
    2326        21439 :       unsigned chain_len = 0;
    2327        21439 :       auto_vec<chain_op_t> chain;
    2328        21439 :       auto_vec<std::pair<tree_code, gimple *> > worklist;
    2329        21439 :       auto_vec<vec<chain_op_t> > chains (group_size);
    2330        21439 :       auto_vec<slp_tree, 4> children;
    2331        21439 :       bool hard_fail = true;
    2332        22506 :       for (unsigned lane = 0; lane < group_size; ++lane)
    2333              :         {
    2334        22150 :           if (!stmts[lane])
    2335              :             {
    2336              :               /* ???  Below we require lane zero is present.  */
    2337            0 :               if (lane == 0)
    2338              :                 {
    2339              :                   hard_fail = false;
    2340        21083 :                   break;
    2341              :                 }
    2342            0 :               chains.quick_push (vNULL);
    2343            0 :               continue;
    2344              :             }
    2345              :           /* For each lane linearize the addition/subtraction (or other
    2346              :              uniform associatable operation) expression tree.  */
    2347        22150 :           gimple *op_stmt = NULL, *other_op_stmt = NULL;
    2348        22150 :           vect_slp_linearize_chain (vinfo, worklist, chain, code,
    2349        22150 :                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
    2350              :                                     NULL);
    2351        22150 :           if (!op_stmt_info && op_stmt)
    2352        20860 :             op_stmt_info = vinfo->lookup_stmt (op_stmt);
    2353        22150 :           if (!other_op_stmt_info && other_op_stmt)
    2354          832 :             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
    2355        22150 :           if (chain.length () == 2)
    2356              :             {
    2357              :               /* In a chain of just two elements resort to the regular
    2358              :                  operand swapping scheme.  Likewise if we run into a
    2359              :                  length mismatch process regularly as well as we did not
    2360              :                  process the other lanes we cannot report a good hint what
    2361              :                  lanes to try swapping in the parent.  */
    2362              :               hard_fail = false;
    2363              :               break;
    2364              :             }
    2365         1070 :           else if (chain_len == 0)
    2366          396 :             chain_len = chain.length ();
    2367         1348 :           else if (chain.length () != chain_len)
    2368              :             {
    2369              :               /* ???  Here we could slip in magic to compensate with
    2370              :                  neutral operands.  */
    2371            3 :               matches[lane] = false;
    2372            3 :               if (lane != group_size - 1)
    2373            3 :                 matches[0] = false;
    2374              :               break;
    2375              :             }
    2376         1067 :           chains.quick_push (chain.copy ());
    2377         1067 :           chain.truncate (0);
    2378              :         }
    2379        42878 :       if (chains.length () == group_size)
    2380              :         {
    2381              :           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
    2382          356 :           if (!op_stmt_info)
    2383              :             {
    2384            3 :               hard_fail = false;
    2385            3 :               goto out;
    2386              :             }
    2387              :           /* Now we have a set of chains with the same length.  */
    2388              :           /* 1. pre-sort according to def_type and operation.  */
    2389         1308 :           for (unsigned lane = 0; lane < group_size; ++lane)
    2390         1910 :             chains[lane].stablesort (dt_sort_cmp, vinfo);
    2391          353 :           if (dump_enabled_p ())
    2392              :             {
    2393          157 :               dump_printf_loc (MSG_NOTE, vect_location,
    2394              :                                "pre-sorted chains of %s\n",
    2395              :                                get_tree_code_name (code));
    2396          685 :               for (unsigned lane = 0; lane < group_size; ++lane)
    2397              :                 {
    2398          528 :                   if (!stmts[lane])
    2399            0 :                     dump_printf (MSG_NOTE, "--");
    2400              :                   else
    2401         2422 :                     for (unsigned opnum = 0; opnum < chain_len; ++opnum)
    2402         3788 :                       dump_printf (MSG_NOTE, "%s %T ",
    2403         1894 :                                    get_tree_code_name (chains[lane][opnum].code),
    2404         1894 :                                    chains[lane][opnum].op);
    2405          528 :                   dump_printf (MSG_NOTE, "\n");
    2406              :                 }
    2407              :             }
    2408              :           /* 2. try to build children nodes, associating as necessary.  */
    2409              :           /* 2a. prepare and perform early checks to avoid eating into
    2410              :              discovery limit unnecessarily.  */
    2411          353 :           vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
    2412         1487 :           for (unsigned n = 0; n < chain_len; ++n)
    2413              :             {
    2414         1134 :               vect_def_type dt = chains[0][n].dt;
    2415         1134 :               unsigned lane;
    2416         4357 :               for (lane = 0; lane < group_size; ++lane)
    2417         6446 :                 if (stmts[lane] && chains[lane][n].dt != dt)
    2418              :                   {
    2419            0 :                     if (dt == vect_constant_def
    2420            0 :                         && chains[lane][n].dt == vect_external_def)
    2421              :                       dt = vect_external_def;
    2422            0 :                     else if (dt == vect_external_def
    2423            0 :                              && chains[lane][n].dt == vect_constant_def)
    2424              :                       ;
    2425              :                     else
    2426              :                       break;
    2427              :                   }
    2428         1134 :               if (lane != group_size)
    2429              :                 {
    2430            0 :                   if (dump_enabled_p ())
    2431            0 :                     dump_printf_loc (MSG_NOTE, vect_location,
    2432              :                                      "giving up on chain due to mismatched "
    2433              :                                      "def types\n");
    2434            0 :                   matches[lane] = false;
    2435            0 :                   if (lane != group_size - 1)
    2436            0 :                     matches[0] = false;
    2437            0 :                   goto out;
    2438              :                 }
    2439         1134 :               dts[n] = dt;
    2440         1134 :               if (dt == vect_constant_def
    2441         1134 :                   || dt == vect_external_def)
    2442              :                 {
    2443              :                   /* Check whether we can build the invariant.  If we can't
    2444              :                      we never will be able to.  */
    2445           93 :                   tree type = TREE_TYPE (chains[0][n].op);
    2446         1134 :                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
    2447              :                       && (TREE_CODE (type) == BOOLEAN_TYPE
    2448              :                           || !can_duplicate_and_interleave_p (vinfo, group_size,
    2449              :                                                               type)))
    2450              :                     {
    2451              :                       matches[0] = false;
    2452              :                       goto out;
    2453              :                     }
    2454              :                 }
    2455         1041 :               else if (dt != vect_internal_def)
    2456              :                 {
    2457              :                   /* Not sure, we might need sth special.
    2458              :                      gcc.dg/vect/pr96854.c,
    2459              :                      gfortran.dg/vect/fast-math-pr37021.f90
    2460              :                      and gfortran.dg/vect/pr61171.f trigger.  */
    2461              :                   /* Soft-fail for now.  */
    2462            0 :                   hard_fail = false;
    2463            0 :                   goto out;
    2464              :                 }
    2465              :             }
    2466              :           /* 2b. do the actual build.  */
    2467         1429 :           for (unsigned n = 0; n < chain_len; ++n)
    2468              :             {
    2469         1096 :               vect_def_type dt = dts[n];
    2470         1096 :               unsigned lane;
    2471         1096 :               if (dt == vect_constant_def
    2472         1096 :                   || dt == vect_external_def)
    2473              :                 {
    2474           93 :                   vec<tree> ops;
    2475           93 :                   ops.create (group_size);
    2476          461 :                   for (lane = 0; lane < group_size; ++lane)
    2477          275 :                     if (stmts[lane])
    2478          275 :                       ops.quick_push (chains[lane][n].op);
    2479              :                     else
    2480            0 :                       ops.quick_push (NULL_TREE);
    2481           93 :                   slp_tree child = vect_create_new_slp_node (ops);
    2482           93 :                   SLP_TREE_DEF_TYPE (child) = dt;
    2483           93 :                   children.safe_push (child);
    2484              :                 }
    2485              :               else
    2486              :                 {
    2487         1003 :                   vec<stmt_vec_info> op_stmts;
    2488         1003 :                   op_stmts.create (group_size);
    2489         1003 :                   slp_tree child = NULL;
    2490              :                   /* Brute-force our way.  We have to consider a lane
    2491              :                      failing after fixing an earlier fail up in the
    2492              :                      SLP discovery recursion.  So track the current
    2493              :                      permute per lane.  */
    2494         1003 :                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
    2495         1003 :                   memset (perms, 0, sizeof (unsigned) * group_size);
    2496         1097 :                   do
    2497              :                     {
    2498         1097 :                       op_stmts.truncate (0);
    2499         5320 :                       for (lane = 0; lane < group_size; ++lane)
    2500         3126 :                         if (stmts[lane])
    2501         3126 :                           op_stmts.quick_push
    2502         3126 :                             (vinfo->lookup_def (chains[lane][n].op));
    2503              :                         else
    2504            0 :                           op_stmts.quick_push (NULL);
    2505         1097 :                       child = vect_build_slp_tree (vinfo, op_stmts,
    2506              :                                                    group_size, &this_max_nunits,
    2507              :                                                    matches, limit,
    2508              :                                                    &this_tree_size, bst_map);
    2509              :                       /* ???  We're likely getting too many fatal mismatches
    2510              :                          here so maybe we want to ignore them (but then we
    2511              :                          have no idea which lanes fatally mismatched).  */
    2512         1097 :                       if (child || !matches[0])
    2513              :                         break;
    2514              :                       /* Swap another lane we have not yet matched up into
    2515              :                          lanes that did not match.  If we run out of
    2516              :                          permute possibilities for a lane terminate the
    2517              :                          search.  */
    2518          287 :                       bool term = false;
    2519          287 :                       for (lane = 1; lane < group_size; ++lane)
    2520          193 :                         if (!matches[lane])
    2521              :                           {
    2522          165 :                             if (n + perms[lane] + 1 == chain_len)
    2523              :                               {
    2524              :                                 term = true;
    2525              :                                 break;
    2526              :                               }
    2527          146 :                             if (dump_enabled_p ())
    2528          113 :                               dump_printf_loc (MSG_NOTE, vect_location,
    2529              :                                                "swapping operand %d and %d "
    2530              :                                                "of lane %d\n",
    2531              :                                                n, n + perms[lane] + 1, lane);
    2532          292 :                             std::swap (chains[lane][n],
    2533          146 :                                        chains[lane][n + perms[lane] + 1]);
    2534          146 :                             perms[lane]++;
    2535              :                           }
    2536          113 :                       if (term)
    2537              :                         break;
    2538              :                     }
    2539              :                   while (1);
    2540         1003 :                   if (!child)
    2541              :                     {
    2542           20 :                       if (dump_enabled_p ())
    2543           18 :                         dump_printf_loc (MSG_NOTE, vect_location,
    2544              :                                          "failed to match up op %d\n", n);
    2545           20 :                       op_stmts.release ();
    2546           20 :                       if (lane != group_size - 1)
    2547           10 :                         matches[0] = false;
    2548              :                       else
    2549           10 :                         matches[lane] = false;
    2550           20 :                       goto out;
    2551              :                     }
    2552          983 :                   if (dump_enabled_p ())
    2553              :                     {
    2554          421 :                       dump_printf_loc (MSG_NOTE, vect_location,
    2555              :                                        "matched up op %d to\n", n);
    2556          421 :                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
    2557              :                     }
    2558          983 :                   children.safe_push (child);
    2559              :                 }
    2560              :             }
    2561              :           /* 3. build SLP nodes to combine the chain.  */
    2562         1213 :           for (unsigned lane = 0; lane < group_size; ++lane)
    2563         1772 :             if (stmts[lane] && chains[lane][0].code != code)
    2564              :               {
    2565              :                 /* See if there's any alternate all-PLUS entry.  */
    2566              :                 unsigned n;
    2567            6 :                 for (n = 1; n < chain_len; ++n)
    2568              :                   {
    2569           30 :                     for (lane = 0; lane < group_size; ++lane)
    2570           48 :                       if (stmts[lane] && chains[lane][n].code != code)
    2571              :                         break;
    2572            6 :                     if (lane == group_size)
    2573              :                       break;
    2574              :                   }
    2575            6 :                 if (n != chain_len)
    2576              :                   {
    2577              :                     /* Swap that in at first position.  */
    2578            6 :                     std::swap (children[0], children[n]);
    2579           30 :                     for (lane = 0; lane < group_size; ++lane)
    2580           24 :                       if (stmts[lane])
    2581           24 :                         std::swap (chains[lane][0], chains[lane][n]);
    2582              :                   }
    2583              :                 else
    2584              :                   {
    2585              :                     /* ???  When this triggers and we end up with two
    2586              :                        vect_constant/external_def up-front things break (ICE)
    2587              :                        spectacularly finding an insertion place for the
    2588              :                        all-constant op.  We should have a fully
    2589              :                        vect_internal_def operand though(?) so we can swap
    2590              :                        that into first place and then prepend the all-zero
    2591              :                        constant.  */
    2592            0 :                     if (dump_enabled_p ())
    2593            0 :                       dump_printf_loc (MSG_NOTE, vect_location,
    2594              :                                        "inserting constant zero to compensate "
    2595              :                                        "for (partially) negated first "
    2596              :                                        "operand\n");
    2597            0 :                     chain_len++;
    2598            0 :                     for (lane = 0; lane < group_size; ++lane)
    2599            0 :                       if (stmts[lane])
    2600            0 :                         chains[lane].safe_insert
    2601            0 :                           (0, chain_op_t (code, vect_constant_def, NULL_TREE));
    2602            0 :                     vec<tree> zero_ops;
    2603            0 :                     zero_ops.create (group_size);
    2604            0 :                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
    2605            0 :                     for (lane = 1; lane < group_size; ++lane)
    2606            0 :                       if (stmts[lane])
    2607            0 :                         zero_ops.quick_push (zero_ops[0]);
    2608              :                       else
    2609            0 :                         zero_ops.quick_push (NULL_TREE);
    2610            0 :                     slp_tree zero = vect_create_new_slp_node (zero_ops);
    2611            0 :                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
    2612            0 :                     children.safe_insert (0, zero);
    2613              :                   }
    2614              :                 break;
    2615              :               }
    2616         1071 :           for (unsigned i = 1; i < children.length (); ++i)
    2617              :             {
    2618          738 :               slp_tree op0 = children[i - 1];
    2619          738 :               slp_tree op1 = children[i];
    2620          738 :               bool this_two_op = false;
    2621         2660 :               for (unsigned lane = 0; lane < group_size; ++lane)
    2622         4200 :                 if (stmts[lane] && chains[lane][i].code != chains[0][i].code)
    2623              :                   {
    2624              :                     this_two_op = true;
    2625              :                     break;
    2626              :                   }
    2627          738 :               slp_tree child;
    2628          738 :               if (i == children.length () - 1)
    2629          333 :                 child = vect_create_new_slp_node (node, stmts, 2);
    2630              :               else
    2631          405 :                 child = vect_create_new_slp_node (2, ERROR_MARK);
    2632          738 :               if (this_two_op)
    2633              :                 {
    2634          178 :                   vec<std::pair<unsigned, unsigned> > lperm;
    2635          178 :                   lperm.create (group_size);
    2636          630 :                   for (unsigned lane = 0; lane < group_size; ++lane)
    2637          904 :                     lperm.quick_push (std::make_pair
    2638          452 :                       (chains[lane][i].code != chains[0][i].code, lane));
    2639          356 :                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
    2640          178 :                                                      (chains[0][i].code == code
    2641              :                                                       ? op_stmt_info
    2642              :                                                       : other_op_stmt_info),
    2643          178 :                                                      (chains[0][i].code == code
    2644              :                                                       ? other_op_stmt_info
    2645              :                                                       : op_stmt_info),
    2646              :                                                      lperm);
    2647              :                 }
    2648              :               else
    2649              :                 {
    2650          560 :                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
    2651          560 :                   SLP_TREE_VECTYPE (child) = vectype;
    2652          560 :                   SLP_TREE_LANES (child) = group_size;
    2653          560 :                   SLP_TREE_CHILDREN (child).quick_push (op0);
    2654          560 :                   SLP_TREE_CHILDREN (child).quick_push (op1);
    2655          560 :                   SLP_TREE_REPRESENTATIVE (child)
    2656         1120 :                     = (chains[0][i].code == code
    2657          560 :                        ? op_stmt_info : other_op_stmt_info);
    2658              :                 }
    2659          738 :               children[i] = child;
    2660              :             }
    2661          333 :           *tree_size += this_tree_size + 1;
    2662          333 :           *max_nunits = this_max_nunits;
    2663         1593 :           while (!chains.is_empty ())
    2664          904 :             chains.pop ().release ();
    2665              :           return node;
    2666              :         }
    2667        21083 : out:
    2668        21106 :       if (dump_enabled_p ())
    2669         2809 :         dump_printf_loc (MSG_NOTE, vect_location,
    2670              :                          "failed to line up SLP graph by re-associating "
    2671              :                          "operations in lanes%s\n",
    2672              :                          !hard_fail ? " trying regular discovery" : "");
    2673        21111 :       while (!children.is_empty ())
    2674            5 :         vect_free_slp_tree (children.pop ());
    2675        21269 :       while (!chains.is_empty ())
    2676          163 :         chains.pop ().release ();
    2677              :       /* Hard-fail, otherwise we might run into quadratic processing of the
    2678              :          chains starting one stmt into the chain again.  */
    2679        21106 :       if (hard_fail)
    2680              :         return NULL;
    2681              :       /* Fall thru to normal processing.  */
    2682        21439 :     }
    2683              : 
    2684              :   /* Get at the operands, verifying they are compatible.  */
    2685      3328692 :   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
    2686      3328692 :   slp_oprnd_info oprnd_info;
    2687     16036055 :   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
    2688              :     {
    2689     25417152 :       int res = vect_get_and_check_slp_defs (vinfo, vectype,
    2690     12708576 :                                              swap[i], skip_args,
    2691              :                                              stmts, i, &oprnds_info);
    2692     12708576 :       if (res != 0)
    2693       543063 :         matches[(res == -1) ? 0 : i] = false;
    2694     12708576 :       if (!matches[0])
    2695              :         break;
    2696              :     }
    2697     15725209 :   for (i = 0; i < group_size; ++i)
    2698     12609021 :     if (!matches[i])
    2699              :       {
    2700       212504 :         vect_free_oprnd_info (oprnds_info);
    2701       212504 :         return NULL;
    2702              :       }
    2703      9348564 :   swap = NULL;
    2704              : 
    2705      9348564 :   bool has_two_operators_perm = false;
    2706     18697128 :   auto_vec<unsigned> two_op_perm_indices[2];
    2707      3116188 :   vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
    2708              : 
    2709      3130423 :   if (two_operators && oprnds_info.length () == 2 && group_size > 2)
    2710              :     {
    2711         3867 :       unsigned idx = 0;
    2712         3867 :       hash_map<gimple *, unsigned> seen;
    2713         3867 :       vec<slp_oprnd_info> new_oprnds_info
    2714         3867 :         = vect_create_oprnd_info (1, group_size);
    2715         3867 :       bool success = true;
    2716              : 
    2717         3867 :       enum tree_code code = ERROR_MARK;
    2718         3867 :       if (oprnds_info[0]->def_stmts[0]
    2719         3867 :           && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
    2720         3809 :         code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
    2721         3867 :       basic_block bb = nullptr;
    2722              : 
    2723         7470 :       for (unsigned j = 0; j < group_size; ++j)
    2724              :         {
    2725        17480 :           FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
    2726              :             {
    2727        13877 :               stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
    2728        13877 :               if (!stmt_info
    2729        13654 :                   || !is_a<gassign *> (stmt_info->stmt)
    2730        13651 :                   || gimple_assign_rhs_code (stmt_info->stmt) != code
    2731        24350 :                   || skip_args[i])
    2732              :                 {
    2733              :                   success = false;
    2734         3408 :                   break;
    2735              :                 }
    2736              :               /* Avoid mixing lanes with defs in different basic-blocks.  */
    2737        10473 :               if (!bb)
    2738         3985 :                 bb = gimple_bb (vect_orig_stmt (stmt_info)->stmt);
    2739         8252 :               else if (gimple_bb (vect_orig_stmt (stmt_info)->stmt) != bb)
    2740              :                 {
    2741              :                   success = false;
    2742              :                   break;
    2743              :                 }
    2744              : 
    2745        10469 :               bool exists;
    2746        10469 :               unsigned &stmt_idx
    2747        10469 :                 = seen.get_or_insert (stmt_info->stmt, &exists);
    2748              : 
    2749        10469 :               if (!exists)
    2750              :                 {
    2751         9128 :                   new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
    2752         9128 :                   new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
    2753         9128 :                   stmt_idx = idx;
    2754         9128 :                   idx++;
    2755              :                 }
    2756              : 
    2757        10469 :               two_op_perm_indices[i].safe_push (stmt_idx);
    2758              :             }
    2759              : 
    2760         7011 :           if (!success)
    2761              :             break;
    2762              :         }
    2763              : 
    2764         3867 :       if (success && idx == group_size)
    2765              :         {
    2766           94 :           if (dump_enabled_p ())
    2767              :             {
    2768            0 :               dump_printf_loc (MSG_NOTE, vect_location,
    2769              :                                "Replace two_operators operands:\n");
    2770              : 
    2771            0 :               FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
    2772              :                 {
    2773            0 :                   dump_printf_loc (MSG_NOTE, vect_location,
    2774              :                                    "Operand %u:\n", i);
    2775            0 :                   for (unsigned j = 0; j < group_size; j++)
    2776            0 :                     dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
    2777            0 :                                      j, oprnd_info->def_stmts[j]->stmt);
    2778              :                 }
    2779              : 
    2780            0 :               dump_printf_loc (MSG_NOTE, vect_location,
    2781              :                                "With a single operand:\n");
    2782            0 :               for (unsigned j = 0; j < group_size; j++)
    2783            0 :                 dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
    2784            0 :                                  j, new_oprnds_info[0]->def_stmts[j]->stmt);
    2785              :             }
    2786              : 
    2787           94 :           two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
    2788           94 :           two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
    2789              : 
    2790           94 :           new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
    2791           94 :           new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
    2792           94 :           new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
    2793           94 :           new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
    2794           94 :           new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
    2795              : 
    2796           94 :           vect_free_oprnd_info (oprnds_info);
    2797           94 :           oprnds_info = new_oprnds_info;
    2798           94 :           nops = 1;
    2799           94 :           has_two_operators_perm = true;
    2800              :         }
    2801              :       else
    2802         3773 :         vect_free_oprnd_info (new_oprnds_info);
    2803         3867 :     }
    2804              : 
    2805      6232376 :   auto_vec<slp_tree, 4> children;
    2806              : 
    2807      3116188 :   stmt_info = stmts[0];
    2808              : 
    2809      3116188 :   int reduc_idx = -1;
    2810      3116188 :   int gs_scale = 0;
    2811      3116188 :   tree gs_base = NULL_TREE;
    2812              : 
    2813              :   /* Create SLP_TREE nodes for the definition node/s.  */
    2814      7975929 :   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
    2815              :     {
    2816      4975482 :       slp_tree child = nullptr;
    2817      4975482 :       unsigned int j;
    2818              : 
    2819              :       /* We're skipping certain operands from processing, for example
    2820              :          outer loop reduction initial defs.  */
    2821      4975482 :       if (skip_args[i])
    2822              :         {
    2823       483280 :           children.safe_push (NULL);
    2824      5343021 :           continue;
    2825              :         }
    2826              : 
    2827      4492202 :       if (oprnd_info->first_dt == vect_uninitialized_def)
    2828              :         {
    2829              :           /* COND_EXPR have one too many eventually if the condition
    2830              :              is a SSA name.  */
    2831            0 :           gcc_assert (i == 3 && nops == 4);
    2832            0 :           continue;
    2833              :         }
    2834              : 
    2835      4492202 :       if (oprnd_info->first_gs_p)
    2836              :         {
    2837        22453 :           gs_scale = oprnd_info->first_gs_info.scale;
    2838        22453 :           gs_base = oprnd_info->first_gs_info.base;
    2839              :         }
    2840              : 
    2841      4492202 :       if (is_a <bb_vec_info> (vinfo)
    2842      1578681 :           && oprnd_info->first_dt == vect_internal_def
    2843      5314452 :           && !oprnd_info->any_pattern)
    2844              :         {
    2845              :           /* For BB vectorization, if all defs are the same do not
    2846              :              bother to continue the build along the single-lane
    2847              :              graph but use a splat of the scalar value.  */
    2848       778251 :           stmt_vec_info first_def = oprnd_info->def_stmts[0];
    2849       834862 :           for (j = 1; j < group_size; ++j)
    2850       794073 :             if (oprnd_info->def_stmts[j] != first_def)
    2851              :               break;
    2852       778251 :           if (j == group_size
    2853              :               /* But avoid doing this for loads where we may be
    2854              :                  able to CSE things, unless the stmt is not
    2855              :                  vectorizable.  */
    2856       778251 :               && (!STMT_VINFO_VECTORIZABLE (first_def)
    2857        50111 :                   || !gimple_vuse (first_def->stmt)))
    2858              :             {
    2859        31723 :               if (dump_enabled_p ())
    2860          105 :                 dump_printf_loc (MSG_NOTE, vect_location,
    2861              :                                  "Using a splat of the uniform operand %G",
    2862              :                                  first_def->stmt);
    2863        31723 :               oprnd_info->first_dt = vect_external_def;
    2864              :             }
    2865              :         }
    2866              : 
    2867      4492202 :       if (oprnd_info->first_dt == vect_external_def
    2868      4492202 :           || oprnd_info->first_dt == vect_constant_def)
    2869              :         {
    2870      1472226 :           if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
    2871              :             {
    2872              :               tree op0;
    2873              :               tree uniform_val = op0 = oprnd_info->ops[0];
    2874              :               for (j = 1; j < oprnd_info->ops.length (); ++j)
    2875              :                 if (oprnd_info->ops[j]
    2876              :                     && !operand_equal_p (uniform_val, oprnd_info->ops[j]))
    2877              :                   {
    2878              :                     uniform_val = NULL_TREE;
    2879              :                     break;
    2880              :                   }
    2881              :               if (!uniform_val
    2882              :                   && !can_duplicate_and_interleave_p (vinfo,
    2883              :                                                       oprnd_info->ops.length (),
    2884              :                                                       TREE_TYPE (op0)))
    2885              :                 {
    2886              :                   matches[j] = false;
    2887              :                   if (dump_enabled_p ())
    2888              :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    2889              :                                      "Build SLP failed: invalid type of def "
    2890              :                                      "for variable-length SLP %T\n", op0);
    2891              :                   goto fail;
    2892              :                 }
    2893              :             }
    2894      1472226 :           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
    2895      1472226 :           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
    2896      1472226 :           oprnd_info->ops = vNULL;
    2897      1472226 :           children.safe_push (invnode);
    2898      1472226 :           continue;
    2899      1472226 :         }
    2900              : 
    2901              :       /* See which SLP operand a reduction chain continues on.  We want
    2902              :          to chain even PHIs but not backedges.  */
    2903      3019976 :       if (STMT_VINFO_REDUC_DEF (oprnd_info->def_stmts[0])
    2904      3019976 :           || STMT_VINFO_REDUC_IDX (oprnd_info->def_stmts[0]) != -1)
    2905              :         {
    2906       232679 :           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
    2907              :             {
    2908          756 :               if (oprnd_info->first_dt == vect_double_reduction_def)
    2909          378 :                 reduc_idx = i;
    2910              :             }
    2911       231923 :           else if (is_a <gphi *> (stmt_info->stmt)
    2912       231923 :                    && gimple_phi_num_args
    2913        99465 :                         (as_a <gphi *> (stmt_info->stmt)) != 1)
    2914              :             ;
    2915       132841 :           else if (STMT_VINFO_REDUC_IDX (stmt_info) == -1
    2916          383 :                    && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
    2917              :             ;
    2918       132841 :           else if (reduc_idx == -1)
    2919       124423 :             reduc_idx = i;
    2920              :           else
    2921              :             /* For .COND_* reduction operations the else value can be the
    2922              :                same as one of the operation operands.  The other def
    2923              :                stmts have been moved, so we can't check easily.  Check
    2924              :                it's a call at least.  */
    2925         8418 :             gcc_assert (is_a <gcall *> (stmt_info->stmt));
    2926              :         }
    2927              : 
    2928              :       /* When we have a masked load with uniform mask discover this
    2929              :          as a single-lane mask with a splat permute.  This way we can
    2930              :          recognize this as a masked load-lane by stripping the splat.  */
    2931      3019976 :       if (is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
    2932        57460 :           && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
    2933              :                                      IFN_MASK_LOAD)
    2934         6075 :           && STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2935      3020053 :           && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info)))
    2936              :         {
    2937           35 :           vec<stmt_vec_info> def_stmts2;
    2938           35 :           def_stmts2.create (1);
    2939           35 :           def_stmts2.quick_push (oprnd_info->def_stmts[0]);
    2940           35 :           child = vect_build_slp_tree (vinfo, def_stmts2, 1,
    2941              :                                        &this_max_nunits,
    2942              :                                        matches, limit,
    2943              :                                        &this_tree_size, bst_map);
    2944           35 :           if (child)
    2945              :             {
    2946           35 :               slp_tree pnode = vect_create_new_slp_node (1, VEC_PERM_EXPR);
    2947           35 :               SLP_TREE_VECTYPE (pnode) = SLP_TREE_VECTYPE (child);
    2948           35 :               SLP_TREE_LANES (pnode) = group_size;
    2949           35 :               SLP_TREE_SCALAR_STMTS (pnode).create (group_size);
    2950           35 :               SLP_TREE_LANE_PERMUTATION (pnode).create (group_size);
    2951          210 :               for (unsigned k = 0; k < group_size; ++k)
    2952              :                 {
    2953          175 :                   SLP_TREE_SCALAR_STMTS (pnode)
    2954          175 :                     .quick_push (oprnd_info->def_stmts[0]);
    2955          175 :                   SLP_TREE_LANE_PERMUTATION (pnode)
    2956          175 :                     .quick_push (std::make_pair (0u, 0u));
    2957              :                 }
    2958           35 :               SLP_TREE_CHILDREN (pnode).quick_push (child);
    2959           35 :               pnode->max_nunits = child->max_nunits;
    2960           35 :               children.safe_push (pnode);
    2961           35 :               oprnd_info->def_stmts = vNULL;
    2962           35 :               continue;
    2963           35 :             }
    2964              :           else
    2965            0 :             def_stmts2.release ();
    2966              :         }
    2967              : 
    2968      3019941 :       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
    2969              :                                         group_size, &this_max_nunits,
    2970              :                                         matches, limit,
    2971              :                                         &this_tree_size, bst_map)) != NULL)
    2972              :         {
    2973      2534219 :           oprnd_info->def_stmts = vNULL;
    2974      2534219 :           children.safe_push (child);
    2975      2534219 :           continue;
    2976              :         }
    2977              : 
    2978              :       /* If the SLP build for operand zero failed and operand zero
    2979              :          and one can be commutated try that for the scalar stmts
    2980              :          that failed the match.  */
    2981       485722 :       if (i == 0
    2982              :           /* A first scalar stmt mismatch signals a fatal mismatch.  */
    2983       382900 :           && matches[0]
    2984              :           /* ???  For COND_EXPRs we can swap the comparison operands
    2985              :              as well as the arms under some constraints.  */
    2986       180923 :           && (nops == 2 || nops == 3)
    2987       109657 :           && oprnds_info[1]->first_dt == vect_internal_def
    2988        59997 :           && (is_gimple_assign (stmt_info->stmt)
    2989        11660 :               || is_gimple_call (stmt_info->stmt))
    2990              :           /* Swapping operands for reductions breaks assumptions later on.  */
    2991       534072 :           && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
    2992              :         {
    2993              :           /* See whether we can swap the matching or the non-matching
    2994              :              stmt operands.  */
    2995              :           bool swap_not_matching = true;
    2996        52685 :           do
    2997              :             {
    2998      7059348 :               for (j = 0; j < group_size; ++j)
    2999              :                 {
    3000      7021490 :                   if (matches[j] != !swap_not_matching)
    3001        71026 :                     continue;
    3002      6950464 :                   stmt_vec_info stmt_info = stmts[j];
    3003              :                   /* Verify if we can swap operands of this stmt.  */
    3004      6950464 :                   if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
    3005              :                     {
    3006      6950438 :                       tree_code code = gimple_assign_rhs_code (stmt);
    3007      6950438 :                       if (! commutative_tree_code (code)
    3008      6950438 :                           && ! commutative_ternary_tree_code (code))
    3009              :                         {
    3010        14803 :                           if (!swap_not_matching)
    3011         6879 :                             goto fail;
    3012              :                           swap_not_matching = false;
    3013              :                           break;
    3014              :                         }
    3015              :                     }
    3016      7006689 :                   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
    3017              :                     {
    3018           26 :                       internal_fn fn = (gimple_call_internal_p (call)
    3019           26 :                                         ? gimple_call_internal_fn (call)
    3020              :                                         : IFN_LAST);
    3021           26 :                       if ((! commutative_binary_fn_p (fn)
    3022           26 :                            && ! commutative_ternary_fn_p (fn))
    3023           28 :                           || first_commutative_argument (fn) != 0)
    3024              :                         {
    3025           24 :                           if (!swap_not_matching)
    3026           12 :                             goto fail;
    3027              :                           swap_not_matching = false;
    3028              :                           break;
    3029              :                         }
    3030              :                     }
    3031              :                 }
    3032              :             }
    3033        45794 :           while (j != group_size);
    3034              : 
    3035              :           /* Swap mismatched definition stmts.  */
    3036        37858 :           if (dump_enabled_p ())
    3037          351 :             dump_printf_loc (MSG_NOTE, vect_location,
    3038              :                              "Re-trying with swapped operands of stmts ");
    3039      7035720 :           for (j = 0; j < group_size; ++j)
    3040      6997862 :             if (matches[j] == !swap_not_matching)
    3041              :               {
    3042     13870906 :                 std::swap (oprnds_info[0]->def_stmts[j],
    3043      6935453 :                            oprnds_info[1]->def_stmts[j]);
    3044     13870906 :                 std::swap (oprnds_info[0]->ops[j],
    3045      6935453 :                            oprnds_info[1]->ops[j]);
    3046      6935453 :                 if (dump_enabled_p ())
    3047          956 :                   dump_printf (MSG_NOTE, "%d ", j);
    3048              :               }
    3049        37858 :           if (dump_enabled_p ())
    3050          351 :             dump_printf (MSG_NOTE, "\n");
    3051              :           /* After swapping some operands we lost track whether an
    3052              :              operand has any pattern defs so be conservative here.  */
    3053        72435 :           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
    3054         3340 :             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
    3055              :           /* And try again with scratch 'matches' ... */
    3056        37858 :           bool *tem = XALLOCAVEC (bool, group_size);
    3057        37858 :           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
    3058              :                                             group_size, &this_max_nunits,
    3059              :                                             tem, limit,
    3060              :                                             &this_tree_size, bst_map)) != NULL)
    3061              :             {
    3062         6559 :               oprnd_info->def_stmts = vNULL;
    3063         6559 :               children.safe_push (child);
    3064         6559 :               continue;
    3065              :             }
    3066              :         }
    3067       479163 : fail:
    3068              : 
    3069              :       /* If the SLP build failed and we analyze a basic-block
    3070              :          simply treat nodes we fail to build as externally defined
    3071              :          (and thus build vectors from the scalar defs).
    3072              :          The cost model will reject outright expensive cases.
    3073              :          ???  This doesn't treat cases where permutation ultimatively
    3074              :          fails (or we don't try permutation below).  Ideally we'd
    3075              :          even compute a permutation that will end up with the maximum
    3076              :          SLP tree size...  */
    3077       479163 :       if (is_a <bb_vec_info> (vinfo)
    3078              :           /* ???  Rejecting patterns this way doesn't work.  We'd have to
    3079              :              do extra work to cancel the pattern so the uses see the
    3080              :              scalar version.  */
    3081       398657 :           && !is_pattern_stmt_p (stmt_info)
    3082       853267 :           && !oprnd_info->any_pattern)
    3083              :         {
    3084              :           /* But if there's a leading vector sized set of matching stmts
    3085              :              fail here so we can split the group.  This matches the condition
    3086              :              vect_analyze_slp_instance uses.  */
    3087              :           /* ???  We might want to split here and combine the results to support
    3088              :              multiple vector sizes better.  */
    3089       586838 :           for (j = 0; j < group_size; ++j)
    3090       586838 :             if (!matches[j])
    3091              :               break;
    3092       373841 :           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
    3093       373812 :               && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
    3094              :             {
    3095       363422 :               if (dump_enabled_p ())
    3096          555 :                 dump_printf_loc (MSG_NOTE, vect_location,
    3097              :                                  "Building vector operands from scalars\n");
    3098       363422 :               this_tree_size++;
    3099       363422 :               child = vect_create_new_slp_node (oprnd_info->ops);
    3100       363422 :               children.safe_push (child);
    3101       363422 :               oprnd_info->ops = vNULL;
    3102       363422 :               continue;
    3103              :             }
    3104              :         }
    3105              : 
    3106       115741 :       gcc_assert (child == NULL);
    3107       131839 :       FOR_EACH_VEC_ELT (children, j, child)
    3108        16098 :         if (child)
    3109        16098 :           vect_free_slp_tree (child);
    3110       115741 :       vect_free_oprnd_info (oprnds_info);
    3111       115741 :       return NULL;
    3112              :     }
    3113              : 
    3114      3000447 :   vect_free_oprnd_info (oprnds_info);
    3115              : 
    3116              :   /* If we have all children of a child built up from uniform scalars
    3117              :      or does more than one possibly expensive vector construction then
    3118              :      just throw that away, causing it built up from scalars.
    3119              :      The exception is the SLP node for the vector store.  */
    3120      3000447 :   if (is_a <bb_vec_info> (vinfo)
    3121      1100397 :       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
    3122              :       /* ???  Rejecting patterns this way doesn't work.  We'd have to
    3123              :          do extra work to cancel the pattern so the uses see the
    3124              :          scalar version.  */
    3125      3437427 :       && !is_pattern_stmt_p (stmt_info))
    3126              :     {
    3127              :       slp_tree child;
    3128              :       unsigned j;
    3129              :       bool all_uniform_p = true;
    3130              :       unsigned n_vector_builds = 0;
    3131      1240335 :       FOR_EACH_VEC_ELT (children, j, child)
    3132              :         {
    3133       829304 :           if (!child)
    3134              :             ;
    3135       829304 :           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    3136              :             all_uniform_p = false;
    3137       590842 :           else if (!vect_slp_tree_uniform_p (child))
    3138              :             {
    3139       449490 :               all_uniform_p = false;
    3140       449490 :               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
    3141       415060 :                 n_vector_builds++;
    3142              :             }
    3143              :         }
    3144       411031 :       if (all_uniform_p
    3145       411031 :           || n_vector_builds > 1
    3146       698479 :           || (n_vector_builds == children.length ()
    3147        30307 :               && is_a <gphi *> (stmt_info->stmt)))
    3148              :         {
    3149              :           /* Roll back.  */
    3150       128410 :           matches[0] = false;
    3151       407877 :           FOR_EACH_VEC_ELT (children, j, child)
    3152       279467 :             if (child)
    3153       279467 :               vect_free_slp_tree (child);
    3154              : 
    3155       128410 :           if (dump_enabled_p ())
    3156          177 :             dump_printf_loc (MSG_NOTE, vect_location,
    3157              :                              "Building parent vector operands from "
    3158              :                              "scalars instead\n");
    3159       128410 :           return NULL;
    3160              :         }
    3161              :     }
    3162              : 
    3163      2872037 :   *tree_size += this_tree_size + 1;
    3164      2872037 :   *max_nunits = this_max_nunits;
    3165              : 
    3166      2872037 :   if (two_operators)
    3167              :     {
    3168              :       /* ???  We'd likely want to either cache in bst_map sth like
    3169              :          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
    3170              :          the true { a+b, a+b, a+b, a+b } ... but there we don't have
    3171              :          explicit stmts to put in so the keying on 'stmts' doesn't
    3172              :          work (but we have the same issue with nodes that use 'ops').  */
    3173              : 
    3174         6841 :       if (has_two_operators_perm)
    3175              :         {
    3176           40 :           slp_tree child = children[0];
    3177           40 :           children.truncate (0);
    3178          120 :           for (i = 0; i < 2; i++)
    3179              :             {
    3180           80 :               slp_tree pnode
    3181           80 :                 = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
    3182           80 :               SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
    3183           80 :               SLP_TREE_VECTYPE (pnode) = vectype;
    3184           80 :               SLP_TREE_CHILDREN (pnode).quick_push (child);
    3185           80 :               SLP_TREE_CHILDREN (pnode).quick_push (child);
    3186           80 :               lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
    3187           80 :               children.safe_push (pnode);
    3188              : 
    3189          656 :               for (unsigned j = 0; j < stmts.length (); j++)
    3190          576 :                 perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
    3191              :             }
    3192              : 
    3193           40 :           SLP_TREE_REF_COUNT (child) += 4;
    3194              :         }
    3195              : 
    3196         6841 :       slp_tree one = new _slp_tree;
    3197         6841 :       slp_tree two = new _slp_tree;
    3198         6841 :       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
    3199         6841 :       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
    3200         6841 :       SLP_TREE_VECTYPE (one) = vectype;
    3201         6841 :       SLP_TREE_VECTYPE (two) = vectype;
    3202         6841 :       SLP_TREE_CHILDREN (one).safe_splice (children);
    3203         6841 :       SLP_TREE_CHILDREN (two).safe_splice (children);
    3204         6841 :       slp_tree child;
    3205        27366 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
    3206        13684 :         SLP_TREE_REF_COUNT (child)++;
    3207              : 
    3208              :       /* Here we record the original defs since this
    3209              :          node represents the final lane configuration.  */
    3210         6841 :       node = vect_create_new_slp_node (node, stmts, 2);
    3211         6841 :       SLP_TREE_VECTYPE (node) = vectype;
    3212         6841 :       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
    3213         6841 :       SLP_TREE_CHILDREN (node).quick_push (one);
    3214         6841 :       SLP_TREE_CHILDREN (node).quick_push (two);
    3215         6841 :       enum tree_code code0 = ERROR_MARK;
    3216         6841 :       enum tree_code ocode = ERROR_MARK;
    3217         6841 :       if (gassign *stmt = dyn_cast <gassign *> (stmts[0]->stmt))
    3218         6839 :         code0 = gimple_assign_rhs_code (stmt);
    3219         6841 :       stmt_vec_info ostmt_info;
    3220         6841 :       unsigned j = 0;
    3221        24996 :       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
    3222              :         {
    3223        18155 :           int op = 0;
    3224        18155 :           if (gassign *ostmt = dyn_cast <gassign *> (ostmt_info->stmt))
    3225              :             {
    3226        18151 :               if (gimple_assign_rhs_code (ostmt) != code0)
    3227              :                 {
    3228         9110 :                   ocode = gimple_assign_rhs_code (ostmt);
    3229              :                   op = 1;
    3230              :                   j = i;
    3231              :                 }
    3232              :             }
    3233              :           else
    3234              :             {
    3235            8 :               if (gimple_call_combined_fn (stmts[0]->stmt)
    3236            4 :                   != gimple_call_combined_fn (ostmt_info->stmt))
    3237              :                 {
    3238            2 :                   op = 1;
    3239            2 :                   j = i;
    3240              :                 }
    3241              :             }
    3242        18155 :           SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (op, i));
    3243              :         }
    3244         6841 :       SLP_TREE_CODE (one) = code0;
    3245         6841 :       SLP_TREE_CODE (two) = ocode;
    3246         6841 :       SLP_TREE_LANES (one) = stmts.length ();
    3247         6841 :       SLP_TREE_LANES (two) = stmts.length ();
    3248         6841 :       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
    3249         6841 :       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
    3250              : 
    3251         6841 :       return node;
    3252              :     }
    3253              : 
    3254      2865196 :   node = vect_create_new_slp_node (node, stmts, nops);
    3255      2865196 :   SLP_TREE_VECTYPE (node) = vectype;
    3256      2865196 :   SLP_TREE_CHILDREN (node).splice (children);
    3257      2865196 :   SLP_TREE_GS_SCALE (node) = gs_scale;
    3258      2865196 :   SLP_TREE_GS_BASE (node) = gs_base;
    3259      2865196 :   if (reduc_idx != -1)
    3260              :     {
    3261       116060 :       gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) != -1
    3262              :                   || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
    3263              :                   || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def);
    3264       116060 :       SLP_TREE_REDUC_IDX (node) = reduc_idx;
    3265       116060 :       node->cycle_info.id = SLP_TREE_CHILDREN (node)[reduc_idx]->cycle_info.id;
    3266              :     }
    3267              :   /* When reaching the reduction PHI, create a vect_reduc_info.  */
    3268      2749136 :   else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
    3269      2749136 :             || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
    3270      2749136 :            && is_a <gphi *> (STMT_VINFO_STMT (stmt_info)))
    3271              :     {
    3272       101393 :       loop_vec_info loop_vinfo = as_a <loop_vec_info> (vinfo);
    3273       101393 :       gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) == -1);
    3274       101393 :       node->cycle_info.id = loop_vinfo->reduc_infos.length ();
    3275       101393 :       vect_reduc_info reduc_info = new vect_reduc_info_s ();
    3276       101393 :       loop_vinfo->reduc_infos.safe_push (reduc_info);
    3277       101393 :       stmt_vec_info reduc_phi = stmt_info;
    3278              :       /* ???  For double reductions vect_is_simple_reduction stores the
    3279              :          reduction type and code on the inner loop header PHI.  */
    3280       101393 :       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
    3281              :         {
    3282          378 :           use_operand_p use_p;
    3283          378 :           gimple *use_stmt;
    3284          378 :           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
    3285              :                                      &use_p, &use_stmt);
    3286          378 :           gcc_assert (res);
    3287          378 :           reduc_phi = loop_vinfo->lookup_stmt (use_stmt);
    3288              :         }
    3289       101393 :       VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (stmt_info);
    3290       101393 :       VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (reduc_phi);
    3291       101393 :       VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (reduc_phi);
    3292       101393 :       VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
    3293              :     }
    3294              :   return node;
    3295      9348564 : }
    3296              : 
    3297              : /* Dump a single SLP tree NODE.  */
    3298              : 
    3299              : static void
    3300       444694 : vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
    3301              :                      slp_tree node)
    3302              : {
    3303       444694 :   unsigned i, j;
    3304       444694 :   slp_tree child;
    3305       444694 :   stmt_vec_info stmt_info;
    3306       444694 :   tree op;
    3307              : 
    3308       444694 :   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
    3309       444694 :   dump_user_location_t user_loc = loc.get_user_location ();
    3310       444694 :   dump_printf_loc (metadata, user_loc,
    3311              :                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
    3312              :                    ", refcnt=%u)",
    3313       444694 :                    SLP_TREE_DEF_TYPE (node) == vect_external_def
    3314              :                    ? " (external)"
    3315              :                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
    3316       429051 :                       ? " (constant)"
    3317              :                       : ""), (void *) node,
    3318       444694 :                    estimated_poly_value (node->max_nunits),
    3319              :                                          SLP_TREE_REF_COUNT (node));
    3320       444694 :   if (SLP_TREE_VECTYPE (node))
    3321       377150 :     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
    3322       444694 :   dump_printf (metadata, "%s",
    3323       444694 :                node->avoid_stlf_fail ? " (avoid-stlf-fail)" : "");
    3324       444694 :   if (node->cycle_info.id != -1 || node->cycle_info.reduc_idx != -1)
    3325        23869 :     dump_printf (metadata, " cycle %d, link %d", node->cycle_info.id,
    3326              :                  node->cycle_info.reduc_idx);
    3327       444694 :   dump_printf (metadata, "\n");
    3328       444694 :   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
    3329              :     {
    3330       362152 :       if (SLP_TREE_PERMUTE_P (node))
    3331        13668 :         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
    3332              :       else
    3333       348484 :         dump_printf_loc (metadata, user_loc, "op template: %G",
    3334       348484 :                          SLP_TREE_REPRESENTATIVE (node)->stmt);
    3335              :     }
    3336       444694 :   if (SLP_TREE_SCALAR_STMTS (node).exists ())
    3337       866474 :     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    3338       512425 :       if (stmt_info)
    3339       507144 :         dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
    3340       507144 :                          SLP_TREE_LIVE_LANES (node).contains (i)
    3341       503519 :                          ? "[l*]" : (STMT_VINFO_LIVE_P (stmt_info)
    3342       503519 :                                      ? "[l] " : ""),
    3343              :                          i, stmt_info->stmt);
    3344              :       else
    3345         5281 :         dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
    3346              :   else
    3347              :     {
    3348        90645 :       dump_printf_loc (metadata, user_loc, "\t{ ");
    3349       199582 :       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
    3350       108937 :         dump_printf (metadata, "%T%s ", op,
    3351       108937 :                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
    3352        90645 :       dump_printf (metadata, "}\n");
    3353              :     }
    3354       444694 :   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
    3355              :     {
    3356        64844 :       dump_printf_loc (metadata, user_loc, "\tload permutation {");
    3357       147798 :       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
    3358        82954 :         dump_printf (dump_kind, " %u", j);
    3359        64844 :       dump_printf (dump_kind, " }\n");
    3360              :     }
    3361       444694 :   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
    3362              :     {
    3363        13676 :       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
    3364        51245 :       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
    3365        37569 :         dump_printf (dump_kind, " %u[%u]",
    3366        37569 :                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
    3367        37569 :                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
    3368        13676 :       dump_printf (dump_kind, " }%s\n",
    3369        13676 :                    node->ldst_lanes ? " (load-lanes)" : "");
    3370              :     }
    3371       444694 :   if (SLP_TREE_CHILDREN (node).is_empty ())
    3372       169596 :     return;
    3373       275098 :   dump_printf_loc (metadata, user_loc, "\tchildren");
    3374       725837 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3375       450739 :     dump_printf (dump_kind, " %p", (void *)child);
    3376       275098 :   dump_printf (dump_kind, "%s\n",
    3377       275098 :                node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
    3378              :                ? " (store-lanes)" : "");
    3379              : }
    3380              : 
    3381              : DEBUG_FUNCTION void
    3382            0 : debug (slp_tree node)
    3383              : {
    3384            0 :   debug_dump_context ctx;
    3385            0 :   vect_print_slp_tree (MSG_NOTE,
    3386            0 :                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
    3387              :                        node);
    3388            0 : }
    3389              : 
    3390              : /* Recursive helper for the dot producer below.  */
    3391              : 
    3392              : static void
    3393            0 : dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
    3394              : {
    3395            0 :   if (visited.add (node))
    3396              :     return;
    3397              : 
    3398            0 :   fprintf (f, "\"%p\" [label=\"", (void *)node);
    3399            0 :   vect_print_slp_tree (MSG_NOTE,
    3400            0 :                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
    3401              :                        node);
    3402            0 :   fprintf (f, "\"];\n");
    3403              : 
    3404              : 
    3405            0 :   for (slp_tree child : SLP_TREE_CHILDREN (node))
    3406            0 :     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
    3407              : 
    3408            0 :   for (slp_tree child : SLP_TREE_CHILDREN (node))
    3409            0 :     if (child)
    3410            0 :       dot_slp_tree (f, child, visited);
    3411              : }
    3412              : 
    3413              : DEBUG_FUNCTION void
    3414            0 : dot_slp_tree (const char *fname, slp_tree node)
    3415              : {
    3416            0 :   FILE *f = fopen (fname, "w");
    3417            0 :   fprintf (f, "digraph {\n");
    3418            0 :   fflush (f);
    3419            0 :     {
    3420            0 :       debug_dump_context ctx (f);
    3421            0 :       hash_set<slp_tree> visited;
    3422            0 :       dot_slp_tree (f, node, visited);
    3423            0 :     }
    3424            0 :   fflush (f);
    3425            0 :   fprintf (f, "}\n");
    3426            0 :   fclose (f);
    3427            0 : }
    3428              : 
    3429              : DEBUG_FUNCTION void
    3430            0 : dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
    3431              : {
    3432            0 :   FILE *f = fopen (fname, "w");
    3433            0 :   fprintf (f, "digraph {\n");
    3434            0 :   fflush (f);
    3435            0 :     {
    3436            0 :       debug_dump_context ctx (f);
    3437            0 :       hash_set<slp_tree> visited;
    3438            0 :       for (auto inst : slp_instances)
    3439            0 :         dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
    3440            0 :     }
    3441            0 :   fflush (f);
    3442            0 :   fprintf (f, "}\n");
    3443            0 :   fclose (f);
    3444            0 : }
    3445              : 
    3446              : /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
    3447              : 
    3448              : static void
    3449       483463 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
    3450              :                       slp_tree node, hash_set<slp_tree> &visited)
    3451              : {
    3452       483463 :   unsigned i;
    3453       483463 :   slp_tree child;
    3454              : 
    3455       483463 :   if (visited.add (node))
    3456       483463 :     return;
    3457              : 
    3458       444220 :   vect_print_slp_tree (dump_kind, loc, node);
    3459              : 
    3460      1338665 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3461       450225 :     if (child)
    3462       407530 :       vect_print_slp_graph (dump_kind, loc, child, visited);
    3463              : }
    3464              : 
    3465              : static void
    3466        46615 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
    3467              :                       slp_tree entry)
    3468              : {
    3469        46615 :   hash_set<slp_tree> visited;
    3470        46615 :   vect_print_slp_graph (dump_kind, loc, entry, visited);
    3471        46615 : }
    3472              : 
    3473              : DEBUG_FUNCTION void
    3474            0 : debug (slp_instance instance)
    3475              : {
    3476            0 :   debug_dump_context ctx;
    3477            0 :   vect_print_slp_graph (MSG_NOTE,
    3478            0 :                         dump_location_t::from_location_t (UNKNOWN_LOCATION),
    3479              :                         SLP_INSTANCE_TREE (instance));
    3480            0 : }
    3481              : 
    3482              : 
    3483              : /* Compute the set of scalar stmts participating in external nodes.  */
    3484              : 
    3485              : static void
    3486      1571433 : vect_slp_gather_extern_scalar_stmts (vec_info *vinfo, slp_tree node,
    3487              :                                      hash_set<slp_tree> &visited,
    3488              :                                      hash_set<stmt_vec_info> &estmts)
    3489              : {
    3490      1571433 :   if (visited.add (node))
    3491              :     return;
    3492              : 
    3493      1526321 :   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
    3494              :     {
    3495              :       slp_tree child;
    3496              :       int i;
    3497      1765425 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3498       888331 :         if (child)
    3499       888331 :           vect_slp_gather_extern_scalar_stmts (vinfo, child, visited, estmts);
    3500              :     }
    3501              :   else
    3502      3653787 :     for (tree def : SLP_TREE_SCALAR_OPS (node))
    3503              :       {
    3504      1707648 :         stmt_vec_info def_stmt = vinfo->lookup_def (def);
    3505      1707648 :         if (def_stmt)
    3506       342420 :           estmts.add (def_stmt);
    3507              :       }
    3508              : }
    3509              : 
    3510              : /* Mark the original scalar stmt coverage of the vector SLP graph of VINFO
    3511              :    with STMT_SLP_TYPE == pure_slp.  */
    3512              : 
    3513              : static void
    3514       236000 : vect_bb_slp_mark_stmts_vectorized (bb_vec_info vinfo)
    3515              : {
    3516              :   /* Gather the scalar stmt leafs of the SLP graph to stop the below DFS
    3517              :      walk on.  */
    3518       236000 :   hash_set<stmt_vec_info> scalar_stmts_in_externs;
    3519       236000 :   hash_set<slp_tree> visited;
    3520      1391102 :   for (auto instance : BB_VINFO_SLP_INSTANCES (vinfo))
    3521       683102 :     vect_slp_gather_extern_scalar_stmts (vinfo, SLP_INSTANCE_TREE (instance),
    3522              :                                          visited, scalar_stmts_in_externs);
    3523              : 
    3524              :   /* DFS walk scalar stmts to compute the vectorized coverage indicated
    3525              :      by STMT_SLP_TYPE (stmt) == pure_slp on the original scalar (non-pattern)
    3526              :      stmts.  */
    3527      1391102 :   for (auto instance : BB_VINFO_SLP_INSTANCES (vinfo))
    3528              :     {
    3529       794266 :       for (auto stmt : SLP_INSTANCE_ROOT_STMTS (instance))
    3530        52972 :         if (!scalar_stmts_in_externs.contains (stmt))
    3531        52273 :           STMT_SLP_TYPE (stmt) = pure_slp;
    3532       683102 :       auto_vec<stmt_vec_info> worklist;
    3533      3860298 :       for (auto stmt : SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)))
    3534              :         {
    3535      1810992 :           stmt = vect_orig_stmt (stmt);
    3536      1810992 :           if (!scalar_stmts_in_externs.contains (stmt)
    3537      1810992 :               && STMT_SLP_TYPE (stmt) != pure_slp)
    3538              :             {
    3539      1801830 :               STMT_SLP_TYPE (stmt) = pure_slp;
    3540      1801830 :               worklist.safe_push (stmt);
    3541              :             }
    3542              :         }
    3543      3607790 :       while (!worklist.is_empty ())
    3544              :         {
    3545      2244253 :           stmt_vec_info stmt = worklist.pop ();
    3546              : 
    3547              :           /* Now walk relevant parts of the SSA use-def graph.  */
    3548      2244253 :           slp_oprnds child_ops (stmt);
    3549      4728751 :           for (unsigned i = 0; i < child_ops.num_slp_children; ++i)
    3550              :             {
    3551      2484498 :               tree op = child_ops.get_op_for_slp_child (stmt, i);
    3552      2484498 :               stmt_vec_info def = vinfo->lookup_def (op);
    3553      2484498 :               if (def
    3554       866683 :                   && !scalar_stmts_in_externs.contains (def)
    3555      3012671 :                   && STMT_SLP_TYPE (def) != pure_slp)
    3556              :                 {
    3557       442423 :                   STMT_SLP_TYPE (def) = pure_slp;
    3558       442423 :                   worklist.safe_push (def);
    3559              :                 }
    3560              :             }
    3561              :         }
    3562       683102 :     }
    3563       236000 : }
    3564              : 
    3565              : /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
    3566              : 
    3567              : static void
    3568      2513778 : vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
    3569              : {
    3570      2513778 :   int i;
    3571      2513778 :   stmt_vec_info stmt_info;
    3572      2513778 :   slp_tree child;
    3573              : 
    3574      2513778 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    3575              :     return;
    3576              : 
    3577      1505634 :   if (visited.add (node))
    3578              :     return;
    3579              : 
    3580      4478546 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    3581      3121232 :     if (stmt_info)
    3582              :       {
    3583      3121232 :         gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
    3584              :                     || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
    3585      3121232 :         STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
    3586              :       }
    3587              : 
    3588      3087640 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3589      1730326 :     if (child)
    3590      1730326 :       vect_mark_slp_stmts_relevant (child, visited);
    3591              : }
    3592              : 
    3593              : static void
    3594       783452 : vect_mark_slp_stmts_relevant (slp_tree node)
    3595              : {
    3596       783452 :   hash_set<slp_tree> visited;
    3597       783452 :   vect_mark_slp_stmts_relevant (node, visited);
    3598       783452 : }
    3599              : 
    3600              : 
    3601              : /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
    3602              : 
    3603              : static void
    3604     10616414 : vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
    3605              :                        hash_set<slp_tree> &visited)
    3606              : {
    3607     10616414 :   if (!node || visited.add (node))
    3608      1747318 :     return;
    3609              : 
    3610      8869096 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    3611              :     return;
    3612              : 
    3613      6570400 :   if (!SLP_TREE_PERMUTE_P (node))
    3614              :     {
    3615      6363316 :       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
    3616      6363316 :       if (STMT_VINFO_DATA_REF (stmt_info)
    3617      2760202 :           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
    3618      1556814 :         loads.safe_push (node);
    3619              :     }
    3620              : 
    3621              :   unsigned i;
    3622              :   slp_tree child;
    3623     14960146 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3624      8389746 :     vect_gather_slp_loads (loads, child, visited);
    3625              : }
    3626              : 
    3627              : 
    3628              : /* Find the last store in SLP INSTANCE.  */
    3629              : 
    3630              : stmt_vec_info
    3631      2742960 : vect_find_last_scalar_stmt_in_slp (slp_tree node)
    3632              : {
    3633      2742960 :   stmt_vec_info last = NULL;
    3634      2742960 :   stmt_vec_info stmt_vinfo;
    3635              : 
    3636      9997907 :   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
    3637      7254947 :     if (stmt_vinfo)
    3638              :       {
    3639      7254947 :         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
    3640      7254947 :         last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
    3641              :       }
    3642              : 
    3643      2742960 :   return last;
    3644              : }
    3645              : 
    3646              : /* Find the first stmt in NODE.  */
    3647              : 
    3648              : stmt_vec_info
    3649       535435 : vect_find_first_scalar_stmt_in_slp (slp_tree node)
    3650              : {
    3651       535435 :   stmt_vec_info first = NULL;
    3652       535435 :   stmt_vec_info stmt_vinfo;
    3653              : 
    3654      1814265 :   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
    3655      1278830 :     if (stmt_vinfo)
    3656              :       {
    3657      1276136 :         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
    3658      1276136 :         if (!first
    3659      1276136 :             || get_later_stmt (stmt_vinfo, first) == first)
    3660              :           first = stmt_vinfo;
    3661              :       }
    3662              : 
    3663       535435 :   return first;
    3664              : }
    3665              : 
    3666              : /* Splits a group of stores, currently beginning at FIRST_VINFO, into
    3667              :    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
    3668              :    (also containing the first GROUP1_SIZE stmts, since stores are
    3669              :    consecutive), the second containing the remainder.
    3670              :    Return the first stmt in the second group.  */
    3671              : 
    3672              : static stmt_vec_info
    3673       157571 : vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
    3674              : {
    3675       157571 :   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
    3676       157571 :   gcc_assert (group1_size > 0);
    3677       157571 :   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
    3678       157571 :   gcc_assert (group2_size > 0);
    3679       157571 :   DR_GROUP_SIZE (first_vinfo) = group1_size;
    3680              : 
    3681       157571 :   stmt_vec_info stmt_info = first_vinfo;
    3682       527855 :   for (unsigned i = group1_size; i > 1; i--)
    3683              :     {
    3684       370284 :       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
    3685       370284 :       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
    3686              :     }
    3687              :   /* STMT is now the last element of the first group.  */
    3688       157571 :   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
    3689       157571 :   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
    3690              : 
    3691       157571 :   DR_GROUP_SIZE (group2) = group2_size;
    3692       440602 :   for (stmt_info = group2; stmt_info;
    3693       283031 :        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
    3694              :     {
    3695       283031 :       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
    3696       283031 :       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
    3697              :     }
    3698              : 
    3699              :   /* For the second group, the DR_GROUP_GAP is that before the original group,
    3700              :      plus skipping over the first vector.  */
    3701       157571 :   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
    3702              : 
    3703              :   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
    3704       157571 :   DR_GROUP_GAP (first_vinfo) += group2_size;
    3705              : 
    3706       157571 :   if (dump_enabled_p ())
    3707           61 :     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
    3708              :                      group1_size, group2_size);
    3709              : 
    3710       157571 :   return group2;
    3711              : }
    3712              : 
    3713              : /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
    3714              :    statements and a vector of NUNITS elements.  */
    3715              : 
    3716              : static poly_uint64
    3717      4159928 : calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
    3718              : {
    3719      4159928 :   return exact_div (common_multiple (nunits, group_size), group_size);
    3720              : }
    3721              : 
    3722              : /* Helper that checks to see if a node is a load node.  */
    3723              : 
    3724              : static inline bool
    3725          108 : vect_is_slp_load_node  (slp_tree root)
    3726              : {
    3727          108 :   return (!SLP_TREE_PERMUTE_P (root)
    3728          108 :           && SLP_TREE_DEF_TYPE (root) == vect_internal_def
    3729          102 :           && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
    3730          172 :           && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
    3731              : }
    3732              : 
    3733              : 
    3734              : /* Helper function of optimize_load_redistribution that performs the operation
    3735              :    recursively.  */
    3736              : 
    3737              : static slp_tree
    3738        20434 : optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
    3739              :                                 vec_info *vinfo, unsigned int group_size,
    3740              :                                 hash_map<slp_tree, slp_tree> *load_map,
    3741              :                                 slp_tree root)
    3742              : {
    3743        20434 :   if (slp_tree *leader = load_map->get (root))
    3744         3669 :     return *leader;
    3745              : 
    3746        16765 :   slp_tree node;
    3747        16765 :   unsigned i;
    3748              : 
    3749              :   /* For now, we don't know anything about externals so do not do anything.  */
    3750        16765 :   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
    3751              :     return NULL;
    3752        12385 :   else if (SLP_TREE_PERMUTE_P (root))
    3753              :     {
    3754              :       /* First convert this node into a load node and add it to the leaves
    3755              :          list and flatten the permute from a lane to a load one.  If it's
    3756              :          unneeded it will be elided later.  */
    3757           76 :       vec<stmt_vec_info> stmts;
    3758           76 :       stmts.create (SLP_TREE_LANES (root));
    3759           76 :       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
    3760          140 :       for (unsigned j = 0; j < lane_perm.length (); j++)
    3761              :         {
    3762          108 :           std::pair<unsigned, unsigned> perm = lane_perm[j];
    3763          108 :           node = SLP_TREE_CHILDREN (root)[perm.first];
    3764              : 
    3765          108 :           if (!vect_is_slp_load_node (node)
    3766          108 :               || SLP_TREE_CHILDREN (node).exists ())
    3767              :             {
    3768           44 :               stmts.release ();
    3769           44 :               goto next;
    3770              :             }
    3771              : 
    3772           64 :           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
    3773              :         }
    3774              : 
    3775           32 :       if (dump_enabled_p ())
    3776            0 :         dump_printf_loc (MSG_NOTE, vect_location,
    3777              :                          "converting stmts on permute node %p\n",
    3778              :                          (void *) root);
    3779              : 
    3780           32 :       bool *matches = XALLOCAVEC (bool, group_size);
    3781           32 :       poly_uint64 max_nunits = 1;
    3782           32 :       unsigned tree_size = 0, limit = 1;
    3783           32 :       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
    3784              :                                   matches, &limit, &tree_size, bst_map);
    3785           32 :       if (!node)
    3786            0 :         stmts.release ();
    3787              : 
    3788           32 :       load_map->put (root, node);
    3789           32 :       return node;
    3790              :     }
    3791              : 
    3792        12309 : next:
    3793        12353 :   load_map->put (root, NULL);
    3794              : 
    3795        29030 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
    3796              :     {
    3797        16677 :       slp_tree value
    3798        16677 :         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
    3799              :                                           node);
    3800        16677 :       if (value)
    3801              :         {
    3802           32 :           SLP_TREE_REF_COUNT (value)++;
    3803           32 :           SLP_TREE_CHILDREN (root)[i] = value;
    3804              :           /* ???  We know the original leafs of the replaced nodes will
    3805              :              be referenced by bst_map, only the permutes created by
    3806              :              pattern matching are not.  */
    3807           32 :           if (SLP_TREE_REF_COUNT (node) == 1)
    3808           32 :             load_map->remove (node);
    3809           32 :           vect_free_slp_tree (node);
    3810              :         }
    3811              :     }
    3812              : 
    3813              :   return NULL;
    3814              : }
    3815              : 
    3816              : /* Temporary workaround for loads not being CSEd during SLP build.  This
    3817              :    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
    3818              :    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
    3819              :    same DR such that the final operation is equal to a permuted load.  Such
    3820              :    NODES are then directly converted into LOADS themselves.  The nodes are
    3821              :    CSEd using BST_MAP.  */
    3822              : 
    3823              : static void
    3824         2851 : optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
    3825              :                               vec_info *vinfo, unsigned int group_size,
    3826              :                               hash_map<slp_tree, slp_tree> *load_map,
    3827              :                               slp_tree root)
    3828              : {
    3829         2851 :   slp_tree node;
    3830         2851 :   unsigned i;
    3831              : 
    3832         6608 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
    3833              :     {
    3834         3757 :       slp_tree value
    3835         3757 :         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
    3836              :                                           node);
    3837         3757 :       if (value)
    3838              :         {
    3839            0 :           SLP_TREE_REF_COUNT (value)++;
    3840            0 :           SLP_TREE_CHILDREN (root)[i] = value;
    3841              :           /* ???  We know the original leafs of the replaced nodes will
    3842              :              be referenced by bst_map, only the permutes created by
    3843              :              pattern matching are not.  */
    3844            0 :           if (SLP_TREE_REF_COUNT (node) == 1)
    3845            0 :             load_map->remove (node);
    3846            0 :           vect_free_slp_tree (node);
    3847              :         }
    3848              :     }
    3849         2851 : }
    3850              : 
    3851              : /* Helper function of vect_match_slp_patterns.
    3852              : 
    3853              :    Attempts to match patterns against the slp tree rooted in REF_NODE using
    3854              :    VINFO.  Patterns are matched in post-order traversal.
    3855              : 
    3856              :    If matching is successful the value in REF_NODE is updated and returned, if
    3857              :    not then it is returned unchanged.  */
    3858              : 
    3859              : static bool
    3860      6122731 : vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
    3861              :                            slp_tree_to_load_perm_map_t *perm_cache,
    3862              :                            slp_compat_nodes_map_t *compat_cache,
    3863              :                            hash_set<slp_tree> *visited)
    3864              : {
    3865      6122731 :   unsigned i;
    3866      6122731 :   slp_tree node = *ref_node;
    3867      6122731 :   bool found_p = false;
    3868      6122731 :   if (!node || visited->add (node))
    3869       874984 :     return false;
    3870              : 
    3871              :   slp_tree child;
    3872      9817627 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3873      4569880 :     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
    3874              :                                           vinfo, perm_cache, compat_cache,
    3875              :                                           visited);
    3876              : 
    3877     15743241 :   for (unsigned x = 0; x < num__slp_patterns; x++)
    3878              :     {
    3879     10495494 :       vect_pattern *pattern
    3880     10495494 :         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
    3881     10495494 :       if (pattern)
    3882              :         {
    3883         1171 :           pattern->build (vinfo);
    3884         1171 :           delete pattern;
    3885         1171 :           found_p = true;
    3886              :         }
    3887              :     }
    3888              : 
    3889              :   return found_p;
    3890              : }
    3891              : 
    3892              : /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
    3893              :    vec_info VINFO.
    3894              : 
    3895              :    The modified tree is returned.  Patterns are tried in order and multiple
    3896              :    patterns may match.  */
    3897              : 
    3898              : static bool
    3899      1552851 : vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
    3900              :                          hash_set<slp_tree> *visited,
    3901              :                          slp_tree_to_load_perm_map_t *perm_cache,
    3902              :                          slp_compat_nodes_map_t *compat_cache)
    3903              : {
    3904      1552851 :   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
    3905      1552851 :   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
    3906              : 
    3907      1552851 :   if (dump_enabled_p ())
    3908        30482 :     dump_printf_loc (MSG_NOTE, vect_location,
    3909              :                      "Analyzing SLP tree %p for patterns\n",
    3910        30482 :                      (void *) SLP_INSTANCE_TREE (instance));
    3911              : 
    3912      1552851 :   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
    3913      1552851 :                                     visited);
    3914              : }
    3915              : 
    3916              : /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
    3917              :    vectorizing with VECTYPE that might be NULL.  MASKED_P indicates whether
    3918              :    the stores are masked.
    3919              :    Return true if we could use IFN_STORE_LANES instead and if that appears
    3920              :    to be the better approach.  */
    3921              : 
    3922              : static bool
    3923         6016 : vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
    3924              :                                tree vectype, bool masked_p,
    3925              :                                unsigned int group_size,
    3926              :                                unsigned int new_group_size)
    3927              : {
    3928         6016 :   if (!vectype)
    3929              :     {
    3930         6016 :       tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
    3931         6016 :       vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
    3932              :     }
    3933         6016 :   if (!vectype)
    3934              :     return false;
    3935              :   /* Allow the split if one of the two new groups would operate on full
    3936              :      vectors *within* rather than across one scalar loop iteration.
    3937              :      This is purely a heuristic, but it should work well for group
    3938              :      sizes of 3 and 4, where the possible splits are:
    3939              : 
    3940              :        3->2+1:  OK if the vector has exactly two elements
    3941              :        4->2+2:  Likewise
    3942              :        4->3+1:  Less clear-cut.  */
    3943         6016 :   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
    3944         3395 :       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
    3945         2644 :     return false;
    3946         3372 :   return vect_store_lanes_supported (vectype, group_size, masked_p) != IFN_LAST;
    3947              : }
    3948              : 
    3949              : /* Analyze an SLP instance starting from a group of grouped stores.  Call
    3950              :    vect_build_slp_tree to build a tree of packed stmts if possible.
    3951              :    Return FALSE if it's impossible to SLP any stmt in the loop.  */
    3952              : 
    3953              : static bool
    3954              : vect_analyze_slp_instance (vec_info *vinfo,
    3955              :                            scalar_stmts_to_slp_tree_map_t *bst_map,
    3956              :                            stmt_vec_info stmt_info, slp_instance_kind kind,
    3957              :                            unsigned max_tree_size, unsigned *limit,
    3958              :                            bool force_single_lane);
    3959              : 
    3960              : /* Build an interleaving scheme for the store sources RHS_NODES from
    3961              :    SCALAR_STMTS.  */
    3962              : 
    3963              : static slp_tree
    3964         7914 : vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
    3965              :                                    vec<stmt_vec_info> &scalar_stmts,
    3966              :                                    poly_uint64 max_nunits)
    3967              : {
    3968         7914 :   unsigned int group_size = scalar_stmts.length ();
    3969        15828 :   slp_tree node = vect_create_new_slp_node (scalar_stmts,
    3970         7914 :                                             SLP_TREE_CHILDREN
    3971              :                                               (rhs_nodes[0]).length ());
    3972         7914 :   SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
    3973         7914 :   node->max_nunits = max_nunits;
    3974         7914 :   for (unsigned l = 0;
    3975        15855 :        l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
    3976              :     {
    3977              :       /* And a permute merging all RHS SLP trees.  */
    3978         7941 :       slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
    3979         7941 :                                                 VEC_PERM_EXPR);
    3980         7941 :       SLP_TREE_CHILDREN (node).quick_push (perm);
    3981         7941 :       SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
    3982         7941 :       SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
    3983         7941 :       perm->max_nunits = max_nunits;
    3984         7941 :       SLP_TREE_LANES (perm) = group_size;
    3985              :       /* ???  We should set this NULL but that's not expected.  */
    3986         7941 :       SLP_TREE_REPRESENTATIVE (perm)
    3987         7941 :         = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
    3988        30916 :       for (unsigned j = 0; j < rhs_nodes.length (); ++j)
    3989              :         {
    3990        22975 :           SLP_TREE_CHILDREN (perm)
    3991        22975 :             .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
    3992        22975 :           SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
    3993        22975 :           for (unsigned k = 0;
    3994        48282 :                k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
    3995              :             {
    3996              :               /* ???  We should populate SLP_TREE_SCALAR_STMTS
    3997              :                  or SLP_TREE_SCALAR_OPS but then we might have
    3998              :                  a mix of both in our children.  */
    3999        25307 :               SLP_TREE_LANE_PERMUTATION (perm)
    4000        25307 :                 .quick_push (std::make_pair (j, k));
    4001              :             }
    4002              :         }
    4003              : 
    4004              :       /* Now we have a single permute node but we cannot code-generate
    4005              :          the case with more than two inputs.
    4006              :          Perform pairwise reduction, reducing the two inputs
    4007              :          with the least number of lanes to one and then repeat until
    4008              :          we end up with two inputs.  That scheme makes sure we end
    4009              :          up with permutes satisfying the restriction of requiring at
    4010              :          most two vector inputs to produce a single vector output
    4011              :          when the number of lanes is even.  */
    4012        15034 :       while (SLP_TREE_CHILDREN (perm).length () > 2)
    4013              :         {
    4014              :           /* When we have three equal sized groups left the pairwise
    4015              :              reduction does not result in a scheme that avoids using
    4016              :              three vectors.  Instead merge the first two groups
    4017              :              to the final size with do-not-care elements (chosen
    4018              :              from the first group) and then merge with the third.
    4019              :                   { A0, B0,  x, A1, B1,  x, ... }
    4020              :                -> { A0, B0, C0, A1, B1, C1, ... }
    4021              :              This handles group size of three (and at least
    4022              :              power-of-two multiples of that).  */
    4023         7093 :           if (SLP_TREE_CHILDREN (perm).length () == 3
    4024         3269 :               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
    4025         3269 :                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
    4026         7093 :               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
    4027         2451 :                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
    4028              :             {
    4029         2145 :               int ai = 0;
    4030         2145 :               int bi = 1;
    4031         2145 :               slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
    4032         2145 :               slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
    4033         2145 :               unsigned n = SLP_TREE_LANES (perm);
    4034              : 
    4035         2145 :               slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
    4036         2145 :               SLP_TREE_LANES (permab) = n;
    4037         2145 :               SLP_TREE_LANE_PERMUTATION (permab).create (n);
    4038         2145 :               SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
    4039         2145 :               permab->max_nunits = max_nunits;
    4040              :               /* ???  Should be NULL but that's not expected.  */
    4041         2145 :               SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
    4042         2145 :               SLP_TREE_CHILDREN (permab).quick_push (a);
    4043         4304 :               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
    4044         2159 :                 SLP_TREE_LANE_PERMUTATION (permab)
    4045         2159 :                   .quick_push (std::make_pair (0, k));
    4046         2145 :               SLP_TREE_CHILDREN (permab).quick_push (b);
    4047         4304 :               for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
    4048         2159 :                 SLP_TREE_LANE_PERMUTATION (permab)
    4049         2159 :                   .quick_push (std::make_pair (1, k));
    4050              :               /* Push the do-not-care lanes.  */
    4051         4304 :               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
    4052         2159 :                 SLP_TREE_LANE_PERMUTATION (permab)
    4053         2159 :                   .quick_push (std::make_pair (0, k));
    4054              : 
    4055              :               /* Put the merged node into 'perm', in place of a.  */
    4056         2145 :               SLP_TREE_CHILDREN (perm)[ai] = permab;
    4057              :               /* Adjust the references to b in the permutation
    4058              :                  of perm and to the later children which we'll
    4059              :                  remove.  */
    4060         8622 :               for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
    4061              :                 {
    4062         6477 :                   std::pair<unsigned, unsigned> &p
    4063         6477 :                     = SLP_TREE_LANE_PERMUTATION (perm)[k];
    4064         6477 :                   if (p.first == (unsigned) bi)
    4065              :                     {
    4066         2159 :                       p.first = ai;
    4067         2159 :                       p.second += SLP_TREE_LANES (a);
    4068              :                     }
    4069         4318 :                   else if (p.first > (unsigned) bi)
    4070         2159 :                     p.first--;
    4071              :                 }
    4072         2145 :               SLP_TREE_CHILDREN (perm).ordered_remove (bi);
    4073         2145 :               break;
    4074              :             }
    4075              : 
    4076              :           /* Pick the two nodes with the least number of lanes,
    4077              :              prefer the earliest candidate and maintain ai < bi.  */
    4078              :           int ai = -1;
    4079              :           int bi = -1;
    4080        45078 :           for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
    4081              :             {
    4082        40130 :               if (ai == -1)
    4083         4948 :                 ai = ci;
    4084        35182 :               else if (bi == -1)
    4085         4948 :                 bi = ci;
    4086        30234 :               else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
    4087        30234 :                         < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
    4088        30234 :                        || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
    4089        24904 :                            < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
    4090              :                 {
    4091        11548 :                   if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
    4092         5774 :                       <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
    4093         2687 :                     bi = ci;
    4094              :                   else
    4095              :                     {
    4096         3087 :                       ai = bi;
    4097         3087 :                       bi = ci;
    4098              :                     }
    4099              :                 }
    4100              :             }
    4101              : 
    4102              :           /* Produce a merge of nodes ai and bi.  */
    4103         4948 :           slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
    4104         4948 :           slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
    4105         4948 :           unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
    4106         4948 :           slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
    4107         4948 :           SLP_TREE_LANES (permab) = n;
    4108         4948 :           SLP_TREE_LANE_PERMUTATION (permab).create (n);
    4109         4948 :           SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
    4110         4948 :           permab->max_nunits = max_nunits;
    4111              :           /* ???  Should be NULL but that's not expected.  */
    4112         4948 :           SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
    4113         4948 :           SLP_TREE_CHILDREN (permab).quick_push (a);
    4114        13096 :           for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
    4115         8148 :             SLP_TREE_LANE_PERMUTATION (permab)
    4116         8148 :               .quick_push (std::make_pair (0, k));
    4117         4948 :           SLP_TREE_CHILDREN (permab).quick_push (b);
    4118        12420 :           for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
    4119         7472 :             SLP_TREE_LANE_PERMUTATION (permab)
    4120         7472 :               .quick_push (std::make_pair (1, k));
    4121              : 
    4122              :           /* Put the merged node into 'perm', in place of a.  */
    4123         4948 :           SLP_TREE_CHILDREN (perm)[ai] = permab;
    4124              :           /* Adjust the references to b in the permutation
    4125              :              of perm and to the later children which we'll
    4126              :              remove.  */
    4127        72097 :           for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
    4128              :             {
    4129        67149 :               std::pair<unsigned, unsigned> &p
    4130        67149 :                 = SLP_TREE_LANE_PERMUTATION (perm)[k];
    4131        67149 :               if (p.first == (unsigned) bi)
    4132              :                 {
    4133         7472 :                   p.first = ai;
    4134         7472 :                   p.second += SLP_TREE_LANES (a);
    4135              :                 }
    4136        59677 :               else if (p.first > (unsigned) bi)
    4137        25082 :                 p.first--;
    4138              :             }
    4139         4948 :           SLP_TREE_CHILDREN (perm).ordered_remove (bi);
    4140              :         }
    4141              :     }
    4142              : 
    4143         7914 :   return node;
    4144              : }
    4145              : 
    4146              : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
    4147              :    of KIND.  Return true if successful.  SCALAR_STMTS is owned by this
    4148              :    function, REMAIN and ROOT_STMT_INFOS ownership is transferred back to
    4149              :    the caller upon failure.  */
    4150              : 
    4151              : static bool
    4152      1899245 : vect_build_slp_instance (vec_info *vinfo,
    4153              :                          slp_instance_kind kind,
    4154              :                          vec<stmt_vec_info> &scalar_stmts,
    4155              :                          vec<stmt_vec_info> &root_stmt_infos,
    4156              :                          vec<tree> &remain,
    4157              :                          unsigned max_tree_size, unsigned *limit,
    4158              :                          scalar_stmts_to_slp_tree_map_t *bst_map,
    4159              :                          bool force_single_lane)
    4160              : {
    4161              :   /* If there's no budget left bail out early.  */
    4162      1899245 :   if (*limit == 0)
    4163              :     {
    4164        27238 :       scalar_stmts.release ();
    4165        27238 :       return false;
    4166              :     }
    4167              : 
    4168      1872007 :   if (kind == slp_inst_kind_ctor)
    4169              :     {
    4170        12854 :       if (dump_enabled_p ())
    4171           86 :         dump_printf_loc (MSG_NOTE, vect_location,
    4172              :                          "Analyzing vectorizable constructor: %G\n",
    4173           43 :                          root_stmt_infos[0]->stmt);
    4174              :     }
    4175      1859153 :   else if (kind == slp_inst_kind_gcond)
    4176              :     {
    4177       277650 :       if (dump_enabled_p ())
    4178         5696 :         dump_printf_loc (MSG_NOTE, vect_location,
    4179              :                          "Analyzing vectorizable control flow: %G",
    4180         2848 :                          root_stmt_infos[0]->stmt);
    4181              :     }
    4182              : 
    4183      1872007 :   if (dump_enabled_p ())
    4184              :     {
    4185        25550 :       dump_printf_loc (MSG_NOTE, vect_location,
    4186              :                        "Starting SLP discovery for\n");
    4187        54536 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4188        57972 :         dump_printf_loc (MSG_NOTE, vect_location,
    4189        28986 :                          "  %G", scalar_stmts[i]->stmt);
    4190              :     }
    4191              : 
    4192              :   /* Build the tree for the SLP instance.  */
    4193      1872007 :   unsigned int group_size = scalar_stmts.length ();
    4194      1872007 :   bool *matches = XALLOCAVEC (bool, group_size);
    4195      1872007 :   poly_uint64 max_nunits = 1;
    4196      1872007 :   unsigned tree_size = 0;
    4197              : 
    4198      1872007 :   slp_tree node = NULL;
    4199      1872007 :   if (group_size > 1 && force_single_lane)
    4200              :     {
    4201            0 :       matches[0] = true;
    4202            0 :       matches[1] = false;
    4203              :     }
    4204              :   else
    4205      1872007 :     node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4206              :                                 &max_nunits, matches, limit,
    4207              :                                 &tree_size, bst_map);
    4208      1872007 :   if (node != NULL)
    4209              :     {
    4210              :       /* Calculate the unrolling factor based on the smallest type.  */
    4211       764838 :       poly_uint64 unrolling_factor
    4212       764838 :         = calculate_unrolling_factor (max_nunits, group_size);
    4213              : 
    4214       764838 :       if (maybe_ne (unrolling_factor, 1U)
    4215       764838 :           && is_a <bb_vec_info> (vinfo))
    4216              :         {
    4217            0 :           unsigned HOST_WIDE_INT const_max_nunits;
    4218            0 :           if (!max_nunits.is_constant (&const_max_nunits)
    4219            0 :               || const_max_nunits > group_size)
    4220              :             {
    4221            0 :               if (dump_enabled_p ())
    4222            0 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    4223              :                                  "Build SLP failed: store group "
    4224              :                                  "size not a multiple of the vector size "
    4225              :                                  "in basic block SLP\n");
    4226            0 :               vect_free_slp_tree (node);
    4227            0 :               return false;
    4228              :             }
    4229              :           /* Fatal mismatch.  */
    4230            0 :           if (dump_enabled_p ())
    4231            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    4232              :                              "SLP discovery succeeded but node needs "
    4233              :                              "splitting\n");
    4234            0 :           memset (matches, true, group_size);
    4235            0 :           matches[group_size / const_max_nunits * const_max_nunits] = false;
    4236            0 :           vect_free_slp_tree (node);
    4237              :         }
    4238              :       else
    4239              :         {
    4240              :           /* Create a new SLP instance.  */
    4241       764838 :           slp_instance new_instance = XNEW (class _slp_instance);
    4242       764838 :           SLP_INSTANCE_TREE (new_instance) = node;
    4243       764838 :           SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4244       764838 :           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    4245       764838 :           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    4246       764838 :           SLP_INSTANCE_KIND (new_instance) = kind;
    4247       764838 :           new_instance->reduc_phis = NULL;
    4248       764838 :           new_instance->cost_vec = vNULL;
    4249       764838 :           new_instance->subgraph_entries = vNULL;
    4250              : 
    4251       764838 :           if (dump_enabled_p ())
    4252        22489 :             dump_printf_loc (MSG_NOTE, vect_location,
    4253              :                              "SLP size %u vs. limit %u.\n",
    4254              :                              tree_size, max_tree_size);
    4255              : 
    4256       764838 :           vinfo->slp_instances.safe_push (new_instance);
    4257              : 
    4258              :           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4259              :              the number of scalar stmts in the root in a few places.
    4260              :              Verify that assumption holds.  */
    4261      1529676 :           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4262              :                         .length () == group_size);
    4263              : 
    4264       764838 :           if (dump_enabled_p ())
    4265              :             {
    4266        22489 :               if (kind == slp_inst_kind_reduc_group)
    4267         1455 :                 dump_printf_loc (MSG_NOTE, vect_location,
    4268              :                                  "SLP discovery of size %d reduction group "
    4269              :                                  "succeeded\n", group_size);
    4270        22489 :               dump_printf_loc (MSG_NOTE, vect_location,
    4271              :                                "Final SLP tree for instance %p:\n",
    4272              :                                (void *) new_instance);
    4273        22489 :               vect_print_slp_graph (MSG_NOTE, vect_location,
    4274              :                                     SLP_INSTANCE_TREE (new_instance));
    4275              :             }
    4276              : 
    4277       764838 :           return true;
    4278              :         }
    4279              :     }
    4280              :   /* Failed to SLP.  */
    4281              : 
    4282              :   /* While we arrive here even with slp_inst_kind_store we should only
    4283              :      for group_size == 1.  The code to split store groups is only in
    4284              :      vect_analyze_slp_instance now.  */
    4285      1107169 :   gcc_assert (kind != slp_inst_kind_store || group_size == 1);
    4286              : 
    4287              :   /* Free the allocated memory.  */
    4288      1107169 :   scalar_stmts.release ();
    4289              : 
    4290              :   /* Failed to SLP.  */
    4291      1107169 :   if (dump_enabled_p ())
    4292         3061 :     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
    4293              :   return false;
    4294              : }
    4295              : 
    4296              : /* Analyze an SLP instance starting from a the start of a reduction chain.
    4297              :    Call vect_build_slp_tree to build a tree of packed stmts if possible.
    4298              :    Return FALSE if SLP build fails.  */
    4299              : 
    4300              : static bool
    4301        63364 : vect_analyze_slp_reduc_chain (loop_vec_info vinfo,
    4302              :                               scalar_stmts_to_slp_tree_map_t *bst_map,
    4303              :                               stmt_vec_info scalar_stmt,
    4304              :                               unsigned max_tree_size, unsigned *limit)
    4305              : {
    4306        63364 :   vec<stmt_vec_info> scalar_stmts = vNULL;
    4307              : 
    4308        63364 :   bool fail = false;
    4309              :   /* ???  We could leave operation code checking to SLP discovery.  */
    4310        63364 :   code_helper code = STMT_VINFO_REDUC_CODE (STMT_VINFO_REDUC_DEF
    4311              :                                               (vect_orig_stmt (scalar_stmt)));
    4312        63364 :   bool first = true;
    4313        63364 :   stmt_vec_info next_stmt = scalar_stmt;
    4314        71551 :   do
    4315              :     {
    4316        71551 :       stmt_vec_info stmt = next_stmt;
    4317        71551 :       gimple_match_op op;
    4318        71551 :       if (!gimple_extract_op (STMT_VINFO_STMT (stmt), &op))
    4319            0 :         gcc_unreachable ();
    4320       143102 :       tree reduc_def = gimple_arg (STMT_VINFO_STMT (stmt),
    4321        71551 :                                    STMT_VINFO_REDUC_IDX (stmt));
    4322        71551 :       next_stmt = vect_stmt_to_vectorize (vinfo->lookup_def (reduc_def));
    4323        71551 :       gcc_assert (is_a <gphi *> (STMT_VINFO_STMT (next_stmt))
    4324              :                   || STMT_VINFO_REDUC_IDX (next_stmt) != -1);
    4325        77095 :       if (!gimple_extract_op (STMT_VINFO_STMT (vect_orig_stmt (stmt)), &op))
    4326            0 :         gcc_unreachable ();
    4327        71551 :       if (CONVERT_EXPR_CODE_P (op.code)
    4328         3421 :           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))
    4329        74960 :           && (first
    4330         1692 :               || is_a <gphi *> (STMT_VINFO_STMT (next_stmt))))
    4331              :         ;
    4332        68146 :       else if (code != op.code)
    4333              :         {
    4334         2559 :           fail = true;
    4335         2559 :           break;
    4336              :         }
    4337              :       else
    4338        65587 :         scalar_stmts.safe_push (stmt);
    4339        68992 :       first = false;
    4340              :     }
    4341        68992 :   while (!is_a <gphi *> (STMT_VINFO_STMT (next_stmt)));
    4342        63364 :   if (fail)
    4343         2559 :     return false;
    4344              : 
    4345              :   /* Remember a stmt with the actual reduction operation.  */
    4346        60805 :   stmt_vec_info reduc_scalar_stmt = scalar_stmts[0];
    4347              : 
    4348              :   /* When the SSA def chain through reduc-idx does not form a natural
    4349              :      reduction chain try to linearize an associative operation manually.  */
    4350        60805 :   if (scalar_stmts.length () == 1
    4351        58186 :       && code.is_tree_code ()
    4352        52128 :       && associative_tree_code ((tree_code)code)
    4353              :       /* We may not associate if a fold-left reduction is required.  */
    4354       112068 :       && !needs_fold_left_reduction_p (TREE_TYPE (gimple_get_lhs
    4355              :                                                     (reduc_scalar_stmt->stmt)),
    4356              :                                        code))
    4357              :     {
    4358        49134 :       auto_vec<chain_op_t> chain;
    4359        49134 :       auto_vec<std::pair<tree_code, gimple *> > worklist;
    4360        49134 :       gimple *op_stmt = NULL, *other_op_stmt = NULL;
    4361        49134 :       if (is_a <gassign *> (scalar_stmts[0]->stmt)
    4362              :           /* We cannot linearize an operation that vect_slp_linearize_chain
    4363              :              would not put on its worklist.  */
    4364        49134 :           && gimple_assign_rhs_code (scalar_stmts[0]->stmt) == (tree_code)code)
    4365              :         {
    4366        48487 :           vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
    4367        48487 :                                     scalar_stmts[0]->stmt, op_stmt,
    4368              :                                     other_op_stmt,
    4369              :                                     NULL);
    4370              : 
    4371        48487 :           scalar_stmts.truncate (0);
    4372        48487 :           stmt_vec_info tail = NULL;
    4373       242680 :           for (auto el : chain)
    4374              :             {
    4375        97530 :               if (el.dt == vect_external_def
    4376        97530 :                   || el.dt == vect_constant_def
    4377        97530 :                   || el.code != (tree_code) code)
    4378              :                 {
    4379          311 :                   scalar_stmts.release ();
    4380          311 :                   return false;
    4381              :                 }
    4382        97219 :               stmt_vec_info stmt = vinfo->lookup_def (el.op);
    4383        97219 :               if (STMT_VINFO_REDUC_IDX (stmt) != -1
    4384        95675 :                   || STMT_VINFO_REDUC_DEF (stmt))
    4385              :                 {
    4386        48374 :                   gcc_assert (tail == NULL);
    4387        48374 :                   tail = stmt;
    4388        48374 :                   continue;
    4389              :                 }
    4390        48845 :               scalar_stmts.safe_push (stmt);
    4391              :             }
    4392        48176 :           gcc_assert (tail);
    4393              :         }
    4394              : 
    4395              :       /* When this linearization didn't produce a chain see if stripping
    4396              :          a wrapping sign conversion produces one.  */
    4397        48823 :       if (scalar_stmts.length () == 1
    4398        48823 :           && (code == PLUS_EXPR || code == MULT_EXPR || code == BIT_IOR_EXPR
    4399              :               || code == BIT_AND_EXPR || code == BIT_XOR_EXPR))
    4400              :         {
    4401        47093 :           gimple *stmt = scalar_stmts[0]->stmt;
    4402        47093 :           if (!is_gimple_assign (stmt)
    4403        46034 :               || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
    4404         4498 :               || TREE_CODE (gimple_assign_rhs1 (stmt)) != SSA_NAME
    4405        51591 :               || !tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
    4406         4498 :                                          TREE_TYPE (gimple_assign_rhs1 (stmt))))
    4407              :             {
    4408        45341 :               scalar_stmts.release ();
    4409        45341 :               return false;
    4410              :             }
    4411         1752 :           stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
    4412         1752 :           if (!is_gimple_assign (stmt)
    4413         1752 :               || gimple_assign_rhs_code (stmt) != (tree_code)code)
    4414              :             {
    4415         1733 :               scalar_stmts.release ();
    4416         1733 :               return false;
    4417              :             }
    4418           19 :           chain.truncate (0);
    4419           19 :           vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
    4420              :                                     stmt, op_stmt, other_op_stmt, NULL);
    4421              : 
    4422           19 :           scalar_stmts.truncate (0);
    4423           19 :           stmt_vec_info tail = NULL;
    4424           93 :           for (auto el : chain)
    4425              :             {
    4426           44 :               if (el.dt == vect_external_def
    4427           44 :                   || el.dt == vect_constant_def
    4428           44 :                   || el.code != (tree_code) code)
    4429              :                 {
    4430            8 :                   scalar_stmts.release ();
    4431            8 :                   return false;
    4432              :                 }
    4433           36 :               stmt_vec_info stmt = vinfo->lookup_def (el.op);
    4434           36 :               if (STMT_VINFO_REDUC_IDX (stmt) != -1
    4435           36 :                   || STMT_VINFO_REDUC_DEF (stmt))
    4436              :                 {
    4437            0 :                   gcc_assert (tail == NULL);
    4438            0 :                   tail = stmt;
    4439            0 :                   continue;
    4440              :                 }
    4441           36 :               scalar_stmts.safe_push (stmt);
    4442              :             }
    4443              :           /* Unlike the above this does not include the reduction SSA
    4444              :              cycle.  */
    4445           11 :           gcc_assert (!tail);
    4446              :         }
    4447              : 
    4448         1741 :       if (scalar_stmts.length () < 2)
    4449              :         {
    4450         1622 :           scalar_stmts.release ();
    4451         1622 :           return false;
    4452              :         }
    4453              : 
    4454          119 :       if (dump_enabled_p ())
    4455              :         {
    4456           34 :           dump_printf_loc (MSG_NOTE, vect_location,
    4457              :                            "Starting SLP discovery of reduction chain for\n");
    4458          140 :           for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4459          212 :             dump_printf_loc (MSG_NOTE, vect_location,
    4460          106 :                              "  %G", scalar_stmts[i]->stmt);
    4461              :         }
    4462              : 
    4463          119 :       unsigned int group_size = scalar_stmts.length ();
    4464          119 :       bool *matches = XALLOCAVEC (bool, group_size);
    4465          119 :       poly_uint64 max_nunits = 1;
    4466          119 :       unsigned tree_size = 0;
    4467          119 :       slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4468              :                                            &max_nunits, matches, limit,
    4469          119 :                                            &tree_size, bst_map);
    4470          119 :       if (!node)
    4471              :         {
    4472           47 :           scalar_stmts.release ();
    4473           47 :           return false;
    4474              :         }
    4475              : 
    4476           72 :       unsigned cycle_id = vinfo->reduc_infos.length ();
    4477           72 :       vect_reduc_info reduc_info = new vect_reduc_info_s ();
    4478           72 :       vinfo->reduc_infos.safe_push (reduc_info);
    4479           72 :       VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (next_stmt);
    4480           72 :       VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (next_stmt);
    4481           72 :       VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (next_stmt);
    4482           72 :       VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
    4483           72 :       reduc_info->is_reduc_chain = true;
    4484              : 
    4485              :       /* Build the node for the PHI and possibly the conversions.  */
    4486           72 :       slp_tree phis = vect_create_new_slp_node (2, ERROR_MARK);
    4487           72 :       SLP_TREE_REPRESENTATIVE (phis) = next_stmt;
    4488           72 :       phis->cycle_info.id = cycle_id;
    4489           72 :       SLP_TREE_LANES (phis) = group_size;
    4490           72 :       if (reduc_scalar_stmt == scalar_stmt)
    4491           68 :         SLP_TREE_VECTYPE (phis) = SLP_TREE_VECTYPE (node);
    4492              :       else
    4493            4 :         SLP_TREE_VECTYPE (phis)
    4494            4 :           = signed_or_unsigned_type_for (TYPE_UNSIGNED
    4495              :                                            (TREE_TYPE (gimple_get_lhs
    4496              :                                                          (scalar_stmt->stmt))),
    4497              :                                          SLP_TREE_VECTYPE (node));
    4498              :       /* ???  vect_cse_slp_nodes cannot cope with cycles without any
    4499              :          SLP_TREE_SCALAR_STMTS.  */
    4500           72 :       SLP_TREE_SCALAR_STMTS (phis).create (group_size);
    4501          375 :       for (unsigned i = 0; i < group_size; ++i)
    4502          303 :         SLP_TREE_SCALAR_STMTS (phis).quick_push (next_stmt);
    4503              : 
    4504           72 :       slp_tree op_input = phis;
    4505           72 :       if (reduc_scalar_stmt != scalar_stmt)
    4506              :         {
    4507            4 :           slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
    4508            4 :           SLP_TREE_REPRESENTATIVE (conv)
    4509            4 :             = vinfo->lookup_def (gimple_arg (reduc_scalar_stmt->stmt,
    4510            4 :                                              STMT_VINFO_REDUC_IDX
    4511              :                                                (reduc_scalar_stmt)));
    4512            4 :           SLP_TREE_CHILDREN (conv).quick_push (phis);
    4513            4 :           conv->cycle_info.id = cycle_id;
    4514            4 :           SLP_TREE_REDUC_IDX (conv) = 0;
    4515            4 :           SLP_TREE_LANES (conv) = group_size;
    4516            4 :           SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (node);
    4517            4 :           SLP_TREE_SCALAR_STMTS (conv) = vNULL;
    4518            4 :           op_input = conv;
    4519              :         }
    4520              : 
    4521           72 :       slp_tree reduc = vect_create_new_slp_node (2, ERROR_MARK);
    4522           72 :       SLP_TREE_REPRESENTATIVE (reduc) = reduc_scalar_stmt;
    4523           72 :       SLP_TREE_CHILDREN (reduc).quick_push (op_input);
    4524           72 :       SLP_TREE_CHILDREN (reduc).quick_push (node);
    4525           72 :       reduc->cycle_info.id = cycle_id;
    4526           72 :       SLP_TREE_REDUC_IDX (reduc) = 0;
    4527           72 :       SLP_TREE_LANES (reduc) = group_size;
    4528           72 :       SLP_TREE_VECTYPE (reduc) = SLP_TREE_VECTYPE (node);
    4529              :       /* ???  For the reduction epilogue we need a live lane.  */
    4530           72 :       SLP_TREE_SCALAR_STMTS (reduc).create (group_size);
    4531           72 :       SLP_TREE_SCALAR_STMTS (reduc).quick_push (reduc_scalar_stmt);
    4532          303 :       for (unsigned i = 1; i < group_size; ++i)
    4533          231 :         SLP_TREE_SCALAR_STMTS (reduc).quick_push (NULL);
    4534              : 
    4535           72 :       if (reduc_scalar_stmt != scalar_stmt)
    4536              :         {
    4537            4 :           slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
    4538            4 :           SLP_TREE_REPRESENTATIVE (conv) = scalar_stmt;
    4539            4 :           SLP_TREE_CHILDREN (conv).quick_push (reduc);
    4540            4 :           conv->cycle_info.id = cycle_id;
    4541            4 :           SLP_TREE_REDUC_IDX (conv) = 0;
    4542            4 :           SLP_TREE_LANES (conv) = group_size;
    4543            4 :           SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (phis);
    4544              :           /* ???  For the reduction epilogue we need a live lane.  */
    4545            4 :           SLP_TREE_SCALAR_STMTS (conv).create (group_size);
    4546            4 :           SLP_TREE_SCALAR_STMTS (conv).quick_push (scalar_stmt);
    4547            8 :           for (unsigned i = 1; i < group_size; ++i)
    4548            4 :             SLP_TREE_SCALAR_STMTS (conv).quick_push (NULL);
    4549            4 :           reduc = conv;
    4550              :         }
    4551              : 
    4552           72 :       edge le = loop_latch_edge (LOOP_VINFO_LOOP (vinfo));
    4553           72 :       SLP_TREE_CHILDREN (phis).quick_push (NULL);
    4554           72 :       SLP_TREE_CHILDREN (phis).quick_push (NULL);
    4555           72 :       SLP_TREE_CHILDREN (phis)[le->dest_idx] = reduc;
    4556           72 :       SLP_TREE_REF_COUNT (reduc)++;
    4557              : 
    4558              :       /* Create a new SLP instance.  */
    4559           72 :       slp_instance new_instance = XNEW (class _slp_instance);
    4560           72 :       SLP_INSTANCE_TREE (new_instance) = reduc;
    4561           72 :       SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4562           72 :       SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
    4563           72 :       SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
    4564           72 :       SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
    4565           72 :       new_instance->reduc_phis = NULL;
    4566           72 :       new_instance->cost_vec = vNULL;
    4567           72 :       new_instance->subgraph_entries = vNULL;
    4568              : 
    4569           72 :       vinfo->slp_instances.safe_push (new_instance);
    4570              : 
    4571           72 :       if (dump_enabled_p ())
    4572              :         {
    4573           24 :           dump_printf_loc (MSG_NOTE, vect_location,
    4574              :                            "Final SLP tree for instance %p:\n",
    4575              :                            (void *) new_instance);
    4576           24 :           vect_print_slp_graph (MSG_NOTE, vect_location,
    4577              :                                 SLP_INSTANCE_TREE (new_instance));
    4578              :         }
    4579              : 
    4580           72 :       return true;
    4581        49134 :     }
    4582              : 
    4583        11671 :   if (scalar_stmts.length () <= 1)
    4584              :     {
    4585         9052 :       scalar_stmts.release ();
    4586         9052 :       return false;
    4587              :     }
    4588              : 
    4589         2619 :   scalar_stmts.reverse ();
    4590         2619 :   stmt_vec_info reduc_phi_info = next_stmt;
    4591              : 
    4592              :   /* Build the tree for the SLP instance.  */
    4593         2619 :   vec<stmt_vec_info> root_stmt_infos = vNULL;
    4594         2619 :   vec<tree> remain = vNULL;
    4595              : 
    4596         2619 :   if (dump_enabled_p ())
    4597              :     {
    4598          180 :       dump_printf_loc (MSG_NOTE, vect_location,
    4599              :                        "Starting SLP discovery of reduction chain for\n");
    4600          966 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4601         1572 :         dump_printf_loc (MSG_NOTE, vect_location,
    4602          786 :                          "  %G", scalar_stmts[i]->stmt);
    4603              :     }
    4604              : 
    4605              :   /* Build the tree for the SLP instance.  */
    4606         2619 :   unsigned int group_size = scalar_stmts.length ();
    4607         2619 :   bool *matches = XALLOCAVEC (bool, group_size);
    4608         2619 :   poly_uint64 max_nunits = 1;
    4609         2619 :   unsigned tree_size = 0;
    4610              : 
    4611              :   /* ???  We need this only for SLP discovery.  */
    4612        10014 :   for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4613         7395 :     REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = scalar_stmts[0];
    4614              : 
    4615         2619 :   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4616              :                                        &max_nunits, matches, limit,
    4617         2619 :                                        &tree_size, bst_map);
    4618              : 
    4619        10014 :   for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4620         7395 :     REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = NULL;
    4621              : 
    4622         2619 :   if (node != NULL)
    4623              :     {
    4624              :       /* Create a new SLP instance.  */
    4625         2286 :       slp_instance new_instance = XNEW (class _slp_instance);
    4626         2286 :       SLP_INSTANCE_TREE (new_instance) = node;
    4627         2286 :       SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4628         2286 :       SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    4629         2286 :       SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    4630         2286 :       SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
    4631         2286 :       new_instance->reduc_phis = NULL;
    4632         2286 :       new_instance->cost_vec = vNULL;
    4633         2286 :       new_instance->subgraph_entries = vNULL;
    4634              : 
    4635         2286 :       vect_reduc_info reduc_info = info_for_reduction (vinfo, node);
    4636         2286 :       reduc_info->is_reduc_chain = true;
    4637              : 
    4638         2286 :       if (dump_enabled_p ())
    4639          135 :         dump_printf_loc (MSG_NOTE, vect_location,
    4640              :                          "SLP size %u vs. limit %u.\n",
    4641              :                          tree_size, max_tree_size);
    4642              : 
    4643              :       /* Fixup SLP reduction chains.  If this is a reduction chain with
    4644              :          a conversion in front amend the SLP tree with a node for that.  */
    4645         2286 :       gimple *scalar_def = STMT_VINFO_REDUC_DEF (reduc_phi_info)->stmt;
    4646         2286 :       if (is_gimple_assign (scalar_def)
    4647         2286 :           && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (scalar_def)))
    4648              :         {
    4649           43 :           stmt_vec_info conv_info = vect_stmt_to_vectorize
    4650           43 :                                         (STMT_VINFO_REDUC_DEF (reduc_phi_info));
    4651           43 :           scalar_stmts = vNULL;
    4652           43 :           scalar_stmts.create (group_size);
    4653          135 :           for (unsigned i = 0; i < group_size; ++i)
    4654           92 :             scalar_stmts.quick_push (conv_info);
    4655           43 :           slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
    4656           43 :           SLP_TREE_VECTYPE (conv)
    4657           43 :             = get_vectype_for_scalar_type (vinfo,
    4658           43 :                                            TREE_TYPE
    4659              :                                              (gimple_assign_lhs (scalar_def)),
    4660              :                                            group_size);
    4661           43 :           SLP_TREE_REDUC_IDX (conv) = 0;
    4662           43 :           conv->cycle_info.id = node->cycle_info.id;
    4663           43 :           SLP_TREE_CHILDREN (conv).quick_push (node);
    4664           43 :           SLP_INSTANCE_TREE (new_instance) = conv;
    4665              :         }
    4666              :       /* Fill the backedge child of the PHI SLP node.  The
    4667              :          general matching code cannot find it because the
    4668              :          scalar code does not reflect how we vectorize the
    4669              :          reduction.  */
    4670         2286 :       use_operand_p use_p;
    4671         2286 :       imm_use_iterator imm_iter;
    4672         2286 :       class loop *loop = LOOP_VINFO_LOOP (vinfo);
    4673        11023 :       FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
    4674              :                              gimple_get_lhs (scalar_def))
    4675              :         /* There are exactly two non-debug uses, the reduction
    4676              :            PHI and the loop-closed PHI node.  */
    4677         6451 :         if (!is_gimple_debug (USE_STMT (use_p))
    4678         6451 :             && gimple_bb (USE_STMT (use_p)) == loop->header)
    4679              :           {
    4680         2286 :             auto_vec<stmt_vec_info, 64> phis (group_size);
    4681         2286 :             stmt_vec_info phi_info = vinfo->lookup_stmt (USE_STMT (use_p));
    4682         8842 :             for (unsigned i = 0; i < group_size; ++i)
    4683         6556 :               phis.quick_push (phi_info);
    4684         2286 :             slp_tree *phi_node = bst_map->get (phis);
    4685         2286 :             unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
    4686         4572 :             SLP_TREE_CHILDREN (*phi_node)[dest_idx]
    4687         2286 :               = SLP_INSTANCE_TREE (new_instance);
    4688         2286 :             SLP_INSTANCE_TREE (new_instance)->refcnt++;
    4689         2286 :           }
    4690              : 
    4691         2286 :       vinfo->slp_instances.safe_push (new_instance);
    4692              : 
    4693              :       /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4694              :          the number of scalar stmts in the root in a few places.
    4695              :          Verify that assumption holds.  */
    4696         4572 :       gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4697              :                   .length () == group_size);
    4698              : 
    4699         2286 :       if (dump_enabled_p ())
    4700              :         {
    4701          135 :           dump_printf_loc (MSG_NOTE, vect_location,
    4702              :                            "Final SLP tree for instance %p:\n",
    4703              :                            (void *) new_instance);
    4704          135 :           vect_print_slp_graph (MSG_NOTE, vect_location,
    4705              :                                 SLP_INSTANCE_TREE (new_instance));
    4706              :         }
    4707              : 
    4708         2286 :       return true;
    4709              :     }
    4710              : 
    4711              :   /* Failed to SLP.  */
    4712          333 :   scalar_stmts.release ();
    4713          333 :   if (dump_enabled_p ())
    4714           45 :     dump_printf_loc (MSG_NOTE, vect_location,
    4715              :                      "SLP discovery of reduction chain failed\n");
    4716              :   return false;
    4717              : }
    4718              : 
    4719              : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
    4720              :    of KIND.  Return true if successful.  */
    4721              : 
    4722              : static bool
    4723        89261 : vect_analyze_slp_reduction (loop_vec_info vinfo,
    4724              :                             stmt_vec_info scalar_stmt,
    4725              :                             unsigned max_tree_size, unsigned *limit,
    4726              :                             scalar_stmts_to_slp_tree_map_t *bst_map,
    4727              :                             bool force_single_lane)
    4728              : {
    4729        89261 :   slp_instance_kind kind = slp_inst_kind_reduc_group;
    4730              : 
    4731              :   /* Try to gather a reduction chain.  Only attempt if there's budget left
    4732              :      since chain analysis may build multi-lane trees that consume limit.  */
    4733        89261 :   if (! force_single_lane
    4734        63634 :       && *limit != 0
    4735        63634 :       && STMT_VINFO_DEF_TYPE (scalar_stmt) == vect_reduction_def
    4736       152625 :       && vect_analyze_slp_reduc_chain (vinfo, bst_map, scalar_stmt,
    4737              :                                        max_tree_size, limit))
    4738              :     return true;
    4739              : 
    4740        86903 :   vec<stmt_vec_info> scalar_stmts;
    4741        86903 :   scalar_stmts.create (1);
    4742        86903 :   scalar_stmts.quick_push (scalar_stmt);
    4743              : 
    4744        86903 :   if (dump_enabled_p ())
    4745              :     {
    4746         3483 :       dump_printf_loc (MSG_NOTE, vect_location,
    4747              :                        "Starting SLP discovery for\n");
    4748         6966 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4749         6966 :         dump_printf_loc (MSG_NOTE, vect_location,
    4750         3483 :                          "  %G", scalar_stmts[i]->stmt);
    4751              :     }
    4752              : 
    4753              :   /* Build the tree for the SLP instance.  */
    4754        86903 :   unsigned int group_size = scalar_stmts.length ();
    4755        86903 :   bool *matches = XALLOCAVEC (bool, group_size);
    4756        86903 :   poly_uint64 max_nunits = 1;
    4757        86903 :   unsigned tree_size = 0;
    4758              : 
    4759        86903 :   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4760              :                                        &max_nunits, matches, limit,
    4761              :                                        &tree_size, bst_map);
    4762        86903 :   if (node != NULL)
    4763              :     {
    4764              :       /* Create a new SLP instance.  */
    4765        83895 :       slp_instance new_instance = XNEW (class _slp_instance);
    4766        83895 :       SLP_INSTANCE_TREE (new_instance) = node;
    4767        83895 :       SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4768        83895 :       SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
    4769        83895 :       SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
    4770        83895 :       SLP_INSTANCE_KIND (new_instance) = kind;
    4771        83895 :       new_instance->reduc_phis = NULL;
    4772        83895 :       new_instance->cost_vec = vNULL;
    4773        83895 :       new_instance->subgraph_entries = vNULL;
    4774              : 
    4775        83895 :       if (dump_enabled_p ())
    4776         3363 :         dump_printf_loc (MSG_NOTE, vect_location,
    4777              :                          "SLP size %u vs. limit %u.\n",
    4778              :                          tree_size, max_tree_size);
    4779              : 
    4780        83895 :       vinfo->slp_instances.safe_push (new_instance);
    4781              : 
    4782              :       /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4783              :          the number of scalar stmts in the root in a few places.
    4784              :          Verify that assumption holds.  */
    4785       167790 :       gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4786              :                   .length () == group_size);
    4787              : 
    4788        83895 :       if (dump_enabled_p ())
    4789              :         {
    4790         3363 :           dump_printf_loc (MSG_NOTE, vect_location,
    4791              :                            "Final SLP tree for instance %p:\n",
    4792              :                            (void *) new_instance);
    4793         3363 :           vect_print_slp_graph (MSG_NOTE, vect_location,
    4794              :                                 SLP_INSTANCE_TREE (new_instance));
    4795              :         }
    4796              : 
    4797        83895 :       return true;
    4798              :     }
    4799              :   /* Failed to SLP.  */
    4800              : 
    4801              :   /* Free the allocated memory.  */
    4802         3008 :   scalar_stmts.release ();
    4803              : 
    4804              :   /* Failed to SLP.  */
    4805         3008 :   if (dump_enabled_p ())
    4806          120 :     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
    4807              :   return false;
    4808              : }
    4809              : 
    4810              : /* Analyze a single SLP reduction group.  If successful add a SLP instance
    4811              :    for it and return true, otherwise return false and have *MATCHES
    4812              :    populated.  */
    4813              : 
    4814              : static bool
    4815        26957 : vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo,
    4816              :                                   vec<stmt_vec_info> scalar_stmts,
    4817              :                                   scalar_stmts_to_slp_tree_map_t *bst_map,
    4818              :                                   unsigned max_tree_size, unsigned *limit,
    4819              :                                   bool *matches)
    4820              : {
    4821              :   /* Try to form a reduction group.  */
    4822        26957 :   unsigned int group_size = scalar_stmts.length ();
    4823        26957 :   if (!matches)
    4824        11205 :     matches = XALLOCAVEC (bool, group_size);
    4825        26957 :   poly_uint64 max_nunits = 1;
    4826        26957 :   unsigned tree_size = 0;
    4827        26957 :   slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts,
    4828              :                                        group_size,
    4829              :                                        &max_nunits, matches, limit,
    4830              :                                        &tree_size, bst_map);
    4831        26957 :   if (!node)
    4832              :     return false;
    4833              : 
    4834              :   /* Create a new SLP instance.  */
    4835        12245 :   slp_instance new_instance = XNEW (class _slp_instance);
    4836        12245 :   SLP_INSTANCE_TREE (new_instance) = node;
    4837        12245 :   SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4838        12245 :   SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
    4839        12245 :   SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
    4840        12245 :   SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group;
    4841        12245 :   new_instance->reduc_phis = NULL;
    4842        12245 :   new_instance->cost_vec = vNULL;
    4843        12245 :   new_instance->subgraph_entries = vNULL;
    4844              : 
    4845        12245 :   if (dump_enabled_p ())
    4846          579 :     dump_printf_loc (MSG_NOTE, vect_location,
    4847              :                      "SLP size %u vs. limit %u.\n",
    4848              :                      tree_size, max_tree_size);
    4849              : 
    4850        12245 :   loop_vinfo->slp_instances.safe_push (new_instance);
    4851              : 
    4852              :   /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4853              :      the number of scalar stmts in the root in a few places.
    4854              :      Verify that assumption holds.  */
    4855        24490 :   gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4856              :               .length () == group_size);
    4857              : 
    4858        12245 :   if (dump_enabled_p ())
    4859              :     {
    4860          579 :       dump_printf_loc (MSG_NOTE, vect_location,
    4861              :                        "SLP discovery of size %d reduction group "
    4862              :                        "succeeded\n", group_size);
    4863          579 :       dump_printf_loc (MSG_NOTE, vect_location,
    4864              :                        "Final SLP tree for instance %p:\n",
    4865              :                        (void *) new_instance);
    4866          579 :       vect_print_slp_graph (MSG_NOTE, vect_location,
    4867              :                             SLP_INSTANCE_TREE (new_instance));
    4868              :     }
    4869              : 
    4870              :   return true;
    4871              : }
    4872              : 
    4873              : /* Analyze reductions in LOOP_VINFO and populate SLP instances
    4874              :    accordingly.  Returns false if something fails.  */
    4875              : 
    4876              : static bool
    4877       491685 : vect_analyze_slp_reductions (loop_vec_info loop_vinfo,
    4878              :                              unsigned max_tree_size, unsigned *limit,
    4879              :                              scalar_stmts_to_slp_tree_map_t *bst_map,
    4880              :                              bool force_single_lane)
    4881              : {
    4882       557348 :   if (loop_vinfo->reductions.is_empty ())
    4883              :     return true;
    4884              : 
    4885              :   /* Collect reduction statements we can combine into
    4886              :      a SLP reduction.  */
    4887        73065 :   vec<stmt_vec_info> scalar_stmts;
    4888        73065 :   scalar_stmts.create (loop_vinfo->reductions.length ());
    4889       324286 :   for (auto next_info : loop_vinfo->reductions)
    4890              :     {
    4891       105091 :       next_info = vect_stmt_to_vectorize (next_info);
    4892       105091 :       if ((STMT_VINFO_RELEVANT_P (next_info)
    4893           14 :            || STMT_VINFO_LIVE_P (next_info))
    4894              :           /* ???  Make sure we didn't skip a conversion around a
    4895              :              reduction path.  In that case we'd have to reverse
    4896              :              engineer that conversion stmt following the chain using
    4897              :              reduc_idx and from the PHI using reduc_def.  */
    4898       105077 :           && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
    4899       105077 :               || (STMT_VINFO_DEF_TYPE (next_info)
    4900              :                   == vect_double_reduction_def)))
    4901              :         {
    4902              :           /* Do not discover SLP reductions combining lane-reducing
    4903              :              ops, that will fail later.  */
    4904       105077 :           if (!force_single_lane
    4905       105077 :               && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
    4906        78761 :             scalar_stmts.quick_push (next_info);
    4907              :           /* Do SLP discovery for single-lane reductions.  */
    4908        26316 :           else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
    4909              :                                                  max_tree_size, limit,
    4910              :                                                  bst_map,
    4911              :                                                  force_single_lane))
    4912              :             {
    4913            0 :               scalar_stmts.release ();
    4914            0 :               return false;
    4915              :             }
    4916              :         }
    4917              :     }
    4918              : 
    4919        73065 :   if (scalar_stmts.length () > 1)
    4920              :     {
    4921              :       /* Try to form a reduction group.  */
    4922         4572 :       unsigned int group_size = scalar_stmts.length ();
    4923         4572 :       bool *matches = XALLOCAVEC (bool, group_size);
    4924         4572 :       if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts, bst_map,
    4925              :                                             max_tree_size, limit, matches))
    4926         4419 :         return true;
    4927              : 
    4928              :       /* When analysis as a single SLP reduction group failed try to
    4929              :          form sub-groups by collecting matching lanes.  Do not recurse
    4930              :          that on failure (to limit compile-time costs), but recurse
    4931              :          for the initial non-matching parts.  Everything not covered
    4932              :          by a sub-group gets single-reduction treatment.  */
    4933         3496 :       vec<stmt_vec_info> cands = vNULL;
    4934        11358 :       while (matches[0])
    4935              :         {
    4936        11205 :           cands.truncate (0);
    4937        11205 :           cands.reserve (group_size, true);
    4938        88267 :           for (unsigned i = 0; i < group_size; ++i)
    4939        77062 :             if (matches[i])
    4940        19538 :               cands.quick_push (scalar_stmts[i]);
    4941              : 
    4942              :           /* Try to form a reduction group.  */
    4943        11205 :           if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map,
    4944              :                                                 max_tree_size, limit, NULL))
    4945         7851 :             cands = vNULL;
    4946              :           else
    4947              :             {
    4948              :               /* Do SLP discovery for single-lane reductions.  */
    4949        20489 :               for (auto stmt_info : cands)
    4950        10452 :                 if (! vect_analyze_slp_reduction (loop_vinfo,
    4951              :                                                   vect_stmt_to_vectorize
    4952              :                                                     (stmt_info),
    4953              :                                                   max_tree_size, limit,
    4954              :                                                   bst_map, force_single_lane))
    4955              :                   {
    4956           25 :                     scalar_stmts.release ();
    4957           25 :                     cands.release ();
    4958           25 :                     return false;
    4959              :                   }
    4960              :             }
    4961              :           /* Remove the handled stmts from scalar_stmts and try again,
    4962              :              possibly repeating the above with updated matches[].  */
    4963              :           unsigned j = 0;
    4964        88172 :           for (unsigned i = 0; i < group_size; ++i)
    4965        76992 :             if (!matches[i])
    4966              :               {
    4967        57494 :                 scalar_stmts[j] = scalar_stmts[i];
    4968        57494 :                 ++j;
    4969              :               }
    4970        11180 :           scalar_stmts.truncate (j);
    4971        11180 :           group_size = scalar_stmts.length ();
    4972        11180 :           if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
    4973              :                                                 bst_map, max_tree_size, limit,
    4974              :                                                 matches))
    4975              :             return true;
    4976              :         }
    4977              :     }
    4978              :   /* Do SLP discovery for single-lane reductions.  */
    4979       255448 :   for (auto stmt_info : scalar_stmts)
    4980        52493 :     if (! vect_analyze_slp_reduction (loop_vinfo,
    4981              :                                       vect_stmt_to_vectorize (stmt_info),
    4982              :                                       max_tree_size, limit,
    4983              :                                       bst_map, force_single_lane))
    4984              :       {
    4985         2983 :         scalar_stmts.release ();
    4986         2983 :         return false;
    4987              :       }
    4988              : 
    4989        65663 :   scalar_stmts.release ();
    4990        65663 :   return true;
    4991              : }
    4992              : 
    4993              : /* Analyze an SLP instance starting from a group of grouped stores.  Call
    4994              :    vect_build_slp_tree to build a tree of packed stmts if possible.
    4995              :    Return FALSE if it's impossible to SLP any stmt in the group.  */
    4996              : 
    4997              : static bool
    4998      1100259 : vect_analyze_slp_instance (vec_info *vinfo,
    4999              :                            scalar_stmts_to_slp_tree_map_t *bst_map,
    5000              :                            stmt_vec_info stmt_info,
    5001              :                            slp_instance_kind kind,
    5002              :                            unsigned max_tree_size, unsigned *limit,
    5003              :                            bool force_single_lane)
    5004              : {
    5005      1100259 :   vec<stmt_vec_info> scalar_stmts;
    5006              : 
    5007      1100259 :   if (is_a <bb_vec_info> (vinfo))
    5008      1070962 :     vect_location = stmt_info->stmt;
    5009              : 
    5010      1100259 :   gcc_assert (kind == slp_inst_kind_store);
    5011              : 
    5012              :   /* Collect the stores and store them in scalar_stmts.  */
    5013      1100259 :   scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
    5014      1100259 :   stmt_vec_info next_info = stmt_info;
    5015      5469793 :   while (next_info)
    5016              :     {
    5017      3269275 :       scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
    5018      3269275 :       next_info = DR_GROUP_NEXT_ELEMENT (next_info);
    5019              :     }
    5020              : 
    5021      1100259 :   vec<stmt_vec_info> root_stmt_infos = vNULL;
    5022      1100259 :   vec<tree> remain = vNULL;
    5023              : 
    5024              :   /* Build the tree for the SLP instance.  */
    5025              : 
    5026              :   /* If there's no budget left bail out early.  */
    5027      1100259 :   if (*limit == 0)
    5028              :     return false;
    5029              : 
    5030      1100236 :   if (dump_enabled_p ())
    5031              :     {
    5032         4131 :       dump_printf_loc (MSG_NOTE, vect_location,
    5033              :                        "Starting SLP discovery for\n");
    5034        23829 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    5035        39396 :         dump_printf_loc (MSG_NOTE, vect_location,
    5036        19698 :                          "  %G", scalar_stmts[i]->stmt);
    5037              :     }
    5038              : 
    5039              :   /* Build the tree for the SLP instance.  */
    5040      1100236 :   unsigned int group_size = scalar_stmts.length ();
    5041      1100236 :   bool *matches = XALLOCAVEC (bool, group_size);
    5042      1100236 :   poly_uint64 max_nunits = 1;
    5043      1100236 :   unsigned tree_size = 0;
    5044      1100236 :   unsigned i;
    5045              : 
    5046      1100236 :   slp_tree node = NULL;
    5047      1100236 :   if (group_size > 1 && force_single_lane)
    5048              :     {
    5049         1689 :       matches[0] = true;
    5050         1689 :       matches[1] = false;
    5051              :     }
    5052              :   else
    5053      1098547 :     node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    5054              :                                 &max_nunits, matches, limit,
    5055              :                                 &tree_size, bst_map);
    5056      1100236 :   if (node != NULL)
    5057              :     {
    5058              :       /* Calculate the unrolling factor based on the smallest type.  */
    5059       684528 :       poly_uint64 unrolling_factor
    5060       684528 :         = calculate_unrolling_factor (max_nunits, group_size);
    5061              : 
    5062       684528 :       if (maybe_ne (unrolling_factor, 1U)
    5063       684528 :           && is_a <bb_vec_info> (vinfo))
    5064              :         {
    5065            0 :           unsigned HOST_WIDE_INT const_max_nunits;
    5066            0 :           if (!max_nunits.is_constant (&const_max_nunits)
    5067            0 :               || const_max_nunits > group_size)
    5068              :             {
    5069            0 :               if (dump_enabled_p ())
    5070            0 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    5071              :                                  "Build SLP failed: store group "
    5072              :                                  "size not a multiple of the vector size "
    5073              :                                  "in basic block SLP\n");
    5074            0 :               vect_free_slp_tree (node);
    5075            0 :               return false;
    5076              :             }
    5077              :           /* Fatal mismatch.  */
    5078            0 :           if (dump_enabled_p ())
    5079            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    5080              :                              "SLP discovery succeeded but node needs "
    5081              :                              "splitting\n");
    5082            0 :           memset (matches, true, group_size);
    5083            0 :           matches[group_size / const_max_nunits * const_max_nunits] = false;
    5084            0 :           vect_free_slp_tree (node);
    5085              :         }
    5086              :       else
    5087              :         {
    5088              :           /* Create a new SLP instance.  */
    5089       684528 :           slp_instance new_instance = XNEW (class _slp_instance);
    5090       684528 :           SLP_INSTANCE_TREE (new_instance) = node;
    5091       684528 :           SLP_INSTANCE_LOADS (new_instance) = vNULL;
    5092       684528 :           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    5093       684528 :           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    5094       684528 :           SLP_INSTANCE_KIND (new_instance) = kind;
    5095       684528 :           new_instance->reduc_phis = NULL;
    5096       684528 :           new_instance->cost_vec = vNULL;
    5097       684528 :           new_instance->subgraph_entries = vNULL;
    5098              : 
    5099       684528 :           if (dump_enabled_p ())
    5100         3147 :             dump_printf_loc (MSG_NOTE, vect_location,
    5101              :                              "SLP size %u vs. limit %u.\n",
    5102              :                              tree_size, max_tree_size);
    5103              : 
    5104       684528 :           vinfo->slp_instances.safe_push (new_instance);
    5105              : 
    5106              :           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    5107              :              the number of scalar stmts in the root in a few places.
    5108              :              Verify that assumption holds.  */
    5109      1369056 :           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    5110              :                         .length () == group_size);
    5111              : 
    5112       684528 :           if (dump_enabled_p ())
    5113              :             {
    5114         3147 :               dump_printf_loc (MSG_NOTE, vect_location,
    5115              :                                "Final SLP tree for instance %p:\n",
    5116              :                                (void *) new_instance);
    5117         3147 :               vect_print_slp_graph (MSG_NOTE, vect_location,
    5118              :                                     SLP_INSTANCE_TREE (new_instance));
    5119              :             }
    5120              : 
    5121       684528 :           return true;
    5122              :         }
    5123              :     }
    5124              :   /* Failed to SLP.  */
    5125              : 
    5126              :   /* Try to break the group up into pieces.  */
    5127       415708 :   if (*limit > 0 && kind == slp_inst_kind_store)
    5128              :     {
    5129              :       /* ???  We could delay all the actual splitting of store-groups
    5130              :          until after SLP discovery of the original group completed.
    5131              :          Then we can recurse to vect_build_slp_instance directly.  */
    5132      1086898 :       for (i = 0; i < group_size; i++)
    5133      1086898 :         if (!matches[i])
    5134              :           break;
    5135              : 
    5136              :       /* For basic block SLP, try to break the group up into multiples of
    5137              :          a vector size.  */
    5138       415707 :       if (is_a <bb_vec_info> (vinfo)
    5139       415707 :           && (i > 1 && i < group_size))
    5140              :         {
    5141              :           /* Free the allocated memory.  */
    5142       155113 :           scalar_stmts.release ();
    5143              : 
    5144       155113 :           tree scalar_type
    5145       155113 :             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
    5146       310226 :           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
    5147       155113 :                                                       1 << floor_log2 (i));
    5148       155113 :           unsigned HOST_WIDE_INT const_nunits;
    5149       155113 :           if (vectype
    5150       155113 :               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
    5151              :             {
    5152              :               /* Split into two groups at the first vector boundary.  */
    5153       155113 :               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
    5154       155113 :               unsigned group1_size = i & ~(const_nunits - 1);
    5155              : 
    5156       155113 :               if (dump_enabled_p ())
    5157           59 :                 dump_printf_loc (MSG_NOTE, vect_location,
    5158              :                                  "Splitting SLP group at stmt %u\n", i);
    5159       155113 :               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
    5160              :                                                                group1_size);
    5161       155113 :               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
    5162              :                                                     kind, max_tree_size,
    5163              :                                                     limit, false);
    5164              :               /* Split the rest at the failure point and possibly
    5165              :                  re-analyze the remaining matching part if it has
    5166              :                  at least two lanes.  */
    5167       155113 :               if (group1_size < i
    5168         5363 :                   && (i + 1 < group_size
    5169         2937 :                       || i - group1_size > 1))
    5170              :                 {
    5171         2458 :                   stmt_vec_info rest2 = rest;
    5172         2458 :                   rest = vect_split_slp_store_group (rest, i - group1_size);
    5173         2458 :                   if (i - group1_size > 1)
    5174           61 :                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
    5175              :                                                       kind, max_tree_size,
    5176              :                                                       limit, false);
    5177              :                 }
    5178              :               /* Re-analyze the non-matching tail if it has at least
    5179              :                  two lanes.  */
    5180       155113 :               if (i + 1 < group_size)
    5181        22086 :                 res |= vect_analyze_slp_instance (vinfo, bst_map,
    5182              :                                                   rest, kind, max_tree_size,
    5183              :                                                   limit, false);
    5184       155113 :               return res;
    5185              :             }
    5186              :         }
    5187              : 
    5188              :       /* For loop vectorization split the RHS into arbitrary pieces of
    5189              :          size >= 1.  */
    5190       260594 :       else if (is_a <loop_vec_info> (vinfo)
    5191       260594 :                && (group_size != 1 && i < group_size))
    5192              :         {
    5193         8175 :           gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
    5194           28 :           bool masked_p = call
    5195           28 :               && gimple_call_internal_p (call)
    5196           28 :               && internal_fn_mask_index (gimple_call_internal_fn (call)) != -1;
    5197              :           /* There are targets that cannot do even/odd interleaving schemes
    5198              :              so they absolutely need to use load/store-lanes.  For now
    5199              :              force single-lane SLP for them - they would be happy with
    5200              :              uniform power-of-two lanes (but depending on element size),
    5201              :              but even if we can use 'i' as indicator we would need to
    5202              :              backtrack when later lanes fail to discover with the same
    5203              :              granularity.  We cannot turn any of strided or scatter store
    5204              :              into store-lanes.  */
    5205              :           /* ???  If this is not in sync with what get_load_store_type
    5206              :              later decides the SLP representation is not good for other
    5207              :              store vectorization methods.  */
    5208         8175 :           bool want_store_lanes
    5209         8175 :             = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    5210         8175 :                && ! STMT_VINFO_STRIDED_P (stmt_info)
    5211         6100 :                && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
    5212         6096 :                && compare_step_with_zero (vinfo, stmt_info) > 0
    5213        14191 :                && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
    5214        16350 :                                                  masked_p, group_size, i));
    5215         8175 :           if (want_store_lanes || force_single_lane)
    5216              :             i = 1;
    5217              : 
    5218              :           /* A fatal discovery fail doesn't always mean single-lane SLP
    5219              :              isn't a possibility, so try.  */
    5220         6486 :           if (i == 0)
    5221              :             i = 1;
    5222              : 
    5223         8175 :           if (dump_enabled_p ())
    5224          883 :             dump_printf_loc (MSG_NOTE, vect_location,
    5225              :                              "Splitting SLP group at stmt %u\n", i);
    5226              : 
    5227              :           /* Analyze the stored values and pinch them together with
    5228              :              a permute node so we can preserve the whole store group.  */
    5229         8175 :           auto_vec<slp_tree> rhs_nodes;
    5230         8175 :           poly_uint64 max_nunits = 1;
    5231              : 
    5232         8175 :           unsigned int rhs_common_nlanes = 0;
    5233         8175 :           unsigned int start = 0, end = i;
    5234        36622 :           while (start < group_size)
    5235              :             {
    5236        28708 :               gcc_assert (end - start >= 1);
    5237        28708 :               vec<stmt_vec_info> substmts;
    5238        28708 :               substmts.create (end - start);
    5239        89549 :               for (unsigned j = start; j < end; ++j)
    5240        60841 :                 substmts.quick_push (scalar_stmts[j]);
    5241        28708 :               max_nunits = 1;
    5242        28708 :               node = vect_build_slp_tree (vinfo, substmts, end - start,
    5243              :                                           &max_nunits,
    5244              :                                           matches, limit, &tree_size, bst_map);
    5245        28708 :               if (node)
    5246              :                 {
    5247        22920 :                   rhs_nodes.safe_push (node);
    5248        22920 :                   vect_update_max_nunits (&max_nunits, node->max_nunits);
    5249        22920 :                   if (start == 0)
    5250         7920 :                     rhs_common_nlanes = SLP_TREE_LANES (node);
    5251        15000 :                   else if (rhs_common_nlanes != SLP_TREE_LANES (node))
    5252         1375 :                     rhs_common_nlanes = 0;
    5253        22920 :                   start = end;
    5254        22920 :                   if (want_store_lanes || force_single_lane)
    5255         5084 :                     end = start + 1;
    5256              :                   else
    5257              :                     end = group_size;
    5258              :                 }
    5259              :               else
    5260              :                 {
    5261         5788 :                   substmts.release ();
    5262         5788 :                   if (end - start == 1)
    5263              :                     {
    5264              :                       /* Single-lane discovery failed.  Free ressources.  */
    5265          281 :                       for (auto node : rhs_nodes)
    5266            8 :                         vect_free_slp_tree (node);
    5267          261 :                       scalar_stmts.release ();
    5268          261 :                       if (dump_enabled_p ())
    5269           39 :                         dump_printf_loc (MSG_NOTE, vect_location,
    5270              :                                          "SLP discovery failed\n");
    5271          261 :                       return false;
    5272              :                     }
    5273              : 
    5274              :                   /* ???  It really happens that we soft-fail SLP
    5275              :                      build at a mismatch but the matching part hard-fails
    5276              :                      later.  As we know we arrived here with a group
    5277              :                      larger than one try a group of size one!  */
    5278         5527 :                   if (!matches[0])
    5279           44 :                     end = start + 1;
    5280              :                   else
    5281        12065 :                     for (unsigned j = start; j < end; j++)
    5282        12065 :                       if (!matches[j - start])
    5283              :                         {
    5284              :                           end = j;
    5285              :                           break;
    5286              :                         }
    5287              :                 }
    5288              :             }
    5289              : 
    5290              :           /* Now re-assess whether we want store lanes in case the
    5291              :              discovery ended up producing all single-lane RHSs.  */
    5292         7914 :           if (! want_store_lanes
    5293         7914 :               && rhs_common_nlanes == 1
    5294         6857 :               && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    5295         6857 :               && ! STMT_VINFO_STRIDED_P (stmt_info)
    5296         5156 :               && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
    5297         5153 :               && compare_step_with_zero (vinfo, stmt_info) > 0
    5298        13012 :               && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes[0]),
    5299              :                                               group_size, masked_p)
    5300              :                   != IFN_LAST))
    5301              :             want_store_lanes = true;
    5302              : 
    5303              :           /* Now we assume we can build the root SLP node from all stores.  */
    5304         7914 :           if (want_store_lanes)
    5305              :             {
    5306              :               /* For store-lanes feed the store node with all RHS nodes
    5307              :                  in order.  */
    5308            0 :               node = vect_create_new_slp_node (scalar_stmts,
    5309            0 :                                                SLP_TREE_CHILDREN
    5310              :                                                  (rhs_nodes[0]).length ());
    5311            0 :               SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
    5312            0 :               node->max_nunits = max_nunits;
    5313            0 :               node->ldst_lanes = true;
    5314            0 :               SLP_TREE_CHILDREN (node)
    5315            0 :                 .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
    5316            0 :                                 + rhs_nodes.length () - 1);
    5317              :               /* First store value and possibly mask.  */
    5318            0 :               SLP_TREE_CHILDREN (node)
    5319            0 :                 .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
    5320              :               /* Rest of the store values.  All mask nodes are the same,
    5321              :                  this should be guaranteed by dataref group discovery.  */
    5322            0 :               for (unsigned j = 1; j < rhs_nodes.length (); ++j)
    5323            0 :                 SLP_TREE_CHILDREN (node)
    5324            0 :                   .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
    5325            0 :               for (slp_tree child : SLP_TREE_CHILDREN (node))
    5326            0 :                 child->refcnt++;
    5327              :             }
    5328              :           else
    5329         7914 :             node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
    5330              :                                                       max_nunits);
    5331              : 
    5332        30826 :           while (!rhs_nodes.is_empty ())
    5333        22912 :             vect_free_slp_tree (rhs_nodes.pop ());
    5334              : 
    5335              :           /* Create a new SLP instance.  */
    5336         7914 :           slp_instance new_instance = XNEW (class _slp_instance);
    5337         7914 :           SLP_INSTANCE_TREE (new_instance) = node;
    5338         7914 :           SLP_INSTANCE_LOADS (new_instance) = vNULL;
    5339         7914 :           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    5340         7914 :           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    5341         7914 :           SLP_INSTANCE_KIND (new_instance) = kind;
    5342         7914 :           new_instance->reduc_phis = NULL;
    5343         7914 :           new_instance->cost_vec = vNULL;
    5344         7914 :           new_instance->subgraph_entries = vNULL;
    5345              : 
    5346         7914 :           if (dump_enabled_p ())
    5347          844 :             dump_printf_loc (MSG_NOTE, vect_location,
    5348              :                              "SLP size %u vs. limit %u.\n",
    5349              :                              tree_size, max_tree_size);
    5350              : 
    5351         7914 :           vinfo->slp_instances.safe_push (new_instance);
    5352              : 
    5353              :           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    5354              :              the number of scalar stmts in the root in a few places.
    5355              :              Verify that assumption holds.  */
    5356        15828 :           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    5357              :                         .length () == group_size);
    5358              : 
    5359         7914 :           if (dump_enabled_p ())
    5360              :             {
    5361          844 :               dump_printf_loc (MSG_NOTE, vect_location,
    5362              :                                "Final SLP tree for instance %p:\n",
    5363              :                                (void *) new_instance);
    5364          844 :               vect_print_slp_graph (MSG_NOTE, vect_location,
    5365              :                                     SLP_INSTANCE_TREE (new_instance));
    5366              :             }
    5367         7914 :           return true;
    5368         8175 :         }
    5369              :       else
    5370              :         /* Free the allocated memory.  */
    5371       252419 :         scalar_stmts.release ();
    5372              : 
    5373              :       /* Even though the first vector did not all match, we might be able to SLP
    5374              :          (some) of the remainder.  FORNOW ignore this possibility.  */
    5375              :     }
    5376              :   else
    5377              :     /* Free the allocated memory.  */
    5378            1 :     scalar_stmts.release ();
    5379              : 
    5380              :   /* Failed to SLP.  */
    5381       252420 :   if (dump_enabled_p ())
    5382           42 :     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
    5383              :   return false;
    5384              : }
    5385              : 
    5386              : /* qsort comparator ordering SLP load nodes.  */
    5387              : 
    5388              : static int
    5389      2642523 : vllp_cmp (const void *a_, const void *b_)
    5390              : {
    5391      2642523 :   const slp_tree a = *(const slp_tree *)a_;
    5392      2642523 :   const slp_tree b = *(const slp_tree *)b_;
    5393      2642523 :   stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
    5394      2642523 :   stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
    5395      2642523 :   if (STMT_VINFO_GROUPED_ACCESS (a0)
    5396      1537931 :       && STMT_VINFO_GROUPED_ACCESS (b0)
    5397      4118996 :       && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
    5398              :     {
    5399              :       /* Same group, order after lanes used.  */
    5400       343995 :       if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
    5401              :         return 1;
    5402       335212 :       else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
    5403              :         return -1;
    5404              :       else
    5405              :         {
    5406              :           /* Try to order loads using the same lanes together, breaking
    5407              :              the tie with the lane number that first differs.  */
    5408       325678 :           if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
    5409       325678 :               && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
    5410              :             return 0;
    5411       325678 :           else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
    5412       325678 :                    && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
    5413              :             return 1;
    5414       321635 :           else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
    5415       321635 :                    && SLP_TREE_LOAD_PERMUTATION (b).exists ())
    5416              :             return -1;
    5417              :           else
    5418              :             {
    5419       314249 :               for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
    5420       314249 :                 if (SLP_TREE_LOAD_PERMUTATION (a)[i]
    5421       314249 :                     != SLP_TREE_LOAD_PERMUTATION (b)[i])
    5422              :                   {
    5423              :                     /* In-order lane first, that's what the above case for
    5424              :                        no permutation does.  */
    5425       312937 :                     if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
    5426              :                       return -1;
    5427       191758 :                     else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
    5428              :                       return 1;
    5429       100787 :                     else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
    5430       100787 :                              < SLP_TREE_LOAD_PERMUTATION (b)[i])
    5431              :                       return -1;
    5432              :                     else
    5433              :                       return 1;
    5434              :                   }
    5435              :               return 0;
    5436              :             }
    5437              :         }
    5438              :     }
    5439              :   else /* Different groups or non-groups.  */
    5440              :     {
    5441              :       /* Order groups as their first element to keep them together.  */
    5442      2298528 :       if (STMT_VINFO_GROUPED_ACCESS (a0))
    5443      2298528 :         a0 = DR_GROUP_FIRST_ELEMENT (a0);
    5444      2298528 :       if (STMT_VINFO_GROUPED_ACCESS (b0))
    5445      2298528 :         b0 = DR_GROUP_FIRST_ELEMENT (b0);
    5446      2298528 :       if (a0 == b0)
    5447              :         return 0;
    5448              :       /* Tie using UID.  */
    5449      2298408 :       else if (gimple_uid (STMT_VINFO_STMT (a0))
    5450      2298408 :                < gimple_uid (STMT_VINFO_STMT (b0)))
    5451              :         return -1;
    5452              :       else
    5453              :         {
    5454      1020907 :           gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
    5455              :                       != gimple_uid (STMT_VINFO_STMT (b0)));
    5456              :           return 1;
    5457              :         }
    5458              :     }
    5459              : }
    5460              : 
    5461              : /* Return whether if the load permutation of NODE is consecutive starting
    5462              :    with value START_VAL in the first element.  If START_VAL is not given
    5463              :    the first element's value is used.  */
    5464              : 
    5465              : bool
    5466       623055 : vect_load_perm_consecutive_p (slp_tree node, unsigned start_val)
    5467              : {
    5468       623055 :   load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
    5469              : 
    5470       623055 :   if (!perm.exists () || !perm.length ())
    5471              :     return false;
    5472              : 
    5473       623055 :   if (start_val == UINT_MAX)
    5474        79184 :     start_val = perm[0];
    5475              : 
    5476      1230186 :   for (unsigned int i = 0; i < perm.length (); i++)
    5477       630332 :     if (perm[i] != start_val + (unsigned int) i)
    5478              :       return false;
    5479              : 
    5480              :   return true;
    5481              : }
    5482              : 
    5483              : /* Process the set of LOADS that are all from the same dataref group.  */
    5484              : 
    5485              : static void
    5486       161048 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
    5487              :                               scalar_stmts_to_slp_tree_map_t *bst_map,
    5488              :                               const array_slice<slp_tree> &loads,
    5489              :                               bool force_single_lane)
    5490              : {
    5491              :   /* We at this point want to lower without a fixed VF or vector
    5492              :      size in mind which means we cannot actually compute whether we
    5493              :      need three or more vectors for a load permutation yet.  So always
    5494              :      lower.  */
    5495       161048 :   stmt_vec_info first
    5496       161048 :     = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
    5497       161048 :   unsigned group_lanes = DR_GROUP_SIZE (first);
    5498              : 
    5499              :   /* Verify if all load permutations can be implemented with a suitably
    5500              :      large element load-lanes operation.  */
    5501       161048 :   unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
    5502       161048 :   if (STMT_VINFO_STRIDED_P (first)
    5503       158755 :       || compare_step_with_zero (loop_vinfo, first) <= 0
    5504       156097 :       || exact_log2 (ld_lanes_lanes) == -1
    5505              :       /* ???  For now only support the single-lane case as there is
    5506              :          missing support on the store-lane side and code generation
    5507              :          isn't up to the task yet.  */
    5508       153312 :       || ld_lanes_lanes != 1
    5509       303410 :       || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
    5510              :                                     group_lanes / ld_lanes_lanes,
    5511              :                                     false) == IFN_LAST)
    5512              :     ld_lanes_lanes = 0;
    5513              :   else
    5514              :     /* Verify the loads access the same number of lanes aligned to
    5515              :        ld_lanes_lanes.  */
    5516            0 :     for (slp_tree load : loads)
    5517              :       {
    5518            0 :         if (SLP_TREE_LANES (load) != ld_lanes_lanes)
    5519              :           {
    5520              :             ld_lanes_lanes = 0;
    5521              :             break;
    5522              :           }
    5523            0 :         unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
    5524            0 :         if (first % ld_lanes_lanes != 0)
    5525              :           {
    5526              :             ld_lanes_lanes = 0;
    5527              :             break;
    5528              :           }
    5529            0 :         if (!vect_load_perm_consecutive_p (load))
    5530              :           {
    5531              :             ld_lanes_lanes = 0;
    5532              :             break;
    5533              :           }
    5534              :       }
    5535              : 
    5536              :   /* Only a power-of-two number of lanes matches interleaving with N levels.
    5537              :      ???  An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
    5538              :      at each step.  */
    5539       261768 :   if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
    5540              :     return;
    5541              : 
    5542       264142 :   for (slp_tree load : loads)
    5543              :     {
    5544              :       /* Leave masked or gather loads alone for now.  */
    5545       186462 :       if (!SLP_TREE_CHILDREN (load).is_empty ())
    5546        60028 :         continue;
    5547              : 
    5548              :       /* For single-element interleaving spanning multiple vectors avoid
    5549              :          lowering, we want to use VMAT_ELEMENTWISE later.  */
    5550       186456 :       if (ld_lanes_lanes == 0
    5551       186456 :           && SLP_TREE_LANES (load) == 1
    5552       167228 :           && !DR_GROUP_NEXT_ELEMENT (first)
    5553       265901 :           && maybe_gt (group_lanes,
    5554              :                        TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (load))))
    5555        51246 :         return;
    5556              : 
    5557              :       /* We want to pattern-match special cases here and keep those
    5558              :          alone.  Candidates are splats and load-lane.  */
    5559              : 
    5560              :       /* We need to lower only loads of less than half of the groups
    5561              :          lanes, including duplicate lanes.  Note this leaves nodes
    5562              :          with a non-1:1 load permutation around instead of canonicalizing
    5563              :          those into a load and a permute node.  Removing this early
    5564              :          check would do such canonicalization.  */
    5565       135210 :       if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
    5566        56454 :           && ld_lanes_lanes == 0)
    5567        56454 :         continue;
    5568              : 
    5569              :       /* Build the permute to get the original load permutation order.  */
    5570        78756 :       bool contiguous = vect_load_perm_consecutive_p (load);
    5571        78756 :       lane_permutation_t final_perm;
    5572        78756 :       final_perm.create (SLP_TREE_LANES (load));
    5573       158426 :       for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
    5574       159340 :         final_perm.quick_push (
    5575        79670 :           std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
    5576              : 
    5577              :       /* When the load permutation accesses a contiguous unpermuted,
    5578              :          power-of-two aligned and sized chunk leave the load alone.
    5579              :          We can likely (re-)load it more efficiently rather than
    5580              :          extracting it from the larger load.
    5581              :          ???  Long-term some of the lowering should move to where
    5582              :          the vector types involved are fixed.  */
    5583        82324 :       if (!force_single_lane
    5584        78756 :           && ld_lanes_lanes == 0
    5585        53099 :           && contiguous
    5586        52856 :           && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
    5587         6563 :           && pow2p_hwi (SLP_TREE_LANES (load))
    5588         6527 :           && pow2p_hwi (group_lanes)
    5589         3568 :           && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
    5590        82324 :           && group_lanes % SLP_TREE_LANES (load) == 0)
    5591              :         {
    5592         3568 :           final_perm.release ();
    5593         3568 :           continue;
    5594              :         }
    5595              : 
    5596              :       /* First build (and possibly re-use) a load node for the
    5597              :          unpermuted group.  Gaps in the middle and on the end are
    5598              :          represented with NULL stmts.  */
    5599        75188 :       vec<stmt_vec_info> stmts;
    5600        75188 :       stmts.create (group_lanes);
    5601       267221 :       for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
    5602              :         {
    5603       192033 :           if (s != first)
    5604       121598 :             for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
    5605         4753 :               stmts.quick_push (NULL);
    5606       192033 :           stmts.quick_push (s);
    5607              :         }
    5608       137098 :       for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
    5609        61910 :         stmts.quick_push (NULL);
    5610        75188 :       poly_uint64 max_nunits = 1;
    5611        75188 :       bool *matches = XALLOCAVEC (bool, group_lanes);
    5612        75188 :       unsigned limit = 1;
    5613        75188 :       unsigned tree_size = 0;
    5614        75188 :       slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
    5615              :                                          group_lanes,
    5616              :                                          &max_nunits, matches, &limit,
    5617        75188 :                                          &tree_size, bst_map);
    5618        75188 :       gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0).exists ());
    5619              : 
    5620        75188 :       if (ld_lanes_lanes != 0)
    5621              :         {
    5622              :           /* ???  If this is not in sync with what get_load_store_type
    5623              :              later decides the SLP representation is not good for other
    5624              :              store vectorization methods.  */
    5625            0 :           l0->ldst_lanes = true;
    5626            0 :           load->ldst_lanes = true;
    5627              :         }
    5628              : 
    5629       233230 :       while (1)
    5630              :         {
    5631       154209 :           unsigned group_lanes = SLP_TREE_LANES (l0);
    5632       154209 :           if (ld_lanes_lanes != 0
    5633       154209 :               || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
    5634              :             break;
    5635              : 
    5636              :           /* Try to lower by reducing the group to half its size using an
    5637              :              interleaving scheme.  For this try to compute whether all
    5638              :              elements needed for this load are in even or odd elements of
    5639              :              an even/odd decomposition with N consecutive elements.
    5640              :              Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
    5641              :              with N == 2.  */
    5642              :           /* ???  Only an even number of lanes can be handed this way, but the
    5643              :              fallback below could work for any number.  We have to make sure
    5644              :              to round up in that case.  */
    5645        79021 :           gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
    5646        11009 :           unsigned even = 0, odd = 0;
    5647        11009 :           if ((group_lanes & 1) == 0)
    5648              :             {
    5649        11009 :               even = (1 << ceil_log2 (group_lanes)) - 1;
    5650        11009 :               odd = even;
    5651        44713 :               for (auto l : final_perm)
    5652              :                 {
    5653        11686 :                   even &= ~l.second;
    5654        11686 :                   odd &= l.second;
    5655              :                 }
    5656              :             }
    5657              : 
    5658              :           /* Now build an even or odd extraction from the unpermuted load.  */
    5659        79021 :           lane_permutation_t perm;
    5660        79021 :           perm.create ((group_lanes + 1) / 2);
    5661        79021 :           unsigned even_level = even ? 1 << ctz_hwi (even) : 0;
    5662        79021 :           unsigned odd_level = odd ? 1 << ctz_hwi (odd) : 0;
    5663        79021 :           if (even_level
    5664        10092 :               && group_lanes % (2 * even_level) == 0
    5665              :               /* ???  When code generating permutes we do not try to pun
    5666              :                  to larger component modes so level != 1 isn't a natural
    5667              :                  even/odd extract.  Prefer one if possible.  */
    5668        10092 :               && (even_level == 1 || !odd_level || odd_level != 1))
    5669              :             {
    5670              :               /* { 0, 1, ... 4, 5 ..., } */
    5671        36375 :               for (unsigned i = 0; i < group_lanes / 2 / even_level; ++i)
    5672        57438 :                 for (unsigned j = 0; j < even_level; ++j)
    5673        28892 :                   perm.quick_push (std::make_pair (0, 2 * i * even_level + j));
    5674              :             }
    5675        68929 :           else if (odd_level)
    5676              :             {
    5677              :               /* { ..., 2, 3, ... 6, 7 } */
    5678         3150 :               gcc_assert (group_lanes % (2 * odd_level) == 0);
    5679        13714 :               for (unsigned i = 0; i < group_lanes / 2 / odd_level; ++i)
    5680        21182 :                 for (unsigned j = 0; j < odd_level; ++j)
    5681        10618 :                   perm.quick_push
    5682        10618 :                     (std::make_pair (0, (2 * i + 1) * odd_level + j));
    5683              :             }
    5684              :           else
    5685              :             {
    5686              :               /* As fallback extract all used lanes and fill to half the
    5687              :                  group size by repeating the last element.
    5688              :                  ???  This is quite a bad strathegy for re-use - we could
    5689              :                  brute force our way to find more optimal filling lanes to
    5690              :                  maximize re-use when looking at all loads from the group.  */
    5691        68042 :               auto_bitmap l;
    5692       272224 :               for (auto p : final_perm)
    5693        68098 :                 bitmap_set_bit (l, p.second);
    5694        68042 :               unsigned i = 0;
    5695        68042 :               bitmap_iterator bi;
    5696       136140 :               EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
    5697        68098 :                   perm.quick_push (std::make_pair (0, i));
    5698       272320 :               while (perm.length () < (group_lanes + 1) / 2)
    5699        68118 :                 perm.quick_push (perm.last ());
    5700        68042 :             }
    5701              : 
    5702              :           /* Update final_perm with the intermediate permute.  */
    5703       158719 :           for (unsigned i = 0; i < final_perm.length (); ++i)
    5704              :             {
    5705        79698 :               unsigned l = final_perm[i].second;
    5706        79698 :               unsigned j;
    5707        88146 :               for (j = 0; j < perm.length (); ++j)
    5708        88146 :                 if (perm[j].second == l)
    5709              :                   {
    5710        79698 :                     final_perm[i].second = j;
    5711        79698 :                     break;
    5712              :                   }
    5713        79698 :               gcc_assert (j < perm.length ());
    5714              :             }
    5715              : 
    5716              :           /* And create scalar stmts.  */
    5717        79021 :           vec<stmt_vec_info> perm_stmts;
    5718        79021 :           perm_stmts.create (perm.length ());
    5719       254747 :           for (unsigned i = 0; i < perm.length (); ++i)
    5720       175726 :             perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
    5721              : 
    5722        79021 :           slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
    5723        79021 :           SLP_TREE_CHILDREN (p).quick_push (l0);
    5724        79021 :           SLP_TREE_LANE_PERMUTATION (p) = perm;
    5725        79021 :           SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
    5726        79021 :           SLP_TREE_LANES (p) = perm.length ();
    5727        79021 :           SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
    5728              :           /* ???  As we have scalar stmts for this intermediate permute we
    5729              :              could CSE it via bst_map but we do not want to pick up
    5730              :              another SLP node with a load permutation.  We instead should
    5731              :              have a "local" CSE map here.  */
    5732        79021 :           SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
    5733              : 
    5734              :           /* We now have a node for (group_lanes + 1) / 2 lanes.  */
    5735        79021 :           l0 = p;
    5736        79021 :         }
    5737              : 
    5738              :       /* And finally from the ordered reduction node create the
    5739              :          permute to shuffle the lanes into the original load-permutation
    5740              :          order.  We replace the original load node with this.  */
    5741        75188 :       SLP_TREE_CODE (load) = VEC_PERM_EXPR;
    5742        75188 :       SLP_TREE_LOAD_PERMUTATION (load).release ();
    5743        75188 :       SLP_TREE_LANE_PERMUTATION (load) = final_perm;
    5744        75188 :       SLP_TREE_CHILDREN (load).create (1);
    5745        75188 :       SLP_TREE_CHILDREN (load).quick_push (l0);
    5746              :     }
    5747              : }
    5748              : 
    5749              : /* Transform SLP loads in the SLP graph created by SLP discovery to
    5750              :    group loads from the same group and lower load permutations that
    5751              :    are unlikely to be supported into a series of permutes.
    5752              :    In the degenerate case of having only single-lane SLP instances
    5753              :    this should result in a series of permute nodes emulating an
    5754              :    interleaving scheme.  */
    5755              : 
    5756              : static void
    5757       473922 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
    5758              :                               scalar_stmts_to_slp_tree_map_t *bst_map,
    5759              :                               bool force_single_lane)
    5760              : {
    5761              :   /* Gather and sort loads across all instances.  */
    5762       473922 :   hash_set<slp_tree> visited;
    5763       473922 :   auto_vec<slp_tree> loads;
    5764      2180441 :   for (auto inst : loop_vinfo->slp_instances)
    5765       760609 :     vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
    5766       473922 :   if (loads.is_empty ())
    5767        89883 :     return;
    5768       384039 :   loads.qsort (vllp_cmp);
    5769              : 
    5770              :   /* Now process each dataref group separately.  */
    5771       384039 :   unsigned firsti = 0;
    5772       718994 :   for (unsigned i = 1; i < loads.length (); ++i)
    5773              :     {
    5774       334955 :       slp_tree first = loads[firsti];
    5775       334955 :       slp_tree next = loads[i];
    5776       334955 :       stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
    5777       334955 :       stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
    5778       334955 :       if (STMT_VINFO_GROUPED_ACCESS (a0)
    5779       157621 :           && STMT_VINFO_GROUPED_ACCESS (b0)
    5780       479513 :           && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
    5781        62679 :         continue;
    5782              :       /* Now we have one or multiple SLP loads of the same group from
    5783              :          firsti to i - 1.  */
    5784       272276 :       if (STMT_VINFO_GROUPED_ACCESS (a0))
    5785        94942 :         vect_lower_load_permutations (loop_vinfo, bst_map,
    5786        94942 :                                       make_array_slice (&loads[firsti],
    5787              :                                                         i - firsti),
    5788              :                                       force_single_lane);
    5789              :       firsti = i;
    5790              :     }
    5791       768078 :   if (firsti < loads.length ()
    5792       768078 :       && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
    5793        66106 :     vect_lower_load_permutations (loop_vinfo, bst_map,
    5794        66106 :                                   make_array_slice (&loads[firsti],
    5795        66106 :                                                     loads.length () - firsti),
    5796              :                                   force_single_lane);
    5797       473922 : }
    5798              : 
    5799              : /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
    5800              :    trees of packed scalar stmts if SLP is possible.  */
    5801              : 
    5802              : opt_result
    5803      1113267 : vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
    5804              :                   bool force_single_lane)
    5805              : {
    5806      1113267 :   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
    5807      1113267 :   unsigned int i;
    5808      1113267 :   stmt_vec_info first_element;
    5809      1113267 :   slp_instance instance;
    5810              : 
    5811      1113267 :   DUMP_VECT_SCOPE ("vect_analyze_slp");
    5812              : 
    5813      1113267 :   unsigned limit = max_tree_size;
    5814              : 
    5815      1113267 :   scalar_stmts_to_slp_tree_map_t *bst_map
    5816      1113267 :     = new scalar_stmts_to_slp_tree_map_t ();
    5817              : 
    5818              :   /* Find SLP sequences starting from groups of grouped stores.  */
    5819      3149264 :   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
    5820       922999 :     if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
    5821              :                                      slp_inst_kind_store, max_tree_size, &limit,
    5822              :                                      force_single_lane)
    5823       922999 :         && loop_vinfo)
    5824              :       {
    5825          269 :         release_scalar_stmts_to_slp_tree_map (bst_map);
    5826          269 :         return opt_result::failure_at (vect_location, "SLP build failed.\n");
    5827              :       }
    5828              : 
    5829              :   /* For loops also start SLP discovery from non-grouped stores.  */
    5830      1112998 :   if (loop_vinfo)
    5831              :     {
    5832              :       data_reference_p dr;
    5833      1630458 :       FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
    5834      1138773 :         if (DR_IS_WRITE (dr))
    5835              :           {
    5836       370755 :             stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
    5837              :             /* Grouped stores are already handled above.  */
    5838       370755 :             if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    5839        99638 :               continue;
    5840       271117 :             vec<stmt_vec_info> stmts;
    5841       271117 :             vec<stmt_vec_info> roots = vNULL;
    5842       271117 :             vec<tree> remain = vNULL;
    5843       271117 :             stmts.create (1);
    5844       271117 :             stmts.quick_push (stmt_info);
    5845       271117 :             if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
    5846              :                                            stmts, roots, remain, max_tree_size,
    5847              :                                            &limit, bst_map, force_single_lane))
    5848              :               {
    5849         6930 :                 release_scalar_stmts_to_slp_tree_map (bst_map);
    5850         6930 :                 return opt_result::failure_at (vect_location,
    5851              :                                                "SLP build failed.\n");
    5852              :               }
    5853              :           }
    5854              : 
    5855              :       stmt_vec_info stmt_info;
    5856       491725 :       FOR_EACH_VEC_ELT (LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo), i, stmt_info)
    5857              :         {
    5858           20 :           vec<stmt_vec_info> stmts;
    5859           20 :           vec<stmt_vec_info> roots = vNULL;
    5860           20 :           vec<tree> remain = vNULL;
    5861           20 :           stmts.create (1);
    5862           20 :           stmts.quick_push (stmt_info);
    5863           20 :           if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
    5864              :                                          stmts, roots, remain, max_tree_size,
    5865              :                                          &limit, bst_map, force_single_lane))
    5866              :             {
    5867            0 :               release_scalar_stmts_to_slp_tree_map (bst_map);
    5868            0 :               return opt_result::failure_at (vect_location,
    5869              :                                              "SLP build failed.\n");
    5870              :             }
    5871              :         }
    5872              :     }
    5873              : 
    5874      1106068 :   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
    5875              :     {
    5876      1857585 :       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
    5877              :         {
    5878      1243202 :           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
    5879              :           /* Apply patterns.  */
    5880      3884056 :           for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
    5881      5281708 :             bb_vinfo->roots[i].stmts[j]
    5882      2723176 :               = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
    5883      1243202 :           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
    5884      1243202 :                                        bb_vinfo->roots[i].stmts,
    5885      1243202 :                                        bb_vinfo->roots[i].roots,
    5886      1243202 :                                        bb_vinfo->roots[i].remain,
    5887              :                                        max_tree_size, &limit, bst_map, false))
    5888              :             {
    5889       128828 :               bb_vinfo->roots[i].roots = vNULL;
    5890       128828 :               bb_vinfo->roots[i].remain = vNULL;
    5891              :             }
    5892      1243202 :           bb_vinfo->roots[i].stmts = vNULL;
    5893              :         }
    5894              :     }
    5895              : 
    5896      1106068 :   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
    5897              :     {
    5898              :       /* Find SLP sequences starting from groups of reductions.  */
    5899       491685 :       if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit,
    5900              :                                         bst_map, force_single_lane))
    5901              :         {
    5902         3008 :           release_scalar_stmts_to_slp_tree_map (bst_map);
    5903         3008 :           return opt_result::failure_at (vect_location, "SLP build failed.\n");
    5904              :         }
    5905              : 
    5906              :       /* Make sure to vectorize only-live stmts, usually inductions.  */
    5907      2200801 :       for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
    5908      1423481 :         for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
    5909       679370 :              gsi_next (&gsi))
    5910              :           {
    5911       688711 :             gphi *lc_phi = *gsi;
    5912       688711 :             tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
    5913       688711 :             stmt_vec_info stmt_info;
    5914       688711 :             if (TREE_CODE (def) == SSA_NAME
    5915       576843 :                 && !virtual_operand_p (def)
    5916       299961 :                 && (stmt_info = loop_vinfo->lookup_def (def))
    5917       268605 :                 && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
    5918       268605 :                 && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
    5919       208767 :                 && STMT_VINFO_LIVE_P (stmt_info)
    5920       208767 :                 && !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))
    5921       796054 :                 && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
    5922              :               {
    5923       107256 :                 vec<stmt_vec_info> stmts;
    5924       107256 :                 vec<stmt_vec_info> roots = vNULL;
    5925       107256 :                 vec<tree> remain = vNULL;
    5926       107256 :                 stmts.create (1);
    5927       107256 :                 stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
    5928       107256 :                 if (! vect_build_slp_instance (vinfo,
    5929              :                                                slp_inst_kind_reduc_group,
    5930              :                                                stmts, roots, remain,
    5931              :                                                max_tree_size, &limit,
    5932              :                                                bst_map, force_single_lane))
    5933              :                   {
    5934         9341 :                     release_scalar_stmts_to_slp_tree_map (bst_map);
    5935         9341 :                     return opt_result::failure_at (vect_location,
    5936              :                                                    "SLP build failed.\n");
    5937              :                   }
    5938              :               }
    5939         9341 :           }
    5940              : 
    5941              :       /* Find SLP sequences starting from gconds.  */
    5942      1190700 :       for (auto cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
    5943              :         {
    5944       279302 :           auto cond_info = loop_vinfo->lookup_stmt (cond);
    5945              : 
    5946       279302 :           cond_info = vect_stmt_to_vectorize (cond_info);
    5947       279302 :           vec<stmt_vec_info> roots = vNULL;
    5948       279302 :           roots.safe_push (cond_info);
    5949       279302 :           gimple *stmt = STMT_VINFO_STMT (cond_info);
    5950       279302 :           tree args0 = gimple_cond_lhs (stmt);
    5951       279302 :           tree args1 = gimple_cond_rhs (stmt);
    5952              : 
    5953              :           /* These should be enforced by cond lowering, but if it failed
    5954              :              bail.  */
    5955       279302 :           if (gimple_cond_code (stmt) != NE_EXPR
    5956       278218 :               || TREE_TYPE (args0) != boolean_type_node
    5957       556952 :               || !integer_zerop (args1))
    5958              :             {
    5959         1652 :               roots.release ();
    5960         1652 :               release_scalar_stmts_to_slp_tree_map (bst_map);
    5961         1652 :               return opt_result::failure_at (vect_location,
    5962              :                                              "SLP build failed.\n");
    5963              :             }
    5964              : 
    5965              :           /* An argument without a loop def will be codegened from vectorizing the
    5966              :              root gcond itself.  As such we don't need to try to build an SLP tree
    5967              :              from them.  It's highly likely that the resulting SLP tree here if both
    5968              :              arguments have a def will be incompatible, but we rely on it being split
    5969              :              later on.  */
    5970       277650 :           auto varg = loop_vinfo->lookup_def (args0);
    5971       277650 :           vec<stmt_vec_info> stmts;
    5972       277650 :           vec<tree> remain = vNULL;
    5973       277650 :           stmts.create (1);
    5974       277650 :           stmts.quick_push (vect_stmt_to_vectorize (varg));
    5975              : 
    5976       277650 :           if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
    5977              :                                          stmts, roots, remain,
    5978              :                                          max_tree_size, &limit,
    5979              :                                          bst_map, force_single_lane))
    5980              :             {
    5981         3762 :               roots.release ();
    5982         3762 :               release_scalar_stmts_to_slp_tree_map (bst_map);
    5983         3762 :               return opt_result::failure_at (vect_location,
    5984              :                                              "SLP build failed.\n");
    5985              :             }
    5986              :         }
    5987              :     }
    5988              : 
    5989      1088305 :   hash_set<slp_tree> visited_patterns;
    5990      1088305 :   slp_tree_to_load_perm_map_t perm_cache;
    5991      1088305 :   slp_compat_nodes_map_t compat_cache;
    5992              : 
    5993              :   /* See if any patterns can be found in the SLP tree.  */
    5994      1088305 :   bool pattern_found = false;
    5995      3729461 :   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    5996      1552851 :     pattern_found |= vect_match_slp_patterns (instance, vinfo,
    5997              :                                               &visited_patterns, &perm_cache,
    5998              :                                               &compat_cache);
    5999              : 
    6000              :   /* If any were found optimize permutations of loads.  */
    6001      1088305 :   if (pattern_found)
    6002              :     {
    6003          285 :       hash_map<slp_tree, slp_tree> load_map;
    6004         3421 :       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6005              :         {
    6006         2851 :           slp_tree root = SLP_INSTANCE_TREE (instance);
    6007         2851 :           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
    6008              :                                         &load_map, root);
    6009              :         }
    6010          285 :     }
    6011              : 
    6012              :   /* Check whether we should force some SLP instances to use load/store-lanes
    6013              :      and do so by forcing SLP re-discovery with single lanes.  We used
    6014              :      to cancel SLP when this applied to all instances in a loop but now
    6015              :      we decide this per SLP instance.  It's important to do this only
    6016              :      after SLP pattern recognition.  */
    6017      1088305 :   if (is_a <loop_vec_info> (vinfo))
    6018      1234531 :     FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6019       760609 :       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
    6020       291398 :           && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
    6021              :         {
    6022       291398 :           slp_tree slp_root = SLP_INSTANCE_TREE (instance);
    6023       291398 :           unsigned int group_size = SLP_TREE_LANES (slp_root);
    6024       291398 :           tree vectype = SLP_TREE_VECTYPE (slp_root);
    6025              : 
    6026       291398 :           stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
    6027       291398 :           gimple *rep = STMT_VINFO_STMT (rep_info);
    6028       291398 :           bool masked = (is_gimple_call (rep)
    6029         2556 :                          && gimple_call_internal_p (rep)
    6030       293934 :                          && internal_fn_mask_index
    6031         2536 :                               (gimple_call_internal_fn (rep)) != -1);
    6032       291378 :           if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
    6033        28993 :               || slp_root->ldst_lanes
    6034       320391 :               || (vect_store_lanes_supported (vectype, group_size, masked)
    6035              :                   == IFN_LAST))
    6036       291398 :             continue;
    6037              : 
    6038            0 :           auto_vec<slp_tree> loads;
    6039            0 :           hash_set<slp_tree> visited;
    6040            0 :           vect_gather_slp_loads (loads, slp_root, visited);
    6041              : 
    6042              :           /* Check whether any load in the SLP instance is possibly
    6043              :              permuted.  */
    6044            0 :           bool loads_permuted = false;
    6045            0 :           slp_tree load_node;
    6046            0 :           unsigned j;
    6047            0 :           FOR_EACH_VEC_ELT (loads, j, load_node)
    6048              :             {
    6049            0 :               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
    6050            0 :                 continue;
    6051              :               unsigned k;
    6052              :               stmt_vec_info load_info;
    6053            0 :               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
    6054            0 :                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
    6055              :                   {
    6056              :                     loads_permuted = true;
    6057              :                     break;
    6058              :                   }
    6059              :             }
    6060              : 
    6061              :           /* If the loads and stores can use load/store-lanes force re-discovery
    6062              :              with single lanes.  */
    6063            0 :           if (loads_permuted)
    6064              :             {
    6065            0 :               bool can_use_lanes = true;
    6066              :               bool prefer_load_lanes = false;
    6067            0 :               FOR_EACH_VEC_ELT (loads, j, load_node)
    6068            0 :                 if (STMT_VINFO_GROUPED_ACCESS
    6069              :                       (SLP_TREE_REPRESENTATIVE (load_node)))
    6070              :                   {
    6071            0 :                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
    6072              :                         (SLP_TREE_REPRESENTATIVE (load_node));
    6073            0 :                     rep = STMT_VINFO_STMT (stmt_vinfo);
    6074            0 :                     masked = (is_gimple_call (rep)
    6075            0 :                               && gimple_call_internal_p (rep)
    6076            0 :                               && internal_fn_mask_index
    6077            0 :                                    (gimple_call_internal_fn (rep)));
    6078              :                     /* Use SLP for strided accesses (or if we can't
    6079              :                        load-lanes).  */
    6080            0 :                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
    6081            0 :                         || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
    6082            0 :                         || vect_load_lanes_supported
    6083            0 :                              (SLP_TREE_VECTYPE (load_node),
    6084            0 :                               DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
    6085              :                         /* ???  During SLP re-discovery with a single lane
    6086              :                            a masked grouped load will appear permuted and
    6087              :                            discovery will fail.  We have to rework this
    6088              :                            on the discovery side - for now avoid ICEing.  */
    6089            0 :                         || masked)
    6090              :                       {
    6091              :                         can_use_lanes = false;
    6092              :                         break;
    6093              :                       }
    6094              :                     /* Make sure that the target would prefer store-lanes
    6095              :                        for at least one of the loads.
    6096              : 
    6097              :                        ??? Perhaps we should instead require this for
    6098              :                        all loads?  */
    6099            0 :                     prefer_load_lanes
    6100              :                       = (prefer_load_lanes
    6101            0 :                          || SLP_TREE_LANES (load_node) == group_size
    6102            0 :                          || (vect_slp_prefer_store_lanes_p
    6103            0 :                              (vinfo, stmt_vinfo,
    6104              :                               SLP_TREE_VECTYPE (load_node), masked,
    6105              :                               group_size, SLP_TREE_LANES (load_node))));
    6106              :                   }
    6107              : 
    6108            0 :               if (can_use_lanes && prefer_load_lanes)
    6109              :                 {
    6110            0 :                   if (dump_enabled_p ())
    6111            0 :                     dump_printf_loc (MSG_NOTE, vect_location,
    6112              :                                      "SLP instance %p can use load/store-lanes,"
    6113              :                                      " re-discovering with single-lanes\n",
    6114              :                                      (void *) instance);
    6115              : 
    6116            0 :                   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
    6117              : 
    6118            0 :                   vect_free_slp_instance (instance);
    6119            0 :                   limit = max_tree_size;
    6120            0 :                   bool res = vect_analyze_slp_instance (vinfo, bst_map,
    6121              :                                                         stmt_info,
    6122              :                                                         slp_inst_kind_store,
    6123              :                                                         max_tree_size, &limit,
    6124              :                                                         true);
    6125            0 :                   gcc_assert (res);
    6126            0 :                   auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
    6127            0 :                   LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
    6128              :                 }
    6129              :             }
    6130            0 :         }
    6131              : 
    6132              :   /* When we end up with load permutations that we cannot possibly handle,
    6133              :      like those requiring three vector inputs, lower them using interleaving
    6134              :      like schemes.  */
    6135      1088305 :   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
    6136              :     {
    6137       473922 :       vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
    6138       473922 :       if (dump_enabled_p ())
    6139              :         {
    6140        19971 :           dump_printf_loc (MSG_NOTE, vect_location,
    6141              :                            "SLP graph after lowering permutations:\n");
    6142        19971 :           hash_set<slp_tree> visited;
    6143        89069 :           FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6144        29181 :             vect_print_slp_graph (MSG_NOTE, vect_location,
    6145              :                                   SLP_INSTANCE_TREE (instance), visited);
    6146        19971 :         }
    6147              :     }
    6148              : 
    6149      1088305 :   release_scalar_stmts_to_slp_tree_map (bst_map);
    6150              : 
    6151      1088305 :   if (pattern_found && dump_enabled_p ())
    6152              :     {
    6153           18 :       dump_printf_loc (MSG_NOTE, vect_location,
    6154              :                        "Pattern matched SLP tree\n");
    6155           18 :       hash_set<slp_tree> visited;
    6156           90 :       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6157           36 :         vect_print_slp_graph (MSG_NOTE, vect_location,
    6158              :                               SLP_INSTANCE_TREE (instance), visited);
    6159           18 :     }
    6160              : 
    6161      1088305 :   return opt_result::success ();
    6162      1088305 : }
    6163              : 
    6164              : /* Estimates the cost of inserting layout changes into the SLP graph.
    6165              :    It can also say that the insertion is impossible.  */
    6166              : 
    6167              : struct slpg_layout_cost
    6168              : {
    6169     10629581 :   slpg_layout_cost () = default;
    6170              :   slpg_layout_cost (sreal, bool);
    6171              : 
    6172       499113 :   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
    6173      5510603 :   bool is_possible () const { return depth != sreal::max (); }
    6174              : 
    6175              :   bool operator== (const slpg_layout_cost &) const;
    6176              :   bool operator!= (const slpg_layout_cost &) const;
    6177              : 
    6178              :   bool is_better_than (const slpg_layout_cost &, bool) const;
    6179              : 
    6180              :   void add_parallel_cost (const slpg_layout_cost &);
    6181              :   void add_serial_cost (const slpg_layout_cost &);
    6182              :   void split (unsigned int);
    6183              : 
    6184              :   /* The longest sequence of layout changes needed during any traversal
    6185              :      of the partition dag, weighted by execution frequency.
    6186              : 
    6187              :      This is the most important metric when optimizing for speed, since
    6188              :      it helps to ensure that we keep the number of operations on
    6189              :      critical paths to a minimum.  */
    6190              :   sreal depth = 0;
    6191              : 
    6192              :   /* An estimate of the total number of operations needed.  It is weighted by
    6193              :      execution frequency when optimizing for speed but not when optimizing for
    6194              :      size.  In order to avoid double-counting, a node with a fanout of N will
    6195              :      distribute 1/N of its total cost to each successor.
    6196              : 
    6197              :      This is the most important metric when optimizing for size, since
    6198              :      it helps to keep the total number of operations to a minimum,  */
    6199              :   sreal total = 0;
    6200              : };
    6201              : 
    6202              : /* Construct costs for a node with weight WEIGHT.  A higher weight
    6203              :    indicates more frequent execution.  IS_FOR_SIZE is true if we are
    6204              :    optimizing for size rather than speed.  */
    6205              : 
    6206      1301522 : slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
    6207      1302390 :   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
    6208              : {
    6209      1301522 : }
    6210              : 
    6211              : bool
    6212            0 : slpg_layout_cost::operator== (const slpg_layout_cost &other) const
    6213              : {
    6214            0 :   return depth == other.depth && total == other.total;
    6215              : }
    6216              : 
    6217              : bool
    6218            0 : slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
    6219              : {
    6220            0 :   return !operator== (other);
    6221              : }
    6222              : 
    6223              : /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
    6224              :    true if we are optimizing for size rather than speed.  */
    6225              : 
    6226              : bool
    6227       321743 : slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
    6228              :                                   bool is_for_size) const
    6229              : {
    6230       321743 :   if (is_for_size)
    6231              :     {
    6232          382 :       if (total != other.total)
    6233          155 :         return total < other.total;
    6234          227 :       return depth < other.depth;
    6235              :     }
    6236              :   else
    6237              :     {
    6238       321361 :       if (depth != other.depth)
    6239       137023 :         return depth < other.depth;
    6240       184338 :       return total < other.total;
    6241              :     }
    6242              : }
    6243              : 
    6244              : /* Increase the costs to account for something with cost INPUT_COST
    6245              :    happening in parallel with the current costs.  */
    6246              : 
    6247              : void
    6248       385614 : slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
    6249              : {
    6250       385614 :   depth = std::max (depth, input_cost.depth);
    6251       385614 :   total += input_cost.total;
    6252       385614 : }
    6253              : 
    6254              : /* Increase the costs to account for something with cost INPUT_COST
    6255              :    happening in series with the current costs.  */
    6256              : 
    6257              : void
    6258      1560665 : slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
    6259              : {
    6260      1560665 :   depth += other.depth;
    6261      1560665 :   total += other.total;
    6262      1560665 : }
    6263              : 
    6264              : /* Split the total cost among TIMES successors or predecessors.  */
    6265              : 
    6266              : void
    6267      1297356 : slpg_layout_cost::split (unsigned int times)
    6268              : {
    6269      1297356 :   if (times > 1)
    6270       568326 :     total /= times;
    6271      1297356 : }
    6272              : 
    6273              : /* Information about one node in the SLP graph, for use during
    6274              :    vect_optimize_slp_pass.  */
    6275              : 
    6276              : struct slpg_vertex
    6277              : {
    6278      9950475 :   slpg_vertex (slp_tree node_) : node (node_) {}
    6279              : 
    6280              :   /* The node itself.  */
    6281              :   slp_tree node;
    6282              : 
    6283              :   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
    6284              :      partitions are flexible; they can have whichever layout consumers
    6285              :      want them to have.  */
    6286              :   int partition = -1;
    6287              : 
    6288              :   /* The number of nodes that directly use the result of this one
    6289              :      (i.e. the number of nodes that count this one as a child).  */
    6290              :   unsigned int out_degree = 0;
    6291              : 
    6292              :   /* The execution frequency of the node.  */
    6293              :   sreal weight = 0;
    6294              : 
    6295              :   /* The total execution frequency of all nodes that directly use the
    6296              :      result of this one.  */
    6297              :   sreal out_weight = 0;
    6298              : };
    6299              : 
    6300              : /* Information about one partition of the SLP graph, for use during
    6301              :    vect_optimize_slp_pass.  */
    6302              : 
    6303              : struct slpg_partition_info
    6304              : {
    6305              :   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
    6306              :      of m_partitioned_nodes.  */
    6307              :   unsigned int node_begin = 0;
    6308              :   unsigned int node_end = 0;
    6309              : 
    6310              :   /* Which layout we've chosen to use for this partition, or -1 if
    6311              :      we haven't picked one yet.  */
    6312              :   int layout = -1;
    6313              : 
    6314              :   /* The number of predecessors and successors in the partition dag.
    6315              :      The predecessors always have lower partition numbers and the
    6316              :      successors always have higher partition numbers.
    6317              : 
    6318              :      Note that the directions of these edges are not necessarily the
    6319              :      same as in the data flow graph.  For example, if an SCC has separate
    6320              :      partitions for an inner loop and an outer loop, the inner loop's
    6321              :      partition will have at least two incoming edges from the outer loop's
    6322              :      partition: one for a live-in value and one for a live-out value.
    6323              :      In data flow terms, one of these edges would also be from the outer loop
    6324              :      to the inner loop, but the other would be in the opposite direction.  */
    6325              :   unsigned int in_degree = 0;
    6326              :   unsigned int out_degree = 0;
    6327              : };
    6328              : 
    6329              : /* Information about the costs of using a particular layout for a
    6330              :    particular partition.  It can also say that the combination is
    6331              :    impossible.  */
    6332              : 
    6333              : struct slpg_partition_layout_costs
    6334              : {
    6335      1571894 :   bool is_possible () const { return internal_cost.is_possible (); }
    6336        55736 :   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
    6337              : 
    6338              :   /* The costs inherited from predecessor partitions.  */
    6339              :   slpg_layout_cost in_cost;
    6340              : 
    6341              :   /* The inherent cost of the layout within the node itself.  For example,
    6342              :      this is nonzero for a load if choosing a particular layout would require
    6343              :      the load to permute the loaded elements.  It is nonzero for a
    6344              :      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
    6345              :      to full-vector moves.  */
    6346              :   slpg_layout_cost internal_cost;
    6347              : 
    6348              :   /* The costs inherited from successor partitions.  */
    6349              :   slpg_layout_cost out_cost;
    6350              : };
    6351              : 
    6352              : /* This class tries to optimize the layout of vectors in order to avoid
    6353              :    unnecessary shuffling.  At the moment, the set of possible layouts are
    6354              :    restricted to bijective permutations.
    6355              : 
    6356              :    The goal of the pass depends on whether we're optimizing for size or
    6357              :    for speed.  When optimizing for size, the goal is to reduce the overall
    6358              :    number of layout changes (including layout changes implied by things
    6359              :    like load permutations).  When optimizing for speed, the goal is to
    6360              :    reduce the maximum latency attributable to layout changes on any
    6361              :    non-cyclical path through the data flow graph.
    6362              : 
    6363              :    For example, when optimizing a loop nest for speed, we will prefer
    6364              :    to make layout changes outside of a loop rather than inside of a loop,
    6365              :    and will prefer to make layout changes in parallel rather than serially,
    6366              :    even if that increases the overall number of layout changes.
    6367              : 
    6368              :    The high-level procedure is:
    6369              : 
    6370              :    (1) Build a graph in which edges go from uses (parents) to definitions
    6371              :        (children).
    6372              : 
    6373              :    (2) Divide the graph into a dag of strongly-connected components (SCCs).
    6374              : 
    6375              :    (3) When optimizing for speed, partition the nodes in each SCC based
    6376              :        on their containing cfg loop.  When optimizing for size, treat
    6377              :        each SCC as a single partition.
    6378              : 
    6379              :        This gives us a dag of partitions.  The goal is now to assign a
    6380              :        layout to each partition.
    6381              : 
    6382              :    (4) Construct a set of vector layouts that are worth considering.
    6383              :        Record which nodes must keep their current layout.
    6384              : 
    6385              :    (5) Perform a forward walk over the partition dag (from loads to stores)
    6386              :        accumulating the "forward" cost of using each layout.  When visiting
    6387              :        each partition, assign a tentative choice of layout to the partition
    6388              :        and use that choice when calculating the cost of using a different
    6389              :        layout in successor partitions.
    6390              : 
    6391              :    (6) Perform a backward walk over the partition dag (from stores to loads),
    6392              :        accumulating the "backward" cost of using each layout.  When visiting
    6393              :        each partition, make a final choice of layout for that partition based
    6394              :        on the accumulated forward costs (from (5)) and backward costs
    6395              :        (from (6)).
    6396              : 
    6397              :    (7) Apply the chosen layouts to the SLP graph.
    6398              : 
    6399              :    For example, consider the SLP statements:
    6400              : 
    6401              :    S1:      a_1 = load
    6402              :        loop:
    6403              :    S2:      a_2 = PHI<a_1, a_3>
    6404              :    S3:      b_1 = load
    6405              :    S4:      a_3 = a_2 + b_1
    6406              :        exit:
    6407              :    S5:      a_4 = PHI<a_3>
    6408              :    S6:      store a_4
    6409              : 
    6410              :    S2 and S4 form an SCC and are part of the same loop.  Every other
    6411              :    statement is in a singleton SCC.  In this example there is a one-to-one
    6412              :    mapping between SCCs and partitions and the partition dag looks like this;
    6413              : 
    6414              :         S1     S3
    6415              :          \     /
    6416              :           S2+S4
    6417              :             |
    6418              :            S5
    6419              :             |
    6420              :            S6
    6421              : 
    6422              :    S2, S3 and S4 will have a higher execution frequency than the other
    6423              :    statements, so when optimizing for speed, the goal is to avoid any
    6424              :    layout changes:
    6425              : 
    6426              :    - within S3
    6427              :    - within S2+S4
    6428              :    - on the S3->S2+S4 edge
    6429              : 
    6430              :    For example, if S3 was originally a reversing load, the goal of the
    6431              :    pass is to make it an unreversed load and change the layout on the
    6432              :    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
    6433              :    on S1->S2+S4 and S5->S6 would also be acceptable.)
    6434              : 
    6435              :    The difference between SCCs and partitions becomes important if we
    6436              :    add an outer loop:
    6437              : 
    6438              :    S1:      a_1 = ...
    6439              :        loop1:
    6440              :    S2:      a_2 = PHI<a_1, a_6>
    6441              :    S3:      b_1 = load
    6442              :    S4:      a_3 = a_2 + b_1
    6443              :        loop2:
    6444              :    S5:      a_4 = PHI<a_3, a_5>
    6445              :    S6:      c_1 = load
    6446              :    S7:      a_5 = a_4 + c_1
    6447              :        exit2:
    6448              :    S8:      a_6 = PHI<a_5>
    6449              :    S9:      store a_6
    6450              :        exit1:
    6451              : 
    6452              :    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
    6453              :    for speed, we usually do not want restrictions in the outer loop to "infect"
    6454              :    the decision for the inner loop.  For example, if an outer-loop node
    6455              :    in the SCC contains a statement with a fixed layout, that should not
    6456              :    prevent the inner loop from using a different layout.  Conversely,
    6457              :    the inner loop should not dictate a layout to the outer loop: if the
    6458              :    outer loop does a lot of computation, then it may not be efficient to
    6459              :    do all of that computation in the inner loop's preferred layout.
    6460              : 
    6461              :    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
    6462              :    and S5+S7 (inner).  We also try to arrange partitions so that:
    6463              : 
    6464              :    - the partition for an outer loop comes before the partition for
    6465              :      an inner loop
    6466              : 
    6467              :    - if a sibling loop A dominates a sibling loop B, A's partition
    6468              :      comes before B's
    6469              : 
    6470              :    This gives the following partition dag for the example above:
    6471              : 
    6472              :         S1        S3
    6473              :          \        /
    6474              :           S2+S4+S8   S6
    6475              :            |   \\    /
    6476              :            |    S5+S7
    6477              :            |
    6478              :           S9
    6479              : 
    6480              :    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
    6481              :    one for a reversal of the edge S7->S8.
    6482              : 
    6483              :    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
    6484              :    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
    6485              :    preferred layout against the cost of changing the layout on entry to the
    6486              :    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
    6487              : 
    6488              :    Although this works well when optimizing for speed, it has the downside
    6489              :    when optimizing for size that the choice of layout for S5+S7 is completely
    6490              :    independent of S9, which lessens the chance of reducing the overall number
    6491              :    of permutations.  We therefore do not partition SCCs when optimizing
    6492              :    for size.
    6493              : 
    6494              :    To give a concrete example of the difference between optimizing
    6495              :    for size and speed, consider:
    6496              : 
    6497              :    a[0] = (b[1] << c[3]) - d[1];
    6498              :    a[1] = (b[0] << c[2]) - d[0];
    6499              :    a[2] = (b[3] << c[1]) - d[3];
    6500              :    a[3] = (b[2] << c[0]) - d[2];
    6501              : 
    6502              :    There are three different layouts here: one for a, one for b and d,
    6503              :    and one for c.  When optimizing for speed it is better to permute each
    6504              :    of b, c and d into the order required by a, since those permutations
    6505              :    happen in parallel.  But when optimizing for size, it is better to:
    6506              : 
    6507              :    - permute c into the same order as b
    6508              :    - do the arithmetic
    6509              :    - permute the result into the order required by a
    6510              : 
    6511              :    This gives 2 permutations rather than 3.  */
    6512              : 
    6513              : class vect_optimize_slp_pass
    6514              : {
    6515              : public:
    6516       681015 :   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
    6517              :   void run ();
    6518              : 
    6519              : private:
    6520              :   /* Graph building.  */
    6521              :   struct loop *containing_loop (slp_tree);
    6522              :   bool is_cfg_latch_edge (graph_edge *);
    6523              :   void build_vertices (hash_set<slp_tree> &, slp_tree);
    6524              :   void build_vertices ();
    6525              :   void build_graph ();
    6526              : 
    6527              :   /* Partitioning.  */
    6528              :   void create_partitions ();
    6529              :   template<typename T> void for_each_partition_edge (unsigned int, T);
    6530              : 
    6531              :   /* Layout selection.  */
    6532              :   bool is_compatible_layout (slp_tree, unsigned int);
    6533              :   bool is_compatible_layout (const slpg_partition_info &, unsigned int);
    6534              :   int change_layout_cost (slp_tree, unsigned int, unsigned int);
    6535              :   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
    6536              :                                                        unsigned int);
    6537              :   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
    6538              :                                int, unsigned int);
    6539              :   int internal_node_cost (slp_tree, int, unsigned int);
    6540              :   void start_choosing_layouts ();
    6541              :   bool legitimize ();
    6542              : 
    6543              :   /* Cost propagation.  */
    6544              :   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
    6545              :                                      unsigned int, unsigned int);
    6546              :   slpg_layout_cost total_in_cost (unsigned int);
    6547              :   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
    6548              :   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
    6549              :   void forward_pass ();
    6550              :   void backward_pass ();
    6551              : 
    6552              :   /* Rematerialization.  */
    6553              :   slp_tree get_result_with_layout (slp_tree, unsigned int);
    6554              :   void materialize ();
    6555              : 
    6556              :   /* Clean-up.  */
    6557              :   void remove_redundant_permutations ();
    6558              : 
    6559              :   /* Masked load lanes discovery.  */
    6560              :   void decide_masked_load_lanes ();
    6561              : 
    6562              :   void dump ();
    6563              : 
    6564              :   vec_info *m_vinfo;
    6565              : 
    6566              :   /* True if we should optimize the graph for size, false if we should
    6567              :      optimize it for speed.  (It wouldn't be easy to make this decision
    6568              :      more locally.)  */
    6569              :   bool m_optimize_size;
    6570              : 
    6571              :   /* A graph of all SLP nodes, with edges leading from uses to definitions.
    6572              :      In other words, a node's predecessors are its slp_tree parents and
    6573              :      a node's successors are its slp_tree children.  */
    6574              :   graph *m_slpg = nullptr;
    6575              : 
    6576              :   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
    6577              :   auto_vec<slpg_vertex> m_vertices;
    6578              : 
    6579              :   /* The list of all leaves of M_SLPG. such as external definitions, constants,
    6580              :      and loads.  */
    6581              :   auto_vec<int> m_leafs;
    6582              : 
    6583              :   /* This array has one entry for every vector layout that we're considering.
    6584              :      Element 0 is null and indicates "no change".  Other entries describe
    6585              :      permutations that are inherent in the current graph and that we would
    6586              :      like to reverse if possible.
    6587              : 
    6588              :      For example, a permutation { 1, 2, 3, 0 } means that something has
    6589              :      effectively been permuted in that way, such as a load group
    6590              :      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
    6591              :      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
    6592              :      in order to put things "back" in order.  */
    6593              :   auto_vec<vec<unsigned> > m_perms;
    6594              : 
    6595              :   /* A partitioning of the nodes for which a layout must be chosen.
    6596              :      Each partition represents an <SCC, cfg loop> pair; that is,
    6597              :      nodes in different SCCs belong to different partitions, and nodes
    6598              :      within an SCC can be further partitioned according to a containing
    6599              :      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
    6600              : 
    6601              :      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
    6602              :        from leaves (such as loads) to roots (such as stores).
    6603              : 
    6604              :      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
    6605              :   auto_vec<slpg_partition_info> m_partitions;
    6606              : 
    6607              :   /* The list of all nodes for which a layout must be chosen.  Nodes for
    6608              :      partition P come before the nodes for partition P+1.  Nodes within a
    6609              :      partition are in reverse postorder.  */
    6610              :   auto_vec<unsigned int> m_partitioned_nodes;
    6611              : 
    6612              :   /* Index P * num-layouts + L contains the cost of using layout L
    6613              :      for partition P.  */
    6614              :   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
    6615              : 
    6616              :   /* Index N * num-layouts + L, if nonnull, is a node that provides the
    6617              :      original output of node N adjusted to have layout L.  */
    6618              :   auto_vec<slp_tree> m_node_layouts;
    6619              : };
    6620              : 
    6621              : /* Fill the vertices and leafs vector with all nodes in the SLP graph.
    6622              :    Also record whether we should optimize anything for speed rather
    6623              :    than size.  */
    6624              : 
    6625              : void
    6626     10761481 : vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
    6627              :                                         slp_tree node)
    6628              : {
    6629     10761481 :   unsigned i;
    6630     10761481 :   slp_tree child;
    6631              : 
    6632     10761481 :   if (visited.add (node))
    6633     10761481 :     return;
    6634              : 
    6635      9950475 :   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
    6636              :     {
    6637      7832928 :       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
    6638      6979966 :       if (optimize_bb_for_speed_p (bb))
    6639      6860194 :         m_optimize_size = false;
    6640              :     }
    6641              : 
    6642      9950475 :   node->vertex = m_vertices.length ();
    6643      9950475 :   m_vertices.safe_push (slpg_vertex (node));
    6644              : 
    6645      9950475 :   bool leaf = true;
    6646      9950475 :   bool force_leaf = false;
    6647     18643354 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    6648      8692879 :     if (child)
    6649              :       {
    6650      7829363 :         leaf = false;
    6651      7829363 :         build_vertices (visited, child);
    6652              :       }
    6653              :     else
    6654              :       force_leaf = true;
    6655              :   /* Since SLP discovery works along use-def edges all cycles have an
    6656              :      entry - but there's the exception of cycles where we do not handle
    6657              :      the entry explicitly (but with a NULL SLP node), like some reductions
    6658              :      and inductions.  Force those SLP PHIs to act as leafs to make them
    6659              :      backwards reachable.  */
    6660      9950475 :   if (leaf || force_leaf)
    6661      4913728 :     m_leafs.safe_push (node->vertex);
    6662              : }
    6663              : 
    6664              : /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
    6665              : 
    6666              : void
    6667      1362030 : vect_optimize_slp_pass::build_vertices ()
    6668              : {
    6669      1362030 :   hash_set<slp_tree> visited;
    6670      1362030 :   unsigned i;
    6671      1362030 :   slp_instance instance;
    6672      1362030 :   m_vertices.truncate (0);
    6673      1362030 :   m_leafs.truncate (0);
    6674      7018208 :   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
    6675      2932118 :     build_vertices (visited, SLP_INSTANCE_TREE (instance));
    6676      1362030 : }
    6677              : 
    6678              : /* Apply (reverse) bijectite PERM to VEC.  */
    6679              : 
    6680              : template <class T>
    6681              : static void
    6682       207947 : vect_slp_permute (vec<unsigned> perm,
    6683              :                   vec<T> &vec, bool reverse)
    6684              : {
    6685       207947 :   auto_vec<T, 64> saved;
    6686       207947 :   saved.create (vec.length ());
    6687       674387 :   for (unsigned i = 0; i < vec.length (); ++i)
    6688       466440 :     saved.quick_push (vec[i]);
    6689              : 
    6690       207947 :   if (reverse)
    6691              :     {
    6692      1338182 :       for (unsigned i = 0; i < vec.length (); ++i)
    6693       465112 :         vec[perm[i]] = saved[i];
    6694       672463 :       for (unsigned i = 0; i < vec.length (); ++i)
    6695       822797 :         gcc_assert (vec[perm[i]] == saved[i]);
    6696              :     }
    6697              :   else
    6698              :     {
    6699         3848 :       for (unsigned i = 0; i < vec.length (); ++i)
    6700         1328 :         vec[i] = saved[perm[i]];
    6701       209275 :       for (unsigned i = 0; i < vec.length (); ++i)
    6702         1992 :         gcc_assert (vec[i] == saved[perm[i]]);
    6703              :     }
    6704       207947 : }
    6705              : 
    6706              : /* Return the cfg loop that contains NODE.  */
    6707              : 
    6708              : struct loop *
    6709      3892470 : vect_optimize_slp_pass::containing_loop (slp_tree node)
    6710              : {
    6711      3892470 :   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
    6712      3892470 :   if (!rep)
    6713         5295 :     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
    6714      4326835 :   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
    6715              : }
    6716              : 
    6717              : /* Return true if UD (an edge from a use to a definition) is associated
    6718              :    with a loop latch edge in the cfg.  */
    6719              : 
    6720              : bool
    6721      7829363 : vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
    6722              : {
    6723      7829363 :   slp_tree use = m_vertices[ud->src].node;
    6724      7829363 :   slp_tree def = m_vertices[ud->dest].node;
    6725      7829363 :   if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
    6726      7829363 :        || SLP_TREE_PERMUTE_P (use))
    6727      7518800 :       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
    6728              :     return false;
    6729              : 
    6730      4561306 :   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
    6731      4561306 :   return (is_a<gphi *> (use_rep->stmt)
    6732       377372 :           && bb_loop_header_p (gimple_bb (use_rep->stmt))
    6733      4772270 :           && containing_loop (def) == containing_loop (use));
    6734              : }
    6735              : 
    6736              : /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
    6737              :    a nonnull data field.  */
    6738              : 
    6739              : void
    6740      1362030 : vect_optimize_slp_pass::build_graph ()
    6741              : {
    6742      1362030 :   m_optimize_size = true;
    6743      1362030 :   build_vertices ();
    6744              : 
    6745      2724060 :   m_slpg = new_graph (m_vertices.length ());
    6746     14036565 :   for (slpg_vertex &v : m_vertices)
    6747     29760840 :     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
    6748      8692879 :       if (child)
    6749              :         {
    6750      7829363 :           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
    6751      7829363 :           if (is_cfg_latch_edge (ud))
    6752       202120 :             ud->data = this;
    6753              :         }
    6754      1362030 : }
    6755              : 
    6756              : /* Return true if E corresponds to a loop latch edge in the cfg.  */
    6757              : 
    6758              : static bool
    6759      4015456 : skip_cfg_latch_edges (graph_edge *e)
    6760              : {
    6761      4015456 :   return e->data;
    6762              : }
    6763              : 
    6764              : /* Create the node partitions.  */
    6765              : 
    6766              : void
    6767       681015 : vect_optimize_slp_pass::create_partitions ()
    6768              : {
    6769              :   /* Calculate a postorder of the graph, ignoring edges that correspond
    6770              :      to natural latch edges in the cfg.  Reading the vector from the end
    6771              :      to the beginning gives the reverse postorder.  */
    6772       681015 :   auto_vec<int> initial_rpo;
    6773      1362030 :   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
    6774              :                false, NULL, skip_cfg_latch_edges);
    6775      2043045 :   gcc_assert (initial_rpo.length () == m_vertices.length ());
    6776              : 
    6777              :   /* Calculate the strongly connected components of the graph.  */
    6778       681015 :   auto_vec<int> scc_grouping;
    6779       681015 :   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
    6780              : 
    6781              :   /* Create a new index order in which all nodes from the same SCC are
    6782              :      consecutive.  Use scc_pos to record the index of the first node in
    6783              :      each SCC.  */
    6784       681015 :   auto_vec<unsigned int> scc_pos (num_sccs);
    6785       681015 :   int last_component = -1;
    6786       681015 :   unsigned int node_count = 0;
    6787      7017999 :   for (unsigned int node_i : scc_grouping)
    6788              :     {
    6789      4974954 :       if (last_component != m_slpg->vertices[node_i].component)
    6790              :         {
    6791      4848031 :           last_component = m_slpg->vertices[node_i].component;
    6792      9696062 :           gcc_assert (last_component == int (scc_pos.length ()));
    6793      4848031 :           scc_pos.quick_push (node_count);
    6794              :         }
    6795      4974954 :       node_count += 1;
    6796              :     }
    6797      1362030 :   gcc_assert (node_count == initial_rpo.length ()
    6798              :               && last_component + 1 == int (num_sccs));
    6799              : 
    6800              :   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
    6801              :      inside each SCC following the RPO we calculated above.  The fact that
    6802              :      we ignored natural latch edges when calculating the RPO should ensure
    6803              :      that, for natural loop nests:
    6804              : 
    6805              :      - the first node that we encounter in a cfg loop is the loop header phi
    6806              :      - the loop header phis are in dominance order
    6807              : 
    6808              :      Arranging for this is an optimization (see below) rather than a
    6809              :      correctness issue.  Unnatural loops with a tangled mess of backedges
    6810              :      will still work correctly, but might give poorer results.
    6811              : 
    6812              :      Also update scc_pos so that it gives 1 + the index of the last node
    6813              :      in the SCC.  */
    6814       681015 :   m_partitioned_nodes.safe_grow (node_count);
    6815      6336984 :   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
    6816              :     {
    6817      4974954 :       unsigned int node_i = initial_rpo[old_i];
    6818      4974954 :       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
    6819      4974954 :       m_partitioned_nodes[new_i] = node_i;
    6820              :     }
    6821              : 
    6822              :   /* When optimizing for speed, partition each SCC based on the containing
    6823              :      cfg loop. The order we constructed above should ensure that, for natural
    6824              :      cfg loops, we'll create sub-SCC partitions for outer loops before
    6825              :      the corresponding sub-SCC partitions for inner loops.  Similarly,
    6826              :      when one sibling loop A dominates another sibling loop B, we should
    6827              :      create a sub-SCC partition for A before a sub-SCC partition for B.
    6828              : 
    6829              :      As above, nothing depends for correctness on whether this achieves
    6830              :      a natural nesting, but we should get better results when it does.  */
    6831      1362030 :   m_partitions.reserve (m_vertices.length ());
    6832       681015 :   unsigned int next_partition_i = 0;
    6833       681015 :   hash_map<struct loop *, int> loop_partitions;
    6834       681015 :   unsigned int rpo_begin = 0;
    6835       681015 :   unsigned int num_partitioned_nodes = 0;
    6836      6891076 :   for (unsigned int rpo_end : scc_pos)
    6837              :     {
    6838      4848031 :       loop_partitions.empty ();
    6839              :       unsigned int partition_i = next_partition_i;
    6840      9822985 :       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
    6841              :         {
    6842              :           /* Handle externals and constants optimistically throughout.
    6843              :              But treat existing vectors as fixed since we do not handle
    6844              :              permuting them.  */
    6845      4974954 :           unsigned int node_i = m_partitioned_nodes[rpo_i];
    6846      4974954 :           auto &vertex = m_vertices[node_i];
    6847      4974954 :           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
    6848       506256 :                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
    6849      4977173 :               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
    6850      1479959 :             vertex.partition = -1;
    6851              :           else
    6852              :             {
    6853      3494995 :               bool existed;
    6854      3494995 :               if (m_optimize_size)
    6855        24453 :                 existed = next_partition_i > partition_i;
    6856              :               else
    6857              :                 {
    6858      3470542 :                   struct loop *loop = containing_loop (vertex.node);
    6859      3470542 :                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
    6860      3470542 :                   if (!existed)
    6861      3344644 :                     entry = next_partition_i;
    6862      3470542 :                   partition_i = entry;
    6863              :                 }
    6864      3494995 :               if (!existed)
    6865              :                 {
    6866      3369019 :                   m_partitions.quick_push (slpg_partition_info ());
    6867      3369019 :                   next_partition_i += 1;
    6868              :                 }
    6869      3494995 :               vertex.partition = partition_i;
    6870      3494995 :               num_partitioned_nodes += 1;
    6871      3494995 :               m_partitions[partition_i].node_end += 1;
    6872              :             }
    6873              :         }
    6874      4848031 :       rpo_begin = rpo_end;
    6875              :     }
    6876              : 
    6877              :   /* Assign ranges of consecutive node indices to each partition,
    6878              :      in partition order.  Start with node_end being the same as
    6879              :      node_begin so that the next loop can use it as a counter.  */
    6880       681015 :   unsigned int node_begin = 0;
    6881      5412064 :   for (auto &partition : m_partitions)
    6882              :     {
    6883      3369019 :       partition.node_begin = node_begin;
    6884      3369019 :       node_begin += partition.node_end;
    6885      3369019 :       partition.node_end = partition.node_begin;
    6886              :     }
    6887       681015 :   gcc_assert (node_begin == num_partitioned_nodes);
    6888              : 
    6889              :   /* Finally build the list of nodes in partition order.  */
    6890       681015 :   m_partitioned_nodes.truncate (num_partitioned_nodes);
    6891      5655969 :   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
    6892              :     {
    6893      4974954 :       int partition_i = m_vertices[node_i].partition;
    6894      4974954 :       if (partition_i >= 0)
    6895              :         {
    6896      3494995 :           unsigned int order_i = m_partitions[partition_i].node_end++;
    6897      3494995 :           m_partitioned_nodes[order_i] = node_i;
    6898              :         }
    6899              :     }
    6900       681015 : }
    6901              : 
    6902              : /* Look for edges from earlier partitions into node NODE_I and edges from
    6903              :    node NODE_I into later partitions.  Call:
    6904              : 
    6905              :       FN (ud, other_node_i)
    6906              : 
    6907              :    for each such use-to-def edge ud, where other_node_i is the node at the
    6908              :    other end of the edge.  */
    6909              : 
    6910              : template<typename T>
    6911              : void
    6912      3937338 : vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
    6913              : {
    6914      3937338 :   int partition_i = m_vertices[node_i].partition;
    6915      3937338 :   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
    6916      6830708 :        pred; pred = pred->pred_next)
    6917              :     {
    6918      2893370 :       int src_partition_i = m_vertices[pred->src].partition;
    6919      2893370 :       if (src_partition_i >= 0 && src_partition_i != partition_i)
    6920      2572159 :         fn (pred, pred->src);
    6921              :     }
    6922      3937338 :   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
    6923      8465300 :        succ; succ = succ->succ_next)
    6924              :     {
    6925      4527962 :       int dest_partition_i = m_vertices[succ->dest].partition;
    6926      4527962 :       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
    6927      2599922 :         fn (succ, succ->dest);
    6928              :     }
    6929      3937338 : }
    6930              : 
    6931              : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
    6932              :    that NODE would operate on.  This test is independent of NODE's actual
    6933              :    operation.  */
    6934              : 
    6935              : bool
    6936      1769406 : vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
    6937              :                                               unsigned int layout_i)
    6938              : {
    6939      1769406 :   if (layout_i == 0)
    6940              :     return true;
    6941              : 
    6942      1012492 :   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
    6943        14802 :     return false;
    6944              : 
    6945              :   return true;
    6946              : }
    6947              : 
    6948              : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
    6949              :    that NODE would operate on for each NODE in PARTITION.
    6950              :    This test is independent of NODE's actual operations.  */
    6951              : 
    6952              : bool
    6953        17791 : vect_optimize_slp_pass::is_compatible_layout (const slpg_partition_info
    6954              :                                                 &partition,
    6955              :                                               unsigned int layout_i)
    6956              : {
    6957        35854 :   for (unsigned int order_i = partition.node_begin;
    6958        35854 :        order_i < partition.node_end; ++order_i)
    6959              :     {
    6960        18129 :       unsigned int node_i = m_partitioned_nodes[order_i];
    6961        18129 :       auto &vertex = m_vertices[node_i];
    6962              : 
    6963              :       /* The layout is incompatible if it is individually incompatible
    6964              :          with any node in the partition.  */
    6965        18129 :       if (!is_compatible_layout (vertex.node, layout_i))
    6966              :         return false;
    6967              :     }
    6968              :   return true;
    6969              : }
    6970              : 
    6971              : /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
    6972              :    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
    6973              :    layouts is incompatible with NODE or if the change is not possible for
    6974              :    some other reason.
    6975              : 
    6976              :    The properties taken from NODE include the number of lanes and the
    6977              :    vector type.  The actual operation doesn't matter.  */
    6978              : 
    6979              : int
    6980       756986 : vect_optimize_slp_pass::change_layout_cost (slp_tree node,
    6981              :                                             unsigned int from_layout_i,
    6982              :                                             unsigned int to_layout_i)
    6983              : {
    6984       756986 :   if (!is_compatible_layout (node, from_layout_i)
    6985       756986 :       || !is_compatible_layout (node, to_layout_i))
    6986          563 :     return -1;
    6987              : 
    6988       756423 :   if (from_layout_i == to_layout_i)
    6989              :     return 0;
    6990              : 
    6991       320840 :   auto_vec<slp_tree, 1> children (1);
    6992       320840 :   children.quick_push (node);
    6993       320840 :   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
    6994       320840 :   if (from_layout_i > 0)
    6995       900928 :     for (unsigned int i : m_perms[from_layout_i])
    6996       393811 :       perm.quick_push ({ 0, i });
    6997              :   else
    6998       488218 :     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
    6999       336417 :       perm.quick_push ({ 0, i });
    7000       320840 :   if (to_layout_i > 0)
    7001       152228 :     vect_slp_permute (m_perms[to_layout_i], perm, true);
    7002       320840 :   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
    7003              :                                                children, false);
    7004       320840 :   if (count >= 0)
    7005       316687 :     return MAX (count, 1);
    7006              : 
    7007              :   /* ??? In principle we could try changing via layout 0, giving two
    7008              :      layout changes rather than 1.  Doing that would require
    7009              :      corresponding support in get_result_with_layout.  */
    7010              :   return -1;
    7011       320840 : }
    7012              : 
    7013              : /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
    7014              : 
    7015              : inline slpg_partition_layout_costs &
    7016      1083115 : vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
    7017              :                                                 unsigned int layout_i)
    7018              : {
    7019      2166230 :   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
    7020              : }
    7021              : 
    7022              : /* Change PERM in one of two ways:
    7023              : 
    7024              :    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
    7025              :      chosen for child I of NODE.
    7026              : 
    7027              :    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
    7028              : 
    7029              :    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
    7030              : 
    7031              : void
    7032        30650 : vect_optimize_slp_pass::
    7033              : change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
    7034              :                         int in_layout_i, unsigned int out_layout_i)
    7035              : {
    7036       178116 :   for (auto &entry : perm)
    7037              :     {
    7038        86166 :       int this_in_layout_i = in_layout_i;
    7039        86166 :       if (this_in_layout_i < 0)
    7040              :         {
    7041        59911 :           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
    7042        59911 :           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
    7043        59911 :           if (in_partition_i == -1u)
    7044          329 :             continue;
    7045        59582 :           this_in_layout_i = m_partitions[in_partition_i].layout;
    7046              :         }
    7047        85837 :       if (this_in_layout_i > 0)
    7048        19151 :         entry.second = m_perms[this_in_layout_i][entry.second];
    7049              :     }
    7050        30650 :   if (out_layout_i > 0)
    7051         7153 :     vect_slp_permute (m_perms[out_layout_i], perm, true);
    7052        30650 : }
    7053              : 
    7054              : /* Check whether the target allows NODE to be rearranged so that the node's
    7055              :    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
    7056              :    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
    7057              : 
    7058              :    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
    7059              :    NODE can adapt to the layout changes that have (perhaps provisionally)
    7060              :    been chosen for NODE's children, so that no extra permutations are
    7061              :    needed on either the input or the output of NODE.
    7062              : 
    7063              :    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
    7064              :    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
    7065              : 
    7066              :    IN_LAYOUT_I has no meaning for other types of node.
    7067              : 
    7068              :    Keeping the node as-is is always valid.  If the target doesn't appear
    7069              :    to support the node as-is, but might realistically support other layouts,
    7070              :    then layout 0 instead has the cost of a worst-case permutation.  On the
    7071              :    one hand, this ensures that every node has at least one valid layout,
    7072              :    avoiding what would otherwise be an awkward special case.  On the other,
    7073              :    it still encourages the pass to change an invalid pre-existing layout
    7074              :    choice into a valid one.  */
    7075              : 
    7076              : int
    7077       233603 : vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
    7078              :                                             unsigned int out_layout_i)
    7079              : {
    7080       233603 :   const int fallback_cost = 1;
    7081              : 
    7082       233603 :   if (SLP_TREE_PERMUTE_P (node))
    7083              :     {
    7084        25506 :       auto_lane_permutation_t tmp_perm;
    7085        25506 :       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
    7086              : 
    7087              :       /* Check that the child nodes support the chosen layout.  Checking
    7088              :          the first child is enough, since any second child would have the
    7089              :          same shape.  */
    7090        25506 :       auto first_child = SLP_TREE_CHILDREN (node)[0];
    7091        25506 :       if (in_layout_i > 0
    7092        25506 :           && !is_compatible_layout (first_child, in_layout_i))
    7093              :         return -1;
    7094              : 
    7095        24947 :       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
    7096        49894 :       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
    7097              :                                                   node, tmp_perm,
    7098        24947 :                                                   SLP_TREE_CHILDREN (node),
    7099              :                                                   false);
    7100        24947 :       if (count < 0)
    7101              :         {
    7102         1498 :           if (in_layout_i == 0 && out_layout_i == 0)
    7103              :             {
    7104              :               /* Use the fallback cost if the node could in principle support
    7105              :                  some nonzero layout for both the inputs and the outputs.
    7106              :                  Otherwise assume that the node will be rejected later
    7107              :                  and rebuilt from scalars.  */
    7108          363 :               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
    7109              :                 return fallback_cost;
    7110          293 :               return 0;
    7111              :             }
    7112              :           return -1;
    7113              :         }
    7114              : 
    7115              :       /* We currently have no way of telling whether the new layout is cheaper
    7116              :          or more expensive than the old one.  But at least in principle,
    7117              :          it should be worth making zero permutations (whole-vector shuffles)
    7118              :          cheaper than real permutations, in case the pass is able to remove
    7119              :          the latter.  */
    7120        23449 :       return count == 0 ? 0 : 1;
    7121        25506 :     }
    7122              : 
    7123       208097 :   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
    7124       208097 :   if (rep
    7125       207165 :       && STMT_VINFO_DATA_REF (rep)
    7126        64004 :       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
    7127       254673 :       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7128              :     {
    7129        39514 :       auto_load_permutation_t tmp_perm;
    7130        39514 :       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
    7131        39514 :       if (out_layout_i > 0)
    7132        13543 :         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
    7133              : 
    7134        39514 :       poly_uint64 vf = 1;
    7135        39514 :       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
    7136        12066 :         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    7137        39514 :       unsigned int n_perms;
    7138        39514 :       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
    7139              :                                            nullptr, vf, true, false, &n_perms))
    7140              :         {
    7141         1492 :           auto rep = SLP_TREE_REPRESENTATIVE (node);
    7142         1492 :           if (out_layout_i == 0)
    7143              :             {
    7144              :               /* Use the fallback cost if the load is an N-to-N permutation.
    7145              :                  Otherwise assume that the node will be rejected later
    7146              :                  and rebuilt from scalars.  */
    7147         1089 :               if (STMT_VINFO_GROUPED_ACCESS (rep)
    7148         2178 :                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
    7149         1089 :                       == SLP_TREE_LANES (node)))
    7150          593 :                 return fallback_cost;
    7151              :               return 0;
    7152              :             }
    7153              :           return -1;
    7154              :         }
    7155              : 
    7156              :       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
    7157        38022 :       return n_perms == 0 ? 0 : 1;
    7158        39514 :     }
    7159              : 
    7160              :   return 0;
    7161              : }
    7162              : 
    7163              : /* Decide which element layouts we should consider using.  Calculate the
    7164              :    weights associated with inserting layout changes on partition edges.
    7165              :    Also mark partitions that cannot change layout, by setting their
    7166              :    layout to zero.  */
    7167              : 
    7168              : void
    7169       681015 : vect_optimize_slp_pass::start_choosing_layouts ()
    7170              : {
    7171              :   /* Used to assign unique permutation indices.  */
    7172       681015 :   using perm_hash = unbounded_hashmap_traits<
    7173              :     vec_free_hash_base<int_hash_base<unsigned>>,
    7174              :     int_hash<int, -1, -2>
    7175              :   >;
    7176       681015 :   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
    7177              : 
    7178              :   /* Layout 0 is "no change".  */
    7179       681015 :   m_perms.safe_push (vNULL);
    7180              : 
    7181              :   /* Create layouts from existing permutations.  */
    7182       681015 :   auto_load_permutation_t tmp_perm;
    7183      5538040 :   for (unsigned int node_i : m_partitioned_nodes)
    7184              :     {
    7185              :       /* Leafs also double as entries to the reverse graph.  Allow the
    7186              :          layout of those to be changed.  */
    7187      3494995 :       auto &vertex = m_vertices[node_i];
    7188      3494995 :       auto &partition = m_partitions[vertex.partition];
    7189      3494995 :       if (!m_slpg->vertices[node_i].succ)
    7190       886667 :         partition.layout = 0;
    7191              : 
    7192              :       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
    7193      3494995 :       slp_tree node = vertex.node;
    7194      3494995 :       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
    7195      3494995 :       slp_tree child;
    7196      3494995 :       unsigned HOST_WIDE_INT imin, imax = 0;
    7197      3494995 :       bool any_permute = false;
    7198      3494995 :       tmp_perm.truncate (0);
    7199      3494995 :       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7200              :         {
    7201              :           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
    7202              :              unpermuted, record a layout that reverses this permutation.
    7203              : 
    7204              :              We would need more work to cope with loads that are internally
    7205              :              permuted and also have inputs (such as masks for
    7206              :              IFN_MASK_LOADs).  */
    7207       596312 :           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
    7208       596312 :           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
    7209              :             {
    7210       423197 :               partition.layout = -1;
    7211      3477716 :               continue;
    7212              :             }
    7213       173115 :           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
    7214       173115 :           imin = DR_GROUP_SIZE (dr_stmt) + 1;
    7215       173115 :           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
    7216              :         }
    7217      5679334 :       else if (SLP_TREE_PERMUTE_P (node)
    7218       136514 :                && SLP_TREE_CHILDREN (node).length () == 1
    7219       118032 :                && (child = SLP_TREE_CHILDREN (node)[0])
    7220      3016715 :                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
    7221       118032 :                    .is_constant (&imin)))
    7222              :         {
    7223              :           /* If the child has the same vector size as this node,
    7224              :              reversing the permutation can make the permutation a no-op.
    7225              :              In other cases it can change a true permutation into a
    7226              :              full-vector extract.  */
    7227       118032 :           tmp_perm.reserve (SLP_TREE_LANES (node));
    7228       316665 :           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7229       198633 :             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
    7230              :         }
    7231              :       else
    7232      2780651 :         continue;
    7233              : 
    7234       768326 :       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7235              :         {
    7236       477179 :           unsigned idx = tmp_perm[j];
    7237       477179 :           imin = MIN (imin, idx);
    7238       477179 :           imax = MAX (imax, idx);
    7239       477179 :           if (idx - tmp_perm[0] != j)
    7240       139073 :             any_permute = true;
    7241              :         }
    7242              :       /* If the span doesn't match we'd disrupt VF computation, avoid
    7243              :          that for now.  */
    7244       291147 :       if (imax - imin + 1 != SLP_TREE_LANES (node))
    7245        82561 :         continue;
    7246              :       /* If there's no permute no need to split one out.  In this case
    7247              :          we can consider turning a load into a permuted load, if that
    7248              :          turns out to be cheaper than alternatives.  */
    7249       208586 :       if (!any_permute)
    7250              :         {
    7251       191169 :           partition.layout = -1;
    7252       191169 :           continue;
    7253              :         }
    7254              : 
    7255              :       /* For now only handle true permutes, like
    7256              :          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
    7257              :          when permuting constants and invariants keeping the permute
    7258              :          bijective.  */
    7259        17417 :       auto_sbitmap load_index (SLP_TREE_LANES (node));
    7260        17417 :       bitmap_clear (load_index);
    7261        66465 :       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7262        49048 :         bitmap_set_bit (load_index, tmp_perm[j] - imin);
    7263              :       unsigned j;
    7264        65781 :       for (j = 0; j < SLP_TREE_LANES (node); ++j)
    7265        48502 :         if (!bitmap_bit_p (load_index, j))
    7266              :           break;
    7267        17417 :       if (j != SLP_TREE_LANES (node))
    7268          138 :         continue;
    7269              : 
    7270        17279 :       vec<unsigned> perm = vNULL;
    7271        17279 :       perm.safe_grow (SLP_TREE_LANES (node), true);
    7272        65542 :       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7273        48263 :         perm[j] = tmp_perm[j] - imin;
    7274              : 
    7275        34558 :       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
    7276              :         {
    7277              :           /* Continue to use existing layouts, but don't add any more.  */
    7278            0 :           int *entry = layout_ids.get (perm);
    7279            0 :           partition.layout = entry ? *entry : 0;
    7280            0 :           perm.release ();
    7281              :         }
    7282              :       else
    7283              :         {
    7284        17279 :           bool existed;
    7285        17279 :           int &layout_i = layout_ids.get_or_insert (perm, &existed);
    7286        17279 :           if (existed)
    7287         6233 :             perm.release ();
    7288              :           else
    7289              :             {
    7290        11046 :               layout_i = m_perms.length ();
    7291        11046 :               m_perms.safe_push (perm);
    7292              :             }
    7293        17279 :           partition.layout = layout_i;
    7294              :         }
    7295        17417 :     }
    7296              : 
    7297              :   /* Initially assume that every layout is possible and has zero cost
    7298              :      in every partition.  */
    7299       681015 :   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
    7300      1362030 :                                               * m_perms.length ());
    7301              : 
    7302              :   /* We have to mark outgoing permutations facing non-associating-reduction
    7303              :      graph entries that are not represented as to be materialized.
    7304              :      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
    7305      3509104 :   for (slp_instance instance : m_vinfo->slp_instances)
    7306      1466059 :     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
    7307              :       {
    7308         6350 :         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
    7309         6350 :         m_partitions[m_vertices[node_i].partition].layout = 0;
    7310              :       }
    7311      1459709 :     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
    7312              :       {
    7313         2255 :         stmt_vec_info stmt_info
    7314         2255 :           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
    7315         2255 :         vect_reduc_info reduc_info
    7316         2255 :           = info_for_reduction (as_a <loop_vec_info> (m_vinfo),
    7317              :                                 SLP_INSTANCE_TREE (instance));
    7318         2255 :         if (needs_fold_left_reduction_p (TREE_TYPE
    7319              :                                            (gimple_get_lhs (stmt_info->stmt)),
    7320              :                                          VECT_REDUC_INFO_CODE (reduc_info)))
    7321              :           {
    7322           97 :             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
    7323           97 :             m_partitions[m_vertices[node_i].partition].layout = 0;
    7324              :           }
    7325              :       }
    7326              : 
    7327              :   /* Check which layouts each node and partition can handle.  Calculate the
    7328              :      weights associated with inserting layout changes on edges.  */
    7329      5538040 :   for (unsigned int node_i : m_partitioned_nodes)
    7330              :     {
    7331      3494995 :       auto &vertex = m_vertices[node_i];
    7332      3494995 :       auto &partition = m_partitions[vertex.partition];
    7333      3494995 :       slp_tree node = vertex.node;
    7334              : 
    7335      3494995 :       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
    7336              :         {
    7337      3489700 :           vertex.weight = vect_slp_node_weight (node);
    7338              : 
    7339              :           /* We do not handle stores with a permutation, so all
    7340              :              incoming permutations must have been materialized.
    7341              : 
    7342              :              We also don't handle masked grouped loads, which lack a
    7343              :              permutation vector.  In this case the memory locations
    7344              :              form an implicit second input to the loads, on top of the
    7345              :              explicit mask input, and the memory input's layout cannot
    7346              :              be changed.
    7347              : 
    7348              :              On the other hand, we do support permuting gather loads and
    7349              :              masked gather loads, where each scalar load is independent
    7350              :              of the others.  This can be useful if the address/index input
    7351              :              benefits from permutation.  */
    7352      3489700 :           if (STMT_VINFO_DATA_REF (rep)
    7353      1761443 :               && STMT_VINFO_GROUPED_ACCESS (rep)
    7354      4583495 :               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7355       920680 :             partition.layout = 0;
    7356              : 
    7357              :           /* We cannot change the layout of an operation that is
    7358              :              not independent on lanes.  Note this is an explicit
    7359              :              negative list since that's much shorter than the respective
    7360              :              positive one but it's critical to keep maintaining it.  */
    7361      3489700 :           if (is_gimple_call (STMT_VINFO_STMT (rep)))
    7362        31722 :             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
    7363              :               {
    7364         1155 :               case CFN_COMPLEX_ADD_ROT90:
    7365         1155 :               case CFN_COMPLEX_ADD_ROT270:
    7366         1155 :               case CFN_COMPLEX_MUL:
    7367         1155 :               case CFN_COMPLEX_MUL_CONJ:
    7368         1155 :               case CFN_VEC_ADDSUB:
    7369         1155 :               case CFN_VEC_FMADDSUB:
    7370         1155 :               case CFN_VEC_FMSUBADD:
    7371         1155 :                 partition.layout = 0;
    7372              :               default:;
    7373              :               }
    7374              :         }
    7375              : 
    7376      7882313 :       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
    7377              :         {
    7378      4387318 :           auto &other_vertex = m_vertices[other_node_i];
    7379              : 
    7380              :           /* Count the number of edges from earlier partitions and the number
    7381              :              of edges to later partitions.  */
    7382      4387318 :           if (other_vertex.partition < vertex.partition)
    7383      2193659 :             partition.in_degree += 1;
    7384              :           else
    7385      2193659 :             partition.out_degree += 1;
    7386              : 
    7387              :           /* If the current node uses the result of OTHER_NODE_I, accumulate
    7388              :              the effects of that.  */
    7389      4387318 :           if (ud->src == int (node_i))
    7390              :             {
    7391      2193659 :               other_vertex.out_weight += vertex.weight;
    7392      2193659 :               other_vertex.out_degree += 1;
    7393              :             }
    7394      7882313 :         };
    7395      3494995 :       for_each_partition_edge (node_i, process_edge);
    7396              :     }
    7397       681015 : }
    7398              : 
    7399              : /* Return the incoming costs for node NODE_I, assuming that each input keeps
    7400              :    its current (provisional) choice of layout.  The inputs do not necessarily
    7401              :    have the same layout as each other.  */
    7402              : 
    7403              : slpg_layout_cost
    7404         3183 : vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
    7405              : {
    7406         3183 :   auto &vertex = m_vertices[node_i];
    7407         3183 :   slpg_layout_cost cost;
    7408        11635 :   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
    7409              :     {
    7410         8452 :       auto &other_vertex = m_vertices[other_node_i];
    7411         8452 :       if (other_vertex.partition < vertex.partition)
    7412              :         {
    7413         5357 :           auto &other_partition = m_partitions[other_vertex.partition];
    7414        10714 :           auto &other_costs = partition_layout_costs (other_vertex.partition,
    7415         5357 :                                                       other_partition.layout);
    7416         5357 :           slpg_layout_cost this_cost = other_costs.in_cost;
    7417         5357 :           this_cost.add_serial_cost (other_costs.internal_cost);
    7418         5357 :           this_cost.split (other_partition.out_degree);
    7419         5357 :           cost.add_parallel_cost (this_cost);
    7420              :         }
    7421        11635 :     };
    7422         3183 :   for_each_partition_edge (node_i, add_cost);
    7423         3183 :   return cost;
    7424              : }
    7425              : 
    7426              : /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
    7427              :    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
    7428              :    slpg_layout_cost::impossible () if the change isn't possible.  */
    7429              : 
    7430              : slpg_layout_cost
    7431       756986 : vect_optimize_slp_pass::
    7432              : edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
    7433              :                   unsigned int layout2_i)
    7434              : {
    7435       756986 :   auto &def_vertex = m_vertices[ud->dest];
    7436       756986 :   auto &use_vertex = m_vertices[ud->src];
    7437       756986 :   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
    7438       756986 :   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
    7439       756986 :   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
    7440              :                                     use_layout_i);
    7441       756986 :   if (factor < 0)
    7442         4716 :     return slpg_layout_cost::impossible ();
    7443              : 
    7444              :   /* We have a choice of putting the layout change at the site of the
    7445              :      definition or at the site of the use.  Prefer the former when
    7446              :      optimizing for size or when the execution frequency of the
    7447              :      definition is no greater than the combined execution frequencies of
    7448              :      the uses.  When putting the layout change at the site of the definition,
    7449              :      divvy up the cost among all consumers.  */
    7450       752270 :   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
    7451              :     {
    7452       734850 :       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
    7453       734850 :       cost.split (def_vertex.out_degree);
    7454       734850 :       return cost;
    7455              :     }
    7456        17420 :   return { use_vertex.weight * factor, m_optimize_size };
    7457              : }
    7458              : 
    7459              : /* UD represents a use-def link between FROM_NODE_I and a node in a later
    7460              :    partition; FROM_NODE_I could be the definition node or the use node.
    7461              :    The node at the other end of the link wants to use layout TO_LAYOUT_I.
    7462              :    Return the cost of any necessary fix-ups on edge UD, or return
    7463              :    slpg_layout_cost::impossible () if the change isn't possible.
    7464              : 
    7465              :    At this point, FROM_NODE_I's partition has chosen the cheapest
    7466              :    layout based on the information available so far, but this choice
    7467              :    is only provisional.  */
    7468              : 
    7469              : slpg_layout_cost
    7470       199137 : vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
    7471              :                                       unsigned int to_layout_i)
    7472              : {
    7473       199137 :   auto &from_vertex = m_vertices[from_node_i];
    7474       199137 :   unsigned int from_partition_i = from_vertex.partition;
    7475       199137 :   slpg_partition_info &from_partition = m_partitions[from_partition_i];
    7476       199137 :   gcc_assert (from_partition.layout >= 0);
    7477              : 
    7478              :   /* First calculate the cost on the assumption that FROM_PARTITION sticks
    7479              :      with its current layout preference.  */
    7480       199137 :   slpg_layout_cost cost = slpg_layout_cost::impossible ();
    7481       199137 :   auto edge_cost = edge_layout_cost (ud, from_node_i,
    7482       199137 :                                      from_partition.layout, to_layout_i);
    7483       199137 :   if (edge_cost.is_possible ())
    7484              :     {
    7485       393344 :       auto &from_costs = partition_layout_costs (from_partition_i,
    7486       196672 :                                                  from_partition.layout);
    7487       196672 :       cost = from_costs.in_cost;
    7488       196672 :       cost.add_serial_cost (from_costs.internal_cost);
    7489       196672 :       cost.split (from_partition.out_degree);
    7490       196672 :       cost.add_serial_cost (edge_cost);
    7491              :     }
    7492         2465 :   else if (from_partition.layout == 0)
    7493              :     /* We must allow the source partition to have layout 0 as a fallback,
    7494              :        in case all other options turn out to be impossible.  */
    7495         2465 :     return cost;
    7496              : 
    7497              :   /* Take the minimum of that cost and the cost that applies if
    7498              :      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
    7499       196672 :   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
    7500              :                                                       to_layout_i);
    7501       196672 :   if (direct_layout_costs.is_possible ())
    7502              :     {
    7503       176892 :       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
    7504       176892 :       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
    7505       176892 :       direct_cost.split (from_partition.out_degree);
    7506       176892 :       if (!cost.is_possible ()
    7507       176892 :           || direct_cost.is_better_than (cost, m_optimize_size))
    7508        45073 :         cost = direct_cost;
    7509              :     }
    7510              : 
    7511       196672 :   return cost;
    7512              : }
    7513              : 
    7514              : /* UD represents a use-def link between TO_NODE_I and a node in an earlier
    7515              :    partition; TO_NODE_I could be the definition node or the use node.
    7516              :    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
    7517              :    return the cost of any necessary fix-ups on edge UD, or
    7518              :    slpg_layout_cost::impossible () if the choice cannot be made.
    7519              : 
    7520              :    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
    7521              : 
    7522              : slpg_layout_cost
    7523       183585 : vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
    7524              :                                        unsigned int from_layout_i)
    7525              : {
    7526       183585 :   auto &to_vertex = m_vertices[to_node_i];
    7527       183585 :   unsigned int to_partition_i = to_vertex.partition;
    7528       183585 :   slpg_partition_info &to_partition = m_partitions[to_partition_i];
    7529       183585 :   gcc_assert (to_partition.layout >= 0);
    7530              : 
    7531              :   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
    7532              :      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
    7533              :      any other inputs keep their current choice of layout.  */
    7534       183585 :   auto &to_costs = partition_layout_costs (to_partition_i,
    7535              :                                            to_partition.layout);
    7536       183585 :   if (ud->src == int (to_node_i)
    7537       183383 :       && SLP_TREE_PERMUTE_P (to_vertex.node))
    7538              :     {
    7539         9507 :       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
    7540         9507 :       auto old_layout = from_partition.layout;
    7541         9507 :       from_partition.layout = from_layout_i;
    7542        19014 :       int factor = internal_node_cost (to_vertex.node, -1,
    7543         9507 :                                        to_partition.layout);
    7544         9507 :       from_partition.layout = old_layout;
    7545         9507 :       if (factor >= 0)
    7546              :         {
    7547         8881 :           slpg_layout_cost cost = to_costs.out_cost;
    7548        17762 :           cost.add_serial_cost ({ to_vertex.weight * factor,
    7549         8881 :                                   m_optimize_size });
    7550         8881 :           cost.split (to_partition.in_degree);
    7551         8881 :           return cost;
    7552              :         }
    7553              :     }
    7554              : 
    7555              :   /* Compute the cost if we insert any necessary layout change on edge UD.  */
    7556       174704 :   auto edge_cost = edge_layout_cost (ud, to_node_i,
    7557       174704 :                                      to_partition.layout, from_layout_i);
    7558       174704 :   if (edge_cost.is_possible ())
    7559              :     {
    7560       174704 :       slpg_layout_cost cost = to_costs.out_cost;
    7561       174704 :       cost.add_serial_cost (to_costs.internal_cost);
    7562       174704 :       cost.split (to_partition.in_degree);
    7563       174704 :       cost.add_serial_cost (edge_cost);
    7564       174704 :       return cost;
    7565              :     }
    7566              : 
    7567            0 :   return slpg_layout_cost::impossible ();
    7568              : }
    7569              : 
    7570              : /* Make a forward pass through the partitions, accumulating input costs.
    7571              :    Make a tentative (provisional) choice of layout for each partition,
    7572              :    ensuring that this choice still allows later partitions to keep
    7573              :    their original layout.  */
    7574              : 
    7575              : void
    7576         5690 : vect_optimize_slp_pass::forward_pass ()
    7577              : {
    7578       125452 :   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
    7579              :        ++partition_i)
    7580              :     {
    7581       119762 :       auto &partition = m_partitions[partition_i];
    7582              : 
    7583              :       /* If the partition consists of a single VEC_PERM_EXPR, precompute
    7584              :          the incoming cost that would apply if every predecessor partition
    7585              :          keeps its current layout.  This is used within the loop below.  */
    7586       119762 :       slpg_layout_cost in_cost;
    7587       119762 :       slp_tree single_node = nullptr;
    7588       119762 :       if (partition.node_end == partition.node_begin + 1)
    7589              :         {
    7590       113591 :           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
    7591       113591 :           single_node = m_vertices[node_i].node;
    7592       113591 :           if (SLP_TREE_PERMUTE_P (single_node))
    7593         3183 :             in_cost = total_in_cost (node_i);
    7594              :         }
    7595              : 
    7596              :       /* Go through the possible layouts.  Decide which ones are valid
    7597              :          for this partition and record which of the valid layouts has
    7598              :          the lowest cost.  */
    7599       119762 :       unsigned int min_layout_i = 0;
    7600       119762 :       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
    7601       365139 :       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
    7602              :         {
    7603       245377 :           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
    7604       245377 :           if (!layout_costs.is_possible ())
    7605        55736 :             continue;
    7606              : 
    7607              :           /* If the recorded layout is already 0 then the layout cannot
    7608              :              change.  */
    7609       245377 :           if (partition.layout == 0 && layout_i != 0)
    7610              :             {
    7611        39012 :               layout_costs.mark_impossible ();
    7612        39012 :               continue;
    7613              :             }
    7614              : 
    7615       206365 :           bool is_possible = true;
    7616       423897 :           for (unsigned int order_i = partition.node_begin;
    7617       423897 :                order_i < partition.node_end; ++order_i)
    7618              :             {
    7619       232178 :               unsigned int node_i = m_partitioned_nodes[order_i];
    7620       232178 :               auto &vertex = m_vertices[node_i];
    7621              : 
    7622              :               /* Reject the layout if it is individually incompatible
    7623              :                  with any node in the partition.  */
    7624       232178 :               if (!is_compatible_layout (vertex.node, layout_i))
    7625              :                 {
    7626        13614 :                   is_possible = false;
    7627        14646 :                   break;
    7628              :                 }
    7629              : 
    7630       604299 :               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
    7631              :                 {
    7632       385735 :                   auto &other_vertex = m_vertices[other_node_i];
    7633       385735 :                   if (other_vertex.partition < vertex.partition)
    7634              :                     {
    7635              :                       /* Accumulate the incoming costs from earlier
    7636              :                          partitions, plus the cost of any layout changes
    7637              :                          on UD itself.  */
    7638       199137 :                       auto cost = forward_cost (ud, other_node_i, layout_i);
    7639       199137 :                       if (!cost.is_possible ())
    7640         2465 :                         is_possible = false;
    7641              :                       else
    7642       196672 :                         layout_costs.in_cost.add_parallel_cost (cost);
    7643              :                     }
    7644              :                   else
    7645              :                     /* Reject the layout if it would make layout 0 impossible
    7646              :                        for later partitions.  This amounts to testing that the
    7647              :                        target supports reversing the layout change on edges
    7648              :                        to later partitions.
    7649              : 
    7650              :                        In principle, it might be possible to push a layout
    7651              :                        change all the way down a graph, so that it never
    7652              :                        needs to be reversed and so that the target doesn't
    7653              :                        need to support the reverse operation.  But it would
    7654              :                        be awkward to bail out if we hit a partition that
    7655              :                        does not support the new layout, especially since
    7656              :                        we are not dealing with a lattice.  */
    7657       186598 :                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
    7658       186598 :                                                      layout_i).is_possible ();
    7659       604299 :                 };
    7660       218564 :               for_each_partition_edge (node_i, add_cost);
    7661              : 
    7662              :               /* Accumulate the cost of using LAYOUT_I within NODE,
    7663              :                  both for the inputs and the outputs.  */
    7664       218564 :               int factor = internal_node_cost (vertex.node, layout_i,
    7665              :                                                layout_i);
    7666       218564 :               if (factor < 0)
    7667              :                 {
    7668         1032 :                   is_possible = false;
    7669         1032 :                   break;
    7670              :                 }
    7671       217532 :               else if (factor)
    7672        36165 :                 layout_costs.internal_cost.add_serial_cost
    7673        36165 :                   ({ vertex.weight * factor, m_optimize_size });
    7674              :             }
    7675       206365 :           if (!is_possible)
    7676              :             {
    7677        16724 :               layout_costs.mark_impossible ();
    7678        16724 :               continue;
    7679              :             }
    7680              : 
    7681              :           /* Combine the incoming and partition-internal costs.  */
    7682       189641 :           slpg_layout_cost combined_cost = layout_costs.in_cost;
    7683       189641 :           combined_cost.add_serial_cost (layout_costs.internal_cost);
    7684              : 
    7685              :           /* If this partition consists of a single VEC_PERM_EXPR, see
    7686              :              if the VEC_PERM_EXPR can be changed to support output layout
    7687              :              LAYOUT_I while keeping all the provisional choices of input
    7688              :              layout.  */
    7689       189641 :           if (single_node && SLP_TREE_PERMUTE_P (single_node))
    7690              :             {
    7691         5532 :               int factor = internal_node_cost (single_node, -1, layout_i);
    7692         5532 :               if (factor >= 0)
    7693              :                 {
    7694         5093 :                   auto weight = m_vertices[single_node->vertex].weight;
    7695         5093 :                   slpg_layout_cost internal_cost
    7696         5093 :                     = { weight * factor, m_optimize_size };
    7697              : 
    7698         5093 :                   slpg_layout_cost alt_cost = in_cost;
    7699         5093 :                   alt_cost.add_serial_cost (internal_cost);
    7700         5093 :                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
    7701              :                     {
    7702         1604 :                       combined_cost = alt_cost;
    7703         1604 :                       layout_costs.in_cost = in_cost;
    7704         1604 :                       layout_costs.internal_cost = internal_cost;
    7705              :                     }
    7706              :                 }
    7707              :             }
    7708              : 
    7709              :           /* Record the layout with the lowest cost.  Prefer layout 0 in
    7710              :              the event of a tie between it and another layout.  */
    7711       189641 :           if (!min_layout_cost.is_possible ()
    7712        69879 :               || combined_cost.is_better_than (min_layout_cost,
    7713        69879 :                                                m_optimize_size))
    7714              :             {
    7715       134500 :               min_layout_i = layout_i;
    7716       134500 :               min_layout_cost = combined_cost;
    7717              :             }
    7718              :         }
    7719              : 
    7720              :       /* This loop's handling of earlier partitions should ensure that
    7721              :          choosing the original layout for the current partition is no
    7722              :          less valid than it was in the original graph, even with the
    7723              :          provisional layout choices for those earlier partitions.  */
    7724       119762 :       gcc_assert (min_layout_cost.is_possible ());
    7725       119762 :       partition.layout = min_layout_i;
    7726              :     }
    7727         5690 : }
    7728              : 
    7729              : /* Make a backward pass through the partitions, accumulating output costs.
    7730              :    Make a final choice of layout for each partition.  */
    7731              : 
    7732              : void
    7733         5690 : vect_optimize_slp_pass::backward_pass ()
    7734              : {
    7735       131142 :   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
    7736              :     {
    7737       119762 :       auto &partition = m_partitions[partition_i];
    7738              : 
    7739       119762 :       unsigned int min_layout_i = 0;
    7740       119762 :       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
    7741       365139 :       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
    7742              :         {
    7743       245377 :           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
    7744       245377 :           if (!layout_costs.is_possible ())
    7745        55736 :             continue;
    7746              : 
    7747              :           /* Accumulate the costs from successor partitions.  */
    7748       189641 :           bool is_possible = true;
    7749       405064 :           for (unsigned int order_i = partition.node_begin;
    7750       405064 :                order_i < partition.node_end; ++order_i)
    7751              :             {
    7752       215423 :               unsigned int node_i = m_partitioned_nodes[order_i];
    7753       215423 :               auto &vertex = m_vertices[node_i];
    7754       595555 :               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
    7755              :                 {
    7756       380132 :                   auto &other_vertex = m_vertices[other_node_i];
    7757       380132 :                   auto &other_partition = m_partitions[other_vertex.partition];
    7758       380132 :                   if (other_vertex.partition > vertex.partition)
    7759              :                     {
    7760              :                       /* Accumulate the incoming costs from later
    7761              :                          partitions, plus the cost of any layout changes
    7762              :                          on UD itself.  */
    7763       183585 :                       auto cost = backward_cost (ud, other_node_i, layout_i);
    7764       183585 :                       if (!cost.is_possible ())
    7765            0 :                         is_possible = false;
    7766              :                       else
    7767       183585 :                         layout_costs.out_cost.add_parallel_cost (cost);
    7768              :                     }
    7769              :                   else
    7770              :                     /* Make sure that earlier partitions can (if necessary
    7771              :                        or beneficial) keep the layout that they chose in
    7772              :                        the forward pass.  This ensures that there is at
    7773              :                        least one valid choice of layout.  */
    7774       196547 :                     is_possible &= edge_layout_cost (ud, other_node_i,
    7775       196547 :                                                      other_partition.layout,
    7776       196547 :                                                      layout_i).is_possible ();
    7777       595555 :                 };
    7778       215423 :               for_each_partition_edge (node_i, add_cost);
    7779              :             }
    7780       189641 :           if (!is_possible)
    7781              :             {
    7782            0 :               layout_costs.mark_impossible ();
    7783            0 :               continue;
    7784              :             }
    7785              : 
    7786              :           /* Locally combine the costs from the forward and backward passes.
    7787              :              (This combined cost is not passed on, since that would lead
    7788              :              to double counting.)  */
    7789       189641 :           slpg_layout_cost combined_cost = layout_costs.in_cost;
    7790       189641 :           combined_cost.add_serial_cost (layout_costs.internal_cost);
    7791       189641 :           combined_cost.add_serial_cost (layout_costs.out_cost);
    7792              : 
    7793              :           /* Record the layout with the lowest cost.  Prefer layout 0 in
    7794              :              the event of a tie between it and another layout.  */
    7795       189641 :           if (!min_layout_cost.is_possible ()
    7796        69879 :               || combined_cost.is_better_than (min_layout_cost,
    7797        69879 :                                                m_optimize_size))
    7798              :             {
    7799       127848 :               min_layout_i = layout_i;
    7800       127848 :               min_layout_cost = combined_cost;
    7801              :             }
    7802              :         }
    7803              : 
    7804       119762 :       gcc_assert (min_layout_cost.is_possible ());
    7805       119762 :       partition.layout = min_layout_i;
    7806              :     }
    7807         5690 : }
    7808              : 
    7809              : /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
    7810              :    NODE already has the layout that was selected for its partition.  */
    7811              : 
    7812              : slp_tree
    7813       166411 : vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
    7814              :                                                 unsigned int to_layout_i)
    7815              : {
    7816       166411 :   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
    7817       166411 :   slp_tree result = m_node_layouts[result_i];
    7818       166411 :   if (result)
    7819              :     return result;
    7820              : 
    7821       165919 :   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
    7822       165919 :       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
    7823              :           /* We can't permute vector defs in place.  */
    7824        20220 :           && SLP_TREE_VEC_DEFS (node).is_empty ()))
    7825              :     {
    7826              :       /* If the vector is uniform or unchanged, there's nothing to do.  */
    7827        38143 :       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
    7828              :         result = node;
    7829              :       else
    7830              :         {
    7831         2009 :           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
    7832         2009 :           result = vect_create_new_slp_node (scalar_ops);
    7833         2009 :           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
    7834              :         }
    7835              :     }
    7836              :   else
    7837              :     {
    7838       127776 :       unsigned int partition_i = m_vertices[node->vertex].partition;
    7839       127776 :       unsigned int from_layout_i = m_partitions[partition_i].layout;
    7840       127776 :       if (from_layout_i == to_layout_i)
    7841       127207 :         return node;
    7842              : 
    7843              :       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
    7844              :          permutation instead of a serial one.  Leave the new permutation
    7845              :          in TMP_PERM on success.  */
    7846          569 :       auto_lane_permutation_t tmp_perm;
    7847          569 :       unsigned int num_inputs = 1;
    7848          569 :       if (SLP_TREE_PERMUTE_P (node))
    7849              :         {
    7850            7 :           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
    7851            7 :           if (from_layout_i != 0)
    7852            7 :             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
    7853            7 :           if (to_layout_i != 0)
    7854            4 :             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
    7855            7 :           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
    7856              :                                               tmp_perm,
    7857            7 :                                               SLP_TREE_CHILDREN (node),
    7858              :                                               false) >= 0)
    7859            7 :             num_inputs = SLP_TREE_CHILDREN (node).length ();
    7860              :           else
    7861            0 :             tmp_perm.truncate (0);
    7862              :         }
    7863              : 
    7864          569 :       if (dump_enabled_p ())
    7865              :         {
    7866           68 :           if (tmp_perm.length () > 0)
    7867            6 :             dump_printf_loc (MSG_NOTE, vect_location,
    7868              :                              "duplicating permutation node %p with"
    7869              :                              " layout %d\n",
    7870              :                              (void *) node, to_layout_i);
    7871              :           else
    7872           62 :             dump_printf_loc (MSG_NOTE, vect_location,
    7873              :                              "inserting permutation node in place of %p\n",
    7874              :                              (void *) node);
    7875              :         }
    7876              : 
    7877          569 :       unsigned int num_lanes = SLP_TREE_LANES (node);
    7878          569 :       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
    7879          569 :       if (SLP_TREE_SCALAR_STMTS (node).length ())
    7880              :         {
    7881          568 :           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
    7882          568 :           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
    7883          568 :           if (from_layout_i != 0)
    7884          298 :             vect_slp_permute (m_perms[from_layout_i], stmts, false);
    7885          568 :           if (to_layout_i != 0)
    7886          274 :             vect_slp_permute (m_perms[to_layout_i], stmts, true);
    7887              :         }
    7888          569 :       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
    7889          569 :       SLP_TREE_LANES (result) = num_lanes;
    7890          569 :       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
    7891          569 :       result->vertex = -1;
    7892              : 
    7893          569 :       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
    7894          569 :       if (tmp_perm.length ())
    7895              :         {
    7896            7 :           lane_perm.safe_splice (tmp_perm);
    7897            7 :           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
    7898              :         }
    7899              :       else
    7900              :         {
    7901          562 :           lane_perm.create (num_lanes);
    7902         1750 :           for (unsigned j = 0; j < num_lanes; ++j)
    7903         1188 :             lane_perm.quick_push ({ 0, j });
    7904          562 :           if (from_layout_i != 0)
    7905          291 :             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
    7906          562 :           if (to_layout_i != 0)
    7907          271 :             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
    7908          562 :           SLP_TREE_CHILDREN (result).safe_push (node);
    7909              :         }
    7910         2280 :       for (slp_tree child : SLP_TREE_CHILDREN (result))
    7911          573 :         child->refcnt++;
    7912          569 :     }
    7913        38712 :   m_node_layouts[result_i] = result;
    7914        38712 :   return result;
    7915              : }
    7916              : 
    7917              : /* Apply the chosen vector layouts to the SLP graph.  */
    7918              : 
    7919              : void
    7920        10629 : vect_optimize_slp_pass::materialize ()
    7921              : {
    7922              :   /* We no longer need the costs, so avoid having two O(N * P) arrays
    7923              :      live at the same time.  */
    7924        10629 :   m_partition_layout_costs.release ();
    7925        31887 :   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
    7926              : 
    7927        21258 :   auto_sbitmap fully_folded (m_vertices.length ());
    7928        10629 :   bitmap_clear (fully_folded);
    7929       174264 :   for (unsigned int node_i : m_partitioned_nodes)
    7930              :     {
    7931       142377 :       auto &vertex = m_vertices[node_i];
    7932       142377 :       slp_tree node = vertex.node;
    7933       142377 :       int layout_i = m_partitions[vertex.partition].layout;
    7934       142377 :       gcc_assert (layout_i >= 0);
    7935              : 
    7936              :       /* Rearrange the scalar statements to match the chosen layout.  */
    7937       142377 :       if (layout_i > 0)
    7938        15986 :         vect_slp_permute (m_perms[layout_i],
    7939        15986 :                           SLP_TREE_SCALAR_STMTS (node), true);
    7940              : 
    7941              :       /* Update load and lane permutations.  */
    7942       142377 :       if (SLP_TREE_PERMUTE_P (node))
    7943              :         {
    7944              :           /* First try to absorb the input vector layouts.  If that fails,
    7945              :              force the inputs to have layout LAYOUT_I too.  We checked that
    7946              :              that was possible before deciding to use nonzero output layouts.
    7947              :              (Note that at this stage we don't really have any guarantee that
    7948              :              the target supports the original VEC_PERM_EXPR.)  */
    7949         5340 :           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
    7950         5340 :           auto_lane_permutation_t tmp_perm;
    7951         5340 :           tmp_perm.safe_splice (perm);
    7952         5340 :           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
    7953         5340 :           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
    7954              :                                               tmp_perm,
    7955         5340 :                                               SLP_TREE_CHILDREN (node),
    7956              :                                               false) >= 0)
    7957              :             {
    7958         4977 :               if (dump_enabled_p ()
    7959         5897 :                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
    7960              :                                   perm.begin ()))
    7961           58 :                 dump_printf_loc (MSG_NOTE, vect_location,
    7962              :                                  "absorbing input layouts into %p\n",
    7963              :                                  (void *) node);
    7964        28034 :               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
    7965         4977 :               bitmap_set_bit (fully_folded, node_i);
    7966              :             }
    7967              :           else
    7968              :             {
    7969              :               /* Not MSG_MISSED because it would make no sense to users.  */
    7970          363 :               if (dump_enabled_p ())
    7971           46 :                 dump_printf_loc (MSG_NOTE, vect_location,
    7972              :                                  "failed to absorb input layouts into %p\n",
    7973              :                                  (void *) node);
    7974          363 :               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
    7975              :             }
    7976         5340 :         }
    7977              :       else
    7978              :         {
    7979       137037 :           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
    7980       137037 :           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
    7981       137037 :           if (layout_i > 0)
    7982              :             /* ???  When we handle non-bijective permutes the idea
    7983              :                is that we can force the load-permutation to be
    7984              :                { min, min + 1, min + 2, ... max }.  But then the
    7985              :                scalar defs might no longer match the lane content
    7986              :                which means wrong-code with live lane vectorization.
    7987              :                So we possibly have to have NULL entries for those.  */
    7988        15883 :             vect_slp_permute (m_perms[layout_i], load_perm, true);
    7989              :         }
    7990              :     }
    7991              : 
    7992              :   /* Do this before any nodes disappear, since it involves a walk
    7993              :      over the leaves.  */
    7994        10629 :   remove_redundant_permutations ();
    7995              : 
    7996              :   /* Replace each child with a correctly laid-out version.  */
    7997       174264 :   for (unsigned int node_i : m_partitioned_nodes)
    7998              :     {
    7999              :       /* Skip nodes that have already been handled above.  */
    8000       142377 :       if (bitmap_bit_p (fully_folded, node_i))
    8001         4977 :         continue;
    8002              : 
    8003       137400 :       auto &vertex = m_vertices[node_i];
    8004       137400 :       int in_layout_i = m_partitions[vertex.partition].layout;
    8005       137400 :       gcc_assert (in_layout_i >= 0);
    8006              : 
    8007              :       unsigned j;
    8008              :       slp_tree child;
    8009       412812 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
    8010              :         {
    8011       172369 :           if (!child)
    8012         5958 :             continue;
    8013              : 
    8014       166411 :           slp_tree new_child = get_result_with_layout (child, in_layout_i);
    8015       166411 :           if (new_child != child)
    8016              :             {
    8017         2813 :               vect_free_slp_tree (child);
    8018         2813 :               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
    8019         2813 :               new_child->refcnt += 1;
    8020              :             }
    8021              :         }
    8022              :     }
    8023        10629 : }
    8024              : 
    8025              : /* Elide load permutations that are not necessary.  Such permutations might
    8026              :    be pre-existing, rather than created by the layout optimizations.  */
    8027              : 
    8028              : void
    8029       681015 : vect_optimize_slp_pass::remove_redundant_permutations ()
    8030              : {
    8031      4499909 :   for (unsigned int node_i : m_leafs)
    8032              :     {
    8033      2456864 :       slp_tree node = m_vertices[node_i].node;
    8034      2456864 :       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
    8035      1860552 :         continue;
    8036              : 
    8037              :       /* In basic block vectorization we allow any subchain of an interleaving
    8038              :          chain.
    8039              :          FORNOW: not in loop SLP because of realignment complications.  */
    8040       596312 :       if (is_a <bb_vec_info> (m_vinfo))
    8041              :         {
    8042       159613 :           bool subchain_p = true;
    8043              :           stmt_vec_info next_load_info = NULL;
    8044              :           stmt_vec_info load_info;
    8045              :           unsigned j;
    8046       159613 :           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
    8047              :             {
    8048       129714 :               if (j != 0
    8049       129714 :                   && (next_load_info != load_info
    8050        61955 :                       || ! load_info
    8051        61955 :                       || DR_GROUP_GAP (load_info) != 1))
    8052              :                 {
    8053              :                   subchain_p = false;
    8054              :                   break;
    8055              :                 }
    8056       107172 :               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
    8057              :             }
    8058        52441 :           if (subchain_p)
    8059              :             {
    8060        29899 :               SLP_TREE_LOAD_PERMUTATION (node).release ();
    8061        29899 :               continue;
    8062              :             }
    8063              :         }
    8064              :       else
    8065              :         {
    8066       543871 :           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
    8067       543871 :           bool this_load_permuted = !vect_load_perm_consecutive_p (node, 0);
    8068              :           /* When this isn't a grouped access we know it's single element
    8069              :              and contiguous.  */
    8070       543871 :           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
    8071              :             {
    8072       423197 :               if (!this_load_permuted
    8073       423197 :                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
    8074       422402 :                       || SLP_TREE_LANES (node) == 1))
    8075       422404 :                 SLP_TREE_LOAD_PERMUTATION (node).release ();
    8076       423197 :               continue;
    8077              :             }
    8078       120674 :           stmt_vec_info first_stmt_info
    8079       120674 :             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
    8080       121179 :           if (!this_load_permuted
    8081              :               /* The load requires permutation when unrolling exposes
    8082              :                  a gap either because the group is larger than the SLP
    8083              :                  group-size or because there is a gap between the groups.  */
    8084       120674 :               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
    8085        98477 :                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
    8086          140 :                       && DR_GROUP_GAP (first_stmt_info) == 0)))
    8087              :             {
    8088          505 :               SLP_TREE_LOAD_PERMUTATION (node).release ();
    8089          505 :               continue;
    8090              :             }
    8091              :         }
    8092              :     }
    8093       681015 : }
    8094              : 
    8095              : /* Print the partition graph and layout information to the dump file.  */
    8096              : 
    8097              : void
    8098          674 : vect_optimize_slp_pass::dump ()
    8099              : {
    8100          674 :   dump_printf_loc (MSG_NOTE, vect_location,
    8101              :                    "SLP optimize permutations:\n");
    8102         1361 :   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
    8103              :     {
    8104          687 :       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
    8105          687 :       const char *sep = "";
    8106         5866 :       for (unsigned int idx : m_perms[layout_i])
    8107              :         {
    8108         3805 :           dump_printf (MSG_NOTE, "%s%d", sep, idx);
    8109         3805 :           sep = ", ";
    8110              :         }
    8111          687 :       dump_printf (MSG_NOTE, " }\n");
    8112              :     }
    8113          674 :   dump_printf_loc (MSG_NOTE, vect_location,
    8114              :                    "SLP optimize partitions:\n");
    8115         5612 :   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
    8116              :        ++partition_i)
    8117              :     {
    8118         4938 :       auto &partition = m_partitions[partition_i];
    8119         4938 :       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
    8120         4938 :       dump_printf_loc (MSG_NOTE, vect_location,
    8121              :                        "  partition %d (layout %d):\n",
    8122              :                        partition_i, partition.layout);
    8123         4938 :       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
    8124        10111 :       for (unsigned int order_i = partition.node_begin;
    8125        10111 :            order_i < partition.node_end; ++order_i)
    8126              :         {
    8127         5173 :           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
    8128        10346 :           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
    8129         5173 :                            (void *) vertex.node);
    8130         5173 :           dump_printf_loc (MSG_NOTE, vect_location,
    8131              :                            "          weight: %f\n",
    8132              :                            vertex.weight.to_double ());
    8133         5173 :           if (vertex.out_degree)
    8134         4050 :             dump_printf_loc (MSG_NOTE, vect_location,
    8135              :                              "          out weight: %f (degree %d)\n",
    8136              :                              vertex.out_weight.to_double (),
    8137              :                              vertex.out_degree);
    8138         5173 :           if (SLP_TREE_PERMUTE_P (vertex.node))
    8139          506 :             dump_printf_loc (MSG_NOTE, vect_location,
    8140              :                              "          op: VEC_PERM_EXPR\n");
    8141         4667 :           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
    8142         4649 :             dump_printf_loc (MSG_NOTE, vect_location,
    8143              :                              "          op template: %G", rep->stmt);
    8144              :         }
    8145         4938 :       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
    8146        10111 :       for (unsigned int order_i = partition.node_begin;
    8147        10111 :            order_i < partition.node_end; ++order_i)
    8148              :         {
    8149         5173 :           unsigned int node_i = m_partitioned_nodes[order_i];
    8150         5173 :           auto &vertex = m_vertices[node_i];
    8151        15617 :           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
    8152              :             {
    8153        10444 :               auto &other_vertex = m_vertices[other_node_i];
    8154        10444 :               if (other_vertex.partition < vertex.partition)
    8155         5222 :                 dump_printf_loc (MSG_NOTE, vect_location,
    8156              :                                  "      - %p [%d] --> %p\n",
    8157         5222 :                                  (void *) other_vertex.node,
    8158              :                                  other_vertex.partition,
    8159         5222 :                                  (void *) vertex.node);
    8160              :               else
    8161         5222 :                 dump_printf_loc (MSG_NOTE, vect_location,
    8162              :                                  "      - %p --> [%d] %p\n",
    8163         5222 :                                  (void *) vertex.node,
    8164              :                                  other_vertex.partition,
    8165         5222 :                                  (void *) other_vertex.node);
    8166        15617 :             };
    8167         5173 :           for_each_partition_edge (node_i, print_edge);
    8168              :         }
    8169              : 
    8170        15013 :       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
    8171              :         {
    8172        10075 :           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
    8173        10075 :           if (layout_costs.is_possible ())
    8174              :             {
    8175         8301 :               dump_printf_loc (MSG_NOTE, vect_location,
    8176              :                                "    layout %d:%s\n", layout_i,
    8177         8301 :                                partition.layout == int (layout_i)
    8178              :                                ? " (*)" : "");
    8179         8301 :               slpg_layout_cost combined_cost = layout_costs.in_cost;
    8180         8301 :               combined_cost.add_serial_cost (layout_costs.internal_cost);
    8181         8301 :               combined_cost.add_serial_cost (layout_costs.out_cost);
    8182              : #define TEMPLATE "{depth: %f, total: %f}"
    8183         8301 :               dump_printf_loc (MSG_NOTE, vect_location,
    8184              :                                "        " TEMPLATE "\n",
    8185              :                                layout_costs.in_cost.depth.to_double (),
    8186              :                                layout_costs.in_cost.total.to_double ());
    8187         8301 :               dump_printf_loc (MSG_NOTE, vect_location,
    8188              :                                "      + " TEMPLATE "\n",
    8189              :                                layout_costs.internal_cost.depth.to_double (),
    8190              :                                layout_costs.internal_cost.total.to_double ());
    8191         8301 :               dump_printf_loc (MSG_NOTE, vect_location,
    8192              :                                "      + " TEMPLATE "\n",
    8193              :                                layout_costs.out_cost.depth.to_double (),
    8194              :                                layout_costs.out_cost.total.to_double ());
    8195         8301 :               dump_printf_loc (MSG_NOTE, vect_location,
    8196              :                                "      = " TEMPLATE "\n",
    8197              :                                combined_cost.depth.to_double (),
    8198              :                                combined_cost.total.to_double ());
    8199              : #undef TEMPLATE
    8200              :             }
    8201              :           else
    8202         1774 :             dump_printf_loc (MSG_NOTE, vect_location,
    8203              :                              "    layout %d: rejected\n", layout_i);
    8204              :         }
    8205              :     }
    8206          674 : }
    8207              : 
    8208              : /* Masked load lanes discovery.  */
    8209              : 
    8210              : void
    8211       681015 : vect_optimize_slp_pass::decide_masked_load_lanes ()
    8212              : {
    8213      7018566 :   for (auto v : m_vertices)
    8214              :     {
    8215      4975521 :       slp_tree node = v.node;
    8216      4975521 :       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
    8217      3493334 :           || SLP_TREE_PERMUTE_P (node))
    8218      1619268 :         continue;
    8219      3356253 :       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
    8220      1642413 :       if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
    8221              :           /* The mask has to be uniform.  */
    8222       975356 :           || STMT_VINFO_SLP_VECT_ONLY (stmt_info)
    8223       975225 :           || ! is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
    8224      3356338 :           || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
    8225              :                                        IFN_MASK_LOAD))
    8226      3356220 :         continue;
    8227           33 :       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
    8228           66 :       if (STMT_VINFO_STRIDED_P (stmt_info)
    8229           33 :           || compare_step_with_zero (m_vinfo, stmt_info) <= 0
    8230           63 :           || vect_load_lanes_supported (SLP_TREE_VECTYPE (node),
    8231           30 :                                         DR_GROUP_SIZE (stmt_info),
    8232              :                                         true) == IFN_LAST)
    8233           33 :         continue;
    8234              : 
    8235              :       /* Uniform masks need to be suitably represented.  */
    8236            0 :       slp_tree mask = SLP_TREE_CHILDREN (node)[0];
    8237            0 :       if (!SLP_TREE_PERMUTE_P (mask)
    8238            0 :           || SLP_TREE_CHILDREN (mask).length () != 1)
    8239            0 :         continue;
    8240            0 :       bool match = true;
    8241            0 :       for (auto perm : SLP_TREE_LANE_PERMUTATION (mask))
    8242            0 :         if (perm.first != 0 || perm.second != 0)
    8243              :           {
    8244              :             match = false;
    8245              :             break;
    8246              :           }
    8247            0 :       if (!match)
    8248            0 :         continue;
    8249              : 
    8250              :       /* Now see if the consumer side matches.  */
    8251            0 :       for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
    8252            0 :            pred; pred = pred->pred_next)
    8253              :         {
    8254            0 :           slp_tree pred_node = m_vertices[pred->src].node;
    8255              :           /* All consumers should be a permute with a single outgoing lane.  */
    8256            0 :           if (!SLP_TREE_PERMUTE_P (pred_node)
    8257            0 :               || SLP_TREE_LANES (pred_node) != 1)
    8258              :             {
    8259              :               match = false;
    8260              :               break;
    8261              :             }
    8262            0 :           gcc_assert (SLP_TREE_CHILDREN (pred_node).length () == 1);
    8263              :         }
    8264            0 :       if (!match)
    8265            0 :         continue;
    8266              :       /* Now we can mark the nodes as to use load lanes.  */
    8267            0 :       node->ldst_lanes = true;
    8268            0 :       for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
    8269            0 :            pred; pred = pred->pred_next)
    8270            0 :         m_vertices[pred->src].node->ldst_lanes = true;
    8271              :       /* The catch is we have to massage the mask.  We have arranged
    8272              :          analyzed uniform masks to be represented by a splat VEC_PERM
    8273              :          which we can now simply elide as we cannot easily re-do SLP
    8274              :          discovery here.  */
    8275            0 :       slp_tree new_mask = SLP_TREE_CHILDREN (mask)[0];
    8276            0 :       SLP_TREE_REF_COUNT (new_mask)++;
    8277            0 :       SLP_TREE_CHILDREN (node)[0] = new_mask;
    8278            0 :       vect_free_slp_tree (mask);
    8279              :     }
    8280       681015 : }
    8281              : 
    8282              : /* Perform legitimizing attempts.  This is intended to improve the
    8283              :    situation when layout 0 is not valid which is a situation the cost
    8284              :    based propagation does not handle well.
    8285              :    Return true if further layout optimization is possible, false if
    8286              :    the layout configuration should be considered final.  */
    8287              : 
    8288              : bool
    8289        10629 : vect_optimize_slp_pass::legitimize ()
    8290              : {
    8291              :   /* Perform a very simple legitimizing attempt by attempting to choose
    8292              :      a single layout for all partitions that will make all permutations
    8293              :      a noop.  That should also be the optimal layout choice in case
    8294              :      layout zero is legitimate.
    8295              :      ???  Disconnected components of the SLP graph could have distinct
    8296              :      single layouts.  */
    8297        10629 :   int single_layout_i = -1;
    8298        10629 :   unsigned deferred_up_to = -1U;
    8299        31472 :   for (unsigned partition_i = 0; partition_i < m_partitions.length ();
    8300              :        ++partition_i)
    8301              :     {
    8302        26527 :       auto &partition = m_partitions[partition_i];
    8303        26527 :       if (single_layout_i == -1)
    8304              :         {
    8305        13863 :           single_layout_i = partition.layout;
    8306        13863 :           deferred_up_to = partition_i;
    8307              :         }
    8308        12664 :       else if (partition.layout == single_layout_i || partition.layout == -1)
    8309              :         ;
    8310              :       else
    8311              :         single_layout_i = 0;
    8312        23255 :       if (single_layout_i == 0)
    8313              :         return true;
    8314              : 
    8315        20903 :       if (single_layout_i != -1
    8316        20903 :           && !is_compatible_layout (partition, single_layout_i))
    8317              :         return true;
    8318              :     }
    8319              : 
    8320         4945 :   if (single_layout_i <= 0)
    8321              :     return true;
    8322              : 
    8323         5061 :   for (unsigned partition_i = 0; partition_i < deferred_up_to; ++partition_i)
    8324          122 :     if (!is_compatible_layout (m_partitions[partition_i],
    8325              :                                single_layout_i))
    8326              :       return true;
    8327              : 
    8328        12517 :   for (unsigned partition_i = 0; partition_i < m_partitions.length ();
    8329              :        ++partition_i)
    8330              :     {
    8331         7578 :       auto &partition = m_partitions[partition_i];
    8332         7578 :       partition.layout = single_layout_i;
    8333              :     }
    8334              : 
    8335              :   return false;
    8336              : }
    8337              : 
    8338              : /* Main entry point for the SLP graph optimization pass.  */
    8339              : 
    8340              : void
    8341       681015 : vect_optimize_slp_pass::run ()
    8342              : {
    8343       681015 :   build_graph ();
    8344       681015 :   create_partitions ();
    8345       681015 :   start_choosing_layouts ();
    8346       681015 :   if (m_perms.length () > 1)
    8347              :     {
    8348        10629 :       if (legitimize ())
    8349              :         {
    8350         5690 :           forward_pass ();
    8351         5690 :           backward_pass ();
    8352              :         }
    8353        10629 :       if (dump_enabled_p ())
    8354          674 :         dump ();
    8355        10629 :       materialize ();
    8356        42933 :       while (!m_perms.is_empty ())
    8357        21675 :         m_perms.pop ().release ();
    8358              :     }
    8359              :   else
    8360       670386 :     remove_redundant_permutations ();
    8361       681015 :   free_graph (m_slpg);
    8362       681015 :   build_graph ();
    8363       681015 :   decide_masked_load_lanes ();
    8364       681015 :   free_graph (m_slpg);
    8365       681015 : }
    8366              : 
    8367              : /* Apply CSE to NODE and its children using BST_MAP.  */
    8368              : 
    8369              : static void
    8370      5377344 : vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
    8371              : {
    8372      5377344 :   bool put_p = false;
    8373      5377344 :   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
    8374              :       /* Besides some VEC_PERM_EXPR, two-operator nodes also
    8375              :          lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
    8376              :          we'd have sth that works for all internal and external nodes.  */
    8377      5377344 :       && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
    8378              :     {
    8379      3869142 :       slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
    8380      3869142 :       if (leader)
    8381              :         {
    8382              :           /* We've visited this node already.  */
    8383       404229 :           if (!*leader || *leader == node)
    8384              :             return;
    8385              : 
    8386         2800 :           if (dump_enabled_p ())
    8387          907 :             dump_printf_loc (MSG_NOTE, vect_location,
    8388              :                              "re-using SLP tree %p for %p\n",
    8389              :                              (void *)*leader, (void *)node);
    8390         2800 :           vect_free_slp_tree (node);
    8391         2800 :           (*leader)->refcnt += 1;
    8392         2800 :           node = *leader;
    8393         2800 :           return;
    8394              :         }
    8395              : 
    8396              :       /* Avoid creating a cycle by populating the map only after recursion.  */
    8397      3464913 :       bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
    8398      3464913 :       node->refcnt += 1;
    8399      3464913 :       put_p = true;
    8400              :       /* And recurse.  */
    8401              :     }
    8402              : 
    8403     14868276 :   for (slp_tree &child : SLP_TREE_CHILDREN (node))
    8404      4343037 :     if (child)
    8405      3911285 :       vect_cse_slp_nodes (bst_map, child);
    8406              : 
    8407              :   /* Now record the node for CSE in other siblings.  */
    8408      4973115 :   if (put_p)
    8409      3464913 :     *bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
    8410              : }
    8411              : 
    8412              : /* Optimize the SLP graph of VINFO.  */
    8413              : 
    8414              : void
    8415      1027203 : vect_optimize_slp (vec_info *vinfo)
    8416              : {
    8417      1027203 :   if (vinfo->slp_instances.is_empty ())
    8418              :     return;
    8419       681015 :   vect_optimize_slp_pass (vinfo).run ();
    8420              : 
    8421              :   /* Apply CSE again to nodes after permute optimization.  */
    8422       681015 :   scalar_stmts_to_slp_tree_map_t *bst_map
    8423       681015 :     = new scalar_stmts_to_slp_tree_map_t ();
    8424              : 
    8425      3509104 :   for (auto inst : vinfo->slp_instances)
    8426      1466059 :     vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
    8427              : 
    8428       681015 :   release_scalar_stmts_to_slp_tree_map (bst_map);
    8429              : }
    8430              : 
    8431              : /* Gather loads reachable from the individual SLP graph entries.  */
    8432              : 
    8433              : void
    8434      1027203 : vect_gather_slp_loads (vec_info *vinfo)
    8435              : {
    8436      1027203 :   unsigned i;
    8437      1027203 :   slp_instance instance;
    8438      2493262 :   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
    8439              :     {
    8440      1466059 :       hash_set<slp_tree> visited;
    8441      1466059 :       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
    8442              :                              SLP_INSTANCE_TREE (instance), visited);
    8443      1466059 :     }
    8444      1027203 : }
    8445              : 
    8446              : /* For NODE update VF based on the number of lanes and the vector types
    8447              :    used.  */
    8448              : 
    8449              : static void
    8450      4226623 : vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
    8451              :                              hash_set<slp_tree> &visited)
    8452              : {
    8453      4226623 :   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    8454      1519506 :     return;
    8455      3070087 :   if (visited.add (node))
    8456              :     return;
    8457              : 
    8458     10265523 :   for (slp_tree child : SLP_TREE_CHILDREN (node))
    8459      3466014 :     vect_update_slp_vf_for_node (child, vf, visited);
    8460              : 
    8461              :   /* We do not visit SLP nodes for constants or externals - those neither
    8462              :      have a vector type set yet (vectorizable_* does this) nor do they
    8463              :      have max_nunits set.  Instead we rely on internal nodes max_nunit
    8464              :      to cover constant/external operands.
    8465              :      Note that when we stop using fixed size vectors externs and constants
    8466              :      shouldn't influence the (minimum) vectorization factor, instead
    8467              :      vectorizable_* should honor the vectorization factor when trying to
    8468              :      assign vector types to constants and externals and cause iteration
    8469              :      to a higher vectorization factor when required.  */
    8470      2707117 :   poly_uint64 node_vf
    8471      2707117 :     = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
    8472      2707117 :   vf = force_common_multiple (vf, node_vf);
    8473              : 
    8474              :   /* For permute nodes that are fed from externs or constants we have to
    8475              :      consider their number of lanes as well.  Likewise for store-lanes.  */
    8476      2707117 :   if (SLP_TREE_PERMUTE_P (node) || node->ldst_lanes)
    8477       706215 :     for (slp_tree child : SLP_TREE_CHILDREN (node))
    8478       189930 :       if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
    8479              :         {
    8480         3445 :           poly_uint64 child_vf
    8481         3445 :             = calculate_unrolling_factor (node->max_nunits,
    8482              :                                           SLP_TREE_LANES (child));
    8483         3445 :           vf = force_common_multiple (vf, child_vf);
    8484              :         }
    8485              : }
    8486              : 
    8487              : /* For each possible SLP instance decide whether to SLP it and calculate overall
    8488              :    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
    8489              :    least one instance.  */
    8490              : 
    8491              : bool
    8492       473922 : vect_make_slp_decision (loop_vec_info loop_vinfo)
    8493              : {
    8494       473922 :   unsigned int i;
    8495       473922 :   poly_uint64 unrolling_factor = 1;
    8496       473922 :   const vec<slp_instance> &slp_instances
    8497              :     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
    8498       473922 :   slp_instance instance;
    8499       473922 :   int decided_to_slp = 0;
    8500              : 
    8501       473922 :   DUMP_VECT_SCOPE ("vect_make_slp_decision");
    8502              : 
    8503       473922 :   hash_set<slp_tree> visited;
    8504      1234531 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
    8505              :     {
    8506       760609 :       slp_tree root = SLP_INSTANCE_TREE (instance);
    8507              : 
    8508              :       /* All unroll factors have the form:
    8509              : 
    8510              :            GET_MODE_SIZE (vinfo->vector_mode) * X
    8511              : 
    8512              :          for some rational X, so they must have a common multiple.  */
    8513       760609 :       vect_update_slp_vf_for_node (root, unrolling_factor, visited);
    8514              : 
    8515              :       /* If all instances ended up with vector(1) T roots make sure to
    8516              :          not vectorize.  RVV for example relies on loop vectorization
    8517              :          when some instances are essentially kept scalar.  See PR121048.  */
    8518       760609 :       if (SLP_TREE_VECTYPE (root)
    8519       760609 :           && known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U))
    8520       622309 :         decided_to_slp++;
    8521              :     }
    8522              : 
    8523       473922 :   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = unrolling_factor;
    8524              : 
    8525       473922 :   if (decided_to_slp && dump_enabled_p ())
    8526              :     {
    8527        19072 :       dump_printf_loc (MSG_NOTE, vect_location,
    8528              :                        "Decided to SLP %d instances. Unrolling factor ",
    8529              :                        decided_to_slp);
    8530        19072 :       dump_dec (MSG_NOTE, unrolling_factor);
    8531        19072 :       dump_printf (MSG_NOTE, "\n");
    8532              :     }
    8533              : 
    8534       473922 :   return (decided_to_slp > 0);
    8535       473922 : }
    8536              : 
    8537              : /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
    8538              : 
    8539      2205447 : _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
    8540              :   : vec_info (vec_info::bb, shared),
    8541      2205447 :     roots (vNULL)
    8542              : {
    8543              :   /* The region we are operating on.  bbs[0] is the entry, excluding
    8544              :      its PHI nodes.  In the future we might want to track an explicit
    8545              :      entry edge to cover bbs[0] PHI nodes and have a region entry
    8546              :      insert location.  */
    8547      2205447 :   bbs = _bbs.address ();
    8548      2205447 :   nbbs = _bbs.length ();
    8549              : 
    8550     17680234 :   for (unsigned i = 0; i < nbbs; ++i)
    8551              :     {
    8552     15474787 :       if (i != 0)
    8553     20118615 :         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
    8554      6849275 :              gsi_next (&si))
    8555              :           {
    8556      6849275 :             gphi *phi = si.phi ();
    8557      6849275 :             gimple_set_uid (phi, 0);
    8558      6849275 :             add_stmt (phi);
    8559              :           }
    8560     30949574 :       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
    8561    137971545 :            !gsi_end_p (gsi); gsi_next (&gsi))
    8562              :         {
    8563    122496758 :           gimple *stmt = gsi_stmt (gsi);
    8564    122496758 :           gimple_set_uid (stmt, 0);
    8565    122496758 :           if (is_gimple_debug (stmt))
    8566     77082069 :             continue;
    8567     45414689 :           add_stmt (stmt);
    8568              :         }
    8569              :     }
    8570      2205447 : }
    8571              : 
    8572              : 
    8573              : /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
    8574              :    stmts in the basic block.  */
    8575              : 
    8576      2205447 : _bb_vec_info::~_bb_vec_info ()
    8577              : {
    8578              :   /* Reset region marker.  */
    8579     17680234 :   for (unsigned i = 0; i < nbbs; ++i)
    8580              :     {
    8581     15474787 :       if (i != 0)
    8582     20134427 :         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
    8583      6865087 :              gsi_next (&si))
    8584              :           {
    8585      6865087 :             gphi *phi = si.phi ();
    8586      6865087 :             gimple_set_uid (phi, -1);
    8587              :           }
    8588     30949574 :       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
    8589    137914411 :            !gsi_end_p (gsi); gsi_next (&gsi))
    8590              :         {
    8591    122439624 :           gimple *stmt = gsi_stmt (gsi);
    8592    122439624 :           gimple_set_uid (stmt, -1);
    8593              :         }
    8594              :     }
    8595              : 
    8596      3448649 :   for (unsigned i = 0; i < roots.length (); ++i)
    8597              :     {
    8598      1243202 :       roots[i].stmts.release ();
    8599      1243202 :       roots[i].roots.release ();
    8600      1243202 :       roots[i].remain.release ();
    8601              :     }
    8602      2205447 :   roots.release ();
    8603      2205447 : }
    8604              : 
    8605              : /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
    8606              :    given then that child nodes have already been processed, and that
    8607              :    their def types currently match their SLP node's def type.  */
    8608              : 
    8609              : static bool
    8610      2819368 : vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
    8611              :                                     slp_instance node_instance,
    8612              :                                     stmt_vector_for_cost *cost_vec)
    8613              : {
    8614              :   /* Handle purely internal nodes.  */
    8615      2819368 :   if (SLP_TREE_PERMUTE_P (node))
    8616              :     {
    8617       122699 :       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
    8618              :         return false;
    8619              : 
    8620              :       stmt_vec_info slp_stmt_info;
    8621              :       unsigned int i;
    8622       323678 :       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
    8623              :         {
    8624       202306 :           if (slp_stmt_info
    8625       196765 :               && STMT_VINFO_LIVE_P (slp_stmt_info)
    8626       202306 :               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
    8627              :                                                node_instance, i,
    8628              :                                                false, cost_vec))
    8629              :             return false;
    8630              :         }
    8631       121372 :       SLP_TREE_TYPE (node) = permute_info_type;
    8632       121372 :       return true;
    8633              :     }
    8634              : 
    8635      2696669 :   return vect_analyze_stmt (vinfo, node, node_instance, cost_vec);
    8636              : }
    8637              : 
    8638              : static int
    8639      1860568 : sort_ints (const void *a_, const void *b_)
    8640              : {
    8641      1860568 :   int a = *(const int *)a_;
    8642      1860568 :   int b = *(const int *)b_;
    8643      1860568 :   return a - b;
    8644              : }
    8645              : 
    8646              : /* Verify if we can externalize a set of internal defs.  */
    8647              : 
    8648              : static bool
    8649       383274 : vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
    8650              : {
    8651              :   /* Constant generation uses get_later_stmt which can only handle
    8652              :      defs from the same BB or a set of defs that can be ordered
    8653              :      with a dominance query.  */
    8654       383274 :   basic_block bb = NULL;
    8655       383274 :   bool all_same = true;
    8656       383274 :   auto_vec<int> bbs;
    8657       766548 :   bbs.reserve_exact (stmts.length ());
    8658      2073018 :   for (stmt_vec_info stmt : stmts)
    8659              :     {
    8660       923196 :       if (!stmt)
    8661              :         return false;
    8662       923196 :       else if (!bb)
    8663       383274 :         bb = gimple_bb (stmt->stmt);
    8664       539922 :       else if (gimple_bb (stmt->stmt) != bb)
    8665       174681 :         all_same = false;
    8666       923196 :       bbs.quick_push (gimple_bb (stmt->stmt)->index);
    8667              :     }
    8668       383274 :   if (all_same)
    8669              :     return true;
    8670              : 
    8671              :   /* Produce a vector of unique BB indexes for the defs.  */
    8672       130880 :   bbs.qsort (sort_ints);
    8673              :   unsigned i, j;
    8674       318876 :   for (i = 1, j = 1; i < bbs.length (); ++i)
    8675       187996 :     if (bbs[i] != bbs[j-1])
    8676       139650 :       bbs[j++] = bbs[i];
    8677       130880 :   gcc_assert (j >= 2);
    8678       130880 :   bbs.truncate (j);
    8679              : 
    8680       261760 :   if (bbs.length () == 2)
    8681       127360 :     return (dominated_by_p (CDI_DOMINATORS,
    8682       127360 :                             BASIC_BLOCK_FOR_FN (cfun, bbs[0]),
    8683       127360 :                             BASIC_BLOCK_FOR_FN (cfun, bbs[1]))
    8684       248076 :             || dominated_by_p (CDI_DOMINATORS,
    8685       120716 :                                BASIC_BLOCK_FOR_FN (cfun, bbs[1]),
    8686       120716 :                                BASIC_BLOCK_FOR_FN (cfun, bbs[0])));
    8687              : 
    8688              :   /* ???  For more than two BBs we can sort the vector and verify the
    8689              :      result is a total order.  But we can't use vec::qsort with a
    8690              :      compare function using a dominance query since there's no way to
    8691              :      signal failure and any fallback for an unordered pair would
    8692              :      fail qsort_chk later.
    8693              :      For now simply hope that ordering after BB index provides the
    8694              :      best candidate total order.  If required we can implement our
    8695              :      own mergesort or export an entry without checking.  */
    8696       399031 :   for (unsigned i = 1; i < bbs.length (); ++i)
    8697        12266 :     if (!dominated_by_p (CDI_DOMINATORS,
    8698        12266 :                          BASIC_BLOCK_FOR_FN (cfun, bbs[i]),
    8699        12266 :                          BASIC_BLOCK_FOR_FN (cfun, bbs[i-1])))
    8700              :       return false;
    8701              : 
    8702              :   return true;
    8703       383274 : }
    8704              : 
    8705              : /* Try to build NODE from scalars, returning true on success.
    8706              :    NODE_INSTANCE is the SLP instance that contains NODE.  */
    8707              : 
    8708              : static bool
    8709       562222 : vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
    8710              :                               slp_instance node_instance)
    8711              : {
    8712       562222 :   stmt_vec_info stmt_info;
    8713       562222 :   unsigned int i;
    8714              : 
    8715       562222 :   if (!is_a <bb_vec_info> (vinfo)
    8716        71191 :       || node == SLP_INSTANCE_TREE (node_instance)
    8717        22299 :       || !SLP_TREE_SCALAR_STMTS (node).exists ()
    8718        22258 :       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
    8719              :       /* Force the mask use to be built from scalars instead.  */
    8720        20022 :       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
    8721       582045 :       || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
    8722       542399 :     return false;
    8723              : 
    8724        19823 :   if (dump_enabled_p ())
    8725           76 :     dump_printf_loc (MSG_NOTE, vect_location,
    8726              :                      "Building vector operands of %p from scalars instead\n",
    8727              :                      (void *) node);
    8728              : 
    8729              :   /* Don't remove and free the child nodes here, since they could be
    8730              :      referenced by other structures.  The analysis and scheduling phases
    8731              :      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
    8732        19823 :   unsigned int group_size = SLP_TREE_LANES (node);
    8733        19823 :   SLP_TREE_DEF_TYPE (node) = vect_external_def;
    8734              :   /* Invariants get their vector type from the uses.  */
    8735        19823 :   SLP_TREE_VECTYPE (node) = NULL_TREE;
    8736        19823 :   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
    8737        19823 :   SLP_TREE_LOAD_PERMUTATION (node).release ();
    8738        68899 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    8739              :     {
    8740        49076 :       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
    8741        49076 :       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
    8742              :     }
    8743              :   return true;
    8744              : }
    8745              : 
    8746              : /* Return true if all elements of the slice are the same.  */
    8747              : bool
    8748       483923 : vect_scalar_ops_slice::all_same_p () const
    8749              : {
    8750       532064 :   for (unsigned int i = 1; i < length; ++i)
    8751       449430 :     if (!operand_equal_p (op (0), op (i)))
    8752              :       return false;
    8753              :   return true;
    8754              : }
    8755              : 
    8756              : hashval_t
    8757       406059 : vect_scalar_ops_slice_hash::hash (const value_type &s)
    8758              : {
    8759       406059 :   hashval_t hash = 0;
    8760      1560783 :   for (unsigned i = 0; i < s.length; ++i)
    8761      1154724 :     hash = iterative_hash_expr (s.op (i), hash);
    8762       406059 :   return hash;
    8763              : }
    8764              : 
    8765              : bool
    8766       220111 : vect_scalar_ops_slice_hash::equal (const value_type &s1,
    8767              :                                    const compare_type &s2)
    8768              : {
    8769       220111 :   if (s1.length != s2.length)
    8770              :     return false;
    8771       384682 :   for (unsigned i = 0; i < s1.length; ++i)
    8772       334547 :     if (!operand_equal_p (s1.op (i), s2.op (i)))
    8773              :       return false;
    8774              :   return true;
    8775              : }
    8776              : 
    8777              : /* Compute the prologue cost for invariant or constant operands represented
    8778              :    by NODE.  */
    8779              : 
    8780              : static void
    8781      1113057 : vect_prologue_cost_for_slp (vec_info *vinfo, slp_tree node,
    8782              :                             stmt_vector_for_cost *cost_vec)
    8783              : {
    8784              :   /* There's a special case of an existing vector, that costs nothing.  */
    8785      1113057 :   if (SLP_TREE_SCALAR_OPS (node).length () == 0
    8786      1113057 :       && !SLP_TREE_VEC_DEFS (node).is_empty ())
    8787         1569 :     return;
    8788              :   /* Without looking at the actual initializer a vector of
    8789              :      constants can be implemented as load from the constant pool.
    8790              :      When all elements are the same we can use a splat.  */
    8791      1111488 :   tree vectype = SLP_TREE_VECTYPE (node);
    8792      1111488 :   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
    8793      1111488 :   unsigned HOST_WIDE_INT const_nunits;
    8794      1111488 :   unsigned nelt_limit;
    8795      1111488 :   unsigned nvectors = vect_get_num_copies (vinfo, node);
    8796      1111488 :   auto ops = &SLP_TREE_SCALAR_OPS (node);
    8797      1111488 :   auto_vec<unsigned int> starts (nvectors);
    8798      1111488 :   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
    8799      1111488 :       && ! multiple_p (const_nunits, group_size))
    8800              :     {
    8801        64556 :       nelt_limit = const_nunits;
    8802        64556 :       hash_set<vect_scalar_ops_slice_hash> vector_ops;
    8803       268057 :       for (unsigned int i = 0; i < nvectors; ++i)
    8804       203501 :         if (!vector_ops.add ({ ops, i * nelt_limit, nelt_limit }))
    8805       153366 :           starts.quick_push (i * nelt_limit);
    8806        64556 :     }
    8807              :   else
    8808              :     {
    8809              :       /* If either the vector has variable length or the vectors
    8810              :          are composed of repeated whole groups we only need to
    8811              :          cost construction once.  All vectors will be the same.  */
    8812      1046932 :       nelt_limit = group_size;
    8813      1046932 :       starts.quick_push (0);
    8814              :     }
    8815              :   /* ???  We're just tracking whether vectors in a single node are the same.
    8816              :      Ideally we'd do something more global.  */
    8817      1111488 :   bool passed = false;
    8818      4534762 :   for (unsigned int start : starts)
    8819              :     {
    8820      1200298 :       vect_cost_for_stmt kind;
    8821      1200298 :       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
    8822              :         kind = vector_load;
    8823       483923 :       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
    8824              :         kind = scalar_to_vec;
    8825              :       else
    8826       401289 :         kind = vec_construct;
    8827              :       /* The target cost hook has no idea which part of the SLP node
    8828              :          we are costing so avoid passing it down more than once.  Pass
    8829              :          it to the first vec_construct or scalar_to_vec part since for those
    8830              :          the x86 backend tries to account for GPR to XMM register moves.  */
    8831      1200298 :       record_stmt_cost (cost_vec, 1, kind, nullptr,
    8832      1200298 :                         (kind != vector_load && !passed) ? node : nullptr,
    8833              :                         vectype, 0, vect_prologue);
    8834      1200298 :       if (kind != vector_load)
    8835       483923 :         passed = true;
    8836              :     }
    8837      1111488 : }
    8838              : 
    8839              : /* Analyze statements contained in SLP tree NODE after recursively analyzing
    8840              :    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
    8841              : 
    8842              :    Return true if the operations are supported.  */
    8843              : 
    8844              : static bool
    8845      5210616 : vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
    8846              :                                   slp_instance node_instance,
    8847              :                                   hash_set<slp_tree> &visited_set,
    8848              :                                   vec<slp_tree> &visited_vec,
    8849              :                                   stmt_vector_for_cost *cost_vec)
    8850              : {
    8851      5210616 :   int i, j;
    8852      5210616 :   slp_tree child;
    8853              : 
    8854              :   /* Assume we can code-generate all invariants.  */
    8855      5210616 :   if (!node
    8856      4836089 :       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
    8857      4064937 :       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
    8858              :     return true;
    8859              : 
    8860      3508730 :   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
    8861              :     {
    8862            9 :       if (dump_enabled_p ())
    8863            0 :         dump_printf_loc (MSG_NOTE, vect_location,
    8864              :                          "Failed cyclic SLP reference in %p\n", (void *) node);
    8865            9 :       return false;
    8866              :     }
    8867      3508721 :   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
    8868              : 
    8869              :   /* If we already analyzed the exact same set of scalar stmts we're done.
    8870              :      We share the generated vector stmts for those.  */
    8871      3508721 :   if (visited_set.add (node))
    8872              :     return true;
    8873      3131319 :   visited_vec.safe_push (node);
    8874              : 
    8875      3131319 :   bool res = true;
    8876      3131319 :   unsigned visited_rec_start = visited_vec.length ();
    8877      3131319 :   unsigned cost_vec_rec_start = cost_vec->length ();
    8878      3131319 :   bool seen_non_constant_child = false;
    8879      6724145 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    8880              :     {
    8881      3904593 :       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
    8882              :                                               visited_set, visited_vec,
    8883              :                                               cost_vec);
    8884      3904593 :       if (!res)
    8885              :         break;
    8886      3592826 :       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
    8887      3592826 :         seen_non_constant_child = true;
    8888              :     }
    8889              :   /* We're having difficulties scheduling nodes with just constant
    8890              :      operands and no scalar stmts since we then cannot compute a stmt
    8891              :      insertion place.  */
    8892      3131319 :   if (res
    8893      3131319 :       && !seen_non_constant_child
    8894      3131319 :       && SLP_TREE_SCALAR_STMTS (node).is_empty ())
    8895              :     {
    8896          184 :       if (dump_enabled_p ())
    8897            6 :         dump_printf_loc (MSG_NOTE, vect_location,
    8898              :                          "Cannot vectorize all-constant op node %p\n",
    8899              :                          (void *) node);
    8900              :       res = false;
    8901              :     }
    8902              : 
    8903      3131135 :   if (res)
    8904      2819368 :     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
    8905              :                                               cost_vec);
    8906              :   /* If analysis failed we have to pop all recursive visited nodes
    8907              :      plus ourselves.  */
    8908      3131319 :   if (!res)
    8909              :     {
    8910      2814716 :       while (visited_vec.length () >= visited_rec_start)
    8911       845136 :         visited_set.remove (visited_vec.pop ());
    8912       562222 :       cost_vec->truncate (cost_vec_rec_start);
    8913              :     }
    8914              : 
    8915              :   /* When the node can be vectorized cost invariant nodes it references.
    8916              :      This is not done in DFS order to allow the referring node
    8917              :      vectorizable_* calls to nail down the invariant nodes vector type
    8918              :      and possibly unshare it if it needs a different vector type than
    8919              :      other referrers.  */
    8920      3131319 :   if (res)
    8921      5846458 :     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
    8922      3277361 :       if (child
    8923      2969531 :           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
    8924      2969531 :               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
    8925              :           /* Perform usual caching, note code-generation still
    8926              :              code-gens these nodes multiple times but we expect
    8927              :              to CSE them later.  */
    8928      4480794 :           && !visited_set.add (child))
    8929              :         {
    8930      1158618 :           visited_vec.safe_push (child);
    8931              :           /* ???  After auditing more code paths make a "default"
    8932              :              and push the vector type from NODE to all children
    8933              :              if it is not already set.  */
    8934              :           /* Compute the number of vectors to be generated.  */
    8935      1158618 :           tree vector_type = SLP_TREE_VECTYPE (child);
    8936      1158618 :           if (!vector_type)
    8937              :             {
    8938              :               /* Masked loads can have an undefined (default SSA definition)
    8939              :                  else operand.  We do not need to cost it.  */
    8940        45561 :               vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
    8941        46996 :               if (SLP_TREE_TYPE (node) == load_vec_info_type
    8942        46996 :                   && ((ops.length ()
    8943         1435 :                        && TREE_CODE (ops[0]) == SSA_NAME
    8944            0 :                        && SSA_NAME_IS_DEFAULT_DEF (ops[0])
    8945            0 :                        && VAR_P (SSA_NAME_VAR (ops[0])))
    8946         1435 :                       || SLP_TREE_DEF_TYPE (child) == vect_constant_def))
    8947         1435 :                 continue;
    8948              : 
    8949              :               /* For shifts with a scalar argument we don't need
    8950              :                  to cost or code-generate anything.
    8951              :                  ???  Represent this more explicitly.  */
    8952        44126 :               gcc_assert (SLP_TREE_TYPE (node) == shift_vec_info_type
    8953              :                           && j == 1);
    8954        44126 :               continue;
    8955        44126 :             }
    8956              : 
    8957              :           /* And cost them.  */
    8958      1113057 :           vect_prologue_cost_for_slp (vinfo, child, cost_vec);
    8959              :         }
    8960              : 
    8961              :   /* If this node or any of its children can't be vectorized, try pruning
    8962              :      the tree here rather than felling the whole thing.  */
    8963       562222 :   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
    8964              :     {
    8965              :       /* We'll need to revisit this for invariant costing and number
    8966              :          of vectorized stmt setting.   */
    8967              :       res = true;
    8968              :     }
    8969              : 
    8970              :   return res;
    8971              : }
    8972              : 
    8973              : /* Mark lanes of NODE that are live outside of the basic-block vectorized
    8974              :    region and that can be vectorized using vectorizable_live_operation
    8975              :    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
    8976              :    scalar code computing it to be retained.  */
    8977              : 
    8978              : static void
    8979       920835 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
    8980              :                              slp_instance instance,
    8981              :                              stmt_vector_for_cost *cost_vec,
    8982              :                              hash_set<stmt_vec_info> &svisited,
    8983              :                              hash_set<slp_tree> &visited)
    8984              : {
    8985       920835 :   if (visited.add (node))
    8986        43724 :     return;
    8987              : 
    8988       877111 :   unsigned i;
    8989       877111 :   stmt_vec_info stmt_info;
    8990       877111 :   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
    8991      3175599 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    8992              :     {
    8993      2298488 :       if (!stmt_info || svisited.contains (stmt_info))
    8994        56903 :         continue;
    8995      2272305 :       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
    8996      2272305 :       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
    8997        12060 :           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
    8998              :         /* Only the pattern root stmt computes the original scalar value.  */
    8999         8975 :         continue;
    9000      2263330 :       if (!PURE_SLP_STMT (orig_stmt_info))
    9001              :         /* Iff the stmt is not part of the vector coverage because it or
    9002              :            uses of it are used by SLP graph leafs as extern input there is
    9003              :            no point in trying to live code-generate from a vector stmt as
    9004              :            the scalar stmt will survive anyway.  */
    9005        21745 :         continue;
    9006      2241585 :       bool mark_visited = true;
    9007      2241585 :       gimple *orig_stmt = orig_stmt_info->stmt;
    9008      2241585 :       ssa_op_iter op_iter;
    9009      2241585 :       def_operand_p def_p;
    9010      4981613 :       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
    9011              :         {
    9012              :           /* We have to verify whether we can insert the lane extract
    9013              :              before all uses.  The following is a conservative approximation.
    9014              :              We cannot put this into vectorizable_live_operation because
    9015              :              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
    9016              :              doesn't work.
    9017              :              Note that while the fact that we emit code for loads at the
    9018              :              first load should make this a non-problem leafs we construct
    9019              :              from scalars are vectorized after the last scalar def.
    9020              :              ???  If we'd actually compute the insert location during
    9021              :              analysis we could use sth less conservative than the last
    9022              :              scalar stmt in the node for the dominance check.  */
    9023              :           /* ???  What remains is "live" uses in vector CTORs in the same
    9024              :              SLP graph which is where those uses can end up code-generated
    9025              :              right after their definition instead of close to their original
    9026              :              use.  But that would restrict us to code-generate lane-extracts
    9027              :              from the latest stmt in a node.  So we compensate for this
    9028              :              during code-generation, simply not replacing uses for those
    9029              :              hopefully rare cases.  */
    9030       498443 :           imm_use_iterator use_iter;
    9031       498443 :           gimple *use_stmt;
    9032       498443 :           stmt_vec_info use_stmt_info;
    9033              : 
    9034       498443 :           bool live_p = false;
    9035       498443 :           bool can_insert = true;
    9036      1921145 :           FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
    9037       940151 :             if (!is_gimple_debug (use_stmt)
    9038       940151 :                 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
    9039       702228 :                     || !PURE_SLP_STMT (use_stmt_info)))
    9040              :               {
    9041       147351 :                 live_p = true;
    9042       147351 :                 if (!vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
    9043              :                   {
    9044        15892 :                     if (dump_enabled_p ())
    9045           46 :                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9046              :                                        "Cannot determine insertion place for "
    9047              :                                        "lane extract\n");
    9048              :                     can_insert = false;
    9049              :                     break;
    9050              :                   }
    9051       498443 :               }
    9052       498443 :           if (live_p && can_insert)
    9053              :             {
    9054              :               /* Only record a live stmt when we can replace all uses.  We
    9055              :                  record from which SLP tree we vectorize the uses, so we'll
    9056              :                  cost once and can deal with the case that not all SLP nodes
    9057              :                  may be suitable for code-generation of all live uses.
    9058              :                  ???  But we never split up the work between multiple SLP
    9059              :                  nodes.  */
    9060        65831 :               STMT_VINFO_LIVE_P (stmt_info) = true;
    9061        65831 :               if (!vectorizable_live_operation (bb_vinfo, stmt_info, node,
    9062              :                                                 instance, i, false, cost_vec))
    9063              :                 {
    9064            0 :                   STMT_VINFO_LIVE_P (stmt_info) = false;
    9065            0 :                   mark_visited = false;
    9066              :                 }
    9067              :             }
    9068              :         }
    9069      2241585 :       if (mark_visited)
    9070      2241585 :         svisited.add (stmt_info);
    9071              :     }
    9072              : 
    9073              :   slp_tree child;
    9074      2535300 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9075       888355 :     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    9076       237733 :       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
    9077              :                                    svisited, visited);
    9078              : }
    9079              : 
    9080              : /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
    9081              :    are live outside of the basic-block vectorized region and that can be
    9082              :    vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
    9083              : 
    9084              : static void
    9085       236000 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
    9086              : {
    9087       236000 :   if (bb_vinfo->slp_instances.is_empty ())
    9088            0 :     return;
    9089              : 
    9090       236000 :   hash_set<slp_tree> visited;
    9091       236000 :   hash_set<stmt_vec_info> svisited;
    9092      1391102 :   for (slp_instance instance : bb_vinfo->slp_instances)
    9093              :     {
    9094       683102 :       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
    9095        29096 :         STMT_VINFO_LIVE_P (SLP_INSTANCE_ROOT_STMTS (instance)[0]) = true;
    9096       683102 :       vect_location = instance->location ();
    9097       683102 :       vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
    9098              :                                    instance, &instance->cost_vec,
    9099              :                                    svisited, visited);
    9100              :     }
    9101       236000 : }
    9102              : 
    9103              : /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
    9104              : 
    9105              : static bool
    9106        74913 : vectorizable_bb_reduc_epilogue (slp_instance instance,
    9107              :                                 stmt_vector_for_cost *cost_vec)
    9108              : {
    9109        74913 :   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
    9110        74913 :   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
    9111        74913 :   if (reduc_code == MINUS_EXPR)
    9112            0 :     reduc_code = PLUS_EXPR;
    9113        74913 :   internal_fn reduc_fn;
    9114        74913 :   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
    9115        74913 :   if (!vectype
    9116        74901 :       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
    9117        74901 :       || reduc_fn == IFN_LAST
    9118        74901 :       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
    9119       110479 :       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
    9120        35566 :                                      TREE_TYPE (vectype)))
    9121              :     {
    9122        49928 :       if (dump_enabled_p ())
    9123          277 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9124              :                          "not vectorized: basic block reduction epilogue "
    9125              :                          "operation unsupported.\n");
    9126        49928 :       return false;
    9127              :     }
    9128              : 
    9129              :   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
    9130              :      cost log2 vector operations plus shuffles and one extraction.  */
    9131        24985 :   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
    9132        24985 :   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
    9133              :                     vectype, 0, vect_body);
    9134        24985 :   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
    9135              :                     vectype, 0, vect_body);
    9136        24985 :   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
    9137              :                     vectype, 0, vect_body);
    9138              : 
    9139              :   /* Since we replace all stmts of a possibly longer scalar reduction
    9140              :      chain account for the extra scalar stmts for that.  */
    9141        24985 :   if (!instance->remain_defs.is_empty ())
    9142        20262 :     record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
    9143        10131 :                       instance->root_stmts[0], 0, vect_body);
    9144              :   return true;
    9145              : }
    9146              : 
    9147              : /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
    9148              :    and recurse to children.  */
    9149              : 
    9150              : static void
    9151       189776 : vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
    9152              :                               hash_set<slp_tree> &visited)
    9153              : {
    9154       189776 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
    9155       189776 :       || visited.add (node))
    9156        83463 :     return;
    9157              : 
    9158              :   stmt_vec_info stmt;
    9159              :   unsigned i;
    9160       359932 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
    9161       253619 :     if (stmt)
    9162       258946 :       roots.remove (vect_orig_stmt (stmt));
    9163              : 
    9164              :   slp_tree child;
    9165       234910 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9166       128597 :     if (child)
    9167       127191 :       vect_slp_prune_covered_roots (child, roots, visited);
    9168              : }
    9169              : 
    9170              : /* Hand over COST_VEC to the target COSTS grouped by SLP node.  */
    9171              : 
    9172              : static void
    9173       944801 : add_slp_costs (vector_costs *costs, stmt_vector_for_cost& cost_vec)
    9174              : {
    9175      3579128 :   for (unsigned start = 0; start < cost_vec.length ();)
    9176              :     {
    9177      2634327 :       unsigned end = start + 1;
    9178      3214686 :       while (end < cost_vec.length ()
    9179      5493175 :              && cost_vec[start].node == cost_vec[end].node)
    9180       580359 :         end++;
    9181      2634327 :       costs->add_slp_cost (cost_vec[start].node,
    9182      2634327 :                            array_slice<stmt_info_for_cost>
    9183      2634327 :                              (cost_vec.begin () + start, end - start));
    9184      2634327 :       start = end;
    9185              :     }
    9186       944801 : }
    9187              : 
    9188              : /* Analyze statements in SLP instances of VINFO.  Return true if the
    9189              :    operations are supported. */
    9190              : 
    9191              : bool
    9192       662023 : vect_slp_analyze_operations (vec_info *vinfo)
    9193              : {
    9194       662023 :   slp_instance instance;
    9195       662023 :   int i;
    9196              : 
    9197       662023 :   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
    9198              : 
    9199       662023 :   hash_set<slp_tree> visited;
    9200      1727792 :   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
    9201              :     {
    9202      1306023 :       auto_vec<slp_tree> visited_vec;
    9203      1306023 :       stmt_vector_for_cost cost_vec;
    9204      1306023 :       cost_vec.create (2);
    9205      1306023 :       if (is_a <bb_vec_info> (vinfo))
    9206       783452 :         vect_location = instance->location ();
    9207      1306023 :       if (!vect_slp_analyze_node_operations (vinfo,
    9208              :                                              SLP_INSTANCE_TREE (instance),
    9209              :                                              instance, visited, visited_vec,
    9210              :                                              &cost_vec)
    9211              :           /* CTOR instances require vectorized defs for the SLP tree root.  */
    9212      1075382 :           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
    9213         5641 :               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
    9214              :                   != vect_internal_def
    9215              :                   /* Make sure we vectorized with the expected type.  */
    9216         5641 :                   || !useless_type_conversion_p
    9217         5641 :                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
    9218              :                                               (instance->root_stmts[0]->stmt))),
    9219         5641 :                          TREE_TYPE (SLP_TREE_VECTYPE
    9220              :                                             (SLP_INSTANCE_TREE (instance))))))
    9221              :           /* Check we can vectorize the reduction.  */
    9222      1075367 :           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
    9223        74913 :               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
    9224              :           /* Check we can vectorize the gcond.  */
    9225      2331462 :           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
    9226        61107 :               && !vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
    9227        61107 :                                            SLP_INSTANCE_ROOT_STMTS (instance)[0],
    9228              :                                            NULL,
    9229              :                                            SLP_INSTANCE_TREE (instance),
    9230              :                                            &cost_vec)))
    9231              :         {
    9232       339089 :           cost_vec.release ();
    9233       339089 :           slp_tree node = SLP_INSTANCE_TREE (instance);
    9234       339089 :           stmt_vec_info stmt_info;
    9235       339089 :           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
    9236       256254 :             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
    9237        82835 :           else if (!SLP_TREE_SCALAR_STMTS (node).is_empty ()
    9238        82835 :                    && SLP_TREE_SCALAR_STMTS (node)[0])
    9239              :             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
    9240              :           else
    9241            0 :             stmt_info = SLP_TREE_REPRESENTATIVE (node);
    9242       339089 :           if (is_a <loop_vec_info> (vinfo))
    9243              :             {
    9244       240254 :               if (dump_enabled_p ())
    9245         6485 :                 dump_printf_loc (MSG_NOTE, vect_location,
    9246              :                                  "unsupported SLP instance starting from: %G",
    9247              :                                  stmt_info->stmt);
    9248       240254 :               return false;
    9249              :             }
    9250        98835 :           if (dump_enabled_p ())
    9251          331 :             dump_printf_loc (MSG_NOTE, vect_location,
    9252              :                              "removing SLP instance operations starting from: %G",
    9253              :                              stmt_info->stmt);
    9254       540692 :           while (!visited_vec.is_empty ())
    9255              :             {
    9256       441857 :               slp_tree node = visited_vec.pop ();
    9257       441857 :               SLP_TREE_TYPE (node) = undef_vec_info_type;
    9258       441857 :               if (node->data)
    9259              :                 {
    9260        12131 :                   delete node->data;
    9261        12131 :                   node->data = nullptr;
    9262              :                 }
    9263       441857 :               visited.remove (node);
    9264              :             }
    9265        98835 :           vect_free_slp_instance (instance);
    9266        98835 :           vinfo->slp_instances.ordered_remove (i);
    9267              :         }
    9268              :       else
    9269              :         {
    9270       966934 :           i++;
    9271       966934 :           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
    9272              :             {
    9273       282317 :               add_slp_costs (loop_vinfo->vector_costs, cost_vec);
    9274       282317 :               cost_vec.release ();
    9275              :             }
    9276              :           else
    9277              :             /* For BB vectorization remember the SLP graph entry
    9278              :                cost for later.  */
    9279       684617 :             instance->cost_vec = cost_vec;
    9280              :         }
    9281      1306023 :     }
    9282              : 
    9283              :   /* Now look for SLP instances with a root that are covered by other
    9284              :      instances and remove them.  */
    9285       421769 :   hash_set<stmt_vec_info> roots;
    9286      1741656 :   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
    9287       931319 :     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
    9288        33201 :       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
    9289       421769 :   if (!roots.is_empty ())
    9290              :     {
    9291        13186 :       visited.empty ();
    9292        75771 :       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
    9293        62585 :         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
    9294              :                                       visited);
    9295        75771 :       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
    9296        62585 :         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
    9297        33201 :             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
    9298              :           {
    9299         1515 :             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
    9300         1515 :             if (dump_enabled_p ())
    9301           20 :               dump_printf_loc (MSG_NOTE, vect_location,
    9302              :                                "removing SLP instance operations starting "
    9303              :                                "from: %G", root->stmt);
    9304         1515 :             vect_free_slp_instance (instance);
    9305         1515 :             vinfo->slp_instances.ordered_remove (i);
    9306              :           }
    9307              :         else
    9308        61070 :           ++i;
    9309              :     }
    9310              : 
    9311       843538 :   return !vinfo->slp_instances.is_empty ();
    9312      1083792 : }
    9313              : 
    9314              : /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
    9315              :    closing the eventual chain.  */
    9316              : 
    9317              : static slp_instance
    9318       753763 : get_ultimate_leader (slp_instance instance,
    9319              :                      hash_map<slp_instance, slp_instance> &instance_leader)
    9320              : {
    9321       753763 :   auto_vec<slp_instance *, 8> chain;
    9322       753763 :   slp_instance *tem;
    9323       838223 :   while (*(tem = instance_leader.get (instance)) != instance)
    9324              :     {
    9325        84460 :       chain.safe_push (tem);
    9326        84460 :       instance = *tem;
    9327              :     }
    9328       838223 :   while (!chain.is_empty ())
    9329        84460 :     *chain.pop () = instance;
    9330       753763 :   return instance;
    9331       753763 : }
    9332              : 
    9333              : namespace {
    9334              : /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
    9335              :    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
    9336              :    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
    9337              : 
    9338              :    INSTANCE_LEADER is as for get_ultimate_leader.  */
    9339              : 
    9340              : template<typename T>
    9341              : bool
    9342      3326223 : vect_map_to_instance (slp_instance instance, T key,
    9343              :                       hash_map<T, slp_instance> &key_to_instance,
    9344              :                       hash_map<slp_instance, slp_instance> &instance_leader)
    9345              : {
    9346              :   bool existed_p;
    9347      3326223 :   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
    9348      3326223 :   if (!existed_p)
    9349              :     ;
    9350       182591 :   else if (key_instance != instance)
    9351              :     {
    9352              :       /* If we're running into a previously marked key make us the
    9353              :          leader of the current ultimate leader.  This keeps the
    9354              :          leader chain acyclic and works even when the current instance
    9355              :          connects two previously independent graph parts.  */
    9356        70661 :       slp_instance key_leader
    9357        70661 :         = get_ultimate_leader (key_instance, instance_leader);
    9358        70661 :       if (key_leader != instance)
    9359        20843 :         instance_leader.put (key_leader, instance);
    9360              :     }
    9361      3326223 :   key_instance = instance;
    9362      3326223 :   return existed_p;
    9363              : }
    9364              : }
    9365              : 
    9366              : /* Worker of vect_bb_partition_graph, recurse on NODE.  */
    9367              : 
    9368              : static void
    9369       920835 : vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
    9370              :                            slp_instance instance, slp_tree node,
    9371              :                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
    9372              :                            hash_map<slp_tree, slp_instance> &node_to_instance,
    9373              :                            hash_map<slp_instance, slp_instance> &instance_leader)
    9374              : {
    9375       920835 :   stmt_vec_info stmt_info;
    9376       920835 :   unsigned i;
    9377              : 
    9378      3326223 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    9379      2405388 :     if (stmt_info)
    9380      2405388 :       vect_map_to_instance (instance, stmt_info, stmt_to_instance,
    9381              :                             instance_leader);
    9382              : 
    9383       920835 :   if (vect_map_to_instance (instance, node, node_to_instance,
    9384              :                             instance_leader))
    9385       920835 :     return;
    9386              : 
    9387              :   slp_tree child;
    9388      1765466 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9389       888355 :     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    9390       237733 :       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
    9391              :                                  node_to_instance, instance_leader);
    9392              : }
    9393              : 
    9394              : /* Partition the SLP graph into pieces that can be costed independently.  */
    9395              : 
    9396              : static void
    9397       236000 : vect_bb_partition_graph (bb_vec_info bb_vinfo)
    9398              : {
    9399       236000 :   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
    9400              : 
    9401              :   /* First walk the SLP graph assigning each involved scalar stmt a
    9402              :      corresponding SLP graph entry and upon visiting a previously
    9403              :      marked stmt, make the stmts leader the current SLP graph entry.  */
    9404       236000 :   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
    9405       236000 :   hash_map<slp_tree, slp_instance> node_to_instance;
    9406       236000 :   hash_map<slp_instance, slp_instance> instance_leader;
    9407       236000 :   slp_instance instance;
    9408       919102 :   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
    9409              :     {
    9410       683102 :       instance_leader.put (instance, instance);
    9411       683102 :       vect_bb_partition_graph_r (bb_vinfo,
    9412              :                                  instance, SLP_INSTANCE_TREE (instance),
    9413              :                                  stmt_to_instance, node_to_instance,
    9414              :                                  instance_leader);
    9415              :     }
    9416              : 
    9417              :   /* Then collect entries to each independent subgraph.  */
    9418      1155102 :   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
    9419              :     {
    9420       683102 :       slp_instance leader = get_ultimate_leader (instance, instance_leader);
    9421       683102 :       leader->subgraph_entries.safe_push (instance);
    9422       683102 :       if (dump_enabled_p ()
    9423       683102 :           && leader != instance)
    9424           69 :         dump_printf_loc (MSG_NOTE, vect_location,
    9425              :                          "instance %p is leader of %p\n",
    9426              :                          (void *) leader, (void *) instance);
    9427              :     }
    9428       236000 : }
    9429              : 
    9430              : /* Compute the scalar cost of the SLP node NODE and its children
    9431              :    and return it.  Do not account defs that are marked in LIFE and
    9432              :    update LIFE according to uses of NODE.  */
    9433              : 
    9434              : static void
    9435       679602 : vect_bb_slp_scalar_cost (bb_vec_info vinfo,
    9436              :                          vec<stmt_vec_info> &worklist,
    9437              :                          stmt_vector_for_cost *cost_vec,
    9438              :                          hash_set<stmt_vec_info> &visited)
    9439              : {
    9440      3155976 :   while (!worklist.is_empty ())
    9441              :     {
    9442      2476374 :       stmt_vec_info stmt = worklist.pop ();
    9443      2766805 :       if (!PURE_SLP_STMT (stmt))
    9444       306076 :         continue;
    9445              : 
    9446              :       /* When the stmt is live but not actually vectorized we have
    9447              :          to keep the feeding scalar defs.  */
    9448      2188839 :       if (!STMT_VINFO_LIVE_P (vect_stmt_to_vectorize (stmt)))
    9449              :         {
    9450      2121835 :           bool live_p = false;
    9451      2121835 :           ssa_op_iter op_iter;
    9452      2121835 :           def_operand_p def_p;
    9453      4645141 :           FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt->stmt, op_iter, SSA_OP_DEF)
    9454              :             {
    9455       401471 :               imm_use_iterator use_iter;
    9456       401471 :               gimple *use_stmt;
    9457      1451120 :               FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
    9458       648178 :                 if (!is_gimple_debug (use_stmt))
    9459              :                   {
    9460       479421 :                     stmt_vec_info use_stmt_info = vinfo->lookup_stmt (use_stmt);
    9461       479421 :                     if (!use_stmt_info || !PURE_SLP_STMT (use_stmt_info))
    9462              :                       {
    9463        24420 :                         if (dump_enabled_p ())
    9464              :                           {
    9465           36 :                             dump_printf_loc (MSG_NOTE, vect_location,
    9466              :                                              "stmt considered live: %G",
    9467              :                                              stmt->stmt);
    9468           36 :                             dump_printf_loc (MSG_NOTE, vect_location,
    9469              :                                              "because of use in: %G",
    9470              :                                              use_stmt);
    9471              :                           }
    9472              :                         live_p = true;
    9473              :                       }
    9474       401471 :                   }
    9475              :             }
    9476      2121835 :           if (live_p)
    9477        15645 :             continue;
    9478              :         }
    9479              : 
    9480              :       /* The following assert verifies that vect_bb_partition_graph
    9481              :          partitions the SLP graph in a way that each scalar stmt of
    9482              :          the coverage of the SLP graph belongs to exactly one subgraph.
    9483              :          ???  This is currently not guaranteed since the function
    9484              :          works purely on SLP_TREE_SCALAR_STMTS, resulting in the assert
    9485              :          tripping or scalar stmts costed multiple times, making vectorization
    9486              :          more profitable than it really is.  */
    9487              :       /* gcc_checking_assert (!gimple_visited_p (stmt->stmt)); */
    9488              : 
    9489      2170298 :       if (vect_nop_conversion_p (stmt))
    9490              :         ;
    9491              :       /* For single-argument PHIs assume coalescing which means zero
    9492              :          cost for the scalar and the vector PHIs.  This avoids
    9493              :          artificially favoring the vector path (but may pessimize it
    9494              :          in some cases).  */
    9495      2148939 :       else if (is_a <gphi *> (stmt->stmt)
    9496      2148939 :                && gimple_phi_num_args (as_a <gphi *> (stmt->stmt)) == 1)
    9497              :         ;
    9498              :       else
    9499              :         {
    9500      2140082 :           vect_cost_for_stmt kind;
    9501      2140082 :           if (STMT_VINFO_DATA_REF (stmt))
    9502              :             {
    9503      1963897 :               data_reference_p dr = STMT_VINFO_DATA_REF (stmt);
    9504      1963897 :               tree base = get_base_address (DR_REF (dr));
    9505              :               /* When the scalar access is to a non-global not
    9506              :                  address-taken decl that is not BLKmode assume we can
    9507              :                  access it with a single non-load/store instruction.  */
    9508      1963897 :               if (DECL_P (base)
    9509      1514036 :                   && !is_global_var (base)
    9510      1438187 :                   && !TREE_ADDRESSABLE (base)
    9511      2512939 :                   && DECL_MODE (base) != BLKmode)
    9512              :                 kind = scalar_stmt;
    9513      1820792 :               else if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt)))
    9514              :                 kind = scalar_load;
    9515              :               else
    9516      1593384 :                 kind = scalar_store;
    9517              :             }
    9518              :           else
    9519              :             kind = scalar_stmt;
    9520              :           /* Cost each scalar stmt only once.  */
    9521      2140082 :           gimple_set_visited (stmt->stmt, true);
    9522      2140082 :           record_stmt_cost (cost_vec, 1, kind, stmt, NULL_TREE, 0, vect_body);
    9523              :         }
    9524              : 
    9525              :       /* Now walk relevant parts of the SSA use-def graph.  */
    9526      2170298 :       slp_oprnds child_ops (stmt);
    9527      4553796 :       for (unsigned i = 0; i < child_ops.num_slp_children; ++i)
    9528              :         {
    9529      2383498 :           tree op = child_ops.get_op_for_slp_child (stmt, i);
    9530      2383498 :           stmt_vec_info def = vinfo->lookup_def (op);
    9531      2383498 :           if (def && !visited.add (def))
    9532       697562 :             worklist.safe_push (def);
    9533              :         }
    9534              :     }
    9535       679602 : }
    9536              : 
    9537              : 
    9538              : /* Comparator for the loop-index sorted cost vectors.  */
    9539              : 
    9540              : static int
    9541     17038004 : li_cost_vec_cmp (const void *a_, const void *b_, void *)
    9542              : {
    9543     17038004 :   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
    9544     17038004 :   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
    9545     17038004 :   if (a->first < b->first)
    9546              :     return -1;
    9547     16196014 :   else if (a->first == b->first)
    9548     15491323 :     return 0;
    9549              :   return 1;
    9550              : }
    9551              : 
    9552              : /* Check if vectorization of the basic block is profitable for the
    9553              :    subgraph denoted by SLP_INSTANCES.  */
    9554              : 
    9555              : static bool
    9556       658896 : vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
    9557              :                                     vec<slp_instance> slp_instances,
    9558              :                                     loop_p orig_loop)
    9559              : {
    9560       658896 :   slp_instance instance;
    9561       658896 :   int i;
    9562       658896 :   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
    9563       658896 :   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
    9564              : 
    9565       658896 :   if (dump_enabled_p ())
    9566              :     {
    9567           98 :       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
    9568           98 :       hash_set<slp_tree> visited;
    9569          395 :       FOR_EACH_VEC_ELT (slp_instances, i, instance)
    9570          101 :         vect_print_slp_graph (MSG_NOTE, vect_location,
    9571              :                               SLP_INSTANCE_TREE (instance), visited);
    9572           98 :     }
    9573              : 
    9574              :   /* Then DFS walk scalar stmts, performing costing and handling
    9575              :      still live scalar stmts via the previously computed vector coverage.  */
    9576       658896 :   stmt_vector_for_cost scalar_costs = vNULL;
    9577       658896 :   stmt_vector_for_cost vector_costs = vNULL;
    9578       658896 :   hash_set<slp_tree> visited;
    9579       658896 :   hash_set<stmt_vec_info> svisited;
    9580      1338498 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
    9581              :     {
    9582       679602 :       auto_vec<stmt_vec_info> worklist;
    9583       679602 :       if (SLP_INSTANCE_ROOT_STMTS (instance).exists ())
    9584        57632 :         record_stmt_cost (&scalar_costs,
    9585        28816 :                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
    9586              :                           scalar_stmt,
    9587        28816 :                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
    9588      3825954 :       for (auto stmt : SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance)))
    9589              :         {
    9590      1787148 :           stmt = vect_orig_stmt (stmt);
    9591      1787148 :           if (!svisited.add (stmt))
    9592      1778812 :             worklist.safe_push (stmt);
    9593              :         }
    9594       679602 :       vect_bb_slp_scalar_cost (bb_vinfo, worklist, &scalar_costs, svisited);
    9595       679602 :       vector_costs.safe_splice (instance->cost_vec);
    9596       679602 :       instance->cost_vec.release ();
    9597       679602 :     }
    9598              : 
    9599       658896 :   if (dump_enabled_p ())
    9600           98 :     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
    9601              : 
    9602              :   /* When costing non-loop vectorization we need to consider each covered
    9603              :      loop independently and make sure vectorization is profitable.  For
    9604              :      now we assume a loop may be not entered or executed an arbitrary
    9605              :      number of iterations (???  static information can provide more
    9606              :      precise info here) which means we can simply cost each containing
    9607              :      loops stmts separately.  */
    9608              : 
    9609              :   /* First produce cost vectors sorted by loop index.  */
    9610       658896 :   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
    9611       658896 :     li_scalar_costs (scalar_costs.length ());
    9612       658896 :   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
    9613       658896 :     li_vector_costs (vector_costs.length ());
    9614       658896 :   stmt_info_for_cost *cost;
    9615      2827794 :   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
    9616              :     {
    9617      2168898 :       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
    9618      2168898 :       li_scalar_costs.quick_push (std::make_pair (l, cost));
    9619              :     }
    9620              :   /* Use a random used loop as fallback in case the first vector_costs
    9621              :      entry does not have a stmt_info associated with it.  */
    9622       658896 :   unsigned l = li_scalar_costs[0].first;
    9623      2408453 :   FOR_EACH_VEC_ELT (vector_costs, i, cost)
    9624              :     {
    9625              :       /* We inherit from the previous COST, invariants, externals and
    9626              :          extracts immediately follow the cost for the related stmt.  */
    9627      1749557 :       if (cost->stmt_info)
    9628      1028078 :         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
    9629      1749557 :       li_vector_costs.quick_push (std::make_pair (l, cost));
    9630              :     }
    9631       658896 :   li_scalar_costs.stablesort (li_cost_vec_cmp, NULL);
    9632       658896 :   li_vector_costs.stablesort (li_cost_vec_cmp, NULL);
    9633              : 
    9634              :   /* Now cost the portions individually.  */
    9635              :   unsigned vi = 0;
    9636              :   unsigned si = 0;
    9637      1143503 :   bool profitable = true;
    9638      1143503 :   while (si < li_scalar_costs.length ()
    9639      1807037 :          && vi < li_vector_costs.length ())
    9640              :     {
    9641       663522 :       unsigned sl = li_scalar_costs[si].first;
    9642       663522 :       unsigned vl = li_vector_costs[vi].first;
    9643       663522 :       if (sl != vl)
    9644              :         {
    9645         1038 :           if (dump_enabled_p ())
    9646            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    9647              :                              "Scalar %d and vector %d loop part do not "
    9648              :                              "match up, skipping scalar part\n", sl, vl);
    9649              :           /* Skip the scalar part, assuming zero cost on the vector side.  */
    9650         1708 :           do
    9651              :             {
    9652         1708 :               si++;
    9653              :             }
    9654         1708 :           while (si < li_scalar_costs.length ()
    9655         3533 :                  && li_scalar_costs[si].first == sl);
    9656         1038 :           continue;
    9657              :         }
    9658              : 
    9659       662484 :       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
    9660      2149632 :       do
    9661              :         {
    9662      2149632 :           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
    9663      2149632 :           si++;
    9664              :         }
    9665      2149632 :       while (si < li_scalar_costs.length ()
    9666      4306840 :              && li_scalar_costs[si].first == sl);
    9667       662484 :       scalar_target_cost_data->finish_cost (nullptr);
    9668       662484 :       scalar_cost = scalar_target_cost_data->body_cost ();
    9669              : 
    9670              :       /* Complete the target-specific vector cost calculation.  */
    9671       662484 :       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
    9672       662484 :       auto_vec<stmt_info_for_cost> tem;
    9673      1721602 :       do
    9674              :         {
    9675      1721602 :           tem.safe_push (*li_vector_costs[vi].second);
    9676      1721602 :           vi++;
    9677              :         }
    9678      1721602 :       while (vi < li_vector_costs.length ()
    9679      3451940 :              && li_vector_costs[vi].first == vl);
    9680       662484 :       add_slp_costs (vect_target_cost_data, tem);
    9681       662484 :       vect_target_cost_data->finish_cost (scalar_target_cost_data);
    9682       662484 :       vec_prologue_cost = vect_target_cost_data->prologue_cost ();
    9683       662484 :       vec_inside_cost = vect_target_cost_data->body_cost ();
    9684       662484 :       vec_epilogue_cost = vect_target_cost_data->epilogue_cost ();
    9685       662484 :       delete scalar_target_cost_data;
    9686       662484 :       delete vect_target_cost_data;
    9687              : 
    9688       662484 :       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
    9689              : 
    9690       662484 :       if (dump_enabled_p ())
    9691              :         {
    9692           98 :           dump_printf_loc (MSG_NOTE, vect_location,
    9693              :                            "Cost model analysis for part in loop %d:\n", sl);
    9694           98 :           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
    9695              :                        vec_inside_cost + vec_outside_cost);
    9696           98 :           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
    9697              :         }
    9698              : 
    9699              :       /* Vectorization is profitable if its cost is more than the cost of scalar
    9700              :          version.  Note that we err on the vector side for equal cost because
    9701              :          the cost estimate is otherwise quite pessimistic (constant uses are
    9702              :          free on the scalar side but cost a load on the vector side for
    9703              :          example).  */
    9704       662484 :       if (vec_outside_cost + vec_inside_cost > scalar_cost)
    9705              :         {
    9706       178915 :           profitable = false;
    9707       178915 :           break;
    9708              :         }
    9709       483569 :     }
    9710       658896 :   if (profitable && vi < li_vector_costs.length ())
    9711              :     {
    9712         1082 :       if (dump_enabled_p ())
    9713           12 :         dump_printf_loc (MSG_NOTE, vect_location,
    9714              :                          "Excess vector cost for part in loop %d:\n",
    9715            6 :                          li_vector_costs[vi].first);
    9716              :       profitable = false;
    9717              :     }
    9718              : 
    9719              :   /* Unset visited flag.  This is delayed when the subgraph is profitable
    9720              :      and we process the loop for remaining unvectorized if-converted code.  */
    9721       658896 :   if (!orig_loop || !profitable)
    9722      2826375 :     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
    9723      2167582 :       gimple_set_visited  (cost->stmt_info->stmt, false);
    9724              : 
    9725       658896 :   scalar_costs.release ();
    9726       658896 :   vector_costs.release ();
    9727              : 
    9728       658896 :   return profitable;
    9729       658896 : }
    9730              : 
    9731              : /* qsort comparator for lane defs.  */
    9732              : 
    9733              : static int
    9734           40 : vld_cmp (const void *a_, const void *b_)
    9735              : {
    9736           40 :   auto *a = (const std::pair<unsigned, tree> *)a_;
    9737           40 :   auto *b = (const std::pair<unsigned, tree> *)b_;
    9738           40 :   return a->first - b->first;
    9739              : }
    9740              : 
    9741              : /* Return true if USE_STMT is a vector lane insert into VEC and set
    9742              :    *THIS_LANE to the lane number that is set.  */
    9743              : 
    9744              : static bool
    9745          248 : vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
    9746              : {
    9747          248 :   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
    9748           91 :   if (!use_ass
    9749           91 :       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
    9750           22 :       || (vec
    9751           22 :           ? gimple_assign_rhs1 (use_ass) != vec
    9752           24 :           : ((vec = gimple_assign_rhs1 (use_ass)), false))
    9753           46 :       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
    9754           46 :                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
    9755           46 :       || !constant_multiple_p
    9756           46 :             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
    9757           92 :              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
    9758              :              this_lane))
    9759          202 :     return false;
    9760              :   return true;
    9761              : }
    9762              : 
    9763              : /* Find any vectorizable constructors and add them to the grouped_store
    9764              :    array.  */
    9765              : 
    9766              : static void
    9767      2205447 : vect_slp_check_for_roots (bb_vec_info bb_vinfo)
    9768              : {
    9769     17680234 :   for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
    9770     30949574 :     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
    9771    137971545 :          !gsi_end_p (gsi); gsi_next (&gsi))
    9772              :     {
    9773    122496758 :       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
    9774              :       /* This can be used to start SLP discovery for early breaks for BB early breaks
    9775              :          when we get that far.  */
    9776    122496758 :       if (!assign)
    9777    184803758 :         continue;
    9778              : 
    9779     31028261 :       tree rhs = gimple_assign_rhs1 (assign);
    9780     31028261 :       enum tree_code code = gimple_assign_rhs_code (assign);
    9781     31028261 :       use_operand_p use_p;
    9782     31028261 :       gimple *use_stmt;
    9783     31028261 :       if (code == CONSTRUCTOR)
    9784              :         {
    9785      1597947 :           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
    9786        63480 :               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
    9787        92543 :                            CONSTRUCTOR_NELTS (rhs))
    9788        42948 :               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
    9789      1640891 :               || uniform_vector_p (rhs))
    9790      1585095 :             continue;
    9791              : 
    9792              :           unsigned j;
    9793              :           tree val;
    9794        63645 :           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
    9795        50793 :             if (TREE_CODE (val) != SSA_NAME
    9796        50793 :                 || !bb_vinfo->lookup_def (val))
    9797              :               break;
    9798        31736 :           if (j != CONSTRUCTOR_NELTS (rhs))
    9799         3016 :             continue;
    9800              : 
    9801        12852 :           vec<stmt_vec_info> roots = vNULL;
    9802        12852 :           roots.safe_push (bb_vinfo->lookup_stmt (assign));
    9803        12852 :           vec<stmt_vec_info> stmts;
    9804        12852 :           stmts.create (CONSTRUCTOR_NELTS (rhs));
    9805        71840 :           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
    9806        46136 :             stmts.quick_push
    9807        46136 :               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
    9808        12852 :           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
    9809        12852 :                                                stmts, roots));
    9810              :         }
    9811     29430314 :       else if (code == BIT_INSERT_EXPR
    9812          933 :                && VECTOR_TYPE_P (TREE_TYPE (rhs))
    9813          611 :                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
    9814          611 :                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
    9815          608 :                && integer_zerop (gimple_assign_rhs3 (assign))
    9816          341 :                && useless_type_conversion_p
    9817          341 :                     (TREE_TYPE (TREE_TYPE (rhs)),
    9818          341 :                      TREE_TYPE (gimple_assign_rhs2 (assign)))
    9819     29430936 :                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
    9820              :         {
    9821              :           /* We start to match on insert to lane zero but since the
    9822              :              inserts need not be ordered we'd have to search both
    9823              :              the def and the use chains.  */
    9824          215 :           tree vectype = TREE_TYPE (rhs);
    9825          215 :           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
    9826          215 :           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
    9827          215 :           auto_sbitmap lanes (nlanes);
    9828          215 :           bitmap_clear (lanes);
    9829          215 :           bitmap_set_bit (lanes, 0);
    9830          215 :           tree def = gimple_assign_lhs (assign);
    9831          215 :           lane_defs.quick_push
    9832          215 :                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
    9833          215 :           unsigned lanes_found = 1;
    9834              :           /* Start with the use chains, the last stmt will be the root.  */
    9835          215 :           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
    9836          215 :           vec<stmt_vec_info> roots = vNULL;
    9837          215 :           roots.safe_push (last);
    9838          217 :           do
    9839              :             {
    9840          217 :               use_operand_p use_p;
    9841          217 :               gimple *use_stmt;
    9842          217 :               if (!single_imm_use (def, &use_p, &use_stmt))
    9843              :                 break;
    9844          211 :               unsigned this_lane;
    9845          211 :               if (!bb_vinfo->lookup_stmt (use_stmt)
    9846          211 :                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
    9847          233 :                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
    9848              :                 break;
    9849           22 :               if (bitmap_bit_p (lanes, this_lane))
    9850              :                 break;
    9851            2 :               lanes_found++;
    9852            2 :               bitmap_set_bit (lanes, this_lane);
    9853            2 :               gassign *use_ass = as_a <gassign *> (use_stmt);
    9854            2 :               lane_defs.quick_push (std::make_pair
    9855            2 :                                      (this_lane, gimple_assign_rhs2 (use_ass)));
    9856            2 :               last = bb_vinfo->lookup_stmt (use_ass);
    9857            2 :               roots.safe_push (last);
    9858            2 :               def = gimple_assign_lhs (use_ass);
    9859              :             }
    9860            2 :           while (lanes_found < nlanes);
    9861          215 :           if (roots.length () > 1)
    9862            2 :             std::swap(roots[0], roots[roots.length () - 1]);
    9863          215 :           if (lanes_found < nlanes)
    9864              :             {
    9865              :               /* Now search the def chain.  */
    9866          215 :               def = gimple_assign_rhs1 (assign);
    9867          217 :               do
    9868              :                 {
    9869          217 :                   if (TREE_CODE (def) != SSA_NAME
    9870          217 :                       || !has_single_use (def))
    9871              :                     break;
    9872           56 :                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
    9873           56 :                   unsigned this_lane;
    9874           56 :                   if (!bb_vinfo->lookup_stmt (def_stmt)
    9875           37 :                       || !vect_slp_is_lane_insert (def_stmt,
    9876              :                                                    NULL_TREE, &this_lane)
    9877           80 :                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
    9878              :                     break;
    9879           24 :                   if (bitmap_bit_p (lanes, this_lane))
    9880              :                     break;
    9881            4 :                   lanes_found++;
    9882            4 :                   bitmap_set_bit (lanes, this_lane);
    9883            8 :                   lane_defs.quick_push (std::make_pair
    9884            4 :                                           (this_lane,
    9885            4 :                                            gimple_assign_rhs2 (def_stmt)));
    9886            4 :                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
    9887            4 :                   def = gimple_assign_rhs1 (def_stmt);
    9888              :                 }
    9889            4 :               while (lanes_found < nlanes);
    9890              :             }
    9891          215 :           if (lanes_found == nlanes)
    9892              :             {
    9893              :               /* Sort lane_defs after the lane index and register the root.  */
    9894            2 :               lane_defs.qsort (vld_cmp);
    9895            2 :               vec<stmt_vec_info> stmts;
    9896            2 :               stmts.create (nlanes);
    9897           10 :               for (unsigned i = 0; i < nlanes; ++i)
    9898            8 :                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
    9899            2 :               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
    9900            2 :                                                    stmts, roots));
    9901              :             }
    9902              :           else
    9903          213 :             roots.release ();
    9904          215 :         }
    9905     29430099 :       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
    9906     28450600 :                && (associative_tree_code (code) || code == MINUS_EXPR)
    9907              :                /* ???  This pessimizes a two-element reduction.  PR54400.
    9908              :                   ???  In-order reduction could be handled if we only
    9909              :                   traverse one operand chain in vect_slp_linearize_chain.  */
    9910     33377145 :                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
    9911              :                /* Ops with constants at the tail can be stripped here.  */
    9912      5829311 :                && TREE_CODE (rhs) == SSA_NAME
    9913      5762785 :                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
    9914              :                /* Should be the chain end.  */
    9915     31720412 :                && (!single_imm_use (gimple_assign_lhs (assign),
    9916              :                                     &use_p, &use_stmt)
    9917      1763985 :                    || !is_gimple_assign (use_stmt)
    9918      1209555 :                    || (gimple_assign_rhs_code (use_stmt) != code
    9919       899164 :                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
    9920       498179 :                            || (gimple_assign_rhs_code (use_stmt)
    9921       498179 :                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
    9922              :         {
    9923              :           /* We start the match at the end of a possible association
    9924              :              chain.  */
    9925      1882265 :           auto_vec<chain_op_t> chain;
    9926      1882265 :           auto_vec<std::pair<tree_code, gimple *> > worklist;
    9927      1882265 :           auto_vec<gimple *> chain_stmts;
    9928      1882265 :           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
    9929      1882265 :           if (code == MINUS_EXPR)
    9930       304825 :             code = PLUS_EXPR;
    9931      1882265 :           internal_fn reduc_fn;
    9932      2163934 :           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
    9933      1882265 :               || reduc_fn == IFN_LAST)
    9934       281669 :             continue;
    9935      1600596 :           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
    9936              :                                     /* ??? */
    9937              :                                     code_stmt, alt_code_stmt, &chain_stmts,
    9938              :                                     false);
    9939      3201192 :           if (chain.length () > 1)
    9940              :             {
    9941              :               /* Sort the chain according to def_type and operation.  */
    9942      1600596 :               chain.sort (dt_sort_cmp, bb_vinfo);
    9943              :               /* ???  Now we'd want to strip externals and constants
    9944              :                  but record those to be handled in the epilogue.  */
    9945              :               /* ???  For now do not allow mixing ops or externs/constants.  */
    9946      1600596 :               bool invalid = false;
    9947      1600596 :               unsigned remain_cnt = 0;
    9948      1600596 :               unsigned last_idx = 0;
    9949      4831863 :               for (unsigned i = 0; i < chain.length (); ++i)
    9950              :                 {
    9951      3536092 :                   if (chain[i].code != code)
    9952              :                     {
    9953              :                       invalid = true;
    9954              :                       break;
    9955              :                     }
    9956      3231267 :                   if (chain[i].dt != vect_internal_def
    9957              :                       /* Avoid stmts where the def is not the LHS, like
    9958              :                          ASMs.  */
    9959      6249703 :                       || (gimple_get_lhs (bb_vinfo->lookup_def
    9960      3018436 :                                                       (chain[i].op)->stmt)
    9961      3018436 :                           != chain[i].op))
    9962       215775 :                     remain_cnt++;
    9963              :                   else
    9964              :                     last_idx = i;
    9965              :                 }
    9966              :               /* Make sure to have an even number of lanes as we later do
    9967              :                  all-or-nothing discovery, not trying to split further.  */
    9968      1600596 :               if ((chain.length () - remain_cnt) & 1)
    9969       169686 :                 remain_cnt++;
    9970      1600596 :               if (!invalid && chain.length () - remain_cnt > 1)
    9971              :                 {
    9972      1230348 :                   vec<stmt_vec_info> stmts;
    9973      1230348 :                   vec<tree> remain = vNULL;
    9974      1230348 :                   stmts.create (chain.length ());
    9975      1230348 :                   if (remain_cnt > 0)
    9976       115003 :                     remain.create (remain_cnt);
    9977      3951375 :                   for (unsigned i = 0; i < chain.length (); ++i)
    9978              :                     {
    9979      2721027 :                       stmt_vec_info stmt_info;
    9980      2721027 :                       if (chain[i].dt == vect_internal_def
    9981      2681057 :                           && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
    9982      2681057 :                               gimple_get_lhs (stmt_info->stmt) == chain[i].op)
    9983      5402000 :                           && (i != last_idx
    9984      1230348 :                               || (stmts.length () & 1)))
    9985      2594710 :                         stmts.quick_push (stmt_info);
    9986              :                       else
    9987       126317 :                         remain.quick_push (chain[i].op);
    9988              :                     }
    9989      1230348 :                   vec<stmt_vec_info> roots;
    9990      1230348 :                   roots.create (chain_stmts.length ());
    9991      2721027 :                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
    9992      1490679 :                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
    9993      1230348 :                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
    9994      1230348 :                                                        stmts, roots, remain));
    9995              :                 }
    9996              :             }
    9997      1882265 :         }
    9998              :     }
    9999      2205447 : }
   10000              : 
   10001              : /* Walk the grouped store chains and replace entries with their
   10002              :    pattern variant if any.  */
   10003              : 
   10004              : static void
   10005       614383 : vect_fixup_store_groups_with_patterns (vec_info *vinfo)
   10006              : {
   10007       614383 :   stmt_vec_info first_element;
   10008       614383 :   unsigned i;
   10009              : 
   10010      1508085 :   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
   10011              :     {
   10012              :       /* We also have CTORs in this array.  */
   10013       893702 :       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
   10014            0 :         continue;
   10015       893702 :       if (STMT_VINFO_IN_PATTERN_P (first_element))
   10016              :         {
   10017          252 :           stmt_vec_info orig = first_element;
   10018          252 :           first_element = STMT_VINFO_RELATED_STMT (first_element);
   10019          252 :           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
   10020          252 :           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
   10021          252 :           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
   10022          252 :           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
   10023          252 :           vinfo->grouped_stores[i] = first_element;
   10024              :         }
   10025       893702 :       stmt_vec_info prev = first_element;
   10026      2511675 :       while (DR_GROUP_NEXT_ELEMENT (prev))
   10027              :         {
   10028      1617973 :           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
   10029      1617973 :           if (STMT_VINFO_IN_PATTERN_P (elt))
   10030              :             {
   10031          849 :               stmt_vec_info orig = elt;
   10032          849 :               elt = STMT_VINFO_RELATED_STMT (elt);
   10033          849 :               DR_GROUP_NEXT_ELEMENT (prev) = elt;
   10034          849 :               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
   10035          849 :               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
   10036              :             }
   10037      1617973 :           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
   10038      1617973 :           prev = elt;
   10039              :         }
   10040              :     }
   10041       614383 : }
   10042              : 
   10043              : /* Check if the region described by BB_VINFO can be vectorized, returning
   10044              :    true if so.  When returning false, set FATAL to true if the same failure
   10045              :    would prevent vectorization at other vector sizes, false if it is still
   10046              :    worth trying other sizes.  N_STMTS is the number of statements in the
   10047              :    region.  */
   10048              : 
   10049              : static bool
   10050      2205447 : vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
   10051              :                        vec<int> *dataref_groups)
   10052              : {
   10053      2205447 :   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
   10054              : 
   10055      2205447 :   slp_instance instance;
   10056      2205447 :   int i;
   10057              : 
   10058              :   /* The first group of checks is independent of the vector size.  */
   10059      2205447 :   fatal = true;
   10060              : 
   10061              :   /* Analyze the data references.  */
   10062              : 
   10063      2205447 :   if (!vect_analyze_data_refs (bb_vinfo, NULL))
   10064              :     {
   10065            0 :       if (dump_enabled_p ())
   10066            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10067              :                          "not vectorized: unhandled data-ref in basic "
   10068              :                          "block.\n");
   10069            0 :       return false;
   10070              :     }
   10071              : 
   10072      2205447 :   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
   10073              :     {
   10074            0 :      if (dump_enabled_p ())
   10075            0 :        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10076              :                         "not vectorized: unhandled data access in "
   10077              :                         "basic block.\n");
   10078            0 :       return false;
   10079              :     }
   10080              : 
   10081      2205447 :   vect_slp_check_for_roots (bb_vinfo);
   10082              : 
   10083              :   /* If there are no grouped stores and no constructors in the region
   10084              :      there is no need to continue with pattern recog as vect_analyze_slp
   10085              :      will fail anyway.  */
   10086      2205447 :   if (bb_vinfo->grouped_stores.is_empty ()
   10087      1862201 :       && bb_vinfo->roots.is_empty ())
   10088              :     {
   10089      1591064 :       if (dump_enabled_p ())
   10090         1022 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10091              :                          "not vectorized: no grouped stores in "
   10092              :                          "basic block.\n");
   10093      1591064 :       return false;
   10094              :     }
   10095              : 
   10096              :   /* While the rest of the analysis below depends on it in some way.  */
   10097       614383 :   fatal = false;
   10098              : 
   10099       614383 :   vect_pattern_recog (bb_vinfo);
   10100              : 
   10101              :   /* Update store groups from pattern processing.  */
   10102       614383 :   vect_fixup_store_groups_with_patterns (bb_vinfo);
   10103              : 
   10104              :   /* Check the SLP opportunities in the basic block, analyze and build SLP
   10105              :      trees.  */
   10106       614383 :   if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
   10107              :     {
   10108            0 :       if (dump_enabled_p ())
   10109              :         {
   10110            0 :           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10111              :                            "Failed to SLP the basic block.\n");
   10112            0 :           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10113              :                            "not vectorized: failed to find SLP opportunities "
   10114              :                            "in basic block.\n");
   10115              :         }
   10116            0 :       return false;
   10117              :     }
   10118              : 
   10119              :   /* Optimize permutations.  */
   10120       614383 :   vect_optimize_slp (bb_vinfo);
   10121              : 
   10122              :   /* Gather the loads reachable from the SLP graph entries.  */
   10123       614383 :   vect_gather_slp_loads (bb_vinfo);
   10124              : 
   10125       614383 :   vect_record_base_alignments (bb_vinfo);
   10126              : 
   10127              :   /* Analyze and verify the alignment of data references and the
   10128              :      dependence in the SLP instances.  */
   10129      1406625 :   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
   10130              :     {
   10131       792242 :       vect_location = instance->location ();
   10132       792242 :       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
   10133       792242 :           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
   10134              :         {
   10135         8790 :           slp_tree node = SLP_INSTANCE_TREE (instance);
   10136         8790 :           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   10137         8790 :           if (dump_enabled_p ())
   10138            4 :             dump_printf_loc (MSG_NOTE, vect_location,
   10139              :                              "removing SLP instance operations starting from: %G",
   10140              :                              stmt_info->stmt);
   10141         8790 :           vect_free_slp_instance (instance);
   10142         8790 :           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
   10143         8790 :           continue;
   10144         8790 :         }
   10145              : 
   10146              :       /* Mark all the statements that we want to vectorize as relevant.  */
   10147       783452 :       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
   10148              : 
   10149       783452 :       i++;
   10150              :     }
   10151      2235637 :   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
   10152              :     return false;
   10153              : 
   10154       266190 :   if (!vect_slp_analyze_operations (bb_vinfo))
   10155              :     {
   10156        30190 :       if (dump_enabled_p ())
   10157           87 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10158              :                          "not vectorized: bad operation in basic block.\n");
   10159        30190 :       return false;
   10160              :     }
   10161              : 
   10162              :   /* Mark all the statements that we vectorize.  */
   10163       236000 :   vect_bb_slp_mark_stmts_vectorized (bb_vinfo);
   10164              : 
   10165              :   /* Compute vectorizable live stmts.  */
   10166       236000 :   vect_bb_slp_mark_live_stmts (bb_vinfo);
   10167              : 
   10168       236000 :   vect_bb_partition_graph (bb_vinfo);
   10169              : 
   10170       236000 :   return true;
   10171              : }
   10172              : 
   10173              : /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
   10174              :    basic blocks in BBS, returning true on success.
   10175              :    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
   10176              : 
   10177              : static bool
   10178      1883769 : vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
   10179              :                  vec<int> *dataref_groups, unsigned int n_stmts,
   10180              :                  loop_p orig_loop)
   10181              : {
   10182      1883769 :   bb_vec_info bb_vinfo;
   10183      1883769 :   auto_vector_modes vector_modes;
   10184              : 
   10185              :   /* Autodetect first vector size we try.  */
   10186      1883769 :   machine_mode next_vector_mode = VOIDmode;
   10187      1883769 :   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
   10188      1883769 :   unsigned int mode_i = 0;
   10189              : 
   10190      1883769 :   vec_info_shared shared;
   10191              : 
   10192      1883769 :   machine_mode autodetected_vector_mode = VOIDmode;
   10193      2527125 :   while (1)
   10194              :     {
   10195      2205447 :       bool vectorized = false;
   10196      2205447 :       bool fatal = false;
   10197      2205447 :       bb_vinfo = new _bb_vec_info (bbs, &shared);
   10198              : 
   10199      2205447 :       bool first_time_p = shared.datarefs.is_empty ();
   10200      2205447 :       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
   10201      2205447 :       if (first_time_p)
   10202      1906626 :         bb_vinfo->shared->save_datarefs ();
   10203              :       else
   10204       298821 :         bb_vinfo->shared->check_datarefs ();
   10205      2205447 :       bb_vinfo->vector_mode = next_vector_mode;
   10206              : 
   10207      2205447 :       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
   10208              :         {
   10209       236000 :           if (dump_enabled_p ())
   10210              :             {
   10211         1506 :               dump_printf_loc (MSG_NOTE, vect_location,
   10212              :                                "***** Analysis succeeded with vector mode"
   10213          753 :                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
   10214          753 :               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
   10215              :             }
   10216              : 
   10217       236000 :           bb_vinfo->shared->check_datarefs ();
   10218              : 
   10219       236000 :           bool force_clear = false;
   10220       236000 :           auto_vec<slp_instance> profitable_subgraphs;
   10221      1391102 :           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
   10222              :             {
   10223       683102 :               if (instance->subgraph_entries.is_empty ())
   10224       221683 :                 continue;
   10225              : 
   10226       662259 :               dump_user_location_t saved_vect_location = vect_location;
   10227       662259 :               vect_location = instance->location ();
   10228       662259 :               if (!unlimited_cost_model (NULL)
   10229       658901 :                   && !param_vect_allow_possibly_not_worthwhile_vectorizations
   10230      1321155 :                   && !vect_bb_vectorization_profitable_p
   10231       658896 :                         (bb_vinfo, instance->subgraph_entries, orig_loop))
   10232              :                 {
   10233       179997 :                   if (dump_enabled_p ())
   10234           28 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10235              :                                      "not vectorized: vectorization is not "
   10236              :                                      "profitable.\n");
   10237       179997 :                   vect_location = saved_vect_location;
   10238       179997 :                   continue;
   10239              :                 }
   10240              : 
   10241       482262 :               vect_location = saved_vect_location;
   10242       482262 :               if (!dbg_cnt (vect_slp))
   10243              :                 {
   10244            0 :                   force_clear = true;
   10245            0 :                   continue;
   10246              :                 }
   10247              : 
   10248       482262 :               profitable_subgraphs.safe_push (instance);
   10249              :             }
   10250              : 
   10251              :           /* When we're vectorizing an if-converted loop body make sure
   10252              :              we vectorized all if-converted code.  */
   10253       395384 :           if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
   10254              :             {
   10255          106 :               gcc_assert (bb_vinfo->nbbs == 1);
   10256          212 :               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
   10257         4388 :                    !gsi_end_p (gsi); gsi_next (&gsi))
   10258              :                 {
   10259              :                   /* The costing above left us with DCEable vectorized scalar
   10260              :                      stmts having the visited flag set on profitable
   10261              :                      subgraphs.  Do the delayed clearing of the flag here.  */
   10262         4282 :                   if (gimple_visited_p (gsi_stmt (gsi)))
   10263              :                     {
   10264         1260 :                       gimple_set_visited (gsi_stmt (gsi), false);
   10265         1260 :                       continue;
   10266              :                     }
   10267         3022 :                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
   10268          813 :                     continue;
   10269              : 
   10270         6334 :                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
   10271         2666 :                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
   10272              :                       {
   10273           69 :                         if (!profitable_subgraphs.is_empty ()
   10274           31 :                             && dump_enabled_p ())
   10275            0 :                           dump_printf_loc (MSG_NOTE, vect_location,
   10276              :                                            "not profitable because of "
   10277              :                                            "unprofitable if-converted scalar "
   10278              :                                            "code\n");
   10279           38 :                         profitable_subgraphs.truncate (0);
   10280              :                       }
   10281              :                 }
   10282              :             }
   10283              : 
   10284              :           /* Finally schedule the profitable subgraphs.  */
   10285      1036984 :           for (slp_instance instance : profitable_subgraphs)
   10286              :             {
   10287       482216 :               if (!vectorized && dump_enabled_p ())
   10288          728 :                 dump_printf_loc (MSG_NOTE, vect_location,
   10289              :                                  "Basic block will be vectorized "
   10290              :                                  "using SLP\n");
   10291       482216 :               vectorized = true;
   10292              : 
   10293              :               /* Dump before scheduling as store vectorization will remove
   10294              :                  the original stores and mess with the instance tree
   10295              :                  so querying its location will eventually ICE.  */
   10296       482216 :               if (flag_checking)
   10297      1940004 :                 for (slp_instance sub : instance->subgraph_entries)
   10298       493356 :                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
   10299       482216 :               unsigned HOST_WIDE_INT bytes;
   10300       482216 :               if (dump_enabled_p ())
   10301         3465 :                 for (slp_instance sub : instance->subgraph_entries)
   10302              :                   {
   10303          918 :                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
   10304         1836 :                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
   10305          918 :                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
   10306          918 :                                        sub->location (),
   10307              :                                        "basic block part vectorized using %wu "
   10308              :                                        "byte vectors\n", bytes);
   10309              :                     else
   10310              :                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
   10311              :                                        sub->location (),
   10312              :                                        "basic block part vectorized using "
   10313              :                                        "variable length vectors\n");
   10314              :                   }
   10315              : 
   10316       482216 :               dump_user_location_t saved_vect_location = vect_location;
   10317       482216 :               vect_location = instance->location ();
   10318              : 
   10319       482216 :               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
   10320              : 
   10321       482216 :               vect_location = saved_vect_location;
   10322              :             }
   10323              : 
   10324              : 
   10325              :           /* Generate the invariant statements.  */
   10326       236000 :           if (!gimple_seq_empty_p (bb_vinfo->inv_pattern_def_seq))
   10327              :             {
   10328           23 :               if (dump_enabled_p ())
   10329            0 :                 dump_printf_loc (MSG_NOTE, vect_location,
   10330              :                          "------>generating invariant statements\n");
   10331              : 
   10332           23 :               bb_vinfo->insert_seq_on_entry (NULL,
   10333              :                                              bb_vinfo->inv_pattern_def_seq);
   10334              :             }
   10335       236000 :         }
   10336              :       else
   10337              :         {
   10338      1969447 :           if (dump_enabled_p ())
   10339         1314 :             dump_printf_loc (MSG_NOTE, vect_location,
   10340              :                              "***** Analysis failed with vector mode %s\n",
   10341         1314 :                              GET_MODE_NAME (bb_vinfo->vector_mode));
   10342              :         }
   10343              : 
   10344      2205447 :       if (mode_i == 0)
   10345      1883769 :         autodetected_vector_mode = bb_vinfo->vector_mode;
   10346              : 
   10347      2205447 :       if (!fatal)
   10348      3154940 :         while (mode_i < vector_modes.length ()
   10349      1765589 :                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
   10350              :           {
   10351       335110 :             if (dump_enabled_p ())
   10352         1658 :               dump_printf_loc (MSG_NOTE, vect_location,
   10353              :                                "***** The result for vector mode %s would"
   10354              :                                " be the same\n",
   10355          829 :                                GET_MODE_NAME (vector_modes[mode_i]));
   10356       335110 :             mode_i += 1;
   10357              :           }
   10358              : 
   10359      2205447 :       delete bb_vinfo;
   10360              : 
   10361      2205447 :       if (mode_i < vector_modes.length ()
   10362      2027632 :           && VECTOR_MODE_P (autodetected_vector_mode)
   10363      2003166 :           && (related_vector_mode (vector_modes[mode_i],
   10364              :                                    GET_MODE_INNER (autodetected_vector_mode))
   10365      1001583 :               == autodetected_vector_mode)
   10366      4233079 :           && (related_vector_mode (autodetected_vector_mode,
   10367       520154 :                                    GET_MODE_INNER (vector_modes[mode_i]))
   10368      1040308 :               == vector_modes[mode_i]))
   10369              :         {
   10370       520154 :           if (dump_enabled_p ())
   10371          205 :             dump_printf_loc (MSG_NOTE, vect_location,
   10372              :                              "***** Skipping vector mode %s, which would"
   10373              :                              " repeat the analysis for %s\n",
   10374          205 :                              GET_MODE_NAME (vector_modes[mode_i]),
   10375          205 :                              GET_MODE_NAME (autodetected_vector_mode));
   10376       520154 :           mode_i += 1;
   10377              :         }
   10378              : 
   10379      2205447 :       if (vectorized
   10380      2046094 :           || mode_i == vector_modes.length ()
   10381      1868324 :           || autodetected_vector_mode == VOIDmode
   10382              :           /* If vect_slp_analyze_bb_1 signaled that analysis for all
   10383              :              vector sizes will fail do not bother iterating.  */
   10384      3047722 :           || fatal)
   10385      3767538 :         return vectorized;
   10386              : 
   10387              :       /* Try the next biggest vector size.  */
   10388       321678 :       next_vector_mode = vector_modes[mode_i++];
   10389       321678 :       if (dump_enabled_p ())
   10390          218 :         dump_printf_loc (MSG_NOTE, vect_location,
   10391              :                          "***** Re-trying analysis with vector mode %s\n",
   10392          218 :                          GET_MODE_NAME (next_vector_mode));
   10393       321678 :     }
   10394      1883769 : }
   10395              : 
   10396              : 
   10397              : /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
   10398              :    true if anything in the basic-block was vectorized.  */
   10399              : 
   10400              : static bool
   10401      1883769 : vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
   10402              : {
   10403      1883769 :   vec<data_reference_p> datarefs = vNULL;
   10404      1883769 :   auto_vec<int> dataref_groups;
   10405      1883769 :   int insns = 0;
   10406      1883769 :   int current_group = 0;
   10407              : 
   10408     12471296 :   for (unsigned i = 0; i < bbs.length (); i++)
   10409              :     {
   10410     10587527 :       basic_block bb = bbs[i];
   10411     89725375 :       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
   10412     79137848 :            gsi_next (&gsi))
   10413              :         {
   10414     79137848 :           gimple *stmt = gsi_stmt (gsi);
   10415     79137848 :           if (is_gimple_debug (stmt))
   10416     49414821 :             continue;
   10417              : 
   10418     29723027 :           insns++;
   10419              : 
   10420     29723027 :           if (gimple_location (stmt) != UNKNOWN_LOCATION)
   10421     26676015 :             vect_location = stmt;
   10422              : 
   10423     29723027 :           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
   10424              :                                               &dataref_groups, current_group))
   10425      5113055 :             ++current_group;
   10426              :         }
   10427              :       /* New BBs always start a new DR group.  */
   10428     10587527 :       ++current_group;
   10429              :     }
   10430              : 
   10431      1883769 :   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
   10432      1883769 : }
   10433              : 
   10434              : /* Special entry for the BB vectorizer.  Analyze and transform a single
   10435              :    if-converted BB with ORIG_LOOPs body being the not if-converted
   10436              :    representation.  Returns true if anything in the basic-block was
   10437              :    vectorized.  */
   10438              : 
   10439              : bool
   10440        19332 : vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
   10441              : {
   10442        19332 :   auto_vec<basic_block> bbs;
   10443        19332 :   bbs.safe_push (bb);
   10444        19332 :   return vect_slp_bbs (bbs, orig_loop);
   10445        19332 : }
   10446              : 
   10447              : /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
   10448              :    true if anything in the basic-block was vectorized.  */
   10449              : 
   10450              : bool
   10451       910766 : vect_slp_function (function *fun)
   10452              : {
   10453       910766 :   bool r = false;
   10454       910766 :   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
   10455       910766 :   auto_bitmap exit_bbs;
   10456       910766 :   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
   10457       910766 :   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
   10458       910766 :   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
   10459       910766 :                                                       true, rpo, NULL);
   10460              : 
   10461              :   /* For the moment split the function into pieces to avoid making
   10462              :      the iteration on the vector mode moot.  Split at points we know
   10463              :      to not handle well which is CFG merges (SLP discovery doesn't
   10464              :      handle non-loop-header PHIs) and loop exits.  Since pattern
   10465              :      recog requires reverse iteration to visit uses before defs
   10466              :      simply chop RPO into pieces.  */
   10467       910766 :   auto_vec<basic_block> bbs;
   10468     11509907 :   for (unsigned i = 0; i < n; i++)
   10469              :     {
   10470     10599141 :       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
   10471     10599141 :       bool split = false;
   10472              : 
   10473              :       /* Split when a BB is not dominated by the first block.  */
   10474     19980938 :       if (!bbs.is_empty ()
   10475      9381797 :           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
   10476              :         {
   10477       666941 :           if (dump_enabled_p ())
   10478          146 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10479              :                              "splitting region at dominance boundary bb%d\n",
   10480              :                              bb->index);
   10481              :           split = true;
   10482              :         }
   10483              :       /* Split when the loop determined by the first block
   10484              :          is exited.  This is because we eventually insert
   10485              :          invariants at region begin.  */
   10486     18647056 :       else if (!bbs.is_empty ()
   10487      8714856 :                && bbs[0]->loop_father != bb->loop_father
   10488      2270550 :                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
   10489              :         {
   10490         3827 :           if (dump_enabled_p ())
   10491            6 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10492              :                              "splitting region at loop %d exit at bb%d\n",
   10493            3 :                              bbs[0]->loop_father->num, bb->index);
   10494              :           split = true;
   10495              :         }
   10496      9928373 :       else if (!bbs.is_empty ()
   10497      8711029 :                && bb->loop_father->header == bb
   10498       470204 :                && bb->loop_father->dont_vectorize)
   10499              :         {
   10500         7271 :           if (dump_enabled_p ())
   10501           72 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10502              :                              "splitting region at dont-vectorize loop %d "
   10503              :                              "entry at bb%d\n",
   10504              :                              bb->loop_father->num, bb->index);
   10505              :           split = true;
   10506              :         }
   10507              : 
   10508     11277180 :       if (split && !bbs.is_empty ())
   10509              :         {
   10510       678039 :           r |= vect_slp_bbs (bbs, NULL);
   10511       678039 :           bbs.truncate (0);
   10512              :         }
   10513              : 
   10514     10599141 :       if (bbs.is_empty ())
   10515              :         {
   10516              :           /* We need to be able to insert at the head of the region which
   10517              :              we cannot for region starting with a returns-twice call.  */
   10518      1895383 :           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
   10519       404420 :             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
   10520              :               {
   10521          303 :                 if (dump_enabled_p ())
   10522            2 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10523              :                                    "skipping bb%d as start of region as it "
   10524              :                                    "starts with returns-twice call\n",
   10525              :                                    bb->index);
   10526        30946 :                 continue;
   10527              :               }
   10528              :           /* If the loop this BB belongs to is marked as not to be vectorized
   10529              :              honor that also for BB vectorization.  */
   10530      1895080 :           if (bb->loop_father->dont_vectorize)
   10531        30643 :             continue;
   10532              :         }
   10533              : 
   10534     10568195 :       bbs.safe_push (bb);
   10535              : 
   10536              :       /* When we have a stmt ending this block and defining a
   10537              :          value we have to insert on edges when inserting after it for
   10538              :          a vector containing its definition.  Avoid this for now.  */
   10539     21136390 :       if (gimple *last = *gsi_last_bb (bb))
   10540      8573611 :         if (gimple_get_lhs (last)
   10541      8573611 :             && is_ctrl_altering_stmt (last))
   10542              :           {
   10543       275639 :             if (dump_enabled_p ())
   10544            2 :               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10545              :                                "splitting region at control altering "
   10546              :                                "definition %G", last);
   10547       275639 :             r |= vect_slp_bbs (bbs, NULL);
   10548       275639 :             bbs.truncate (0);
   10549              :           }
   10550              :     }
   10551              : 
   10552       910766 :   if (!bbs.is_empty ())
   10553       910759 :     r |= vect_slp_bbs (bbs, NULL);
   10554              : 
   10555       910766 :   free (rpo);
   10556              : 
   10557       910766 :   return r;
   10558       910766 : }
   10559              : 
   10560              : /* Build a variable-length vector in which the elements in ELTS are repeated
   10561              :    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
   10562              :    RESULTS and add any new instructions to SEQ.
   10563              : 
   10564              :    The approach we use is:
   10565              : 
   10566              :    (1) Find a vector mode VM with integer elements of mode IM.
   10567              : 
   10568              :    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
   10569              :        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
   10570              :        from small vectors to IM.
   10571              : 
   10572              :    (3) Duplicate each ELTS'[I] into a vector of mode VM.
   10573              : 
   10574              :    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
   10575              :        correct byte contents.
   10576              : 
   10577              :    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
   10578              : 
   10579              :    We try to find the largest IM for which this sequence works, in order
   10580              :    to cut down on the number of interleaves.  */
   10581              : 
   10582              : void
   10583            0 : duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
   10584              :                           const vec<tree> &elts, unsigned int nresults,
   10585              :                           vec<tree> &results)
   10586              : {
   10587            0 :   unsigned int nelts = elts.length ();
   10588            0 :   tree element_type = TREE_TYPE (vector_type);
   10589              : 
   10590              :   /* (1) Find a vector mode VM with integer elements of mode IM.  */
   10591            0 :   unsigned int nvectors = 1;
   10592            0 :   tree new_vector_type;
   10593            0 :   tree permutes[2];
   10594            0 :   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
   10595              :                                        &nvectors, &new_vector_type,
   10596              :                                        permutes))
   10597            0 :     gcc_unreachable ();
   10598              : 
   10599              :   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
   10600            0 :   unsigned int partial_nelts = nelts / nvectors;
   10601            0 :   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
   10602              : 
   10603            0 :   tree_vector_builder partial_elts;
   10604            0 :   auto_vec<tree, 32> pieces (nvectors * 2);
   10605            0 :   pieces.quick_grow_cleared (nvectors * 2);
   10606            0 :   for (unsigned int i = 0; i < nvectors; ++i)
   10607              :     {
   10608              :       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
   10609              :              ELTS' has mode IM.  */
   10610            0 :       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
   10611            0 :       for (unsigned int j = 0; j < partial_nelts; ++j)
   10612            0 :         partial_elts.quick_push (elts[i * partial_nelts + j]);
   10613            0 :       tree t = gimple_build_vector (seq, &partial_elts);
   10614            0 :       t = gimple_build (seq, VIEW_CONVERT_EXPR,
   10615            0 :                         TREE_TYPE (new_vector_type), t);
   10616              : 
   10617              :       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
   10618            0 :       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
   10619              :     }
   10620              : 
   10621              :   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
   10622              :          correct byte contents.
   10623              : 
   10624              :      Conceptually, we need to repeat the following operation log2(nvectors)
   10625              :      times, where hi_start = nvectors / 2:
   10626              : 
   10627              :         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
   10628              :         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
   10629              : 
   10630              :      However, if each input repeats every N elements and the VF is
   10631              :      a multiple of N * 2, the HI result is the same as the LO result.
   10632              :      This will be true for the first N1 iterations of the outer loop,
   10633              :      followed by N2 iterations for which both the LO and HI results
   10634              :      are needed.  I.e.:
   10635              : 
   10636              :         N1 + N2 = log2(nvectors)
   10637              : 
   10638              :      Each "N1 iteration" doubles the number of redundant vectors and the
   10639              :      effect of the process as a whole is to have a sequence of nvectors/2**N1
   10640              :      vectors that repeats 2**N1 times.  Rather than generate these redundant
   10641              :      vectors, we halve the number of vectors for each N1 iteration.  */
   10642              :   unsigned int in_start = 0;
   10643              :   unsigned int out_start = nvectors;
   10644              :   unsigned int new_nvectors = nvectors;
   10645            0 :   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
   10646              :     {
   10647            0 :       unsigned int hi_start = new_nvectors / 2;
   10648            0 :       unsigned int out_i = 0;
   10649            0 :       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
   10650              :         {
   10651            0 :           if ((in_i & 1) != 0
   10652            0 :               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
   10653              :                              2 * in_repeat))
   10654            0 :             continue;
   10655              : 
   10656            0 :           tree output = make_ssa_name (new_vector_type);
   10657            0 :           tree input1 = pieces[in_start + (in_i / 2)];
   10658            0 :           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
   10659            0 :           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
   10660              :                                                input1, input2,
   10661              :                                                permutes[in_i & 1]);
   10662            0 :           gimple_seq_add_stmt (seq, stmt);
   10663            0 :           pieces[out_start + out_i] = output;
   10664            0 :           out_i += 1;
   10665              :         }
   10666            0 :       std::swap (in_start, out_start);
   10667            0 :       new_nvectors = out_i;
   10668              :     }
   10669              : 
   10670              :   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
   10671            0 :   results.reserve (nresults);
   10672            0 :   for (unsigned int i = 0; i < nresults; ++i)
   10673            0 :     if (i < new_nvectors)
   10674            0 :       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
   10675            0 :                                         pieces[in_start + i]));
   10676              :     else
   10677            0 :       results.quick_push (results[i - new_nvectors]);
   10678            0 : }
   10679              : 
   10680              : 
   10681              : /* For constant and loop invariant defs in OP_NODE this function creates
   10682              :    vector defs that will be used in the vectorized stmts and stores them
   10683              :    to SLP_TREE_VEC_DEFS of OP_NODE.  */
   10684              : 
   10685              : static void
   10686       491495 : vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
   10687              : {
   10688       491495 :   unsigned HOST_WIDE_INT nunits;
   10689       491495 :   tree vec_cst;
   10690       491495 :   unsigned j, number_of_places_left_in_vector;
   10691       491495 :   tree vector_type;
   10692       491495 :   tree vop;
   10693       491495 :   int group_size = op_node->ops.length ();
   10694       491495 :   unsigned int vec_num, i;
   10695       491495 :   unsigned number_of_copies = 1;
   10696       491495 :   bool constant_p;
   10697       491495 :   gimple_seq ctor_seq = NULL;
   10698       491495 :   auto_vec<tree, 16> permute_results;
   10699              : 
   10700              :   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
   10701       491495 :   vector_type = SLP_TREE_VECTYPE (op_node);
   10702              : 
   10703       491495 :   unsigned int number_of_vectors = vect_get_num_copies (vinfo, op_node);
   10704       491495 :   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
   10705       491495 :   auto_vec<tree> voprnds (number_of_vectors);
   10706              : 
   10707              :   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
   10708              :      created vectors. It is greater than 1 if unrolling is performed.
   10709              : 
   10710              :      For example, we have two scalar operands, s1 and s2 (e.g., group of
   10711              :      strided accesses of size two), while NUNITS is four (i.e., four scalars
   10712              :      of this type can be packed in a vector).  The output vector will contain
   10713              :      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
   10714              :      will be 2).
   10715              : 
   10716              :      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
   10717              :      containing the operands.
   10718              : 
   10719              :      For example, NUNITS is four as before, and the group size is 8
   10720              :      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
   10721              :      {s5, s6, s7, s8}.  */
   10722              : 
   10723              :   /* When using duplicate_and_interleave, we just need one element for
   10724              :      each scalar statement.  */
   10725       491495 :   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
   10726              :     nunits = group_size;
   10727              : 
   10728       491495 :   number_of_copies = nunits * number_of_vectors / group_size;
   10729              : 
   10730       491495 :   number_of_places_left_in_vector = nunits;
   10731       491495 :   constant_p = true;
   10732       491495 :   tree uniform_elt = NULL_TREE;
   10733       491495 :   tree_vector_builder elts (vector_type, nunits, 1);
   10734       491495 :   elts.quick_grow (nunits);
   10735       491495 :   stmt_vec_info insert_after = NULL;
   10736      1464084 :   for (j = 0; j < number_of_copies; j++)
   10737              :     {
   10738       972589 :       tree op;
   10739      3731964 :       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
   10740              :         {
   10741              :           /* Create 'vect_ = {op0,op1,...,opn}'.  */
   10742      1786786 :           tree orig_op = op;
   10743      1786786 :           if (number_of_places_left_in_vector == nunits)
   10744              :             uniform_elt = op;
   10745      1165550 :           else if (uniform_elt && operand_equal_p (uniform_elt, op))
   10746       740162 :             op = elts[number_of_places_left_in_vector];
   10747              :           else
   10748              :             uniform_elt = NULL_TREE;
   10749      1786786 :           number_of_places_left_in_vector--;
   10750      1786786 :           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
   10751              :             {
   10752       276070 :               if (CONSTANT_CLASS_P (op))
   10753              :                 {
   10754       100787 :                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
   10755              :                     {
   10756              :                       /* Can't use VIEW_CONVERT_EXPR for booleans because
   10757              :                          of possibly different sizes of scalar value and
   10758              :                          vector element.  */
   10759           51 :                       if (integer_zerop (op))
   10760           51 :                         op = build_int_cst (TREE_TYPE (vector_type), 0);
   10761            0 :                       else if (integer_onep (op))
   10762            0 :                         op = build_all_ones_cst (TREE_TYPE (vector_type));
   10763              :                       else
   10764            0 :                         gcc_unreachable ();
   10765              :                     }
   10766              :                   else
   10767       100736 :                     op = fold_unary (VIEW_CONVERT_EXPR,
   10768              :                                      TREE_TYPE (vector_type), op);
   10769       100787 :                   gcc_assert (op && CONSTANT_CLASS_P (op));
   10770              :                 }
   10771              :               else
   10772              :                 {
   10773       175283 :                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
   10774       175283 :                   gimple *init_stmt;
   10775       175283 :                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
   10776              :                     {
   10777          403 :                       tree true_val
   10778          403 :                         = build_all_ones_cst (TREE_TYPE (vector_type));
   10779          403 :                       tree false_val
   10780          403 :                         = build_zero_cst (TREE_TYPE (vector_type));
   10781          403 :                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
   10782          403 :                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
   10783              :                                                        op, true_val,
   10784              :                                                        false_val);
   10785              :                     }
   10786              :                   else
   10787              :                     {
   10788       174880 :                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
   10789              :                                    op);
   10790       174880 :                       init_stmt
   10791       174880 :                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
   10792              :                                                op);
   10793              :                     }
   10794       175283 :                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
   10795       175283 :                   op = new_temp;
   10796              :                 }
   10797              :             }
   10798      1786786 :           elts[number_of_places_left_in_vector] = op;
   10799      1786786 :           if (!CONSTANT_CLASS_P (op))
   10800       317003 :             constant_p = false;
   10801              :           /* For BB vectorization we have to compute an insert location
   10802              :              when a def is inside the analyzed region since we cannot
   10803              :              simply insert at the BB start in this case.  */
   10804      1786786 :           stmt_vec_info opdef;
   10805      1786786 :           if (TREE_CODE (orig_op) == SSA_NAME
   10806       181966 :               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
   10807       162156 :               && is_a <bb_vec_info> (vinfo)
   10808      1891345 :               && (opdef = vinfo->lookup_def (orig_op)))
   10809              :             {
   10810        85536 :               if (!insert_after)
   10811              :                 insert_after = opdef;
   10812              :               else
   10813        47162 :                 insert_after = get_later_stmt (insert_after, opdef);
   10814              :             }
   10815              : 
   10816      1786786 :           if (number_of_places_left_in_vector == 0)
   10817              :             {
   10818       621236 :               auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
   10819       621236 :               if (uniform_elt)
   10820       646912 :                 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
   10821       323456 :                                                         elts[0]);
   10822       595560 :               else if (constant_p
   10823       595560 :                        ? multiple_p (type_nunits, nunits)
   10824       109421 :                        : known_eq (type_nunits, nunits))
   10825       297780 :                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
   10826              :               else
   10827              :                 {
   10828            0 :                   if (permute_results.is_empty ())
   10829            0 :                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
   10830              :                                               elts, number_of_vectors,
   10831              :                                               permute_results);
   10832            0 :                   vec_cst = permute_results[number_of_vectors - j - 1];
   10833              :                 }
   10834       621236 :               if (!gimple_seq_empty_p (ctor_seq))
   10835              :                 {
   10836       136692 :                   if (insert_after)
   10837              :                     {
   10838        38374 :                       gimple_stmt_iterator gsi;
   10839        38374 :                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
   10840              :                         {
   10841          630 :                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
   10842          630 :                           gsi_insert_seq_before (&gsi, ctor_seq,
   10843              :                                                  GSI_CONTINUE_LINKING);
   10844              :                         }
   10845        37744 :                       else if (!stmt_ends_bb_p (insert_after->stmt))
   10846              :                         {
   10847        37744 :                           gsi = gsi_for_stmt (insert_after->stmt);
   10848        37744 :                           gsi_insert_seq_after (&gsi, ctor_seq,
   10849              :                                                 GSI_CONTINUE_LINKING);
   10850              :                         }
   10851              :                       else
   10852              :                         {
   10853              :                           /* When we want to insert after a def where the
   10854              :                              defining stmt throws then insert on the fallthru
   10855              :                              edge.  */
   10856            0 :                           edge e = find_fallthru_edge
   10857            0 :                                      (gimple_bb (insert_after->stmt)->succs);
   10858            0 :                           basic_block new_bb
   10859            0 :                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
   10860            0 :                           gcc_assert (!new_bb);
   10861              :                         }
   10862              :                     }
   10863              :                   else
   10864        98318 :                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
   10865       136692 :                   ctor_seq = NULL;
   10866              :                 }
   10867       621236 :               voprnds.quick_push (vec_cst);
   10868       621236 :               insert_after = NULL;
   10869       621236 :               number_of_places_left_in_vector = nunits;
   10870       621236 :               constant_p = true;
   10871       621236 :               elts.new_vector (vector_type, nunits, 1);
   10872       621236 :               elts.quick_grow (nunits);
   10873              :             }
   10874              :         }
   10875              :     }
   10876              : 
   10877              :   /* Since the vectors are created in the reverse order, we should invert
   10878              :      them.  */
   10879       491495 :   vec_num = voprnds.length ();
   10880      1112731 :   for (j = vec_num; j != 0; j--)
   10881              :     {
   10882       621236 :       vop = voprnds[j - 1];
   10883       621236 :       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
   10884              :     }
   10885              : 
   10886              :   /* In case that VF is greater than the unrolling factor needed for the SLP
   10887              :      group of stmts, NUMBER_OF_VECTORS to be created is greater than
   10888              :      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
   10889              :      to replicate the vectors.  */
   10890       491495 :   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
   10891       491495 :     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
   10892              :          i++)
   10893            0 :       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
   10894       491495 : }
   10895              : 
   10896              : /* Get the scalar definition of the Nth lane from SLP_NODE or NULL_TREE
   10897              :    if there is no definition for it in the scalar IL or it is not known.  */
   10898              : 
   10899              : tree
   10900         2665 : vect_get_slp_scalar_def (slp_tree slp_node, unsigned n)
   10901              : {
   10902         2665 :   if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
   10903              :     {
   10904         2653 :       if (!SLP_TREE_SCALAR_STMTS (slp_node).exists ())
   10905              :         return NULL_TREE;
   10906         2653 :       stmt_vec_info def = SLP_TREE_SCALAR_STMTS (slp_node)[n];
   10907         2653 :       if (!def)
   10908              :         return NULL_TREE;
   10909         2653 :       return gimple_get_lhs (STMT_VINFO_STMT (def));
   10910              :     }
   10911              :   else
   10912           12 :     return SLP_TREE_SCALAR_OPS (slp_node)[n];
   10913              : }
   10914              : 
   10915              : /* Get the Ith vectorized definition from SLP_NODE.  */
   10916              : 
   10917              : tree
   10918       145669 : vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
   10919              : {
   10920       145669 :   return SLP_TREE_VEC_DEFS (slp_node)[i];
   10921              : }
   10922              : 
   10923              : /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
   10924              : 
   10925              : void
   10926       931113 : vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
   10927              : {
   10928      1862226 :   vec_defs->create (SLP_TREE_VEC_DEFS (slp_node).length ());
   10929       931113 :   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
   10930       931113 : }
   10931              : 
   10932              : /* Get N vectorized definitions for SLP_NODE.  */
   10933              : 
   10934              : void
   10935         2965 : vect_get_slp_defs (vec_info *,
   10936              :                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
   10937              : {
   10938         2965 :   if (n == -1U)
   10939         2965 :     n = SLP_TREE_CHILDREN (slp_node).length ();
   10940              : 
   10941        10681 :   for (unsigned i = 0; i < n; ++i)
   10942              :     {
   10943         7716 :       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
   10944         7716 :       vec<tree> vec_defs = vNULL;
   10945         7716 :       vect_get_slp_defs (child, &vec_defs);
   10946         7716 :       vec_oprnds->quick_push (vec_defs);
   10947              :     }
   10948         2965 : }
   10949              : 
   10950              : /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
   10951              :    - PERM gives the permutation that the caller wants to use for NODE,
   10952              :      which might be different from SLP_LOAD_PERMUTATION.
   10953              :    - DUMP_P controls whether the function dumps information.  */
   10954              : 
   10955              : static bool
   10956       129808 : vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
   10957              :                                 load_permutation_t &perm,
   10958              :                                 const vec<tree> &dr_chain,
   10959              :                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
   10960              :                                 bool analyze_only, bool dump_p,
   10961              :                                 unsigned *n_perms, unsigned int *n_loads,
   10962              :                                 bool dce_chain)
   10963              : {
   10964       129808 :   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   10965       129808 :   int vec_index = 0;
   10966       129808 :   tree vectype = SLP_TREE_VECTYPE (node);
   10967       129808 :   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
   10968       129808 :   unsigned int mask_element;
   10969       129808 :   unsigned dr_group_size;
   10970       129808 :   machine_mode mode;
   10971              : 
   10972       129808 :   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
   10973              :     {
   10974              :       /* We have both splats of the same non-grouped load and groups
   10975              :          of distinct invariant loads entering here.  */
   10976         1603 :       unsigned max_idx = 0;
   10977         8819 :       for (auto idx : perm)
   10978         4010 :         max_idx = idx > max_idx ? idx : max_idx;
   10979         1603 :       dr_group_size = max_idx + 1;
   10980              :     }
   10981              :   else
   10982              :     {
   10983       128205 :       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
   10984       128205 :       dr_group_size = DR_GROUP_SIZE (stmt_info);
   10985              :     }
   10986              : 
   10987       129808 :   mode = TYPE_MODE (vectype);
   10988       129808 :   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   10989       129808 :   unsigned int nstmts = vect_get_num_copies (vinfo, node);
   10990              : 
   10991              :   /* Initialize the vect stmts of NODE to properly insert the generated
   10992              :      stmts later.  */
   10993       129808 :   if (! analyze_only)
   10994        57144 :     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
   10995        22052 :       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
   10996              : 
   10997              :   /* Generate permutation masks for every NODE. Number of masks for each NODE
   10998              :      is equal to GROUP_SIZE.
   10999              :      E.g., we have a group of three nodes with three loads from the same
   11000              :      location in each node, and the vector size is 4. I.e., we have a
   11001              :      a0b0c0a1b1c1... sequence and we need to create the following vectors:
   11002              :      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
   11003              :      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
   11004              :      ...
   11005              : 
   11006              :      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
   11007              :      The last mask is illegal since we assume two operands for permute
   11008              :      operation, and the mask element values can't be outside that range.
   11009              :      Hence, the last mask must be converted into {2,5,5,5}.
   11010              :      For the first two permutations we need the first and the second input
   11011              :      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
   11012              :      we need the second and the third vectors: {b1,c1,a2,b2} and
   11013              :      {c2,a3,b3,c3}.  */
   11014              : 
   11015       129808 :   int vect_stmts_counter = 0;
   11016       129808 :   unsigned int index = 0;
   11017       129808 :   int first_vec_index = -1;
   11018       129808 :   int second_vec_index = -1;
   11019       129808 :   bool noop_p = true;
   11020       129808 :   *n_perms = 0;
   11021              : 
   11022       129808 :   vec_perm_builder mask;
   11023       129808 :   unsigned int nelts_to_build;
   11024       129808 :   unsigned int nvectors_per_build;
   11025       129808 :   unsigned int in_nlanes;
   11026       129808 :   bool repeating_p = (group_size == dr_group_size
   11027       164782 :                       && multiple_p (nunits, group_size));
   11028       129808 :   if (repeating_p)
   11029              :     {
   11030              :       /* A single vector contains a whole number of copies of the node, so:
   11031              :          (a) all permutes can use the same mask; and
   11032              :          (b) the permutes only need a single vector input.  */
   11033        32754 :       mask.new_vector (nunits, group_size, 3);
   11034        32754 :       nelts_to_build = mask.encoded_nelts ();
   11035              :       /* It's possible to obtain zero nstmts during analyze_only, so make
   11036              :          it at least one to ensure the later computation for n_perms
   11037              :          proceed.  */
   11038        32754 :       nvectors_per_build = nstmts > 0 ? nstmts : 1;
   11039        32754 :       in_nlanes = dr_group_size * 3;
   11040              :     }
   11041              :   else
   11042              :     {
   11043              :       /* We need to construct a separate mask for each vector statement.  */
   11044        97054 :       unsigned HOST_WIDE_INT const_nunits, const_vf;
   11045        97054 :       if (!nunits.is_constant (&const_nunits)
   11046        97054 :           || !vf.is_constant (&const_vf))
   11047              :         return false;
   11048        97054 :       mask.new_vector (const_nunits, const_nunits, 1);
   11049        97054 :       nelts_to_build = const_vf * group_size;
   11050        97054 :       nvectors_per_build = 1;
   11051        97054 :       in_nlanes = const_vf * dr_group_size;
   11052              :     }
   11053       129808 :   auto_sbitmap used_in_lanes (in_nlanes);
   11054       129808 :   bitmap_clear (used_in_lanes);
   11055       129808 :   auto_bitmap used_defs;
   11056              : 
   11057       129808 :   unsigned int count = mask.encoded_nelts ();
   11058       129808 :   mask.quick_grow (count);
   11059       129808 :   vec_perm_indices indices;
   11060              : 
   11061       689287 :   for (unsigned int j = 0; j < nelts_to_build; j++)
   11062              :     {
   11063       569097 :       unsigned int iter_num = j / group_size;
   11064       569097 :       unsigned int stmt_num = j % group_size;
   11065       569097 :       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
   11066       569097 :       bitmap_set_bit (used_in_lanes, i);
   11067       569097 :       if (repeating_p)
   11068              :         {
   11069              :           first_vec_index = 0;
   11070              :           mask_element = i;
   11071              :         }
   11072              :       else
   11073              :         {
   11074              :           /* Enforced before the loop when !repeating_p.  */
   11075       359163 :           unsigned int const_nunits = nunits.to_constant ();
   11076       359163 :           vec_index = i / const_nunits;
   11077       359163 :           mask_element = i % const_nunits;
   11078       359163 :           if (vec_index == first_vec_index
   11079       359163 :               || first_vec_index == -1)
   11080              :             {
   11081              :               first_vec_index = vec_index;
   11082              :             }
   11083       143777 :           else if (vec_index == second_vec_index
   11084       143777 :                    || second_vec_index == -1)
   11085              :             {
   11086       137684 :               second_vec_index = vec_index;
   11087       137684 :               mask_element += const_nunits;
   11088              :             }
   11089              :           else
   11090              :             {
   11091         6093 :               if (dump_p)
   11092          280 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11093              :                                  "permutation requires at "
   11094              :                                  "least three vectors %G",
   11095              :                                  stmt_info->stmt);
   11096         6093 :               gcc_assert (analyze_only);
   11097              :               return false;
   11098              :             }
   11099              : 
   11100       353070 :           gcc_assert (mask_element < 2 * const_nunits);
   11101              :         }
   11102              : 
   11103       563004 :       if (mask_element != index)
   11104       362644 :         noop_p = false;
   11105       563004 :       mask[index++] = mask_element;
   11106              : 
   11107       563004 :       if (index == count)
   11108              :         {
   11109       153073 :           if (!noop_p)
   11110              :             {
   11111       210617 :               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
   11112       124624 :               if (!can_vec_perm_const_p (mode, mode, indices))
   11113              :                 {
   11114         3525 :                   if (dump_p)
   11115              :                     {
   11116           79 :                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11117              :                                        "unsupported vect permute { ");
   11118          669 :                       for (i = 0; i < count; ++i)
   11119              :                         {
   11120          590 :                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
   11121          590 :                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
   11122              :                         }
   11123           79 :                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
   11124              :                     }
   11125         3525 :                   gcc_assert (analyze_only);
   11126              :                   return false;
   11127              :                 }
   11128              : 
   11129       121099 :               tree mask_vec = NULL_TREE;
   11130       121099 :               if (!analyze_only)
   11131        20371 :                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
   11132              : 
   11133       121099 :               if (second_vec_index == -1)
   11134        36685 :                 second_vec_index = first_vec_index;
   11135              : 
   11136       245072 :               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
   11137              :                 {
   11138       123973 :                   ++*n_perms;
   11139       123973 :                   if (analyze_only)
   11140       103320 :                     continue;
   11141              :                   /* Generate the permute statement if necessary.  */
   11142        20653 :                   tree first_vec = dr_chain[first_vec_index + ri];
   11143        20653 :                   tree second_vec = dr_chain[second_vec_index + ri];
   11144        20653 :                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
   11145        20653 :                   tree perm_dest
   11146        20653 :                     = vect_create_destination_var (gimple_assign_lhs (stmt),
   11147              :                                                    vectype);
   11148        20653 :                   perm_dest = make_ssa_name (perm_dest);
   11149        20653 :                   gimple *perm_stmt
   11150        20653 :                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
   11151              :                                            second_vec, mask_vec);
   11152        20653 :                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
   11153              :                                                gsi);
   11154        20653 :                   if (dce_chain)
   11155              :                     {
   11156        19964 :                       bitmap_set_bit (used_defs, first_vec_index + ri);
   11157        19964 :                       bitmap_set_bit (used_defs, second_vec_index + ri);
   11158              :                     }
   11159              : 
   11160              :                   /* Store the vector statement in NODE.  */
   11161        20653 :                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
   11162              :                 }
   11163              :             }
   11164        28449 :           else if (!analyze_only)
   11165              :             {
   11166         2798 :               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
   11167              :                 {
   11168         1399 :                   tree first_vec = dr_chain[first_vec_index + ri];
   11169              :                   /* If mask was NULL_TREE generate the requested
   11170              :                      identity transform.  */
   11171         1399 :                   if (dce_chain)
   11172         1392 :                     bitmap_set_bit (used_defs, first_vec_index + ri);
   11173              : 
   11174              :                   /* Store the vector statement in NODE.  */
   11175         1399 :                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
   11176              :                 }
   11177              :             }
   11178              : 
   11179              :           index = 0;
   11180              :           first_vec_index = -1;
   11181              :           second_vec_index = -1;
   11182              :           noop_p = true;
   11183              :         }
   11184              :     }
   11185              : 
   11186       120190 :   if (n_loads)
   11187              :     {
   11188        81612 :       if (repeating_p)
   11189        10602 :         *n_loads = nstmts;
   11190              :       else
   11191              :         {
   11192              :           /* Enforced above when !repeating_p.  */
   11193        71010 :           unsigned int const_nunits = nunits.to_constant ();
   11194        71010 :           *n_loads = 0;
   11195        71010 :           bool load_seen = false;
   11196       991368 :           for (unsigned i = 0; i < in_nlanes; ++i)
   11197              :             {
   11198       920358 :               if (i % const_nunits == 0)
   11199              :                 {
   11200       389597 :                   if (load_seen)
   11201       110626 :                     *n_loads += 1;
   11202              :                   load_seen = false;
   11203              :                 }
   11204       920358 :               if (bitmap_bit_p (used_in_lanes, i))
   11205       253382 :                 load_seen = true;
   11206              :             }
   11207        71010 :           if (load_seen)
   11208        48451 :             *n_loads += 1;
   11209              :         }
   11210              :     }
   11211              : 
   11212       120190 :   if (dce_chain)
   11213       218752 :     for (unsigned i = 0; i < dr_chain.length (); ++i)
   11214        71954 :       if (!bitmap_bit_p (used_defs, i))
   11215              :         {
   11216        39323 :           tree def = dr_chain[i];
   11217        39670 :           do
   11218              :             {
   11219        39670 :               gimple *stmt = SSA_NAME_DEF_STMT (def);
   11220        39670 :               if (is_gimple_assign (stmt)
   11221        39670 :                   && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
   11222        39670 :                       || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
   11223         4916 :                 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
   11224              :               else
   11225              :                 def = NULL;
   11226        39670 :               gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
   11227        39670 :               gsi_remove (&rgsi, true);
   11228        39670 :               release_defs (stmt);
   11229              :             }
   11230        39670 :           while (def);
   11231              :         }
   11232              : 
   11233              :   return true;
   11234       129808 : }
   11235              : 
   11236              : /* Generate vector permute statements from a list of loads in DR_CHAIN.
   11237              :    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
   11238              :    permute statements for the SLP node NODE.  Store the number of vector
   11239              :    permute instructions in *N_PERMS and the number of vector load
   11240              :    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
   11241              :    that were not needed.  */
   11242              : 
   11243              : bool
   11244        90294 : vect_transform_slp_perm_load (vec_info *vinfo,
   11245              :                               slp_tree node, const vec<tree> &dr_chain,
   11246              :                               gimple_stmt_iterator *gsi, poly_uint64 vf,
   11247              :                               bool analyze_only, unsigned *n_perms,
   11248              :                               unsigned int *n_loads, bool dce_chain)
   11249              : {
   11250        90294 :   return vect_transform_slp_perm_load_1 (vinfo, node,
   11251        90294 :                                          SLP_TREE_LOAD_PERMUTATION (node),
   11252              :                                          dr_chain, gsi, vf, analyze_only,
   11253              :                                          dump_enabled_p (), n_perms, n_loads,
   11254        90294 :                                          dce_chain);
   11255              : }
   11256              : 
   11257              : /* Produce the next vector result for SLP permutation NODE by adding a vector
   11258              :    statement at GSI.  If MASK_VEC is nonnull, add:
   11259              : 
   11260              :       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
   11261              : 
   11262              :    otherwise add:
   11263              : 
   11264              :       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF,
   11265              :                                       { N, N+1, N+2, ... }>
   11266              : 
   11267              :    where N == IDENTITY_OFFSET which is either zero or equal to the
   11268              :    number of elements of the result.  */
   11269              : 
   11270              : static void
   11271        31263 : vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
   11272              :                           slp_tree node, tree first_def, tree second_def,
   11273              :                           tree mask_vec, poly_uint64 identity_offset)
   11274              : {
   11275        31263 :   tree vectype = SLP_TREE_VECTYPE (node);
   11276              : 
   11277              :   /* ???  We SLP match existing vector element extracts but
   11278              :      allow punning which we need to re-instantiate at uses
   11279              :      but have no good way of explicitly representing.  */
   11280        31263 :   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
   11281        31263 :       && !types_compatible_p (TREE_TYPE (first_def), vectype))
   11282              :     {
   11283           14 :       gassign *conv_stmt
   11284           14 :         = gimple_build_assign (make_ssa_name (vectype),
   11285              :                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
   11286           14 :       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
   11287           14 :       first_def = gimple_assign_lhs (conv_stmt);
   11288              :     }
   11289        31263 :   gassign *perm_stmt;
   11290        31263 :   tree perm_dest = make_ssa_name (vectype);
   11291        31263 :   if (mask_vec)
   11292              :     {
   11293        27957 :       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
   11294        27957 :                            TYPE_SIZE (vectype))
   11295        27957 :           && !types_compatible_p (TREE_TYPE (second_def), vectype))
   11296              :         {
   11297            8 :           gassign *conv_stmt
   11298            8 :             = gimple_build_assign (make_ssa_name (vectype),
   11299              :                                    build1 (VIEW_CONVERT_EXPR,
   11300              :                                            vectype, second_def));
   11301            8 :           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
   11302            8 :           second_def = gimple_assign_lhs (conv_stmt);
   11303              :         }
   11304        27957 :       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
   11305              :                                        first_def, second_def,
   11306              :                                        mask_vec);
   11307              :     }
   11308              :   else
   11309              :     {
   11310         3306 :       auto def_nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
   11311         3306 :       unsigned HOST_WIDE_INT vecno;
   11312         3306 :       poly_uint64 eltno;
   11313         3306 :       if (!can_div_trunc_p (poly_uint64 (identity_offset), def_nunits,
   11314              :                             &vecno, &eltno))
   11315              :         gcc_unreachable ();
   11316         3306 :       tree def = vecno & 1 ? second_def : first_def;
   11317         3306 :       if (!types_compatible_p (TREE_TYPE (def), vectype))
   11318              :         {
   11319              :           /* For identity permutes we still need to handle the case
   11320              :              of offsetted extracts or concats.  */
   11321          261 :           unsigned HOST_WIDE_INT c;
   11322          261 :           if (known_le (TYPE_VECTOR_SUBPARTS (vectype), def_nunits))
   11323              :             {
   11324          257 :               unsigned HOST_WIDE_INT elsz
   11325          257 :                 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (def))));
   11326          514 :               tree lowpart = build3 (BIT_FIELD_REF, vectype, def,
   11327          257 :                                      TYPE_SIZE (vectype),
   11328          257 :                                      bitsize_int (eltno * elsz));
   11329          257 :               perm_stmt = gimple_build_assign (perm_dest, lowpart);
   11330              :             }
   11331            4 :           else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
   11332            4 :                                         def_nunits, &c) && c == 2)
   11333              :             {
   11334            4 :               gcc_assert (known_eq (identity_offset, 0U));
   11335            4 :               tree ctor = build_constructor_va (vectype, 2,
   11336              :                                                 NULL_TREE, first_def,
   11337              :                                                 NULL_TREE, second_def);
   11338            4 :               perm_stmt = gimple_build_assign (perm_dest, ctor);
   11339              :             }
   11340              :           else
   11341            0 :             gcc_unreachable ();
   11342              :         }
   11343              :       else
   11344              :         {
   11345              :           /* We need a copy here in case the def was external.  */
   11346         3045 :           gcc_assert (known_eq (eltno, 0U));
   11347         3045 :           perm_stmt = gimple_build_assign (perm_dest, def);
   11348              :         }
   11349              :     }
   11350        31263 :   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
   11351              :   /* Store the vector statement in NODE.  */
   11352        31263 :   node->push_vec_def (perm_stmt);
   11353        31263 : }
   11354              : 
   11355              : /* Subroutine of vectorizable_slp_permutation.  Check whether the target
   11356              :    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
   11357              :    If GSI is nonnull, emit the permutation there.
   11358              : 
   11359              :    When GSI is null, the only purpose of NODE is to give properties
   11360              :    of the result, such as the vector type and number of SLP lanes.
   11361              :    The node does not need to be a VEC_PERM_EXPR.
   11362              : 
   11363              :    If the target supports the operation, return the number of individual
   11364              :    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
   11365              :    dump file if DUMP_P is true.  */
   11366              : 
   11367              : static int
   11368       490300 : vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
   11369              :                                 slp_tree node, lane_permutation_t &perm,
   11370              :                                 vec<slp_tree> &children, bool dump_p)
   11371              : {
   11372       490300 :   tree vectype = SLP_TREE_VECTYPE (node);
   11373              : 
   11374              :   /* ???  We currently only support all same vector input types
   11375              :      while the SLP IL should really do a concat + select and thus accept
   11376              :      arbitrary mismatches.  */
   11377       490300 :   slp_tree child;
   11378       490300 :   unsigned i;
   11379       490300 :   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   11380       490300 :   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
   11381              :   /* True if we're permuting a single input of 2N vectors down
   11382              :      to N vectors.  This case doesn't generalize beyond 2 since
   11383              :      VEC_PERM_EXPR only takes 2 inputs.  */
   11384       490300 :   bool pack_p = false;
   11385              :   /* If we're permuting inputs of N vectors each into X*N outputs,
   11386              :      this is the value of X, otherwise it is 1.  */
   11387       490300 :   unsigned int unpack_factor = 1;
   11388       490300 :   tree op_vectype = NULL_TREE;
   11389       491865 :   FOR_EACH_VEC_ELT (children, i, child)
   11390       491790 :     if (SLP_TREE_VECTYPE (child))
   11391              :       {
   11392              :         op_vectype = SLP_TREE_VECTYPE (child);
   11393              :         break;
   11394              :       }
   11395       490300 :   if (!op_vectype)
   11396           75 :     op_vectype = vectype;
   11397      1064748 :   FOR_EACH_VEC_ELT (children, i, child)
   11398              :     {
   11399       574448 :       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
   11400        10467 :            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
   11401       574448 :           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
   11402      1148896 :           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
   11403              :         {
   11404            0 :           if (dump_p)
   11405            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11406              :                              "Unsupported vector types in lane permutation\n");
   11407            0 :           return -1;
   11408              :         }
   11409       574448 :       auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
   11410       574448 :       unsigned int this_unpack_factor;
   11411              :       /* Detect permutations of external, pre-existing vectors.  The external
   11412              :          node's SLP_TREE_LANES stores the total number of units in the vector,
   11413              :          or zero if the vector has variable length.
   11414              : 
   11415              :          We are expected to keep the original VEC_PERM_EXPR for such cases.
   11416              :          There is no repetition to model.  */
   11417       574448 :       if (SLP_TREE_DEF_TYPE (child) == vect_external_def
   11418       574448 :           && SLP_TREE_SCALAR_OPS (child).is_empty ())
   11419              :         repeating_p = false;
   11420              :       /* Check whether the input has twice as many lanes per vector.  */
   11421       566540 :       else if (children.length () == 1
   11422       566540 :                && known_eq (SLP_TREE_LANES (child) * nunits,
   11423              :                             SLP_TREE_LANES (node) * op_nunits * 2))
   11424              :         pack_p = true;
   11425              :       /* Check whether the output has N times as many lanes per vector.  */
   11426       574448 :       else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
   11427       522767 :                                     SLP_TREE_LANES (child) * nunits,
   11428              :                                     &this_unpack_factor)
   11429       487833 :                && (i == 0 || unpack_factor == this_unpack_factor))
   11430              :         unpack_factor = this_unpack_factor;
   11431              :       else
   11432              :         repeating_p = false;
   11433              :     }
   11434              : 
   11435       980600 :   gcc_assert (perm.length () == SLP_TREE_LANES (node));
   11436              : 
   11437              :   /* Load-lanes permute.  This permute only acts as a forwarder to
   11438              :      select the correct vector def of the load-lanes load which
   11439              :      has the permuted vectors in its vector defs like
   11440              :      { v0, w0, r0, v1, w1, r1 ... } for a ld3.  All costs are
   11441              :      accounted for in the costing for the actual load so we
   11442              :      return zero here.  */
   11443       490300 :   if (node->ldst_lanes)
   11444              :     {
   11445            0 :       gcc_assert (children.length () == 1);
   11446            0 :       if (!gsi)
   11447              :         /* This is a trivial op always supported.  */
   11448              :         return 0;
   11449            0 :       slp_tree child = children[0];
   11450            0 :       unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
   11451            0 :                           / SLP_TREE_LANES (node));
   11452            0 :       unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
   11453            0 :       unsigned nvectors = vect_get_num_copies (vinfo, node);
   11454            0 :       for (unsigned i = 0; i < nvectors; ++i)
   11455              :         {
   11456            0 :           tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num  + vec_idx];
   11457            0 :           node->push_vec_def (def);
   11458              :         }
   11459              :       return 0;
   11460              :     }
   11461              : 
   11462              :   /* Set REPEATING_P to true if the permutations are cyclical wrt UNPACK_FACTOR
   11463              :      and if we can generate the vectors in a vector-length agnostic way.
   11464              :      This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
   11465              :      compile time.
   11466              : 
   11467              :      The significance of UNPACK_STEP is that, when PACK_P is false,
   11468              :      output vector I operates on a window of UNPACK_STEP elements from each
   11469              :      input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR).  For example,
   11470              :      when UNPACK_FACTOR is 2, the first output vector operates on lanes
   11471              :      [0, NUNITS / 2 - 1] of each input vector and the second output vector
   11472              :      operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
   11473              : 
   11474              :      When REPEATING_P is true, NOUTPUTS holds the total number of outputs
   11475              :      that we actually need to generate.  */
   11476       490300 :   uint64_t noutputs = 0;
   11477       490300 :   poly_uint64 unpack_step = 0;
   11478       490300 :   loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
   11479       182124 :   if (!linfo
   11480       529416 :       || !multiple_p (nunits, unpack_factor, &unpack_step)
   11481       181190 :       || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
   11482       181190 :                                * SLP_TREE_LANES (node), nunits, &noutputs))
   11483              :     repeating_p = false;
   11484              : 
   11485              :   /* We can handle the conditions described for REPEATING_P above for
   11486              :      both variable- and constant-length vectors.  The fallback requires
   11487              :      us to generate every element of every permute vector explicitly,
   11488              :      which is only possible for constant-length permute vectors.
   11489              : 
   11490              :      Set:
   11491              : 
   11492              :      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
   11493              :        mask vectors that we want to build.
   11494              : 
   11495              :      - NCOPIES to the number of copies of PERM that we need in order
   11496              :        to build the necessary permute mask vectors.  */
   11497       181190 :   uint64_t npatterns;
   11498       181190 :   unsigned nelts_per_pattern;
   11499       181190 :   uint64_t ncopies;
   11500       181190 :   if (repeating_p)
   11501              :     {
   11502              :       /* We need permute mask vectors that have the form:
   11503              : 
   11504              :            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
   11505              : 
   11506              :          In other words, the original n-element permute in PERM is
   11507              :          "unrolled" to fill a full vector.  The stepped vector encoding
   11508              :          that we use for permutes requires 3n elements.  */
   11509       142074 :       npatterns = SLP_TREE_LANES (node);
   11510       142074 :       nelts_per_pattern = ncopies = 3;
   11511              :     }
   11512              :   else
   11513              :     {
   11514              :       /* Calculate every element of every permute mask vector explicitly,
   11515              :          instead of relying on the pattern described above.  */
   11516       348226 :       if (!nunits.is_constant (&npatterns)
   11517       348226 :           || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
   11518              :         {
   11519              :           if (dump_p)
   11520              :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11521              :                              "unsupported permutation %p on variable-length"
   11522              :                              " vectors\n", (void *) node);
   11523              :           return -1;
   11524              :         }
   11525       348226 :       nelts_per_pattern = ncopies = 1;
   11526       348226 :       if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
   11527              :         {
   11528              :           if (dump_p)
   11529              :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11530              :                              "unsupported permutation %p for variable VF\n",
   11531              :                              (void *) node);
   11532              :           return -1;
   11533              :         }
   11534              :       pack_p = false;
   11535              :       unpack_factor = 1;
   11536              :     }
   11537       490300 :   unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
   11538       490300 :   gcc_assert (repeating_p || multiple_p (olanes, nunits));
   11539              : 
   11540              :   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
   11541              :      from the { SLP operand, scalar lane } permutation as recorded in the
   11542              :      SLP node as intermediate step.  This part should already work
   11543              :      with SLP children with arbitrary number of lanes.  */
   11544       490300 :   auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
   11545       490300 :   auto_vec<poly_uint64> active_lane;
   11546       490300 :   vperm.create (olanes);
   11547       490300 :   active_lane.safe_grow_cleared (children.length (), true);
   11548       988827 :   for (unsigned int ui = 0; ui < unpack_factor; ++ui)
   11549              :     {
   11550      2178810 :       for (unsigned j = 0; j < children.length (); ++j)
   11551       590878 :         active_lane[j] = ui * unpack_step;
   11552      1397593 :       for (unsigned i = 0; i < ncopies; ++i)
   11553              :         {
   11554      5597910 :           for (unsigned pi = 0; pi < perm.length (); ++pi)
   11555              :             {
   11556      1899889 :               std::pair<unsigned, unsigned> p = perm[pi];
   11557      1899889 :               tree vtype = SLP_TREE_VECTYPE (children[p.first]);
   11558      1899889 :               if (repeating_p)
   11559       827625 :                 vperm.quick_push ({{p.first, 0},
   11560       827625 :                                    p.second + active_lane[p.first]});
   11561              :               else
   11562              :                 {
   11563              :                   /* We checked above that the vectors are constant-length.  */
   11564      1072264 :                   unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
   11565      1072264 :                     .to_constant ();
   11566      1072264 :                   unsigned lane = active_lane[p.first].to_constant ();
   11567      1072264 :                   unsigned vi = (lane + p.second) / vnunits;
   11568      1072264 :                   unsigned vl = (lane + p.second) % vnunits;
   11569      1072264 :                   vperm.quick_push ({{p.first, vi}, vl});
   11570              :                 }
   11571              :             }
   11572              :           /* Advance to the next group.  */
   11573      1954347 :           for (unsigned j = 0; j < children.length (); ++j)
   11574      1055281 :             active_lane[j] += SLP_TREE_LANES (children[j]);
   11575              :         }
   11576              :     }
   11577              : 
   11578       490300 :   if (dump_p)
   11579              :     {
   11580         8909 :       dump_printf_loc (MSG_NOTE, vect_location,
   11581              :                        "vectorizing permutation %p", (void *)node);
   11582        32209 :       for (unsigned i = 0; i < perm.length (); ++i)
   11583        23300 :         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
   11584         8909 :       if (repeating_p)
   11585         7502 :         dump_printf (MSG_NOTE, " (repeat %d)", SLP_TREE_LANES (node));
   11586         8909 :       dump_printf (MSG_NOTE, "\n");
   11587         8909 :       dump_printf_loc (MSG_NOTE, vect_location, "as");
   11588        89301 :       for (unsigned i = 0; i < vperm.length (); ++i)
   11589              :         {
   11590        80392 :           if (i != 0
   11591        80392 :               && (repeating_p
   11592        54232 :                   ? multiple_p (i, npatterns)
   11593        59784 :                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
   11594        24113 :             dump_printf (MSG_NOTE, ",");
   11595        80392 :           dump_printf (MSG_NOTE, " vops%u[%u][",
   11596        80392 :                        vperm[i].first.first, vperm[i].first.second);
   11597        80392 :           dump_dec (MSG_NOTE, vperm[i].second);
   11598        80392 :           dump_printf (MSG_NOTE, "]");
   11599              :         }
   11600         8909 :       dump_printf (MSG_NOTE, "\n");
   11601              :     }
   11602              : 
   11603              :   /* We can only handle two-vector permutes, everything else should
   11604              :      be lowered on the SLP level.  The following is closely inspired
   11605              :      by vect_transform_slp_perm_load and is supposed to eventually
   11606              :      replace it.
   11607              :      ???   As intermediate step do code-gen in the SLP tree representation
   11608              :      somehow?  */
   11609       490300 :   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
   11610       490300 :   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
   11611       490300 :   unsigned int index = 0;
   11612       490300 :   poly_uint64 mask_element;
   11613       490300 :   vec_perm_builder mask;
   11614       490300 :   mask.new_vector (nunits, npatterns, nelts_per_pattern);
   11615       490300 :   unsigned int count = mask.encoded_nelts ();
   11616       490300 :   mask.quick_grow (count);
   11617       490300 :   vec_perm_indices indices;
   11618       490300 :   unsigned nperms = 0;
   11619              :   /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
   11620              :      vectors to check during analysis, but we need to generate NOUTPUTS
   11621              :      vectors during transformation.  */
   11622       490300 :   unsigned total_nelts = olanes;
   11623       490300 :   unsigned process_nelts = olanes;
   11624       490300 :   if (repeating_p)
   11625              :     {
   11626       142074 :       total_nelts = (total_nelts / unpack_factor) * noutputs;
   11627       142074 :       if (gsi)
   11628         9805 :         process_nelts = total_nelts;
   11629              :     }
   11630       490300 :   unsigned last_ei = (total_nelts - 1) % process_nelts;
   11631      2399472 :   for (unsigned i = 0; i < process_nelts; ++i)
   11632              :     {
   11633              :       /* VI is the input vector index when generating code for REPEATING_P.  */
   11634      1916513 :       unsigned vi = i / olanes * (pack_p ? 2 : 1);
   11635      1916513 :       unsigned ei = i % olanes;
   11636      1916513 :       mask_element = vperm[ei].second;
   11637      1916513 :       if (pack_p)
   11638              :         {
   11639              :           /* In this case, we have N outputs and the single child provides 2N
   11640              :              inputs.  Output X permutes inputs 2X and 2X+1.
   11641              : 
   11642              :              The mask indices are taken directly from the SLP permutation node.
   11643              :              Index X selects from the first vector if (X / NUNITS) % 2 == 0;
   11644              :              X selects from the second vector otherwise.  These conditions
   11645              :              are only known at compile time for constant-length vectors.  */
   11646              :           first_vec = std::make_pair (0, 0);
   11647              :           second_vec = std::make_pair (0, 1);
   11648              :         }
   11649      1747877 :       else if (first_vec.first == -1U
   11650      1747877 :                || first_vec == vperm[ei].first)
   11651      1515665 :         first_vec = vperm[ei].first;
   11652       232212 :       else if (second_vec.first == -1U
   11653       232212 :                || second_vec == vperm[ei].first)
   11654              :         {
   11655       231815 :           second_vec = vperm[ei].first;
   11656       231815 :           mask_element += nunits;
   11657              :         }
   11658              :       else
   11659              :         {
   11660          397 :           if (dump_p)
   11661            7 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11662              :                              "permutation requires at "
   11663              :                              "least three vectors\n");
   11664          397 :           gcc_assert (!gsi);
   11665              :           return -1;
   11666              :         }
   11667              : 
   11668      1916116 :       mask[index++] = mask_element;
   11669              : 
   11670      1916116 :       if (index == count)
   11671              :         {
   11672       806993 :           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
   11673              :                               TYPE_VECTOR_SUBPARTS (op_vectype));
   11674       632810 :           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
   11675       968846 :                              && constant_multiple_p (mask[0], nunits));
   11676       632810 :           machine_mode vmode = TYPE_MODE (vectype);
   11677       632810 :           machine_mode op_vmode = TYPE_MODE (op_vectype);
   11678       632810 :           unsigned HOST_WIDE_INT c;
   11679       632810 :           if ((!identity_p
   11680       589488 :                && !can_vec_perm_const_p (vmode, op_vmode, indices))
   11681       632810 :               || (identity_p
   11682        43322 :                   && !known_le (nunits,
   11683              :                                 TYPE_VECTOR_SUBPARTS (op_vectype))
   11684         6952 :                   && (!constant_multiple_p (nunits,
   11685            8 :                                             TYPE_VECTOR_SUBPARTS (op_vectype),
   11686            8 :                                             &c) || c != 2)))
   11687              :             {
   11688         6944 :               if (dump_p)
   11689              :                 {
   11690          152 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
   11691              :                                    vect_location,
   11692              :                                    "unsupported vect permute { ");
   11693         1586 :                   for (i = 0; i < count; ++i)
   11694              :                     {
   11695         1434 :                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
   11696         1434 :                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
   11697              :                     }
   11698          152 :                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
   11699              :                 }
   11700         6944 :               gcc_assert (!gsi);
   11701         7341 :               return -1;
   11702              :             }
   11703              : 
   11704       625866 :           if (!identity_p)
   11705       582544 :             nperms += CEIL (total_nelts, process_nelts) - (ei > last_ei);
   11706       625866 :           if (gsi)
   11707              :             {
   11708        31263 :               if (second_vec.first == -1U)
   11709         7079 :                 second_vec = first_vec;
   11710              : 
   11711        31263 :               slp_tree
   11712        31263 :                 first_node = children[first_vec.first],
   11713        31263 :                 second_node = children[second_vec.first];
   11714              : 
   11715        31263 :               tree mask_vec = NULL_TREE;
   11716        31263 :               if (!identity_p)
   11717        27957 :                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
   11718              : 
   11719        31263 :               tree first_def
   11720        31263 :                 = vect_get_slp_vect_def (first_node, first_vec.second + vi);
   11721        31263 :               tree second_def
   11722        31263 :                 = vect_get_slp_vect_def (second_node, second_vec.second + vi);
   11723        31263 :               vect_add_slp_permutation (vinfo, gsi, node, first_def,
   11724        31263 :                                         second_def, mask_vec, mask[0]);
   11725              :             }
   11726              : 
   11727              :           index = 0;
   11728              :           first_vec = std::make_pair (-1U, -1U);
   11729              :           second_vec = std::make_pair (-1U, -1U);
   11730              :         }
   11731              :     }
   11732              : 
   11733       482959 :   return nperms;
   11734       490300 : }
   11735              : 
   11736              : /* Vectorize the SLP permutations in NODE as specified
   11737              :    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
   11738              :    child number and lane number.
   11739              :    Interleaving of two two-lane two-child SLP subtrees (not supported):
   11740              :      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
   11741              :    A blend of two four-lane two-child SLP subtrees:
   11742              :      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
   11743              :    Highpart of a four-lane one-child SLP subtree (not supported):
   11744              :      [ { 0, 2 }, { 0, 3 } ]
   11745              :    Where currently only a subset is supported by code generating below.  */
   11746              : 
   11747              : bool
   11748       139166 : vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
   11749              :                               slp_tree node, stmt_vector_for_cost *cost_vec)
   11750              : {
   11751       139166 :   tree vectype = SLP_TREE_VECTYPE (node);
   11752       139166 :   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
   11753       139166 :   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
   11754       139166 :                                                SLP_TREE_CHILDREN (node),
   11755              :                                                dump_enabled_p ());
   11756       139166 :   if (nperms < 0)
   11757              :     return false;
   11758              : 
   11759       137839 :   if (!gsi && nperms != 0)
   11760       115857 :     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
   11761              : 
   11762              :   return true;
   11763              : }
   11764              : 
   11765              : /* Vectorize SLP NODE.  */
   11766              : 
   11767              : static void
   11768      1472867 : vect_schedule_slp_node (vec_info *vinfo,
   11769              :                         slp_tree node, slp_instance instance)
   11770              : {
   11771      1472867 :   gimple_stmt_iterator si;
   11772      1472867 :   int i;
   11773      1472867 :   slp_tree child;
   11774              : 
   11775              :   /* Vectorize externals and constants.  */
   11776      1472867 :   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
   11777      1472867 :       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
   11778              :     {
   11779              :       /* ???  vectorizable_shift can end up using a scalar operand which is
   11780              :          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
   11781              :          node in this case.  */
   11782       499294 :       if (!SLP_TREE_VECTYPE (node))
   11783       499294 :         return;
   11784              : 
   11785              :       /* There are two reasons vector defs might already exist.  The first
   11786              :          is that we are vectorizing an existing vector def.  The second is
   11787              :          when performing BB vectorization shared constant/external nodes
   11788              :          are not split apart during partitioning so during the code-gen
   11789              :          DFS walk we can end up visiting them twice.  */
   11790       492318 :       if (! SLP_TREE_VEC_DEFS (node).exists ())
   11791       491495 :         vect_create_constant_vectors (vinfo, node);
   11792       492318 :       return;
   11793              :     }
   11794              : 
   11795       973573 :   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
   11796              : 
   11797       973573 :   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
   11798       973573 :   if (SLP_TREE_VECTYPE (node))
   11799       973567 :     SLP_TREE_VEC_DEFS (node).create (vect_get_num_copies (vinfo, node));
   11800              : 
   11801       973573 :   if (!SLP_TREE_PERMUTE_P (node) && STMT_VINFO_DATA_REF (stmt_info))
   11802              :     {
   11803              :       /* Vectorized loads go before the first scalar load to make it
   11804              :          ready early, vectorized stores go before the last scalar
   11805              :          stmt which is where all uses are ready.  */
   11806       713130 :       stmt_vec_info last_stmt_info = NULL;
   11807       713130 :       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
   11808       166233 :         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
   11809              :       else /* DR_IS_WRITE */
   11810       546897 :         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
   11811       713130 :       si = gsi_for_stmt (last_stmt_info->stmt);
   11812       713130 :     }
   11813       260443 :   else if (!SLP_TREE_PERMUTE_P (node)
   11814       243976 :            && (SLP_TREE_TYPE (node) == cycle_phi_info_type
   11815              :                || SLP_TREE_TYPE (node) == induc_vec_info_type
   11816              :                || SLP_TREE_TYPE (node) == phi_info_type))
   11817              :     {
   11818              :       /* For PHI node vectorization we do not use the insertion iterator.  */
   11819        53945 :       si = gsi_none ();
   11820              :     }
   11821              :   else
   11822              :     {
   11823              :       /* Emit other stmts after the children vectorized defs which is
   11824              :          earliest possible.  */
   11825              :       gimple *last_stmt = NULL;
   11826              :       bool seen_vector_def = false;
   11827       574218 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
   11828       367720 :         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
   11829              :           {
   11830              :             /* For fold-left reductions we are retaining the scalar
   11831              :                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
   11832              :                set so the representation isn't perfect.  Resort to the
   11833              :                last scalar def here.  */
   11834       294724 :             if (SLP_TREE_VEC_DEFS (child).is_empty ())
   11835              :               {
   11836          878 :                 gcc_assert (SLP_TREE_TYPE (child) == cycle_phi_info_type);
   11837          878 :                 gphi *phi = as_a <gphi *>
   11838          878 :                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
   11839          878 :                 if (!last_stmt)
   11840              :                   last_stmt = phi;
   11841          662 :                 else if (vect_stmt_dominates_stmt_p (last_stmt, phi))
   11842              :                   last_stmt = phi;
   11843          651 :                 else if (vect_stmt_dominates_stmt_p (phi, last_stmt))
   11844              :                   ;
   11845              :                 else
   11846            0 :                   gcc_unreachable ();
   11847              :               }
   11848              :             /* We are emitting all vectorized stmts in the same place and
   11849              :                the last one is the last.
   11850              :                ???  Unless we have a load permutation applied and that
   11851              :                figures to re-use an earlier generated load.  */
   11852              :             unsigned j;
   11853              :             tree vdef;
   11854       697047 :             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
   11855              :               {
   11856       402323 :                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
   11857       402323 :                 if (!last_stmt)
   11858              :                   last_stmt = vstmt;
   11859       206563 :                 else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
   11860              :                   last_stmt = vstmt;
   11861        45219 :                 else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
   11862              :                   ;
   11863              :                 else
   11864            0 :                   gcc_unreachable ();
   11865              :               }
   11866              :           }
   11867        72996 :         else if (!SLP_TREE_VECTYPE (child))
   11868              :           {
   11869              :             /* For externals we use unvectorized at all scalar defs.  */
   11870              :             unsigned j;
   11871              :             tree def;
   11872        14831 :             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
   11873         8491 :               if (TREE_CODE (def) == SSA_NAME
   11874         8491 :                   && !SSA_NAME_IS_DEFAULT_DEF (def))
   11875              :                 {
   11876          295 :                   gimple *stmt = SSA_NAME_DEF_STMT (def);
   11877          295 :                   if (gimple_uid (stmt) == -1u)
   11878              :                     /* If the stmt is not inside the region do not
   11879              :                        use it as possible insertion point.  */
   11880              :                     ;
   11881          285 :                   else if (!last_stmt)
   11882              :                     last_stmt = stmt;
   11883          261 :                   else if (vect_stmt_dominates_stmt_p (last_stmt, stmt))
   11884              :                     last_stmt = stmt;
   11885          159 :                   else if (vect_stmt_dominates_stmt_p (stmt, last_stmt))
   11886              :                     ;
   11887              :                   else
   11888            0 :                     gcc_unreachable ();
   11889              :                 }
   11890              :           }
   11891              :         else
   11892              :           {
   11893              :             /* For externals we have to look at all defs since their
   11894              :                insertion place is decided per vector.  But beware
   11895              :                of pre-existing vectors where we need to make sure
   11896              :                we do not insert before the region boundary.  */
   11897        66656 :             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
   11898          650 :                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
   11899              :               seen_vector_def = true;
   11900              :             else
   11901              :               {
   11902              :                 unsigned j;
   11903              :                 tree vdef;
   11904       528822 :                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
   11905        94563 :                   if (TREE_CODE (vdef) == SSA_NAME
   11906        94563 :                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
   11907              :                     {
   11908        19780 :                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
   11909        19780 :                       if (!last_stmt)
   11910              :                         last_stmt = vstmt;
   11911        11005 :                       else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
   11912              :                         last_stmt = vstmt;
   11913         8738 :                       else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
   11914              :                         ;
   11915              :                       else
   11916            0 :                         gcc_unreachable ();
   11917              :                     }
   11918              :               }
   11919              :           }
   11920              :       /* This can happen when all children are pre-existing vectors or
   11921              :          constants.  */
   11922       206498 :       if (!last_stmt)
   11923         1723 :         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
   11924         1723 :       if (!last_stmt)
   11925              :         {
   11926            0 :           gcc_assert (seen_vector_def);
   11927            0 :           si = gsi_after_labels (vinfo->bbs[0]);
   11928              :         }
   11929       206498 :       else if (is_ctrl_altering_stmt (last_stmt))
   11930              :         {
   11931              :           /* We split regions to vectorize at control altering stmts
   11932              :              with a definition so this must be an external which
   11933              :              we can insert at the start of the region.  */
   11934            0 :           si = gsi_after_labels (vinfo->bbs[0]);
   11935              :         }
   11936       206498 :       else if (is_a <bb_vec_info> (vinfo)
   11937        18190 :                && !SLP_TREE_PERMUTE_P (node)
   11938        16704 :                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
   11939       207883 :                && gimple_could_trap_p (stmt_info->stmt))
   11940              :         {
   11941              :           /* We've constrained possibly trapping operations to all come
   11942              :              from the same basic-block, if vectorized defs would allow earlier
   11943              :              scheduling still force vectorized stmts to the original block.
   11944              :              This is only necessary for BB vectorization since for loop vect
   11945              :              all operations are in a single BB and scalar stmt based
   11946              :              placement doesn't play well with epilogue vectorization.  */
   11947           53 :           gcc_assert (dominated_by_p (CDI_DOMINATORS,
   11948              :                                       gimple_bb (stmt_info->stmt),
   11949              :                                       gimple_bb (last_stmt)));
   11950           53 :           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
   11951              :         }
   11952       206445 :       else if (is_a <gphi *> (last_stmt))
   11953        14439 :         si = gsi_after_labels (gimple_bb (last_stmt));
   11954              :       else
   11955              :         {
   11956       192006 :           si = gsi_for_stmt (last_stmt);
   11957       192006 :           gsi_next (&si);
   11958              : 
   11959       192006 :           if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
   11960              :             {
   11961              :               /* Avoid scheduling stmts to random places in the CFG, any
   11962              :                  stmt dominance check we performed is possibly wrong as UIDs
   11963              :                  are not initialized for all of the function for loop
   11964              :                  vectorization.  Instead append to the loop preheader.  */
   11965       174085 :               if ((LOOP_VINFO_LOOP (loop_vinfo)->header
   11966       174085 :                    != gimple_bb (last_stmt))
   11967       177302 :                   && dominated_by_p (CDI_DOMINATORS,
   11968              :                                      LOOP_VINFO_LOOP (loop_vinfo)->header,
   11969         3217 :                                      gimple_bb (last_stmt)))
   11970         1406 :                 si = gsi_end_bb (loop_preheader_edge
   11971          703 :                                    (LOOP_VINFO_LOOP (loop_vinfo))->src);
   11972              :               /* Avoid scheduling internal defs outside of the loop when
   11973              :                  we might have only implicitly tracked loop mask/len defs.  */
   11974           74 :               if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
   11975       174085 :                   || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
   11976              :                 {
   11977           74 :                   gimple_stmt_iterator si2
   11978           74 :                     = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
   11979           74 :                   if ((gsi_end_p (si2)
   11980            0 :                        && (LOOP_VINFO_LOOP (loop_vinfo)->header
   11981            0 :                            != gimple_bb (last_stmt))
   11982            0 :                        && dominated_by_p (CDI_DOMINATORS,
   11983              :                                           LOOP_VINFO_LOOP (loop_vinfo)->header,
   11984            0 :                                           gimple_bb (last_stmt)))
   11985           74 :                       || (!gsi_end_p (si2)
   11986           74 :                           && last_stmt != *si2
   11987           72 :                           && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
   11988            3 :                     si = si2;
   11989              :                 }
   11990              :             }
   11991              :         }
   11992              :     }
   11993              : 
   11994       973573 :   if (dump_enabled_p ())
   11995              :     {
   11996        71489 :       if (stmt_info)
   11997        71436 :         dump_printf_loc (MSG_NOTE, vect_location,
   11998              :                          "------>vectorizing SLP node starting from: %G",
   11999              :                          stmt_info->stmt);
   12000              :       else
   12001              :         {
   12002           53 :           dump_printf_loc (MSG_NOTE, vect_location,
   12003              :                            "------>vectorizing SLP node:\n");
   12004           53 :           vect_print_slp_tree (MSG_NOTE, vect_location, node);
   12005              :         }
   12006              :     }
   12007       973573 :   vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
   12008              : }
   12009              : 
   12010              : /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
   12011              :    For loop vectorization this is done in vectorizable_call, but for SLP
   12012              :    it needs to be deferred until end of vect_schedule_slp, because multiple
   12013              :    SLP instances may refer to the same scalar stmt.  */
   12014              : 
   12015              : static void
   12016       598713 : vect_remove_slp_scalar_calls (vec_info *vinfo,
   12017              :                               slp_tree node, hash_set<slp_tree> &visited)
   12018              : {
   12019       598713 :   gimple *new_stmt;
   12020       598713 :   gimple_stmt_iterator gsi;
   12021       598713 :   int i;
   12022       598713 :   slp_tree child;
   12023       598713 :   tree lhs;
   12024       598713 :   stmt_vec_info stmt_info;
   12025              : 
   12026       598713 :   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
   12027       187549 :     return;
   12028              : 
   12029       454485 :   if (visited.add (node))
   12030              :     return;
   12031              : 
   12032       920222 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
   12033       509058 :     vect_remove_slp_scalar_calls (vinfo, child, visited);
   12034              : 
   12035      1301816 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
   12036              :     {
   12037       483620 :       if (!stmt_info)
   12038         3974 :         continue;
   12039       479646 :       stmt_info = vect_orig_stmt (stmt_info);
   12040       479646 :       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
   12041         5239 :       if (!stmt || gimple_bb (stmt) == NULL)
   12042       474453 :         continue;
   12043         5193 :       lhs = gimple_call_lhs (stmt);
   12044         5193 :       if (lhs)
   12045         4585 :         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
   12046              :       else
   12047          608 :         new_stmt = gimple_build_nop ();
   12048         5193 :       unlink_stmt_vdef (stmt_info->stmt);
   12049         5193 :       gsi = gsi_for_stmt (stmt);
   12050         5193 :       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
   12051         5193 :       if (lhs)
   12052         4585 :         SSA_NAME_DEF_STMT (lhs) = new_stmt;
   12053              :     }
   12054              : }
   12055              : 
   12056              : static void
   12057        89655 : vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
   12058              : {
   12059        89655 :   hash_set<slp_tree> visited;
   12060        89655 :   vect_remove_slp_scalar_calls (vinfo, node, visited);
   12061        89655 : }
   12062              : 
   12063              : /* Vectorize the instance root.  */
   12064              : 
   12065              : void
   12066        10935 : vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance instance)
   12067              : {
   12068        10935 :   gassign *rstmt = NULL;
   12069              : 
   12070        10935 :   if (instance->kind == slp_inst_kind_ctor)
   12071              :     {
   12072         5236 :       if (SLP_TREE_VEC_DEFS (node).length () == 1)
   12073              :         {
   12074         5199 :           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
   12075         5199 :           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
   12076         5199 :           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
   12077         5199 :                                           TREE_TYPE (vect_lhs)))
   12078            0 :             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
   12079              :                                vect_lhs);
   12080         5199 :           rstmt = gimple_build_assign (root_lhs, vect_lhs);
   12081              :         }
   12082              :       else
   12083              :         {
   12084           37 :           gcc_assert (SLP_TREE_VEC_DEFS (node).length () > 1);
   12085           37 :           tree child_def;
   12086           37 :           int j;
   12087           37 :           vec<constructor_elt, va_gc> *v;
   12088           37 :           vec_alloc (v, SLP_TREE_VEC_DEFS (node).length ());
   12089              : 
   12090              :           /* A CTOR can handle V16HI composition from VNx8HI so we
   12091              :              do not need to convert vector elements if the types
   12092              :              do not match.  */
   12093          111 :           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
   12094           74 :             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
   12095           37 :           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
   12096           37 :           tree rtype
   12097           37 :             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
   12098           37 :           tree r_constructor = build_constructor (rtype, v);
   12099           37 :           rstmt = gimple_build_assign (lhs, r_constructor);
   12100              :         }
   12101              :     }
   12102         5699 :   else if (instance->kind == slp_inst_kind_bb_reduc)
   12103              :     {
   12104              :       /* Largely inspired by reduction chain epilogue handling in
   12105              :          vect_create_epilog_for_reduction.  */
   12106         4131 :       vec<tree> vec_defs = vNULL;
   12107         4131 :       vect_get_slp_defs (node, &vec_defs);
   12108         4131 :       enum tree_code reduc_code
   12109         4131 :         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
   12110              :       /* ???  We actually have to reflect signs somewhere.  */
   12111         4131 :       if (reduc_code == MINUS_EXPR)
   12112            0 :         reduc_code = PLUS_EXPR;
   12113         4131 :       gimple_seq epilogue = NULL;
   12114              :       /* We may end up with more than one vector result, reduce them
   12115              :          to one vector.  */
   12116         4131 :       tree vec_def = vec_defs[0];
   12117         4131 :       tree vectype = TREE_TYPE (vec_def);
   12118         4131 :       tree compute_vectype = vectype;
   12119         4131 :       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
   12120         3932 :                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
   12121         6896 :                                  && operation_can_overflow (reduc_code));
   12122         2622 :       if (pun_for_overflow_p)
   12123              :         {
   12124         2622 :           compute_vectype = unsigned_type_for (vectype);
   12125         2622 :           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
   12126              :                                   compute_vectype, vec_def);
   12127              :         }
   12128         6519 :       for (unsigned i = 1; i < vec_defs.length (); ++i)
   12129              :         {
   12130         2388 :           tree def = vec_defs[i];
   12131         2388 :           if (pun_for_overflow_p)
   12132         2285 :             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
   12133              :                                 compute_vectype, def);
   12134         2388 :           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
   12135              :                                   vec_def, def);
   12136              :         }
   12137         4131 :       vec_defs.release ();
   12138              :       /* ???  Support other schemes than direct internal fn.  */
   12139         4131 :       internal_fn reduc_fn;
   12140         4131 :       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
   12141         4131 :           || reduc_fn == IFN_LAST)
   12142            0 :         gcc_unreachable ();
   12143         4131 :       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
   12144         4131 :                                       TREE_TYPE (compute_vectype), vec_def);
   12145         4131 :       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
   12146              :         {
   12147         2565 :           tree rem_def = NULL_TREE;
   12148        11907 :           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
   12149              :             {
   12150         9342 :               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
   12151         9342 :               if (!rem_def)
   12152              :                 rem_def = def;
   12153              :               else
   12154         6777 :                 rem_def = gimple_build (&epilogue, reduc_code,
   12155         6777 :                                         TREE_TYPE (scalar_def),
   12156              :                                         rem_def, def);
   12157              :             }
   12158         2565 :           scalar_def = gimple_build (&epilogue, reduc_code,
   12159         2565 :                                      TREE_TYPE (scalar_def),
   12160              :                                      scalar_def, rem_def);
   12161              :         }
   12162         4131 :       scalar_def = gimple_convert (&epilogue,
   12163         4131 :                                    TREE_TYPE (vectype), scalar_def);
   12164         4131 :       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
   12165         4131 :       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
   12166         4131 :       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
   12167         4131 :       update_stmt (gsi_stmt (rgsi));
   12168         4131 :       return;
   12169              :     }
   12170         1568 :   else if (instance->kind == slp_inst_kind_gcond)
   12171              :     {
   12172              :       /* Only support a single root for now as we can't codegen CFG yet and so we
   12173              :          can't support lane > 1 at this time.  */
   12174         1568 :       gcc_assert (instance->root_stmts.length () == 1);
   12175         1568 :       auto root_stmt_info = instance->root_stmts[0];
   12176         1568 :       auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
   12177         1568 :       gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
   12178         1568 :       gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
   12179         1568 :       bool res = vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
   12180              :                                           root_stmt_info, &rgsi, node, NULL);
   12181         1568 :       gcc_assert (res);
   12182         1568 :       return;
   12183              :     }
   12184              :   else
   12185            0 :     gcc_unreachable ();
   12186              : 
   12187         5236 :   gcc_assert (rstmt);
   12188              : 
   12189         5236 :   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
   12190         5236 :   gsi_replace (&rgsi, rstmt, true);
   12191              : }
   12192              : 
   12193              : struct slp_scc_info
   12194              : {
   12195              :   bool on_stack;
   12196              :   int dfs;
   12197              :   int lowlink;
   12198              : };
   12199              : 
   12200              : /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
   12201              : 
   12202              : static void
   12203      1472867 : vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
   12204              :                    hash_map<slp_tree, slp_scc_info> &scc_info,
   12205              :                    int &maxdfs, vec<slp_tree> &stack)
   12206              : {
   12207      1472867 :   bool existed_p;
   12208      1472867 :   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
   12209      1472867 :   gcc_assert (!existed_p);
   12210      1472867 :   info->dfs = maxdfs;
   12211      1472867 :   info->lowlink = maxdfs;
   12212      1472867 :   maxdfs++;
   12213              : 
   12214              :   /* Leaf.  */
   12215      1472867 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
   12216              :     {
   12217       499294 :       info->on_stack = false;
   12218       499294 :       vect_schedule_slp_node (vinfo, node, instance);
   12219      1030021 :       return;
   12220              :     }
   12221              : 
   12222       973573 :   info->on_stack = true;
   12223       973573 :   stack.safe_push (node);
   12224              : 
   12225       973573 :   unsigned i;
   12226       973573 :   slp_tree child;
   12227              :   /* DFS recurse.  */
   12228      2008299 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
   12229              :     {
   12230      1034726 :       if (!child)
   12231        55074 :         continue;
   12232       979652 :       slp_scc_info *child_info = scc_info.get (child);
   12233       979652 :       if (!child_info)
   12234              :         {
   12235       889963 :           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
   12236              :           /* Recursion might have re-allocated the node.  */
   12237       889963 :           info = scc_info.get (node);
   12238       889963 :           child_info = scc_info.get (child);
   12239       889963 :           info->lowlink = MIN (info->lowlink, child_info->lowlink);
   12240              :         }
   12241        89689 :       else if (child_info->on_stack)
   12242        25289 :         info->lowlink = MIN (info->lowlink, child_info->dfs);
   12243              :     }
   12244       973573 :   if (info->lowlink != info->dfs)
   12245              :     return;
   12246              : 
   12247       942140 :   auto_vec<slp_tree, 4> phis_to_fixup;
   12248              : 
   12249              :   /* Singleton.  */
   12250       942140 :   if (stack.last () == node)
   12251              :     {
   12252       918526 :       stack.pop ();
   12253       918526 :       info->on_stack = false;
   12254       918526 :       vect_schedule_slp_node (vinfo, node, instance);
   12255       918526 :       if (!SLP_TREE_PERMUTE_P (node)
   12256       918526 :           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
   12257        30458 :         phis_to_fixup.quick_push (node);
   12258              :     }
   12259              :   else
   12260              :     {
   12261              :       /* SCC.  */
   12262        23614 :       int last_idx = stack.length () - 1;
   12263        55047 :       while (stack[last_idx] != node)
   12264        31433 :         last_idx--;
   12265              :       /* We can break the cycle at PHIs who have at least one child
   12266              :          code generated.  Then we could re-start the DFS walk until
   12267              :          all nodes in the SCC are covered (we might have new entries
   12268              :          for only back-reachable nodes).  But it's simpler to just
   12269              :          iterate and schedule those that are ready.  */
   12270        23614 :       unsigned todo = stack.length () - last_idx;
   12271        23953 :       do
   12272              :         {
   12273       104737 :           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
   12274              :             {
   12275        56831 :               slp_tree entry = stack[idx];
   12276        56831 :               if (!entry)
   12277          956 :                 continue;
   12278        55875 :               bool phi = (!SLP_TREE_PERMUTE_P (entry)
   12279        55875 :                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
   12280        55875 :               bool ready = !phi;
   12281       141383 :               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
   12282       110366 :                   if (!child)
   12283              :                     {
   12284        22736 :                       gcc_assert (phi);
   12285              :                       ready = true;
   12286              :                       break;
   12287              :                     }
   12288        87630 :                   else if (scc_info.get (child)->on_stack)
   12289              :                     {
   12290        23823 :                       if (!phi)
   12291              :                         {
   12292              :                           ready = false;
   12293              :                           break;
   12294              :                         }
   12295              :                     }
   12296              :                   else
   12297              :                     {
   12298        63807 :                       if (phi)
   12299              :                         {
   12300              :                           ready = true;
   12301              :                           break;
   12302              :                         }
   12303              :                     }
   12304        33139 :               if (ready)
   12305              :                 {
   12306        55047 :                   vect_schedule_slp_node (vinfo, entry, instance);
   12307        55047 :                   scc_info.get (entry)->on_stack = false;
   12308        55047 :                   stack[idx] = NULL;
   12309        55047 :                   todo--;
   12310        55047 :                   if (phi)
   12311        24060 :                     phis_to_fixup.safe_push (entry);
   12312              :                 }
   12313              :             }
   12314              :         }
   12315        23953 :       while (todo != 0);
   12316              : 
   12317              :       /* Pop the SCC.  */
   12318        23614 :       stack.truncate (last_idx);
   12319              :     }
   12320              : 
   12321              :   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
   12322              :   slp_tree phi_node;
   12323      1938798 :   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
   12324              :     {
   12325        54518 :       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
   12326        54518 :       edge_iterator ei;
   12327        54518 :       edge e;
   12328       172209 :       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
   12329              :         {
   12330       117691 :           unsigned dest_idx = e->dest_idx;
   12331       117691 :           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
   12332       117691 :           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
   12333        66092 :             continue;
   12334        51599 :           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
   12335              :           /* Simply fill all args.  */
   12336        51599 :           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
   12337              :               != vect_first_order_recurrence)
   12338       110856 :             for (unsigned i = 0; i < n; ++i)
   12339              :               {
   12340        59300 :                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
   12341        59300 :                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
   12342        59300 :                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
   12343              :                              e, gimple_phi_arg_location (phi, dest_idx));
   12344              :               }
   12345              :           else
   12346              :             {
   12347              :               /* Unless it is a first order recurrence which needs
   12348              :                  args filled in for both the PHI node and the permutes.  */
   12349           43 :               gimple *perm
   12350           43 :                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
   12351           43 :               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
   12352           43 :               add_phi_arg (as_a <gphi *> (rphi),
   12353              :                            vect_get_slp_vect_def (child, n - 1),
   12354              :                            e, gimple_phi_arg_location (phi, dest_idx));
   12355          123 :               for (unsigned i = 0; i < n; ++i)
   12356              :                 {
   12357           80 :                   gimple *perm
   12358           80 :                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
   12359           80 :                   if (i > 0)
   12360           37 :                     gimple_assign_set_rhs1 (perm,
   12361              :                                             vect_get_slp_vect_def (child, i - 1));
   12362           80 :                   gimple_assign_set_rhs2 (perm,
   12363              :                                           vect_get_slp_vect_def (child, i));
   12364           80 :                   update_stmt (perm);
   12365              :                 }
   12366              :             }
   12367              :         }
   12368              :     }
   12369       942140 : }
   12370              : 
   12371              : /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
   12372              : 
   12373              : void
   12374       543782 : vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
   12375              : {
   12376       543782 :   slp_instance instance;
   12377       543782 :   unsigned int i;
   12378              : 
   12379       543782 :   hash_map<slp_tree, slp_scc_info> scc_info;
   12380       543782 :   int maxdfs = 0;
   12381      1126793 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
   12382              :     {
   12383       583011 :       slp_tree node = SLP_INSTANCE_TREE (instance);
   12384       583011 :       if (dump_enabled_p ())
   12385              :         {
   12386        16034 :           dump_printf_loc (MSG_NOTE, vect_location,
   12387              :                            "Vectorizing SLP tree:\n");
   12388              :           /* ???  Dump all?  */
   12389        16034 :           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
   12390          465 :             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
   12391          465 :                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
   12392        16034 :           vect_print_slp_graph (MSG_NOTE, vect_location,
   12393              :                                 SLP_INSTANCE_TREE (instance));
   12394              :         }
   12395              :       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
   12396              :          have a PHI be the node breaking the cycle.  */
   12397       583011 :       auto_vec<slp_tree> stack;
   12398       583011 :       if (!scc_info.get (node))
   12399       582904 :         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
   12400              : 
   12401       583011 :       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
   12402        10935 :         vectorize_slp_instance_root_stmt (vinfo, node, instance);
   12403              : 
   12404       583011 :       if (dump_enabled_p ())
   12405        16034 :         dump_printf_loc (MSG_NOTE, vect_location,
   12406              :                          "vectorizing stmts using SLP.\n");
   12407       583011 :     }
   12408              : 
   12409      1670575 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
   12410              :     {
   12411       583011 :       slp_tree root = SLP_INSTANCE_TREE (instance);
   12412       583011 :       stmt_vec_info store_info;
   12413       583011 :       unsigned int j;
   12414              : 
   12415              :       /* Remove scalar call stmts.  Do not do this for basic-block
   12416              :          vectorization as not all uses may be vectorized.
   12417              :          ???  Why should this be necessary?  DCE should be able to
   12418              :          remove the stmts itself.
   12419              :          ???  For BB vectorization we can as well remove scalar
   12420              :          stmts starting from the SLP tree root if they have no
   12421              :          uses.  */
   12422       583011 :       if (is_a <loop_vec_info> (vinfo))
   12423        89655 :         vect_remove_slp_scalar_calls (vinfo, root);
   12424              : 
   12425              :       /* Remove vectorized stores original scalar stmts.  */
   12426      2603408 :       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
   12427              :         {
   12428      1473500 :           if (!store_info
   12429      1473486 :               || !STMT_VINFO_DATA_REF (store_info)
   12430      1445992 :               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
   12431              :             break;
   12432              : 
   12433      1437386 :           store_info = vect_orig_stmt (store_info);
   12434              :           /* Free the attached stmt_vec_info and remove the stmt.  */
   12435      1437386 :           vinfo->remove_stmt (store_info);
   12436              : 
   12437              :           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
   12438              :              to not crash in vect_free_slp_tree later.  */
   12439      1437386 :           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
   12440       546566 :             SLP_TREE_REPRESENTATIVE (root) = NULL;
   12441              :         }
   12442              :     }
   12443       543782 : }
        

Generated by: LCOV version 2.4-beta

LCOV profile is generated on x86_64 machine using following configure options: configure --disable-bootstrap --enable-coverage=opt --enable-languages=c,c++,fortran,go,jit,lto,rust,m2 --enable-host-shared. GCC test suite is run with the built compiler.