LCOV - code coverage report
Current view: top level - gcc - tree-vect-slp.cc (source / functions) Coverage Total Hit
Test: gcc.info Lines: 92.4 % 5963 5508
Test Date: 2026-03-28 14:25:54 Functions: 95.0 % 180 171
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /* SLP - Basic Block Vectorization
       2              :    Copyright (C) 2007-2026 Free Software Foundation, Inc.
       3              :    Contributed by Dorit Naishlos <dorit@il.ibm.com>
       4              :    and Ira Rosen <irar@il.ibm.com>
       5              : 
       6              : This file is part of GCC.
       7              : 
       8              : GCC is free software; you can redistribute it and/or modify it under
       9              : the terms of the GNU General Public License as published by the Free
      10              : Software Foundation; either version 3, or (at your option) any later
      11              : version.
      12              : 
      13              : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
      14              : WARRANTY; without even the implied warranty of MERCHANTABILITY or
      15              : FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      16              : for more details.
      17              : 
      18              : You should have received a copy of the GNU General Public License
      19              : along with GCC; see the file COPYING3.  If not see
      20              : <http://www.gnu.org/licenses/>.  */
      21              : 
      22              : #include "config.h"
      23              : #define INCLUDE_ALGORITHM
      24              : #include "system.h"
      25              : #include "coretypes.h"
      26              : #include "backend.h"
      27              : #include "target.h"
      28              : #include "rtl.h"
      29              : #include "tree.h"
      30              : #include "gimple.h"
      31              : #include "tree-pass.h"
      32              : #include "ssa.h"
      33              : #include "optabs-tree.h"
      34              : #include "insn-config.h"
      35              : #include "recog.h"            /* FIXME: for insn_data */
      36              : #include "fold-const.h"
      37              : #include "stor-layout.h"
      38              : #include "gimple-iterator.h"
      39              : #include "cfgloop.h"
      40              : #include "tree-vectorizer.h"
      41              : #include "langhooks.h"
      42              : #include "gimple-walk.h"
      43              : #include "dbgcnt.h"
      44              : #include "tree-vector-builder.h"
      45              : #include "vec-perm-indices.h"
      46              : #include "gimple-fold.h"
      47              : #include "internal-fn.h"
      48              : #include "dump-context.h"
      49              : #include "cfganal.h"
      50              : #include "tree-eh.h"
      51              : #include "tree-cfg.h"
      52              : #include "alloc-pool.h"
      53              : #include "sreal.h"
      54              : #include "predict.h"
      55              : 
      56              : #define REDUC_GROUP_FIRST_ELEMENT(S) \
      57              :   (gcc_checking_assert (!(S)->dr_aux.dr), (S)->first_element)
      58              : 
      59              : static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
      60              :                                             load_permutation_t &,
      61              :                                             const vec<tree> &,
      62              :                                             gimple_stmt_iterator *,
      63              :                                             poly_uint64, bool, bool,
      64              :                                             unsigned *,
      65              :                                             unsigned * = nullptr,
      66              :                                             bool = false);
      67              : static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
      68              :                                            slp_tree, lane_permutation_t &,
      69              :                                            vec<slp_tree> &, bool);
      70              : static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
      71              : static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
      72              : 
      73              : static object_allocator<_slp_tree> *slp_tree_pool;
      74              : static slp_tree slp_first_node;
      75              : 
      76              : void
      77      1113321 : vect_slp_init (void)
      78              : {
      79      1113321 :   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
      80      1113321 : }
      81              : 
      82              : void
      83      1113321 : vect_slp_fini (void)
      84              : {
      85      1615569 :   while (slp_first_node)
      86       502248 :     delete slp_first_node;
      87      2226642 :   delete slp_tree_pool;
      88      1113321 :   slp_tree_pool = NULL;
      89      1113321 : }
      90              : 
      91              : void *
      92      7115296 : _slp_tree::operator new (size_t n)
      93              : {
      94      7115296 :   gcc_assert (n == sizeof (_slp_tree));
      95      7115296 :   return slp_tree_pool->allocate_raw ();
      96              : }
      97              : 
      98              : void
      99      7115296 : _slp_tree::operator delete (void *node, size_t n)
     100              : {
     101      7115296 :   gcc_assert (n == sizeof (_slp_tree));
     102      7115296 :   slp_tree_pool->remove_raw (node);
     103      7115296 : }
     104              : 
     105              : 
     106              : /* Initialize a SLP node.  */
     107              : 
     108      7115296 : _slp_tree::_slp_tree ()
     109              : {
     110      7115296 :   this->prev_node = NULL;
     111      7115296 :   if (slp_first_node)
     112      6156127 :     slp_first_node->prev_node = this;
     113      7115296 :   this->next_node = slp_first_node;
     114      7115296 :   slp_first_node = this;
     115      7115296 :   SLP_TREE_SCALAR_STMTS (this) = vNULL;
     116      7115296 :   SLP_TREE_SCALAR_OPS (this) = vNULL;
     117      7115296 :   SLP_TREE_VEC_DEFS (this) = vNULL;
     118      7115296 :   SLP_TREE_CHILDREN (this) = vNULL;
     119      7115296 :   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
     120      7115296 :   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
     121      7115296 :   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
     122      7115296 :   SLP_TREE_CODE (this) = ERROR_MARK;
     123      7115296 :   SLP_TREE_GS_SCALE (this) = 0;
     124      7115296 :   SLP_TREE_GS_BASE (this) = NULL_TREE;
     125      7115296 :   this->ldst_lanes = false;
     126      7115296 :   this->avoid_stlf_fail = false;
     127      7115296 :   SLP_TREE_VECTYPE (this) = NULL_TREE;
     128      7115296 :   SLP_TREE_REPRESENTATIVE (this) = NULL;
     129      7115296 :   this->cycle_info.id = -1;
     130      7115296 :   this->cycle_info.reduc_idx = -1;
     131      7115296 :   SLP_TREE_REF_COUNT (this) = 1;
     132      7115296 :   this->failed = NULL;
     133      7115296 :   this->max_nunits = 1;
     134      7115296 :   this->lanes = 0;
     135      7115296 :   SLP_TREE_TYPE (this) = undef_vec_info_type;
     136      7115296 :   this->data = NULL;
     137      7115296 : }
     138              : 
     139              : /* Tear down a SLP node.  */
     140              : 
     141      7115296 : _slp_tree::~_slp_tree ()
     142              : {
     143      7115296 :   if (this->prev_node)
     144      4389504 :     this->prev_node->next_node = this->next_node;
     145              :   else
     146      2725792 :     slp_first_node = this->next_node;
     147      7115296 :   if (this->next_node)
     148      5235470 :     this->next_node->prev_node = this->prev_node;
     149      7115296 :   SLP_TREE_CHILDREN (this).release ();
     150      7115296 :   SLP_TREE_SCALAR_STMTS (this).release ();
     151      7115296 :   SLP_TREE_SCALAR_OPS (this).release ();
     152      7115296 :   SLP_TREE_VEC_DEFS (this).release ();
     153      7115296 :   SLP_TREE_LOAD_PERMUTATION (this).release ();
     154      7115296 :   SLP_TREE_LANE_PERMUTATION (this).release ();
     155      7115296 :   if (this->failed)
     156      1926318 :     free (failed);
     157      7115296 :   if (this->data)
     158      1127404 :     delete this->data;
     159      7115296 : }
     160              : 
     161              : /* Push the single SSA definition in DEF to the vector of vector defs.  */
     162              : 
     163              : void
     164       525542 : _slp_tree::push_vec_def (gimple *def)
     165              : {
     166       525542 :   if (gphi *phi = dyn_cast <gphi *> (def))
     167        58642 :     vec_defs.quick_push (gimple_phi_result (phi));
     168              :   else
     169              :     {
     170       466900 :       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
     171       466900 :       vec_defs.quick_push (get_def_from_ptr (defop));
     172              :     }
     173       525542 : }
     174              : 
     175              : /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
     176              : 
     177              : void
     178     13381788 : vect_free_slp_tree (slp_tree node)
     179              : {
     180     13381788 :   int i;
     181     13381788 :   slp_tree child;
     182              : 
     183     13381788 :   if (--SLP_TREE_REF_COUNT (node) != 0)
     184     13381788 :     return;
     185              : 
     186     10175018 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     187      3561970 :     if (child)
     188      3237869 :       vect_free_slp_tree (child);
     189              : 
     190              :   /* If the node defines any SLP only patterns then those patterns are no
     191              :      longer valid and should be removed.  */
     192      6613048 :   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
     193      6613048 :   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
     194              :     {
     195          973 :       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
     196          973 :       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
     197          973 :       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
     198              :     }
     199              : 
     200      6613048 :   delete node;
     201              : }
     202              : 
     203              : /* Return a location suitable for dumpings related to the SLP instance.  */
     204              : 
     205              : dump_user_location_t
     206      3373462 : _slp_instance::location () const
     207              : {
     208      3373462 :   if (!root_stmts.is_empty ())
     209       314978 :     return root_stmts[0]->stmt;
     210              :   else
     211      3058484 :     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
     212              : }
     213              : 
     214              : 
     215              : /* Free the memory allocated for the SLP instance.  */
     216              : 
     217              : void
     218      1451065 : vect_free_slp_instance (slp_instance instance)
     219              : {
     220      1451065 :   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
     221      1451065 :   SLP_INSTANCE_LOADS (instance).release ();
     222      1451065 :   SLP_INSTANCE_ROOT_STMTS (instance).release ();
     223      1451065 :   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
     224      1451065 :   instance->subgraph_entries.release ();
     225      1451065 :   instance->cost_vec.release ();
     226      1451065 :   free (instance);
     227      1451065 : }
     228              : 
     229              : 
     230              : /* Create an SLP node for SCALAR_STMTS.  */
     231              : 
     232              : slp_tree
     233        86729 : vect_create_new_slp_node (unsigned nops, tree_code code)
     234              : {
     235        86729 :   slp_tree node = new _slp_tree;
     236        86729 :   SLP_TREE_SCALAR_STMTS (node) = vNULL;
     237        86729 :   SLP_TREE_CHILDREN (node).create (nops);
     238        86729 :   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
     239        86729 :   SLP_TREE_CODE (node) = code;
     240        86729 :   return node;
     241              : }
     242              : /* Create an SLP node for SCALAR_STMTS.  */
     243              : 
     244              : static slp_tree
     245      3339883 : vect_create_new_slp_node (slp_tree node,
     246              :                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
     247              : {
     248      3339883 :   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
     249      3339883 :   SLP_TREE_CHILDREN (node).create (nops);
     250      3339883 :   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
     251      3339883 :   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
     252      3339883 :   SLP_TREE_LANES (node) = scalar_stmts.length ();
     253      3339883 :   return node;
     254              : }
     255              : 
     256              : /* Create an SLP node for SCALAR_STMTS.  */
     257              : 
     258              : static slp_tree
     259         6276 : vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
     260              : {
     261         6276 :   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
     262              : }
     263              : 
     264              : /* Create an SLP node for OPS.  */
     265              : 
     266              : static slp_tree
     267      1752640 : vect_create_new_slp_node (slp_tree node, vec<tree> ops)
     268              : {
     269      1752640 :   SLP_TREE_SCALAR_OPS (node) = ops;
     270      1752640 :   SLP_TREE_DEF_TYPE (node) = vect_external_def;
     271            0 :   SLP_TREE_LANES (node) = ops.length ();
     272      1752640 :   return node;
     273              : }
     274              : 
     275              : /* Create an SLP node for OPS.  */
     276              : 
     277              : static slp_tree
     278      1752640 : vect_create_new_slp_node (vec<tree> ops)
     279              : {
     280      1752640 :   return vect_create_new_slp_node (new _slp_tree, ops);
     281              : }
     282              : 
     283              : 
     284              : /* This structure is used in creation of an SLP tree.  Each instance
     285              :    corresponds to the same operand in a group of scalar stmts in an SLP
     286              :    node.  */
     287              : typedef struct _slp_oprnd_info
     288              : {
     289              :   /* Def-stmts for the operands.  */
     290              :   vec<stmt_vec_info> def_stmts;
     291              :   /* Operands.  */
     292              :   vec<tree> ops;
     293              :   /* Information about the first statement, its vector def-type, type, the
     294              :      operand itself in case it's constant, and an indication if it's a pattern
     295              :      stmt and gather/scatter info.  */
     296              :   tree first_op_type;
     297              :   enum vect_def_type first_dt;
     298              :   bool any_pattern;
     299              :   bool first_gs_p;
     300              :   gather_scatter_info first_gs_info;
     301              : } *slp_oprnd_info;
     302              : 
     303              : 
     304              : /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
     305              :    operand.  */
     306              : static vec<slp_oprnd_info>
     307      2981204 : vect_create_oprnd_info (int nops, int group_size)
     308              : {
     309      2981204 :   int i;
     310      2981204 :   slp_oprnd_info oprnd_info;
     311      2981204 :   vec<slp_oprnd_info> oprnds_info;
     312              : 
     313      2981204 :   oprnds_info.create (nops);
     314     10653248 :   for (i = 0; i < nops; i++)
     315              :     {
     316      4690840 :       oprnd_info = XNEW (struct _slp_oprnd_info);
     317      4690840 :       oprnd_info->def_stmts.create (group_size);
     318      4690840 :       oprnd_info->ops.create (group_size);
     319      4690840 :       oprnd_info->first_dt = vect_uninitialized_def;
     320      4690840 :       oprnd_info->first_op_type = NULL_TREE;
     321      4690840 :       oprnd_info->any_pattern = false;
     322      4690840 :       oprnd_info->first_gs_p = false;
     323      4690840 :       oprnds_info.quick_push (oprnd_info);
     324              :     }
     325              : 
     326      2981204 :   return oprnds_info;
     327              : }
     328              : 
     329              : 
     330              : /* Free operands info.  */
     331              : 
     332              : static void
     333      2981204 : vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
     334              : {
     335      2981204 :   int i;
     336      2981204 :   slp_oprnd_info oprnd_info;
     337              : 
     338      7672044 :   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
     339              :     {
     340      4690840 :       oprnd_info->def_stmts.release ();
     341      4690840 :       oprnd_info->ops.release ();
     342      4690840 :       XDELETE (oprnd_info);
     343              :     }
     344              : 
     345      2981204 :   oprnds_info.release ();
     346      2981204 : }
     347              : 
     348              : /* Return the execution frequency of NODE (so that a higher value indicates
     349              :    a "more important" node when optimizing for speed).  */
     350              : 
     351              : static sreal
     352      3139585 : vect_slp_node_weight (slp_tree node)
     353              : {
     354      3139585 :   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
     355      3139585 :   basic_block bb = gimple_bb (stmt_info->stmt);
     356      3139585 :   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
     357              : }
     358              : 
     359              : /* Return true if STMTS contains a pattern statement.  */
     360              : 
     361              : static bool
     362        22303 : vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
     363              : {
     364        22303 :   stmt_vec_info stmt_info;
     365        22303 :   unsigned int i;
     366        72067 :   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
     367        52069 :     if (stmt_info && is_pattern_stmt_p (stmt_info))
     368              :       return true;
     369              :   return false;
     370              : }
     371              : 
     372              : /* Return true when all lanes in the external or constant NODE have
     373              :    the same value.  */
     374              : 
     375              : static bool
     376       591195 : vect_slp_tree_uniform_p (slp_tree node)
     377              : {
     378       591195 :   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
     379              :               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
     380              : 
     381              :   /* Pre-exsting vectors.  */
     382      1041084 :   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
     383              :     return false;
     384              : 
     385              :   unsigned i;
     386              :   tree op, first = NULL_TREE;
     387      1353495 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
     388      1212189 :     if (!first)
     389              :       first = op;
     390       620994 :     else if (!operand_equal_p (first, op, 0))
     391              :       return false;
     392              : 
     393              :   return true;
     394              : }
     395              : 
     396              : /* Find the place of the data-ref in STMT_INFO in the interleaving chain
     397              :    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
     398              :    of the chain.  */
     399              : 
     400              : int
     401       660328 : vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
     402              :                                       stmt_vec_info first_stmt_info)
     403              : {
     404       660328 :   stmt_vec_info next_stmt_info = first_stmt_info;
     405       660328 :   int result = 0;
     406              : 
     407       660328 :   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
     408              :     return -1;
     409              : 
     410      1660588 :   do
     411              :     {
     412      1660588 :       if (next_stmt_info == stmt_info)
     413              :         return result;
     414      1000260 :       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
     415      1000260 :       if (next_stmt_info)
     416      1000260 :         result += DR_GROUP_GAP (next_stmt_info);
     417              :     }
     418      1000260 :   while (next_stmt_info);
     419              : 
     420              :   return -1;
     421              : }
     422              : 
     423              : /* Check whether it is possible to load COUNT elements of type ELT_TYPE
     424              :    using the method implemented by duplicate_and_interleave.  Return true
     425              :    if so, returning the number of intermediate vectors in *NVECTORS_OUT
     426              :    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
     427              :    (if nonnull).  */
     428              : 
     429              : bool
     430            0 : can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
     431              :                                 tree elt_type, unsigned int *nvectors_out,
     432              :                                 tree *vector_type_out,
     433              :                                 tree *permutes)
     434              : {
     435            0 :   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
     436            0 :   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
     437            0 :     return false;
     438              : 
     439            0 :   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
     440            0 :   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
     441            0 :   unsigned int nvectors = 1;
     442            0 :   for (;;)
     443              :     {
     444            0 :       scalar_int_mode int_mode;
     445            0 :       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
     446            0 :       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
     447              :         {
     448              :           /* Get the natural vector type for this SLP group size.  */
     449            0 :           tree int_type = build_nonstandard_integer_type
     450            0 :             (GET_MODE_BITSIZE (int_mode), 1);
     451            0 :           tree vector_type
     452            0 :             = get_vectype_for_scalar_type (vinfo, int_type, count);
     453            0 :           poly_int64 half_nelts;
     454            0 :           if (vector_type
     455            0 :               && VECTOR_MODE_P (TYPE_MODE (vector_type))
     456            0 :               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
     457              :                            GET_MODE_SIZE (base_vector_mode))
     458            0 :               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
     459              :                              2, &half_nelts))
     460              :             {
     461              :               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
     462              :                  together into elements of type INT_TYPE and using the result
     463              :                  to build NVECTORS vectors.  */
     464            0 :               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
     465            0 :               vec_perm_builder sel1 (nelts, 2, 3);
     466            0 :               vec_perm_builder sel2 (nelts, 2, 3);
     467              : 
     468            0 :               for (unsigned int i = 0; i < 3; ++i)
     469              :                 {
     470            0 :                   sel1.quick_push (i);
     471            0 :                   sel1.quick_push (i + nelts);
     472            0 :                   sel2.quick_push (half_nelts + i);
     473            0 :                   sel2.quick_push (half_nelts + i + nelts);
     474              :                 }
     475            0 :               vec_perm_indices indices1 (sel1, 2, nelts);
     476            0 :               vec_perm_indices indices2 (sel2, 2, nelts);
     477            0 :               machine_mode vmode = TYPE_MODE (vector_type);
     478            0 :               if (can_vec_perm_const_p (vmode, vmode, indices1)
     479            0 :                   && can_vec_perm_const_p (vmode, vmode, indices2))
     480              :                 {
     481            0 :                   if (nvectors_out)
     482            0 :                     *nvectors_out = nvectors;
     483            0 :                   if (vector_type_out)
     484            0 :                     *vector_type_out = vector_type;
     485            0 :                   if (permutes)
     486              :                     {
     487            0 :                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
     488              :                                                                 indices1);
     489            0 :                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
     490              :                                                                 indices2);
     491              :                     }
     492            0 :                   return true;
     493              :                 }
     494            0 :             }
     495              :         }
     496            0 :       if (!multiple_p (elt_bytes, 2, &elt_bytes))
     497              :         return false;
     498            0 :       nvectors *= 2;
     499              :       /* We need to be able to fuse COUNT / NVECTORS elements together.  */
     500            0 :       if (!multiple_p (count, nvectors))
     501              :         return false;
     502              :     }
     503              : }
     504              : 
     505              : /* Return true if DTA and DTB match.  */
     506              : 
     507              : static bool
     508     16788186 : vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
     509              : {
     510     16788186 :   return (dta == dtb
     511       341072 :           || ((dta == vect_external_def || dta == vect_constant_def)
     512       213062 :               && (dtb == vect_external_def || dtb == vect_constant_def)));
     513              : }
     514              : 
     515              : #define GATHER_SCATTER_OFFSET (-3)
     516              : 
     517              : static const int no_arg_map[] = { 0 };
     518              : static const int arg0_map[] = { 1, 0 };
     519              : static const int arg2_map[] = { 1, 2 };
     520              : static const int arg2_arg3_map[] = { 2, 2, 3 };
     521              : static const int arg2_arg4_map[] = { 2, 2, 4 };
     522              : static const int arg2_arg5_arg6_map[] = { 3, 2, 5, 6 };
     523              : static const int arg2_arg4_arg5_map[] = { 3, 2, 4, 5 };
     524              : static const int arg3_arg2_map[] = { 2, 3, 2 };
     525              : static const int op1_op0_map[] = { 2, 1, 0 };
     526              : static const int off_map[] = { 1, GATHER_SCATTER_OFFSET };
     527              : static const int off_op0_map[] = { 2, GATHER_SCATTER_OFFSET, 0 };
     528              : static const int off_arg2_arg3_map[] = { 3, GATHER_SCATTER_OFFSET, 2, 3 };
     529              : static const int off_arg3_arg2_map[] = { 3, GATHER_SCATTER_OFFSET, 3, 2 };
     530              : static const int mask_call_maps[6][7] = {
     531              :   { 1, 1, },
     532              :   { 2, 1, 2, },
     533              :   { 3, 1, 2, 3, },
     534              :   { 4, 1, 2, 3, 4, },
     535              :   { 5, 1, 2, 3, 4, 5, },
     536              :   { 6, 1, 2, 3, 4, 5, 6 },
     537              : };
     538              : 
     539              : /* For most SLP statements, there is a one-to-one mapping between
     540              :    gimple arguments and child nodes.  If that is not true for STMT,
     541              :    return an array that contains:
     542              : 
     543              :    - the number of child nodes, followed by
     544              :    - for each child node, the index of the argument associated with that node.
     545              :      The special index -1 is the first operand of an embedded comparison and
     546              :      the special index -2 is the second operand of an embedded comparison.
     547              :      The special indes -3 is the offset of a gather as analyzed by
     548              :      vect_check_gather_scatter.
     549              : 
     550              :    SWAP is as for vect_get_and_check_slp_defs.  */
     551              : 
     552              : static const int *
     553     18814899 : vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
     554              :                       unsigned char swap = 0)
     555              : {
     556     18814899 :   if (auto assign = dyn_cast<const gassign *> (stmt))
     557              :     {
     558     17672689 :       if (gimple_assign_rhs_code (assign) == COND_EXPR
     559     17672689 :           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
     560            0 :         gcc_unreachable ();
     561     17672689 :       if ((TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
     562     16386855 :            || commutative_tree_code (gimple_assign_rhs_code (assign)))
     563     26201648 :           && swap)
     564              :         return op1_op0_map;
     565     17632391 :       if (gather_scatter_p)
     566        42215 :         return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
     567        42215 :                 ? off_op0_map : off_map);
     568              :     }
     569     18732386 :   gcc_assert (!swap);
     570     18732386 :   if (auto call = dyn_cast<const gcall *> (stmt))
     571              :     {
     572       139370 :       if (gimple_call_internal_p (call))
     573        73317 :         switch (gimple_call_internal_fn (call))
     574              :           {
     575        12168 :           case IFN_MASK_LOAD:
     576        20116 :             return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
     577              : 
     578            0 :           case IFN_GATHER_LOAD:
     579            0 :             return arg2_map;
     580              : 
     581            0 :           case IFN_MASK_GATHER_LOAD:
     582            0 :           case IFN_MASK_LEN_GATHER_LOAD:
     583            0 :             return arg2_arg5_arg6_map;
     584              : 
     585            0 :           case IFN_SCATTER_STORE:
     586            0 :             return arg2_arg4_map;
     587              : 
     588            0 :           case IFN_MASK_SCATTER_STORE:
     589            0 :           case IFN_MASK_LEN_SCATTER_STORE:
     590            0 :             return arg2_arg4_arg5_map;
     591              : 
     592         6227 :           case IFN_MASK_STORE:
     593        11178 :             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
     594              : 
     595          988 :           case IFN_MASK_CALL:
     596          988 :             {
     597          988 :               unsigned nargs = gimple_call_num_args (call);
     598          988 :               if (nargs >= 2 && nargs <= 7)
     599          988 :                 return mask_call_maps[nargs-2];
     600              :               else
     601              :                 return nullptr;
     602              :             }
     603              : 
     604          140 :           case IFN_CLZ:
     605          140 :           case IFN_CTZ:
     606          140 :             return arg0_map;
     607              : 
     608         6306 :           case IFN_GOMP_SIMD_LANE:
     609         6306 :             return no_arg_map;
     610              : 
     611              :           default:
     612              :             break;
     613              :           }
     614              :     }
     615              :   return nullptr;
     616              : }
     617              : 
     618              : /* Return the SLP node child index for operand OP of STMT.  */
     619              : 
     620              : int
     621      1325462 : vect_slp_child_index_for_operand (const gimple *stmt, int op,
     622              :                                   bool gather_scatter_p)
     623              : {
     624      1325462 :   const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
     625      1325462 :   if (!opmap)
     626              :     return op;
     627        18015 :   for (int i = 1; i < 1 + opmap[0]; ++i)
     628        18015 :     if (opmap[i] == op)
     629         9882 :       return i - 1;
     630            0 :   gcc_unreachable ();
     631              : }
     632              : 
     633              : /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
     634              :    they are of a valid type and that they match the defs of the first stmt of
     635              :    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
     636              :    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
     637              :    indicates swap is required for cond_expr stmts.  Specifically, SWAP
     638              :    is 1 if STMT is cond and operands of comparison need to be swapped;
     639              :    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
     640              : 
     641              :    If there was a fatal error return -1; if the error could be corrected by
     642              :    swapping operands of father node of this one, return 1; if everything is
     643              :    ok return 0.  */
     644              : static int
     645     12216205 : vect_get_and_check_slp_defs (vec_info *vinfo, tree vectype, unsigned char swap,
     646              :                              bool *skip_args,
     647              :                              vec<stmt_vec_info> stmts, unsigned stmt_num,
     648              :                              vec<slp_oprnd_info> *oprnds_info)
     649              : {
     650     12216205 :   stmt_vec_info stmt_info = stmts[stmt_num];
     651     12216205 :   tree oprnd;
     652     12216205 :   unsigned int i, number_of_oprnds;
     653     12216205 :   enum vect_def_type dt = vect_uninitialized_def;
     654     12216205 :   slp_oprnd_info oprnd_info;
     655     12216205 :   gather_scatter_info gs_info;
     656     12216205 :   unsigned int gs_op = -1u;
     657     12216205 :   unsigned int commutative_op = -1U;
     658     12216205 :   bool first = stmt_num == 0;
     659              : 
     660     12216205 :   if (!stmt_info)
     661              :     {
     662            0 :       for (auto oi : *oprnds_info)
     663              :         {
     664            0 :           oi->def_stmts.quick_push (NULL);
     665            0 :           oi->ops.quick_push (NULL_TREE);
     666              :         }
     667              :       return 0;
     668              :     }
     669              : 
     670     12216205 :   if (!is_a<gcall *> (stmt_info->stmt)
     671              :       && !is_a<gassign *> (stmt_info->stmt)
     672              :       && !is_a<gphi *> (stmt_info->stmt))
     673              :     return -1;
     674              : 
     675     12216205 :   number_of_oprnds = gimple_num_args (stmt_info->stmt);
     676     12216205 :   const int *map
     677     24432410 :     = vect_get_operand_map (stmt_info->stmt,
     678     12216205 :                             STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
     679     12216205 :   if (map)
     680        69662 :     number_of_oprnds = *map++;
     681     12216205 :   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
     682              :     {
     683        40096 :       if (gimple_call_internal_p (stmt))
     684              :         {
     685        24252 :           internal_fn ifn = gimple_call_internal_fn (stmt);
     686        24252 :           commutative_op = first_commutative_argument (ifn);
     687        24252 :           if (internal_gather_scatter_fn_p (ifn))
     688              :             {
     689            0 :               vect_describe_gather_scatter_call
     690            0 :                 (stmt_info,
     691            0 :                  first ? &(*oprnds_info)[0]->first_gs_info : &gs_info);
     692            0 :               if (first)
     693            0 :                 (*oprnds_info)[0]->first_gs_p = true;
     694              :               gs_op = 0;
     695              :             }
     696              :         }
     697              :     }
     698     12176109 :   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
     699              :     {
     700     14238804 :       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
     701      8177067 :         commutative_op = 0;
     702              :     }
     703              : 
     704     12216205 :   bool swapped = (swap != 0);
     705     12216205 :   bool backedge = false;
     706     12216205 :   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
     707     33832805 :   for (i = 0; i < number_of_oprnds; i++)
     708              :     {
     709     21617709 :       oprnd_info = (*oprnds_info)[i];
     710     21617709 :       int opno = map ? map[i] : int (i);
     711     21617709 :       if (opno == GATHER_SCATTER_OFFSET)
     712              :         {
     713        22050 :           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
     714        22050 :           if (!is_a <loop_vec_info> (vinfo)
     715        22050 :               || !vect_check_gather_scatter (stmt_info, vectype,
     716              :                                              as_a <loop_vec_info> (vinfo),
     717              :                                              first ? &oprnd_info->first_gs_info
     718              :                                              : &gs_info))
     719         1109 :             return -1;
     720              : 
     721        22050 :           if (first)
     722              :             {
     723        21813 :               oprnd_info->first_gs_p = true;
     724        21813 :               oprnd = oprnd_info->first_gs_info.offset;
     725              :             }
     726              :           else
     727              :             {
     728          237 :               gs_op = i;
     729          237 :               oprnd = gs_info.offset;
     730              :             }
     731              :         }
     732     21595659 :       else if (opno < 0)
     733            0 :         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
     734              :       else
     735              :         {
     736     21595659 :           oprnd = gimple_arg (stmt_info->stmt, opno);
     737     21595659 :           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
     738              :             {
     739      1090364 :               edge e = gimple_phi_arg_edge (stmt, opno);
     740      2180728 :               backedge = (is_a <bb_vec_info> (vinfo)
     741      1629296 :                           ? e->flags & EDGE_DFS_BACK
     742       538932 :                           : dominated_by_p (CDI_DOMINATORS, e->src,
     743       538932 :                                             gimple_bb (stmt_info->stmt)));
     744              :             }
     745              :         }
     746     21617709 :       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
     747         2650 :         oprnd = TREE_OPERAND (oprnd, 0);
     748              : 
     749     21617709 :       stmt_vec_info def_stmt_info;
     750     21617709 :       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
     751              :         {
     752          957 :           if (dump_enabled_p ())
     753            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     754              :                              "Build SLP failed: can't analyze def for %T\n",
     755              :                              oprnd);
     756              : 
     757          957 :           return -1;
     758              :         }
     759              : 
     760     21616752 :       if (skip_args[i])
     761              :         {
     762       445318 :           oprnd_info->def_stmts.quick_push (NULL);
     763       445318 :           oprnd_info->ops.quick_push (NULL_TREE);
     764       445318 :           oprnd_info->first_dt = vect_uninitialized_def;
     765       445318 :           continue;
     766              :         }
     767              : 
     768     21171434 :       oprnd_info->def_stmts.quick_push (def_stmt_info);
     769     21171434 :       oprnd_info->ops.quick_push (oprnd);
     770              : 
     771     21171434 :       if (def_stmt_info
     772     21171434 :           && is_pattern_stmt_p (def_stmt_info))
     773              :         {
     774       345226 :           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
     775              :               != def_stmt_info)
     776       248007 :             oprnd_info->any_pattern = true;
     777              :           else
     778              :             /* If we promote this to external use the original stmt def.  */
     779        97219 :             oprnd_info->ops.last ()
     780       194438 :               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
     781              :         }
     782              : 
     783              :       /* If there's a extern def on a backedge make sure we can
     784              :          code-generate at the region start.
     785              :          ???  This is another case that could be fixed by adjusting
     786              :          how we split the function but at the moment we'd have conflicting
     787              :          goals there.  */
     788     21171434 :       if (backedge
     789       126901 :           && dts[i] == vect_external_def
     790          173 :           && is_a <bb_vec_info> (vinfo)
     791          173 :           && TREE_CODE (oprnd) == SSA_NAME
     792          152 :           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
     793     21171586 :           && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
     794          152 :                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
     795              :         {
     796          152 :           if (dump_enabled_p ())
     797            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     798              :                              "Build SLP failed: extern def %T only defined "
     799              :                              "on backedge\n", oprnd);
     800          152 :           return -1;
     801              :         }
     802              : 
     803     21171282 :       if (first)
     804              :         {
     805      4269103 :           tree type = TREE_TYPE (oprnd);
     806      4269103 :           dt = dts[i];
     807              : 
     808              :           /* For the swapping logic below force vect_reduction_def
     809              :              for the reduction op in a SLP reduction group.  */
     810      4269103 :           if (!STMT_VINFO_DATA_REF (stmt_info)
     811      3181119 :               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     812         3288 :               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
     813      4270723 :               && def_stmt_info)
     814         1620 :             dts[i] = dt = vect_reduction_def;
     815              : 
     816              :           /* Check the types of the definition.  */
     817      4269103 :           switch (dt)
     818              :             {
     819      4269103 :             case vect_external_def:
     820      4269103 :             case vect_constant_def:
     821      4269103 :             case vect_internal_def:
     822      4269103 :             case vect_reduction_def:
     823      4269103 :             case vect_double_reduction_def:
     824      4269103 :             case vect_induction_def:
     825      4269103 :             case vect_nested_cycle:
     826      4269103 :             case vect_first_order_recurrence:
     827      4269103 :               break;
     828              : 
     829            0 :             default:
     830              :               /* FORNOW: Not supported.  */
     831            0 :               if (dump_enabled_p ())
     832            0 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     833              :                                  "Build SLP failed: illegal type of def %T\n",
     834              :                                  oprnd);
     835            0 :               return -1;
     836              :             }
     837              : 
     838      4269103 :           oprnd_info->first_dt = dt;
     839      4269103 :           oprnd_info->first_op_type = type;
     840              :         }
     841              :     }
     842     12215096 :   if (first)
     843              :     return 0;
     844              : 
     845              :   /* Now match the operand definition types to that of the first stmt.  */
     846     25871678 :   for (i = 0; i < number_of_oprnds;)
     847              :     {
     848     16898349 :       if (skip_args[i])
     849              :         {
     850        27772 :           ++i;
     851        27772 :           continue;
     852              :         }
     853              : 
     854     16870577 :       oprnd_info = (*oprnds_info)[i];
     855     16870577 :       dt = dts[i];
     856     16870577 :       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
     857     16870577 :       oprnd = oprnd_info->ops[stmt_num];
     858     16870577 :       tree type = TREE_TYPE (oprnd);
     859              : 
     860     16870577 :       if (!types_compatible_p (oprnd_info->first_op_type, type))
     861              :         {
     862        88604 :           if (dump_enabled_p ())
     863          107 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     864              :                              "Build SLP failed: different operand types\n");
     865        88604 :           return 1;
     866              :         }
     867              : 
     868     16781973 :       if ((gs_op == i) != oprnd_info->first_gs_p)
     869              :         {
     870            0 :           if (dump_enabled_p ())
     871            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     872              :                              "Build SLP failed: mixed gather and non-gather\n");
     873            0 :           return 1;
     874              :         }
     875     16781973 :       else if (gs_op == i)
     876              :         {
     877          207 :           if (!operand_equal_p (oprnd_info->first_gs_info.base,
     878          207 :                                 gs_info.base))
     879              :             {
     880           16 :               if (dump_enabled_p ())
     881            6 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     882              :                                  "Build SLP failed: different gather base\n");
     883           16 :               return 1;
     884              :             }
     885          191 :           if (oprnd_info->first_gs_info.scale != gs_info.scale)
     886              :             {
     887            8 :               if (dump_enabled_p ())
     888            2 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     889              :                                  "Build SLP failed: different gather scale\n");
     890            8 :               return 1;
     891              :             }
     892              :         }
     893              : 
     894              :       /* Not first stmt of the group, check that the def-stmt/s match
     895              :          the def-stmt/s of the first stmt.  Allow different definition
     896              :          types for reduction chains: the first stmt must be a
     897              :          vect_reduction_def (a phi node), and the rest
     898              :          end in the reduction chain.  */
     899     16781949 :       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
     900       284723 :            && !(oprnd_info->first_dt == vect_reduction_def
     901         2777 :                 && !STMT_VINFO_DATA_REF (stmt_info)
     902         2777 :                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     903         2767 :                 && def_stmt_info
     904         2767 :                 && !STMT_VINFO_DATA_REF (def_stmt_info)
     905         2767 :                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
     906              :                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
     907     16499993 :           || (!STMT_VINFO_DATA_REF (stmt_info)
     908     15223902 :               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     909         5814 :               && ((!def_stmt_info
     910         5652 :                    || STMT_VINFO_DATA_REF (def_stmt_info)
     911        10379 :                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
     912              :                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
     913         5814 :                   != (oprnd_info->first_dt != vect_reduction_def))))
     914              :         {
     915              :           /* Try swapping operands if we got a mismatch.  For BB
     916              :              vectorization only in case it will clearly improve things.  */
     917       283887 :           if (i == commutative_op && !swapped
     918       281956 :               && (!is_a <bb_vec_info> (vinfo)
     919         4983 :                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
     920         4983 :                                              dts[i+1])
     921         1108 :                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
     922              :                           || vect_def_types_match
     923          146 :                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
     924              :             {
     925         1931 :               if (dump_enabled_p ())
     926          144 :                 dump_printf_loc (MSG_NOTE, vect_location,
     927              :                                  "trying swapped operands\n");
     928         1931 :               std::swap (dts[i], dts[i+1]);
     929         1931 :               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
     930         1931 :                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
     931         1931 :               std::swap ((*oprnds_info)[i]->ops[stmt_num],
     932         1931 :                          (*oprnds_info)[i+1]->ops[stmt_num]);
     933              :               /* After swapping some operands we lost track whether an
     934              :                  operand has any pattern defs so be conservative here.  */
     935         1931 :               if ((*oprnds_info)[i]->any_pattern
     936         1931 :                   || (*oprnds_info)[i+1]->any_pattern)
     937            4 :                 (*oprnds_info)[i]->any_pattern
     938            2 :                   = (*oprnds_info)[i+1]->any_pattern = true;
     939         1931 :               swapped = true;
     940         1931 :               continue;
     941              :             }
     942              : 
     943       280025 :           if (is_a <bb_vec_info> (vinfo)
     944       269538 :               && !oprnd_info->any_pattern
     945       549325 :               && number_of_oprnds > 1)
     946              :             {
     947              :               /* Now for commutative ops we should see whether we can
     948              :                  make the other operand matching.  */
     949       104259 :               if (dump_enabled_p ())
     950          149 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     951              :                                  "treating operand as external\n");
     952       104259 :               oprnd_info->first_dt = dt = vect_external_def;
     953              :             }
     954              :           else
     955              :             {
     956       175766 :               if (dump_enabled_p ())
     957          406 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
     958              :                                  "Build SLP failed: different types\n");
     959       175766 :               return 1;
     960              :             }
     961              :         }
     962              : 
     963              :       /* Make sure to demote the overall operand to external.  */
     964     16604252 :       if (dt == vect_external_def)
     965       329846 :         oprnd_info->first_dt = vect_external_def;
     966              :       /* For a SLP reduction chain we want to duplicate the reduction to
     967              :          each of the chain members.  That gets us a sane SLP graph (still
     968              :          the stmts are not 100% correct wrt the initial values).  */
     969     16274406 :       else if ((dt == vect_internal_def
     970     16274406 :                 || dt == vect_reduction_def)
     971     15370735 :                && oprnd_info->first_dt == vect_reduction_def
     972        64716 :                && !STMT_VINFO_DATA_REF (stmt_info)
     973        64716 :                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
     974         2767 :                && !STMT_VINFO_DATA_REF (def_stmt_info)
     975     16277173 :                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
     976              :                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
     977              :         {
     978         2767 :           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
     979         2767 :           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
     980              :         }
     981              : 
     982     16604252 :       ++i;
     983              :     }
     984              : 
     985              :   /* Swap operands.  */
     986      8973329 :   if (swapped)
     987              :     {
     988        39972 :       if (dump_enabled_p ())
     989          432 :         dump_printf_loc (MSG_NOTE, vect_location,
     990              :                          "swapped operands to match def types in %G",
     991              :                          stmt_info->stmt);
     992              :     }
     993              : 
     994              :   return 0;
     995              : }
     996              : 
     997              : /* Return true if call statements CALL1 and CALL2 are similar enough
     998              :    to be combined into the same SLP group.  */
     999              : 
    1000              : bool
    1001        20900 : compatible_calls_p (gcall *call1, gcall *call2, bool allow_two_operators)
    1002              : {
    1003        20900 :   unsigned int nargs = gimple_call_num_args (call1);
    1004        20900 :   if (nargs != gimple_call_num_args (call2))
    1005              :     return false;
    1006              : 
    1007        18964 :   auto cfn1 = gimple_call_combined_fn (call1);
    1008        18964 :   auto cfn2 = gimple_call_combined_fn (call2);
    1009        18964 :   if (cfn1 != cfn2
    1010            2 :       && (!allow_two_operators
    1011            2 :           || !((cfn1 == CFN_FMA || cfn1 == CFN_FMS)
    1012            2 :                && (cfn2 == CFN_FMA || cfn2 == CFN_FMS))))
    1013              :     return false;
    1014              : 
    1015        18964 :   if (gimple_call_internal_p (call1))
    1016              :     {
    1017         7084 :       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
    1018         7084 :                                TREE_TYPE (gimple_call_lhs (call2))))
    1019              :         return false;
    1020        14393 :       for (unsigned int i = 0; i < nargs; ++i)
    1021         7309 :         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
    1022         7309 :                                  TREE_TYPE (gimple_call_arg (call2, i))))
    1023              :           return false;
    1024              :     }
    1025              :   else
    1026              :     {
    1027        11880 :       if (!operand_equal_p (gimple_call_fn (call1),
    1028        11880 :                             gimple_call_fn (call2), 0))
    1029              :         return false;
    1030              : 
    1031        25884 :       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
    1032              :         return false;
    1033              :     }
    1034              : 
    1035              :   /* Check that any unvectorized arguments are equal.  */
    1036        15712 :   if (const int *map = vect_get_operand_map (call1))
    1037              :     {
    1038           15 :       unsigned int nkept = *map++;
    1039           15 :       unsigned int mapi = 0;
    1040           57 :       for (unsigned int i = 0; i < nargs; ++i)
    1041           42 :         if (mapi < nkept && map[mapi] == int (i))
    1042           27 :           mapi += 1;
    1043           15 :         else if (!operand_equal_p (gimple_call_arg (call1, i),
    1044           15 :                                    gimple_call_arg (call2, i)))
    1045              :           return false;
    1046              :     }
    1047              : 
    1048              :   return true;
    1049              : }
    1050              : 
    1051              : /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
    1052              :    caller's attempt to find the vector type in STMT_INFO with the narrowest
    1053              :    element type.  Return true if VECTYPE is nonnull and if it is valid
    1054              :    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
    1055              :    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
    1056              :    vect_build_slp_tree.  */
    1057              : 
    1058              : static bool
    1059      4966443 : vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
    1060              :                         unsigned int group_size,
    1061              :                         tree vectype, poly_uint64 *max_nunits)
    1062              : {
    1063      4966443 :   if (!vectype)
    1064              :     {
    1065         4489 :       if (dump_enabled_p ())
    1066            7 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1067              :                          "Build SLP failed: unsupported data-type in %G\n",
    1068              :                          stmt_info->stmt);
    1069              :       /* Fatal mismatch.  */
    1070         4489 :       return false;
    1071              :     }
    1072              : 
    1073              :   /* If populating the vector type requires unrolling then fail
    1074              :      before adjusting *max_nunits for basic-block vectorization.  */
    1075      4961954 :   if (is_a <bb_vec_info> (vinfo)
    1076      4961954 :       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
    1077              :     {
    1078       140914 :       if (dump_enabled_p ())
    1079           34 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1080              :                          "Build SLP failed: unrolling required "
    1081              :                          "in basic block SLP\n");
    1082              :       /* Fatal mismatch.  */
    1083       140914 :       return false;
    1084              :     }
    1085              : 
    1086              :   /* In case of multiple types we need to detect the smallest type.  */
    1087      4821040 :   vect_update_max_nunits (max_nunits, vectype);
    1088      4821040 :   return true;
    1089              : }
    1090              : 
    1091              : /* Verify if the scalar stmts STMTS are isomorphic, require data
    1092              :    permutation or are of unsupported types of operation.  Return
    1093              :    true if they are, otherwise return false and indicate in *MATCHES
    1094              :    which stmts are not isomorphic to the first one.  If MATCHES[0]
    1095              :    is false then this indicates the comparison could not be
    1096              :    carried out or the stmts will never be vectorized by SLP.
    1097              : 
    1098              :    Note COND_EXPR is possibly isomorphic to another one after swapping its
    1099              :    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
    1100              :    the first stmt by swapping the two operands of comparison; set SWAP[i]
    1101              :    to 2 if stmt I is isormorphic to the first stmt by inverting the code
    1102              :    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
    1103              :    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
    1104              : 
    1105              : static bool
    1106      5253029 : vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
    1107              :                        vec<stmt_vec_info> stmts, unsigned int group_size,
    1108              :                        poly_uint64 *max_nunits, bool *matches,
    1109              :                        bool *two_operators, tree *node_vectype)
    1110              : {
    1111      5253029 :   unsigned int i;
    1112      5253029 :   stmt_vec_info first_stmt_info = stmts[0];
    1113      5253029 :   code_helper first_stmt_code = ERROR_MARK;
    1114      5253029 :   code_helper alt_stmt_code = ERROR_MARK;
    1115      5253029 :   code_helper first_cond_code = ERROR_MARK;
    1116      5253029 :   bool need_same_oprnds = false;
    1117      5253029 :   tree first_lhs = NULL_TREE;
    1118      5253029 :   tree first_op1 = NULL_TREE;
    1119      5253029 :   stmt_vec_info first_load = NULL, prev_first_load = NULL;
    1120      5253029 :   bool first_stmt_ldst_p = false, first_stmt_ldst_masklen_p = false;
    1121      5253029 :   bool first_stmt_phi_p = false;
    1122      5253029 :   int first_reduc_idx = -1;
    1123      5253029 :   bool maybe_soft_fail = false;
    1124      5253029 :   tree soft_fail_nunits_vectype = NULL_TREE;
    1125              : 
    1126      5253029 :   tree vectype, nunits_vectype;
    1127      5253029 :   if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
    1128              :                                        &nunits_vectype, group_size))
    1129              :     {
    1130              :       /* Fatal mismatch.  */
    1131       194867 :       matches[0] = false;
    1132       194867 :       return false;
    1133              :     }
    1134      5058162 :   if (is_a <bb_vec_info> (vinfo)
    1135      5058162 :       && known_le (TYPE_VECTOR_SUBPARTS (vectype), 1U))
    1136              :     {
    1137       344295 :       if (dump_enabled_p ())
    1138          290 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1139              :                          "Build SLP failed: not using single lane "
    1140              :                          "vector type %T\n", vectype);
    1141       344295 :       matches[0] = false;
    1142       344295 :       return false;
    1143              :     }
    1144              :   /* Record nunits required but continue analysis, producing matches[]
    1145              :      as if nunits was not an issue.  This allows splitting of groups
    1146              :      to happen.  */
    1147      4713867 :   if (nunits_vectype
    1148      4713867 :       && !vect_record_max_nunits (vinfo, first_stmt_info, group_size,
    1149              :                                   nunits_vectype, max_nunits))
    1150              :     {
    1151       140914 :       gcc_assert (is_a <bb_vec_info> (vinfo));
    1152       140914 :       maybe_soft_fail = true;
    1153       140914 :       soft_fail_nunits_vectype = nunits_vectype;
    1154              :     }
    1155              : 
    1156      4713867 :   gcc_assert (vectype || !gimple_get_lhs (first_stmt_info->stmt));
    1157      4713867 :   *node_vectype = vectype;
    1158              : 
    1159              :   /* For every stmt in NODE find its def stmt/s.  */
    1160      4713867 :   stmt_vec_info stmt_info;
    1161     20977825 :   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
    1162              :     {
    1163     16423394 :       bool ldst_p = false;
    1164     16423394 :       bool ldst_masklen_p = false;
    1165     16423394 :       bool phi_p = false;
    1166     16423394 :       code_helper rhs_code = ERROR_MARK;
    1167              : 
    1168     16423394 :       swap[i] = 0;
    1169     16423394 :       matches[i] = false;
    1170     16423394 :       if (!stmt_info)
    1171              :         {
    1172        39741 :           matches[i] = true;
    1173     16303699 :           continue;
    1174              :         }
    1175              : 
    1176     16383653 :       gimple *stmt = stmt_info->stmt;
    1177     16383653 :       if (dump_enabled_p ())
    1178       213753 :         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
    1179              : 
    1180              :       /* Fail to vectorize statements marked as unvectorizable, throw
    1181              :          or are volatile.  */
    1182     16383653 :       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
    1183     16195315 :           || stmt_can_throw_internal (cfun, stmt)
    1184     31854467 :           || gimple_has_volatile_ops (stmt))
    1185              :         {
    1186       193822 :           if (dump_enabled_p ())
    1187          199 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1188              :                              "Build SLP failed: unvectorizable statement %G",
    1189              :                              stmt);
    1190              :           /* ???  For BB vectorization we want to commutate operands in a way
    1191              :              to shuffle all unvectorizable defs into one operand and have
    1192              :              the other still vectorized.  The following doesn't reliably
    1193              :              work for this though but it's the easiest we can do here.  */
    1194       193822 :           if (is_a <bb_vec_info> (vinfo) && i != 0)
    1195        63392 :             continue;
    1196              :           /* Fatal mismatch.  */
    1197       130430 :           matches[0] = false;
    1198       130430 :           return false;
    1199              :         }
    1200              : 
    1201     16189831 :       gcall *call_stmt = dyn_cast <gcall *> (stmt);
    1202     16189831 :       tree lhs = gimple_get_lhs (stmt);
    1203     16189831 :       if (lhs == NULL_TREE && !call_stmt)
    1204              :         {
    1205           36 :           if (dump_enabled_p ())
    1206            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1207              :                              "Build SLP failed: not GIMPLE_ASSIGN nor "
    1208              :                              "GIMPLE_CALL %G", stmt);
    1209           36 :           if (is_a <bb_vec_info> (vinfo) && i != 0)
    1210           36 :             continue;
    1211              :           /* Fatal mismatch.  */
    1212            0 :           matches[0] = false;
    1213            0 :           return false;
    1214              :         }
    1215              : 
    1216     16189795 :       if (call_stmt)
    1217              :         {
    1218        92922 :           combined_fn cfn = gimple_call_combined_fn (call_stmt);
    1219        92922 :           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
    1220        49417 :             rhs_code = cfn;
    1221              :           else
    1222              :             rhs_code = CALL_EXPR;
    1223              : 
    1224        92922 :           if (cfn == CFN_GATHER_LOAD
    1225        92922 :               || cfn == CFN_SCATTER_STORE)
    1226              :             ldst_p = true;
    1227              :           else if (cfn == CFN_MASK_LOAD
    1228              :                    || cfn == CFN_MASK_GATHER_LOAD
    1229              :                    || cfn == CFN_MASK_LEN_GATHER_LOAD
    1230              :                    || cfn == CFN_MASK_SCATTER_STORE
    1231              :                    || cfn == CFN_MASK_LEN_SCATTER_STORE)
    1232              :             {
    1233              :               ldst_p = true;
    1234              :               ldst_masklen_p = true;
    1235              :             }
    1236              :           else if (cfn == CFN_MASK_STORE)
    1237              :             {
    1238              :               ldst_p = true;
    1239              :               ldst_masklen_p = true;
    1240              :               rhs_code = CFN_MASK_STORE;
    1241              :             }
    1242              :           else if (cfn == CFN_GOMP_SIMD_LANE)
    1243              :             ;
    1244        83779 :           else if ((cfn != CFN_LAST
    1245              :                     && cfn != CFN_MASK_CALL
    1246        40274 :                     && internal_fn_p (cfn)
    1247        31185 :                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
    1248        83705 :                    || gimple_call_tail_p (call_stmt)
    1249        83705 :                    || gimple_call_noreturn_p (call_stmt)
    1250       167484 :                    || gimple_call_chain (call_stmt))
    1251              :             {
    1252          423 :               if (dump_enabled_p ())
    1253           13 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1254              :                                  "Build SLP failed: unsupported call type %G",
    1255              :                                  (gimple *) call_stmt);
    1256          423 :               if (is_a <bb_vec_info> (vinfo) && i != 0)
    1257           62 :                 continue;
    1258              :               /* Fatal mismatch.  */
    1259          361 :               matches[0] = false;
    1260          361 :               return false;
    1261              :             }
    1262              :         }
    1263     16096873 :       else if (gimple_code (stmt) == GIMPLE_PHI)
    1264              :         {
    1265              :           rhs_code = ERROR_MARK;
    1266              :           phi_p = true;
    1267              :         }
    1268              :       else
    1269              :         {
    1270     15372372 :           rhs_code = gimple_assign_rhs_code (stmt);
    1271     15372372 :           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
    1272              :         }
    1273              : 
    1274              :       /* Check the operation.  */
    1275     16189372 :       if (i == 0)
    1276              :         {
    1277      4583076 :           first_lhs = lhs;
    1278      4583076 :           first_stmt_code = rhs_code;
    1279      4583076 :           first_stmt_ldst_p = ldst_p;
    1280      4583076 :           first_stmt_ldst_masklen_p = ldst_masklen_p;
    1281      4583076 :           first_stmt_phi_p = phi_p;
    1282      4583076 :           first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
    1283              : 
    1284              :           /* Shift arguments should be equal in all the packed stmts for a
    1285              :              vector shift with scalar shift operand.  */
    1286      4583076 :           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
    1287      4460480 :               || rhs_code == LROTATE_EXPR
    1288      9043514 :               || rhs_code == RROTATE_EXPR)
    1289              :             {
    1290              :               /* First see if we have a vector/vector shift.  */
    1291       122839 :               if (!directly_supported_p (rhs_code, vectype, optab_vector))
    1292              :                 {
    1293              :                   /* No vector/vector shift, try for a vector/scalar shift.  */
    1294       114779 :                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
    1295              :                     {
    1296         9423 :                       if (dump_enabled_p ())
    1297          375 :                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1298              :                                          "Build SLP failed: "
    1299              :                                          "op not supported by target.\n");
    1300         9423 :                       if (is_a <bb_vec_info> (vinfo) && i != 0)
    1301              :                         continue;
    1302              :                       /* Fatal mismatch.  */
    1303         9423 :                       matches[0] = false;
    1304         9423 :                       return false;
    1305              :                     }
    1306       105356 :                   need_same_oprnds = true;
    1307       105356 :                   first_op1 = gimple_assign_rhs2 (stmt);
    1308              :                 }
    1309              :             }
    1310      4460237 :           else if (rhs_code == WIDEN_LSHIFT_EXPR)
    1311              :             {
    1312            0 :               need_same_oprnds = true;
    1313            0 :               first_op1 = gimple_assign_rhs2 (stmt);
    1314              :             }
    1315      4460237 :           else if (!ldst_p
    1316      4460237 :                    && rhs_code == BIT_FIELD_REF)
    1317              :             {
    1318         5748 :               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
    1319         5748 :               if (!is_a <bb_vec_info> (vinfo)
    1320         5622 :                   || TREE_CODE (vec) != SSA_NAME
    1321              :                   /* When the element types are not compatible we pun the
    1322              :                      source to the target vectype which requires equal size.  */
    1323        11358 :                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
    1324         4895 :                        || !types_compatible_p (TREE_TYPE (vectype),
    1325         4895 :                                                TREE_TYPE (TREE_TYPE (vec))))
    1326         1031 :                       && !operand_equal_p (TYPE_SIZE (vectype),
    1327         1031 :                                            TYPE_SIZE (TREE_TYPE (vec)))))
    1328              :                 {
    1329          781 :                   if (dump_enabled_p ())
    1330            0 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1331              :                                      "Build SLP failed: "
    1332              :                                      "BIT_FIELD_REF not supported\n");
    1333              :                   /* Fatal mismatch.  */
    1334          781 :                   matches[0] = false;
    1335          781 :                   return false;
    1336              :                 }
    1337              :             }
    1338      4454489 :           else if (rhs_code == CFN_DIV_POW2)
    1339              :             {
    1340            0 :               need_same_oprnds = true;
    1341            0 :               first_op1 = gimple_call_arg (call_stmt, 1);
    1342              :             }
    1343      4454489 :           else if (rhs_code == CFN_GOMP_SIMD_LANE)
    1344              :             {
    1345         3153 :               need_same_oprnds = true;
    1346         3153 :               first_op1 = gimple_call_arg (call_stmt, 1);
    1347              :             }
    1348              :         }
    1349              :       else
    1350              :         {
    1351     11606628 :           if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
    1352              :               /* For SLP reduction groups the index isn't necessarily
    1353              :                  uniform but only that of the first stmt matters.  */
    1354         1640 :               && !(first_reduc_idx != -1
    1355         1640 :                    && STMT_VINFO_REDUC_IDX (stmt_info) != -1
    1356         1640 :                    && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
    1357     11606296 :               && !(first_reduc_idx != -1
    1358          898 :                    && STMT_VINFO_REDUC_IDX (stmt_info) != -1
    1359          898 :                    && rhs_code.is_tree_code ()
    1360          898 :                    && commutative_tree_code (tree_code (rhs_code))
    1361          704 :                    && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info)))
    1362              :             {
    1363          332 :               if (dump_enabled_p ())
    1364              :                 {
    1365           12 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1366              :                                    "Build SLP failed: different reduc_idx "
    1367              :                                    "%d instead of %d in %G",
    1368              :                                    STMT_VINFO_REDUC_IDX (stmt_info),
    1369              :                                    first_reduc_idx, stmt);
    1370              :                 }
    1371              :               /* Mismatch.  */
    1372          332 :               continue;
    1373              :             }
    1374     11605964 :           if (!ldst_p
    1375      9174603 :               && first_stmt_code != rhs_code
    1376     12991131 :               && alt_stmt_code == ERROR_MARK)
    1377              :             alt_stmt_code = rhs_code;
    1378     12971420 :           if ((!ldst_p
    1379      9174603 :                && first_stmt_code != rhs_code
    1380      1385167 :                && (first_stmt_code != IMAGPART_EXPR
    1381          127 :                    || rhs_code != REALPART_EXPR)
    1382      1385147 :                && (first_stmt_code != REALPART_EXPR
    1383          458 :                    || rhs_code != IMAGPART_EXPR)
    1384              :                /* Handle mismatches in plus/minus by computing both
    1385              :                   and merging the results.  */
    1386      1385136 :                && !((((first_stmt_code == PLUS_EXPR
    1387      1288268 :                        || first_stmt_code == MINUS_EXPR)
    1388       116915 :                       && (alt_stmt_code == PLUS_EXPR
    1389       108102 :                           || alt_stmt_code == MINUS_EXPR))
    1390      1362557 :                      || ((first_stmt_code == CFN_FMA
    1391      1362555 :                           || first_stmt_code == CFN_FMS)
    1392            2 :                          && (alt_stmt_code == CFN_FMA
    1393            2 :                              || alt_stmt_code == CFN_FMS)))
    1394        22581 :                     && rhs_code == alt_stmt_code)
    1395      1402434 :                && !(first_stmt_code.is_tree_code ()
    1396      1286575 :                     && rhs_code.is_tree_code ()
    1397      1193820 :                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
    1398              :                         == tcc_comparison)
    1399       128023 :                     && (swap_tree_comparison (tree_code (first_stmt_code))
    1400       128023 :                         == tree_code (rhs_code))
    1401              :                     && (first_reduc_idx == -1
    1402            0 :                         || REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
    1403              :               || (ldst_p
    1404      4862722 :                   && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
    1405      2431361 :                       != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
    1406              :               || (ldst_p
    1407      2389070 :                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    1408      2389070 :                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
    1409     10240650 :               || first_stmt_ldst_p != ldst_p
    1410     10240516 :               || (ldst_p && first_stmt_ldst_masklen_p != ldst_masklen_p)
    1411     21846472 :               || first_stmt_phi_p != phi_p)
    1412              :             {
    1413      1365456 :               if (dump_enabled_p ())
    1414              :                 {
    1415         2845 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1416              :                                    "Build SLP failed: different operation "
    1417              :                                    "in stmt %G", stmt);
    1418         2845 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1419              :                                    "original stmt %G", first_stmt_info->stmt);
    1420              :                 }
    1421              :               /* Mismatch.  */
    1422      1365456 :               continue;
    1423              :             }
    1424              : 
    1425     10242871 :           if (!ldst_p
    1426      7851568 :               && first_stmt_code == BIT_FIELD_REF
    1427     10246317 :               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
    1428         5809 :                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
    1429              :             {
    1430         2363 :               if (dump_enabled_p ())
    1431           40 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1432              :                                  "Build SLP failed: different BIT_FIELD_REF "
    1433              :                                  "arguments in %G", stmt);
    1434              :               /* Mismatch.  */
    1435         2363 :               continue;
    1436              :             }
    1437              : 
    1438     10238145 :           if (call_stmt
    1439        21732 :               && first_stmt_code != CFN_MASK_LOAD
    1440     10259391 :               && first_stmt_code != CFN_MASK_STORE)
    1441              :             {
    1442        20900 :               if (!is_a <gcall *> (stmts[0]->stmt)
    1443        20900 :                   || !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
    1444              :                                           call_stmt, true))
    1445              :                 {
    1446         5188 :                   if (dump_enabled_p ())
    1447            0 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1448              :                                      "Build SLP failed: different calls in %G",
    1449              :                                      stmt);
    1450              :                   /* Mismatch.  */
    1451         5188 :                   continue;
    1452              :                 }
    1453              :             }
    1454              : 
    1455     10062180 :           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
    1456     10931037 :               && (gimple_bb (first_stmt_info->stmt)
    1457       868857 :                   != gimple_bb (stmt_info->stmt)))
    1458              :             {
    1459        27048 :               if (dump_enabled_p ())
    1460            8 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1461              :                                  "Build SLP failed: different BB for PHI "
    1462              :                                  "or possibly trapping operation in %G", stmt);
    1463              :               /* Mismatch.  */
    1464        27048 :               continue;
    1465              :             }
    1466              : 
    1467     10205909 :           if (need_same_oprnds)
    1468              :             {
    1469        54759 :               tree other_op1 = gimple_arg (stmt, 1);
    1470        54759 :               if (!operand_equal_p (first_op1, other_op1, 0))
    1471              :                 {
    1472         7457 :                   if (dump_enabled_p ())
    1473          123 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1474              :                                      "Build SLP failed: different shift "
    1475              :                                      "arguments in %G", stmt);
    1476              :                   /* Mismatch.  */
    1477         7457 :                   continue;
    1478              :                 }
    1479              :             }
    1480              : 
    1481     10199189 :           if (first_lhs
    1482     10198452 :               && lhs
    1483     10198452 :               && !types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (first_lhs)))
    1484              :             {
    1485          737 :               if (dump_enabled_p ())
    1486            6 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1487              :                                  "Build SLP failed: different vector type "
    1488              :                                  "in %G", stmt);
    1489              :               /* Mismatch.  */
    1490          737 :               continue;
    1491              :             }
    1492              :         }
    1493              : 
    1494              :       /* Grouped store or load.  */
    1495     14770587 :       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    1496              :         {
    1497      3742014 :           gcc_assert (ldst_p);
    1498      3742014 :           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
    1499              :             {
    1500              :               /* Store.  */
    1501      2968228 :               gcc_assert (rhs_code == CFN_MASK_STORE
    1502              :                           || REFERENCE_CLASS_P (lhs)
    1503              :                           || DECL_P (lhs));
    1504              :             }
    1505              :           else
    1506              :             {
    1507              :               /* Load.  */
    1508       773786 :               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
    1509       773786 :               if (prev_first_load)
    1510              :                 {
    1511              :                   /* Check that there are no loads from different interleaving
    1512              :                      chains in the same node.  */
    1513       344053 :                   if (prev_first_load != first_load)
    1514              :                     {
    1515        41488 :                       if (dump_enabled_p ())
    1516         1988 :                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
    1517              :                                          vect_location,
    1518              :                                          "Build SLP failed: different "
    1519              :                                          "interleaving chains in one node %G",
    1520              :                                          stmt);
    1521              :                       /* Mismatch.  */
    1522        41488 :                       continue;
    1523              :                     }
    1524              :                 }
    1525              :               else
    1526              :                 prev_first_load = first_load;
    1527              :            }
    1528              :         }
    1529              :       /* Non-grouped store or load.  */
    1530     11028573 :       else if (ldst_p)
    1531              :         {
    1532       706860 :           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
    1533       495038 :               && rhs_code != CFN_GATHER_LOAD
    1534              :               && rhs_code != CFN_MASK_GATHER_LOAD
    1535              :               && rhs_code != CFN_MASK_LEN_GATHER_LOAD
    1536              :               && rhs_code != CFN_SCATTER_STORE
    1537              :               && rhs_code != CFN_MASK_SCATTER_STORE
    1538              :               && rhs_code != CFN_MASK_LEN_SCATTER_STORE
    1539       495038 :               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    1540              :               /* Not grouped loads are handled as externals for BB
    1541              :                  vectorization.  For loop vectorization we can handle
    1542              :                  splats the same we handle single element interleaving.
    1543              :                  Likewise we can handle a collection of invariant refs.  */
    1544      1183564 :               && (is_a <bb_vec_info> (vinfo)
    1545       476704 :                   || (stmt_info != first_stmt_info
    1546        44304 :                   && !(integer_zerop (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
    1547          157 :                       && integer_zerop (DR_STEP (STMT_VINFO_DATA_REF
    1548              :                                                          (first_stmt_info)))))))
    1549              :             {
    1550              :               /* Not grouped load.  */
    1551        43990 :               if (dump_enabled_p ())
    1552          121 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1553              :                                  "Build SLP failed: not grouped load %G", stmt);
    1554              : 
    1555        43990 :               if (i != 0)
    1556        43990 :                 continue;
    1557              :               /* Fatal mismatch.  */
    1558            0 :               matches[0] = false;
    1559            0 :               return false;
    1560              :             }
    1561              :         }
    1562              :       /* Not memory operation.  */
    1563              :       else
    1564              :         {
    1565     10321713 :           if (!phi_p
    1566      9719280 :               && rhs_code.is_tree_code ()
    1567      9677508 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
    1568      1420635 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
    1569       902142 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
    1570       853756 :               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
    1571        64529 :               && rhs_code != VIEW_CONVERT_EXPR
    1572              :               && rhs_code != CALL_EXPR
    1573              :               && rhs_code != BIT_FIELD_REF
    1574     10321713 :               && rhs_code != SSA_NAME)
    1575              :             {
    1576        18441 :               if (dump_enabled_p ())
    1577            7 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1578              :                                  "Build SLP failed: operation unsupported %G",
    1579              :                                  stmt);
    1580        18441 :               if (is_a <bb_vec_info> (vinfo) && i != 0)
    1581            0 :                 continue;
    1582              :               /* Fatal mismatch.  */
    1583        18441 :               matches[0] = false;
    1584        18441 :               return false;
    1585              :             }
    1586              : 
    1587     10303272 :           if (rhs_code == COND_EXPR)
    1588              :             {
    1589        45970 :               tree cond_expr = gimple_assign_rhs1 (stmt);
    1590        45970 :               enum tree_code cond_code = TREE_CODE (cond_expr);
    1591        45970 :               enum tree_code swap_code = ERROR_MARK;
    1592        45970 :               enum tree_code invert_code = ERROR_MARK;
    1593              : 
    1594        45970 :               if (i == 0)
    1595        37213 :                 first_cond_code = TREE_CODE (cond_expr);
    1596         8757 :               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
    1597              :                 {
    1598            0 :                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
    1599            0 :                   swap_code = swap_tree_comparison (cond_code);
    1600            0 :                   invert_code = invert_tree_comparison (cond_code, honor_nans);
    1601              :                 }
    1602              : 
    1603        45970 :               if (first_cond_code == cond_code)
    1604              :                 ;
    1605              :               /* Isomorphic can be achieved by swapping.  */
    1606            0 :               else if (first_cond_code == swap_code)
    1607            0 :                 swap[i] = 1;
    1608              :               /* Isomorphic can be achieved by inverting.  */
    1609            0 :               else if (first_cond_code == invert_code)
    1610            0 :                 swap[i] = 2;
    1611              :               else
    1612              :                 {
    1613            0 :                   if (dump_enabled_p ())
    1614            0 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    1615              :                                      "Build SLP failed: different"
    1616              :                                      " operation %G", stmt);
    1617              :                   /* Mismatch.  */
    1618            0 :                   continue;
    1619              :                 }
    1620              :             }
    1621              : 
    1622     10303272 :           if (i != 0
    1623      7809956 :               && first_stmt_code != rhs_code
    1624        62132 :               && first_stmt_code.is_tree_code ()
    1625        62130 :               && rhs_code.is_tree_code ()
    1626        62130 :               && TREE_CODE_CLASS ((tree_code)first_stmt_code) == tcc_comparison
    1627     10342987 :               && (swap_tree_comparison ((tree_code)first_stmt_code)
    1628        39715 :                   == (tree_code)rhs_code))
    1629        39715 :             swap[i] = 1;
    1630              : 
    1631     10303272 :           if (i != 0
    1632      7809956 :               && first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
    1633         1084 :               && first_reduc_idx != -1
    1634         1084 :               && STMT_VINFO_REDUC_IDX (stmt_info) != -1
    1635         1084 :               && rhs_code.is_tree_code ()
    1636         1084 :               && commutative_tree_code (tree_code (rhs_code))
    1637     10304356 :               && first_reduc_idx == 1 - STMT_VINFO_REDUC_IDX (stmt_info))
    1638         1084 :             swap[i] = 1;
    1639              :         }
    1640              : 
    1641     14666668 :       matches[i] = true;
    1642              :     }
    1643              : 
    1644     19235432 :   for (i = 0; i < group_size; ++i)
    1645     15336862 :     if (!matches[i])
    1646              :       return false;
    1647              : 
    1648              :   /* If we allowed a two-operation SLP node verify the target can cope
    1649              :      with the permute we are going to use.  */
    1650      3898570 :   if (alt_stmt_code != ERROR_MARK
    1651      3898570 :       && (!alt_stmt_code.is_tree_code ()
    1652        51464 :           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
    1653        51464 :               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
    1654              :     {
    1655        12322 :       *two_operators = true;
    1656              :     }
    1657              : 
    1658      3898570 :   if (maybe_soft_fail)
    1659              :     {
    1660       140499 :       unsigned HOST_WIDE_INT const_nunits;
    1661       140499 :       if (!TYPE_VECTOR_SUBPARTS
    1662       140499 :             (soft_fail_nunits_vectype).is_constant (&const_nunits)
    1663       140499 :           || const_nunits > group_size)
    1664            0 :         matches[0] = false;
    1665              :       else
    1666              :         {
    1667              :           /* With constant vector elements simulate a mismatch at the
    1668              :              point we need to split.  */
    1669       140499 :           unsigned tail = group_size & (const_nunits - 1);
    1670       140499 :           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
    1671              :         }
    1672       140499 :       return false;
    1673              :     }
    1674              : 
    1675              :   return true;
    1676              : }
    1677              : 
    1678              : /* Traits for the hash_set to record failed SLP builds for a stmt set.
    1679              :    Note we never remove apart from at destruction time so we do not
    1680              :    need a special value for deleted that differs from empty.  */
    1681              : struct bst_traits
    1682              : {
    1683              :   typedef vec <stmt_vec_info> value_type;
    1684              :   typedef vec <stmt_vec_info> compare_type;
    1685              :   static inline hashval_t hash (value_type);
    1686              :   static inline bool equal (value_type existing, value_type candidate);
    1687    433889645 :   static inline bool is_empty (value_type x) { return !x.exists (); }
    1688     96562673 :   static inline bool is_deleted (value_type x) { return !x.exists (); }
    1689              :   static const bool empty_zero_p = true;
    1690            0 :   static inline void mark_empty (value_type &x) { x.release (); }
    1691              :   static inline void mark_deleted (value_type &x) { x.release (); }
    1692      8376442 :   static inline void remove (value_type &x) { x.release (); }
    1693              : };
    1694              : inline hashval_t
    1695     84046806 : bst_traits::hash (value_type x)
    1696              : {
    1697     84046806 :   inchash::hash h;
    1698    398965932 :   for (unsigned i = 0; i < x.length (); ++i)
    1699    314919126 :     h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
    1700     84046806 :   return h.end ();
    1701              : }
    1702              : inline bool
    1703     73414547 : bst_traits::equal (value_type existing, value_type candidate)
    1704              : {
    1705    220243641 :   if (existing.length () != candidate.length ())
    1706              :     return false;
    1707     75225775 :   for (unsigned i = 0; i < existing.length (); ++i)
    1708     71393620 :     if (existing[i] != candidate[i])
    1709              :       return false;
    1710              :   return true;
    1711              : }
    1712              : 
    1713              : typedef hash_map <vec <stmt_vec_info>, slp_tree,
    1714              :                   simple_hashmap_traits <bst_traits, slp_tree> >
    1715              :   scalar_stmts_to_slp_tree_map_t;
    1716              : 
    1717              : /* Release BST_MAP.  */
    1718              : 
    1719              : static void
    1720      1661254 : release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
    1721              : {
    1722              :   /* The map keeps a reference on SLP nodes built, release that.  */
    1723     10037696 :   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
    1724     18414138 :        it != bst_map->end (); ++it)
    1725      8376442 :     if ((*it).second)
    1726      8376442 :       vect_free_slp_tree ((*it).second);
    1727      1661254 :   delete bst_map;
    1728      1661254 : }
    1729              : 
    1730              : /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
    1731              :    but then vec::insert does memmove and that's not compatible with
    1732              :    std::pair.  */
    1733              : struct chain_op_t
    1734              : {
    1735      3646349 :   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
    1736      3646349 :       : code (code_), dt (dt_), op (op_) {}
    1737              :   tree_code code;
    1738              :   vect_def_type dt;
    1739              :   tree op;
    1740              : };
    1741              : 
    1742              : /* Comparator for sorting associatable chains.  */
    1743              : 
    1744              : static int
    1745      8448485 : dt_sort_cmp (const void *op1_, const void *op2_, void *)
    1746              : {
    1747      8448485 :   auto *op1 = (const chain_op_t *) op1_;
    1748      8448485 :   auto *op2 = (const chain_op_t *) op2_;
    1749      8448485 :   if (op1->dt != op2->dt)
    1750      1032834 :     return (int)op1->dt - (int)op2->dt;
    1751      7415651 :   return (int)op1->code - (int)op2->code;
    1752              : }
    1753              : 
    1754              : /* Linearize the associatable expression chain at START with the
    1755              :    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
    1756              :    filling CHAIN with the result and using WORKLIST as intermediate storage.
    1757              :    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
    1758              :    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
    1759              :    stmts, starting with START.  */
    1760              : 
    1761              : static void
    1762      1633968 : vect_slp_linearize_chain (vec_info *vinfo,
    1763              :                           vec<std::pair<tree_code, gimple *> > &worklist,
    1764              :                           vec<chain_op_t> &chain,
    1765              :                           enum tree_code code, gimple *start,
    1766              :                           gimple *&code_stmt, gimple *&alt_code_stmt,
    1767              :                           vec<gimple *> *chain_stmts)
    1768              : {
    1769              :   /* For each lane linearize the addition/subtraction (or other
    1770              :      uniform associatable operation) expression tree.  */
    1771      1633968 :   worklist.safe_push (std::make_pair (code, start));
    1772      3646349 :   while (!worklist.is_empty ())
    1773              :     {
    1774      2012381 :       auto entry = worklist.pop ();
    1775      2012381 :       gassign *stmt = as_a <gassign *> (entry.second);
    1776      2012381 :       enum tree_code in_code = entry.first;
    1777      4024762 :       enum tree_code this_code = gimple_assign_rhs_code (stmt);
    1778              :       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
    1779      2012381 :       if (!code_stmt
    1780      2012381 :           && gimple_assign_rhs_code (stmt) == code)
    1781      1382544 :         code_stmt = stmt;
    1782       629837 :       else if (!alt_code_stmt
    1783       629837 :                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
    1784       328262 :         alt_code_stmt = stmt;
    1785      2012381 :       if (chain_stmts)
    1786      1963081 :         chain_stmts->safe_push (stmt);
    1787      6037143 :       for (unsigned opnum = 1; opnum <= 2; ++opnum)
    1788              :         {
    1789      4024762 :           tree op = gimple_op (stmt, opnum);
    1790      4024762 :           vect_def_type dt;
    1791      4024762 :           stmt_vec_info def_stmt_info;
    1792      4024762 :           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
    1793      4024762 :           gcc_assert (res);
    1794      4024762 :           if (dt == vect_internal_def
    1795      4024762 :               && is_pattern_stmt_p (def_stmt_info))
    1796         6498 :             op = gimple_get_lhs (def_stmt_info->stmt);
    1797      4024762 :           gimple *use_stmt;
    1798      4024762 :           use_operand_p use_p;
    1799      4024762 :           if (dt == vect_internal_def
    1800      3734873 :               && single_imm_use (op, &use_p, &use_stmt)
    1801      2296275 :               && is_gimple_assign (def_stmt_info->stmt)
    1802      6139862 :               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
    1803      1764313 :                   || (code == PLUS_EXPR
    1804       884532 :                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
    1805              :                           == MINUS_EXPR))))
    1806              :             {
    1807       378413 :               tree_code op_def_code = this_code;
    1808       378413 :               if (op_def_code == MINUS_EXPR && opnum == 1)
    1809        55519 :                 op_def_code = PLUS_EXPR;
    1810       378413 :               if (in_code == MINUS_EXPR)
    1811          193 :                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
    1812       378413 :               worklist.safe_push (std::make_pair (op_def_code,
    1813       378413 :                                                   def_stmt_info->stmt));
    1814              :             }
    1815              :           else
    1816              :             {
    1817      3646349 :               tree_code op_def_code = this_code;
    1818      3646349 :               if (op_def_code == MINUS_EXPR && opnum == 1)
    1819       277231 :                 op_def_code = PLUS_EXPR;
    1820      3646349 :               if (in_code == MINUS_EXPR)
    1821         6241 :                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
    1822      3646349 :               chain.safe_push (chain_op_t (op_def_code, dt, op));
    1823              :             }
    1824              :         }
    1825              :     }
    1826      1633968 : }
    1827              : 
    1828              : static slp_tree
    1829              : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
    1830              :                        vec<stmt_vec_info> stmts, unsigned int group_size,
    1831              :                        poly_uint64 *max_nunits,
    1832              :                        bool *matches, unsigned *limit, unsigned *tree_size,
    1833              :                        scalar_stmts_to_slp_tree_map_t *bst_map);
    1834              : 
    1835              : static slp_tree
    1836      5649975 : vect_build_slp_tree (vec_info *vinfo,
    1837              :                      vec<stmt_vec_info> stmts, unsigned int group_size,
    1838              :                      poly_uint64 *max_nunits,
    1839              :                      bool *matches, unsigned *limit, unsigned *tree_size,
    1840              :                      scalar_stmts_to_slp_tree_map_t *bst_map)
    1841              : {
    1842      5649975 :   if (slp_tree *leader = bst_map->get (stmts))
    1843              :     {
    1844       390883 :       if (dump_enabled_p ())
    1845        16822 :         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
    1846        16822 :                          !(*leader)->failed ? "" : "failed ",
    1847              :                          (void *) *leader);
    1848       390883 :       if (!(*leader)->failed)
    1849              :         {
    1850       343909 :           SLP_TREE_REF_COUNT (*leader)++;
    1851       343909 :           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
    1852       343909 :           stmts.release ();
    1853       343909 :           return *leader;
    1854              :         }
    1855        46974 :       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
    1856        46974 :       return NULL;
    1857              :     }
    1858              : 
    1859              :   /* Single-lane SLP doesn't have the chance of run-away, do not account
    1860              :      it to the limit.  */
    1861      5259092 :   if (stmts.length () > 1)
    1862              :     {
    1863      3074262 :       if (*limit == 0)
    1864              :         {
    1865         1501 :           if (dump_enabled_p ())
    1866           12 :             dump_printf_loc (MSG_NOTE, vect_location,
    1867              :                              "SLP discovery limit exceeded\n");
    1868         1501 :           memset (matches, 0, sizeof (bool) * group_size);
    1869         1501 :           return NULL;
    1870              :         }
    1871      3072761 :       --*limit;
    1872              :     }
    1873              : 
    1874              :   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
    1875              :      so we can pick up backedge destinations during discovery.  */
    1876      5257591 :   slp_tree res = new _slp_tree;
    1877      5257591 :   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
    1878      5257591 :   SLP_TREE_SCALAR_STMTS (res) = stmts;
    1879      5257591 :   bst_map->put (stmts.copy (), res);
    1880              : 
    1881      5257591 :   if (dump_enabled_p ())
    1882       142299 :     dump_printf_loc (MSG_NOTE, vect_location,
    1883              :                      "starting SLP discovery for node %p\n", (void *) res);
    1884              : 
    1885      5257591 :   poly_uint64 this_max_nunits = 1;
    1886      5257591 :   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
    1887              :                                         &this_max_nunits,
    1888              :                                         matches, limit, tree_size, bst_map);
    1889      5257591 :   if (!res_)
    1890              :     {
    1891      1926318 :       if (dump_enabled_p ())
    1892         8016 :         dump_printf_loc (MSG_NOTE, vect_location,
    1893              :                          "SLP discovery for node %p failed\n", (void *) res);
    1894              :       /* Mark the node invalid so we can detect those when still in use
    1895              :          as backedge destinations.  */
    1896      1926318 :       SLP_TREE_SCALAR_STMTS (res) = vNULL;
    1897      1926318 :       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
    1898      1926318 :       res->failed = XNEWVEC (bool, group_size);
    1899      1926318 :       if (flag_checking)
    1900              :         {
    1901              :           unsigned i;
    1902      3421979 :           for (i = 0; i < group_size; ++i)
    1903      3421979 :             if (!matches[i])
    1904              :               break;
    1905      1926318 :           gcc_assert (i < group_size);
    1906              :         }
    1907      1926318 :       memcpy (res->failed, matches, sizeof (bool) * group_size);
    1908              :     }
    1909              :   else
    1910              :     {
    1911      3331273 :       if (dump_enabled_p ())
    1912       134283 :         dump_printf_loc (MSG_NOTE, vect_location,
    1913              :                          "SLP discovery for node %p succeeded\n",
    1914              :                          (void *) res);
    1915      3331273 :       gcc_assert (res_ == res);
    1916      3331273 :       res->max_nunits = this_max_nunits;
    1917      3331273 :       vect_update_max_nunits (max_nunits, this_max_nunits);
    1918              :       /* Keep a reference for the bst_map use.  */
    1919      3331273 :       SLP_TREE_REF_COUNT (res)++;
    1920              :     }
    1921              :   return res_;
    1922              : }
    1923              : 
    1924              : /* Helper for building an associated SLP node chain.  */
    1925              : 
    1926              : static void
    1927          122 : vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
    1928              :                                    slp_tree op0, slp_tree op1,
    1929              :                                    stmt_vec_info oper1, stmt_vec_info oper2,
    1930              :                                    vec<std::pair<unsigned, unsigned> > lperm)
    1931              : {
    1932          122 :   unsigned group_size = SLP_TREE_LANES (op1);
    1933              : 
    1934          122 :   slp_tree child1 = new _slp_tree;
    1935          122 :   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
    1936          122 :   SLP_TREE_VECTYPE (child1) = vectype;
    1937          122 :   SLP_TREE_LANES (child1) = group_size;
    1938          122 :   SLP_TREE_CHILDREN (child1).create (2);
    1939          122 :   SLP_TREE_CHILDREN (child1).quick_push (op0);
    1940          122 :   SLP_TREE_CHILDREN (child1).quick_push (op1);
    1941          122 :   SLP_TREE_REPRESENTATIVE (child1) = oper1;
    1942              : 
    1943          122 :   slp_tree child2 = new _slp_tree;
    1944          122 :   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
    1945          122 :   SLP_TREE_VECTYPE (child2) = vectype;
    1946          122 :   SLP_TREE_LANES (child2) = group_size;
    1947          122 :   SLP_TREE_CHILDREN (child2).create (2);
    1948          122 :   SLP_TREE_CHILDREN (child2).quick_push (op0);
    1949          122 :   SLP_TREE_REF_COUNT (op0)++;
    1950          122 :   SLP_TREE_CHILDREN (child2).quick_push (op1);
    1951          122 :   SLP_TREE_REF_COUNT (op1)++;
    1952          122 :   SLP_TREE_REPRESENTATIVE (child2) = oper2;
    1953              : 
    1954          122 :   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
    1955          122 :   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
    1956          122 :   SLP_TREE_VECTYPE (perm) = vectype;
    1957          122 :   SLP_TREE_LANES (perm) = group_size;
    1958              :   /* ???  We should set this NULL but that's not expected.  */
    1959          122 :   SLP_TREE_REPRESENTATIVE (perm) = oper1;
    1960          122 :   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
    1961          122 :   SLP_TREE_CHILDREN (perm).quick_push (child1);
    1962          122 :   SLP_TREE_CHILDREN (perm).quick_push (child2);
    1963          122 : }
    1964              : 
    1965              : /* Recursively build an SLP tree starting from NODE.
    1966              :    Fail (and return a value not equal to zero) if def-stmts are not
    1967              :    isomorphic, require data permutation or are of unsupported types of
    1968              :    operation.  Otherwise, return 0.
    1969              :    The value returned is the depth in the SLP tree where a mismatch
    1970              :    was found.  */
    1971              : 
    1972              : static slp_tree
    1973      5257591 : vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
    1974              :                        vec<stmt_vec_info> stmts, unsigned int group_size,
    1975              :                        poly_uint64 *max_nunits,
    1976              :                        bool *matches, unsigned *limit, unsigned *tree_size,
    1977              :                        scalar_stmts_to_slp_tree_map_t *bst_map)
    1978              : {
    1979      5257591 :   unsigned nops, i, this_tree_size = 0;
    1980      5257591 :   poly_uint64 this_max_nunits = *max_nunits;
    1981              : 
    1982      5257591 :   matches[0] = false;
    1983              : 
    1984      5257591 :   stmt_vec_info stmt_info = stmts[0];
    1985      5257591 :   if (!is_a<gcall *> (stmt_info->stmt)
    1986              :       && !is_a<gassign *> (stmt_info->stmt)
    1987              :       && !is_a<gphi *> (stmt_info->stmt))
    1988              :     return NULL;
    1989              : 
    1990      5257520 :   nops = gimple_num_args (stmt_info->stmt);
    1991      5257520 :   if (const int *map = vect_get_operand_map (stmt_info->stmt,
    1992      5257520 :                                              STMT_VINFO_GATHER_SCATTER_P
    1993              :                                                (stmt_info)))
    1994        28783 :     nops = map[0];
    1995              : 
    1996              :   /* If the SLP node is a PHI (induction or reduction), terminate
    1997              :      the recursion.  */
    1998      5257520 :   bool *skip_args = XALLOCAVEC (bool, nops);
    1999      5257520 :   memset (skip_args, 0, sizeof (bool) * nops);
    2000      5257520 :   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
    2001      2329600 :     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
    2002              :       {
    2003       252596 :         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
    2004       252596 :         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
    2005              :                                                     group_size);
    2006       252596 :         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
    2007              :                                      max_nunits))
    2008              :           return NULL;
    2009              : 
    2010       248107 :         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
    2011       248107 :         if (def_type == vect_induction_def)
    2012              :           {
    2013              :             /* Induction PHIs are not cycles but walk the initial
    2014              :                value.  Only for inner loops through, for outer loops
    2015              :                we need to pick up the value from the actual PHIs
    2016              :                to more easily support peeling and epilogue vectorization.  */
    2017       172772 :             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    2018       172772 :             if (!nested_in_vect_loop_p (loop, stmt_info))
    2019       172029 :               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
    2020              :             else
    2021              :               loop = loop->inner;
    2022       172772 :             skip_args[loop_latch_edge (loop)->dest_idx] = true;
    2023              :           }
    2024        75335 :         else if (def_type == vect_reduction_def
    2025              :                  || def_type == vect_double_reduction_def
    2026              :                  || def_type == vect_nested_cycle
    2027        75335 :                  || def_type == vect_first_order_recurrence)
    2028              :           {
    2029              :             /* Else def types have to match.  */
    2030              :             stmt_vec_info other_info;
    2031              :             bool all_same = true;
    2032       166728 :             FOR_EACH_VEC_ELT (stmts, i, other_info)
    2033              :               {
    2034        92527 :                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
    2035      1709972 :                   return NULL;
    2036        92525 :                 if (other_info != stmt_info)
    2037        15673 :                   all_same = false;
    2038              :               }
    2039        74201 :             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    2040              :             /* Reduction initial values are not explicitly represented.  */
    2041        74201 :             if (def_type != vect_first_order_recurrence
    2042        74201 :                 && gimple_bb (stmt_info->stmt) == loop->header)
    2043        71331 :               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
    2044              :             /* Reduction chain backedge defs are filled manually.
    2045              :                ???  Need a better way to identify a SLP reduction chain PHI.
    2046              :                Or a better overall way to SLP match those.  */
    2047        74201 :             if (stmts.length () > 1
    2048        74201 :                 && all_same && def_type == vect_reduction_def)
    2049         1414 :               skip_args[loop_latch_edge (loop)->dest_idx] = true;
    2050              :           }
    2051         1132 :         else if (def_type != vect_internal_def)
    2052              :           return NULL;
    2053              :       }
    2054              : 
    2055              : 
    2056      5253029 :   bool two_operators = false;
    2057      5253029 :   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
    2058      5253029 :   tree vectype = NULL_TREE;
    2059      5253029 :   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
    2060              :                               &this_max_nunits, matches, &two_operators,
    2061              :                               &vectype))
    2062              :     return NULL;
    2063              : 
    2064              :   /* If the SLP node is a load, terminate the recursion unless masked.  */
    2065      3758071 :   if (STMT_VINFO_DATA_REF (stmt_info)
    2066      1854957 :       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
    2067              :     {
    2068       797224 :       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
    2069              :         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
    2070              :       else
    2071              :         {
    2072       779127 :           *max_nunits = this_max_nunits;
    2073       779127 :           (*tree_size)++;
    2074       779127 :           node = vect_create_new_slp_node (node, stmts, 0);
    2075       779127 :           SLP_TREE_VECTYPE (node) = vectype;
    2076              :           /* And compute the load permutation.  Whether it is actually
    2077              :              a permutation depends on the unrolling factor which is
    2078              :              decided later.  */
    2079       779127 :           vec<unsigned> load_permutation;
    2080       779127 :           int j;
    2081       779127 :           stmt_vec_info load_info;
    2082       779127 :           load_permutation.create (group_size);
    2083       779127 :           stmt_vec_info first_stmt_info
    2084       779127 :             = STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2085       779127 :               ? DR_GROUP_FIRST_ELEMENT (stmt_info) : stmt_info;
    2086       779127 :           bool any_permute = false;
    2087      1894265 :           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
    2088              :             {
    2089      1115138 :               int load_place;
    2090      1115138 :               if (! load_info)
    2091              :                 {
    2092        39421 :                   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    2093              :                     load_place = j;
    2094              :                   else
    2095              :                     load_place = 0;
    2096              :                 }
    2097      1075717 :               else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    2098       660328 :                 load_place = vect_get_place_in_interleaving_chain
    2099       660328 :                     (load_info, first_stmt_info);
    2100              :               else
    2101              :                 /* Recognize the splat case as { 0, 0, ... } but make
    2102              :                    sure to use the appropriate refs for collections
    2103              :                    of invariant refs.  */
    2104       415389 :                 load_place = (load_info == stmt_info) ? 0 : j;
    2105       699906 :               gcc_assert (load_place != -1);
    2106      1115138 :               any_permute |= load_place != j;
    2107      1115138 :               load_permutation.quick_push (load_place);
    2108              :             }
    2109              : 
    2110       779127 :           if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
    2111              :             {
    2112         2350 :               gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
    2113         2350 :               bool has_gaps = false;
    2114         2350 :               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    2115          209 :                 for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
    2116         1346 :                      si; si = DR_GROUP_NEXT_ELEMENT (si))
    2117         1137 :                   if (DR_GROUP_GAP (si) != 1)
    2118          160 :                     has_gaps = true;
    2119              :               /* We cannot handle permuted masked loads directly, see
    2120              :                  PR114375.  We cannot handle strided masked loads or masked
    2121              :                  loads with gaps unless the mask is uniform.  */
    2122         2350 :               if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2123          209 :                    && (DR_GROUP_GAP (first_stmt_info) != 0
    2124          149 :                        || (has_gaps
    2125           55 :                            && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))))
    2126         4605 :                   || STMT_VINFO_STRIDED_P (stmt_info))
    2127              :                 {
    2128          108 :                   load_permutation.release ();
    2129          108 :                   matches[0] = false;
    2130       776929 :                   return NULL;
    2131              :                 }
    2132              : 
    2133              :               /* For permuted masked loads do an unpermuted masked load of
    2134              :                  the whole group followed by a SLP permute node.  */
    2135         2242 :               if (any_permute
    2136         2242 :                   || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2137           84 :                       && DR_GROUP_SIZE (first_stmt_info) != group_size))
    2138              :                 {
    2139              :                   /* Discover the whole unpermuted load.  */
    2140           44 :                   vec<stmt_vec_info> stmts2;
    2141           44 :                   unsigned dr_group_size = STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2142           78 :                       ? DR_GROUP_SIZE (first_stmt_info) : 1;
    2143           44 :                   stmts2.create (dr_group_size);
    2144           44 :                   stmts2.quick_grow_cleared (dr_group_size);
    2145           44 :                   unsigned i = 0;
    2146           44 :                   for (stmt_vec_info si = first_stmt_info;
    2147          594 :                        si; si = DR_GROUP_NEXT_ELEMENT (si))
    2148              :                     {
    2149          550 :                       if (si != first_stmt_info)
    2150         2106 :                         for (unsigned k = 1; k < DR_GROUP_GAP (si); ++k)
    2151         1600 :                           stmts2[i++] = NULL;
    2152          550 :                       stmts2[i++] = si;
    2153              :                     }
    2154           44 :                   bool *matches2 = XALLOCAVEC (bool, dr_group_size);
    2155           44 :                   slp_tree unperm_load
    2156           44 :                     = vect_build_slp_tree (vinfo, stmts2, dr_group_size,
    2157              :                                            &this_max_nunits, matches2, limit,
    2158           44 :                                            &this_tree_size, bst_map);
    2159              :                   /* When we are able to do the full masked load emit that
    2160              :                      followed by 'node' being the desired final permutation.  */
    2161           44 :                   if (unperm_load)
    2162              :                     {
    2163           16 :                       gcc_assert
    2164              :                         (!SLP_TREE_LOAD_PERMUTATION (unperm_load).exists ());
    2165           16 :                       lane_permutation_t lperm;
    2166           16 :                       lperm.create (group_size);
    2167           56 :                       for (unsigned j = 0; j < load_permutation.length (); ++j)
    2168           40 :                         lperm.quick_push
    2169           40 :                           (std::make_pair (0, load_permutation[j]));
    2170           16 :                       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
    2171           16 :                       SLP_TREE_CHILDREN (node).safe_push (unperm_load);
    2172           16 :                       SLP_TREE_LANE_PERMUTATION (node) = lperm;
    2173           16 :                       load_permutation.release ();
    2174           16 :                       return node;
    2175              :                     }
    2176           28 :                   stmts2.release ();
    2177           28 :                   load_permutation.release ();
    2178           28 :                   matches[0] = false;
    2179           28 :                   return NULL;
    2180              :                 }
    2181         2198 :               load_permutation.release ();
    2182              :             }
    2183              :           else
    2184              :             {
    2185       776777 :               if (!any_permute
    2186       676520 :                   && STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2187      1054505 :                   && group_size == DR_GROUP_SIZE (first_stmt_info))
    2188       120450 :                 load_permutation.release ();
    2189       776777 :               SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
    2190       776777 :               return node;
    2191              :             }
    2192              :         }
    2193              :     }
    2194      2960847 :   else if (gimple_assign_single_p (stmt_info->stmt)
    2195      2121890 :            && !gimple_vuse (stmt_info->stmt)
    2196      2968556 :            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
    2197              :     {
    2198              :       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
    2199              :          the same SSA name vector of a compatible type to vectype.  */
    2200         2391 :       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
    2201         2391 :       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
    2202         2391 :       stmt_vec_info estmt_info;
    2203         7531 :       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
    2204              :         {
    2205         5287 :           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
    2206         5287 :           tree bfref = gimple_assign_rhs1 (estmt);
    2207         5287 :           HOST_WIDE_INT lane;
    2208         5287 :           if (!known_eq (bit_field_size (bfref),
    2209              :                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
    2210        10427 :               || !constant_multiple_p (bit_field_offset (bfref),
    2211         5140 :                                        bit_field_size (bfref), &lane))
    2212              :             {
    2213          147 :               lperm.release ();
    2214          147 :               matches[0] = false;
    2215          147 :               return NULL;
    2216              :             }
    2217         5140 :           lperm.safe_push (std::make_pair (0, (unsigned)lane));
    2218              :         }
    2219         2244 :       slp_tree vnode = vect_create_new_slp_node (vNULL);
    2220         2244 :       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
    2221              :         /* ???  We record vectype here but we hide eventually necessary
    2222              :            punning and instead rely on code generation to materialize
    2223              :            VIEW_CONVERT_EXPRs as necessary.  We instead should make
    2224              :            this explicit somehow.  */
    2225          710 :         SLP_TREE_VECTYPE (vnode) = vectype;
    2226              :       else
    2227              :         {
    2228              :           /* For different size but compatible elements we can still
    2229              :              use VEC_PERM_EXPR without punning.  */
    2230         1534 :           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
    2231              :                       && types_compatible_p (TREE_TYPE (vectype),
    2232              :                                              TREE_TYPE (TREE_TYPE (vec))));
    2233         1534 :           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
    2234              :         }
    2235         2244 :       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
    2236         2244 :       unsigned HOST_WIDE_INT const_nunits;
    2237         2244 :       if (nunits.is_constant (&const_nunits))
    2238         2244 :         SLP_TREE_LANES (vnode) = const_nunits;
    2239         2244 :       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
    2240              :       /* We are always building a permutation node even if it is an identity
    2241              :          permute to shield the rest of the vectorizer from the odd node
    2242              :          representing an actual vector without any scalar ops.
    2243              :          ???  We could hide it completely with making the permute node
    2244              :          external?  */
    2245         2244 :       node = vect_create_new_slp_node (node, stmts, 1);
    2246         2244 :       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
    2247         2244 :       SLP_TREE_LANE_PERMUTATION (node) = lperm;
    2248         2244 :       SLP_TREE_VECTYPE (node) = vectype;
    2249         2244 :       SLP_TREE_CHILDREN (node).quick_push (vnode);
    2250         2244 :       return node;
    2251              :     }
    2252              :   /* When discovery reaches an associatable operation see whether we can
    2253              :      improve that to match up lanes in a way superior to the operand
    2254              :      swapping code which at most looks at two defs.
    2255              :      ???  For BB vectorization we cannot do the brute-force search
    2256              :      for matching as we can succeed by means of builds from scalars
    2257              :      and have no good way to "cost" one build against another.  */
    2258      2958456 :   else if (is_a <loop_vec_info> (vinfo)
    2259              :            /* Do not bother for single-lane SLP.  */
    2260      1627427 :            && group_size > 1
    2261              :            /* ???  We don't handle !vect_internal_def defs below.  */
    2262        80180 :            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
    2263              :            /* ???  Do not associate a reduction, this will wreck REDUC_IDX
    2264              :               mapping as long as that exists on the stmt_info level.  */
    2265        63620 :            && STMT_VINFO_REDUC_IDX (stmt_info) == -1
    2266        58477 :            && is_gimple_assign (stmt_info->stmt)
    2267        58209 :            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
    2268        40641 :                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
    2269      2977613 :            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
    2270        11656 :                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
    2271         9713 :                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
    2272              :     {
    2273              :       /* See if we have a chain of (mixed) adds or subtracts or other
    2274              :          associatable ops.  */
    2275        13653 :       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
    2276        13653 :       if (code == MINUS_EXPR)
    2277          686 :         code = PLUS_EXPR;
    2278        13653 :       stmt_vec_info other_op_stmt_info = NULL;
    2279        13653 :       stmt_vec_info op_stmt_info = NULL;
    2280        13653 :       unsigned chain_len = 0;
    2281        13653 :       auto_vec<chain_op_t> chain;
    2282        13653 :       auto_vec<std::pair<tree_code, gimple *> > worklist;
    2283        13653 :       auto_vec<vec<chain_op_t> > chains (group_size);
    2284        13653 :       auto_vec<slp_tree, 4> children;
    2285        13653 :       bool hard_fail = true;
    2286        14538 :       for (unsigned lane = 0; lane < group_size; ++lane)
    2287              :         {
    2288        14269 :           if (!stmts[lane])
    2289              :             {
    2290              :               /* ???  Below we require lane zero is present.  */
    2291            0 :               if (lane == 0)
    2292              :                 {
    2293              :                   hard_fail = false;
    2294        13384 :                   break;
    2295              :                 }
    2296            0 :               chains.quick_push (vNULL);
    2297            0 :               continue;
    2298              :             }
    2299              :           /* For each lane linearize the addition/subtraction (or other
    2300              :              uniform associatable operation) expression tree.  */
    2301        14269 :           gimple *op_stmt = NULL, *other_op_stmt = NULL;
    2302        14269 :           vect_slp_linearize_chain (vinfo, worklist, chain, code,
    2303        14269 :                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
    2304              :                                     NULL);
    2305        14269 :           if (!op_stmt_info && op_stmt)
    2306        13123 :             op_stmt_info = vinfo->lookup_stmt (op_stmt);
    2307        14269 :           if (!other_op_stmt_info && other_op_stmt)
    2308          722 :             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
    2309        14269 :           if (chain.length () == 2)
    2310              :             {
    2311              :               /* In a chain of just two elements resort to the regular
    2312              :                  operand swapping scheme.  Likewise if we run into a
    2313              :                  length mismatch process regularly as well as we did not
    2314              :                  process the other lanes we cannot report a good hint what
    2315              :                  lanes to try swapping in the parent.  */
    2316              :               hard_fail = false;
    2317              :               break;
    2318              :             }
    2319          888 :           else if (chain_len == 0)
    2320          309 :             chain_len = chain.length ();
    2321         1158 :           else if (chain.length () != chain_len)
    2322              :             {
    2323              :               /* ???  Here we could slip in magic to compensate with
    2324              :                  neutral operands.  */
    2325            3 :               matches[lane] = false;
    2326            3 :               if (lane != group_size - 1)
    2327            3 :                 matches[0] = false;
    2328              :               break;
    2329              :             }
    2330          885 :           chains.quick_push (chain.copy ());
    2331          885 :           chain.truncate (0);
    2332              :         }
    2333        27306 :       if (chains.length () == group_size)
    2334              :         {
    2335              :           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
    2336          269 :           if (!op_stmt_info)
    2337              :             {
    2338            2 :               hard_fail = false;
    2339            2 :               goto out;
    2340              :             }
    2341              :           /* Now we have a set of chains with the same length.  */
    2342              :           /* 1. pre-sort according to def_type and operation.  */
    2343         1042 :           for (unsigned lane = 0; lane < group_size; ++lane)
    2344         1550 :             chains[lane].stablesort (dt_sort_cmp, vinfo);
    2345          267 :           if (dump_enabled_p ())
    2346              :             {
    2347          145 :               dump_printf_loc (MSG_NOTE, vect_location,
    2348              :                                "pre-sorted chains of %s\n",
    2349              :                                get_tree_code_name (code));
    2350          649 :               for (unsigned lane = 0; lane < group_size; ++lane)
    2351              :                 {
    2352          504 :                   if (!stmts[lane])
    2353            0 :                     dump_printf (MSG_NOTE, "--");
    2354              :                   else
    2355         2326 :                     for (unsigned opnum = 0; opnum < chain_len; ++opnum)
    2356         3644 :                       dump_printf (MSG_NOTE, "%s %T ",
    2357         1822 :                                    get_tree_code_name (chains[lane][opnum].code),
    2358         1822 :                                    chains[lane][opnum].op);
    2359          504 :                   dump_printf (MSG_NOTE, "\n");
    2360              :                 }
    2361              :             }
    2362              :           /* 2. try to build children nodes, associating as necessary.  */
    2363              :           /* 2a. prepare and perform early checks to avoid eating into
    2364              :              discovery limit unnecessarily.  */
    2365          267 :           vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
    2366         1135 :           for (unsigned n = 0; n < chain_len; ++n)
    2367              :             {
    2368          868 :               vect_def_type dt = chains[0][n].dt;
    2369          868 :               unsigned lane;
    2370         3535 :               for (lane = 0; lane < group_size; ++lane)
    2371         5334 :                 if (stmts[lane] && chains[lane][n].dt != dt)
    2372              :                   {
    2373            0 :                     if (dt == vect_constant_def
    2374            0 :                         && chains[lane][n].dt == vect_external_def)
    2375              :                       dt = vect_external_def;
    2376            0 :                     else if (dt == vect_external_def
    2377            0 :                              && chains[lane][n].dt == vect_constant_def)
    2378              :                       ;
    2379              :                     else
    2380              :                       break;
    2381              :                   }
    2382          868 :               if (lane != group_size)
    2383              :                 {
    2384            0 :                   if (dump_enabled_p ())
    2385            0 :                     dump_printf_loc (MSG_NOTE, vect_location,
    2386              :                                      "giving up on chain due to mismatched "
    2387              :                                      "def types\n");
    2388            0 :                   matches[lane] = false;
    2389            0 :                   if (lane != group_size - 1)
    2390            0 :                     matches[0] = false;
    2391            0 :                   goto out;
    2392              :                 }
    2393          868 :               dts[n] = dt;
    2394          868 :               if (dt == vect_constant_def
    2395          868 :                   || dt == vect_external_def)
    2396              :                 {
    2397              :                   /* Check whether we can build the invariant.  If we can't
    2398              :                      we never will be able to.  */
    2399           77 :                   tree type = TREE_TYPE (chains[0][n].op);
    2400          868 :                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
    2401              :                       && (TREE_CODE (type) == BOOLEAN_TYPE
    2402              :                           || !can_duplicate_and_interleave_p (vinfo, group_size,
    2403              :                                                               type)))
    2404              :                     {
    2405              :                       matches[0] = false;
    2406              :                       goto out;
    2407              :                     }
    2408              :                 }
    2409          791 :               else if (dt != vect_internal_def)
    2410              :                 {
    2411              :                   /* Not sure, we might need sth special.
    2412              :                      gcc.dg/vect/pr96854.c,
    2413              :                      gfortran.dg/vect/fast-math-pr37021.f90
    2414              :                      and gfortran.dg/vect/pr61171.f trigger.  */
    2415              :                   /* Soft-fail for now.  */
    2416            0 :                   hard_fail = false;
    2417            0 :                   goto out;
    2418              :                 }
    2419              :             }
    2420              :           /* 2b. do the actual build.  */
    2421         1081 :           for (unsigned n = 0; n < chain_len; ++n)
    2422              :             {
    2423          833 :               vect_def_type dt = dts[n];
    2424          833 :               unsigned lane;
    2425          833 :               if (dt == vect_constant_def
    2426          833 :                   || dt == vect_external_def)
    2427              :                 {
    2428           77 :                   vec<tree> ops;
    2429           77 :                   ops.create (group_size);
    2430          397 :                   for (lane = 0; lane < group_size; ++lane)
    2431          243 :                     if (stmts[lane])
    2432          243 :                       ops.quick_push (chains[lane][n].op);
    2433              :                     else
    2434            0 :                       ops.quick_push (NULL_TREE);
    2435           77 :                   slp_tree child = vect_create_new_slp_node (ops);
    2436           77 :                   SLP_TREE_DEF_TYPE (child) = dt;
    2437           77 :                   children.safe_push (child);
    2438              :                 }
    2439              :               else
    2440              :                 {
    2441          756 :                   vec<stmt_vec_info> op_stmts;
    2442          756 :                   op_stmts.create (group_size);
    2443          756 :                   slp_tree child = NULL;
    2444              :                   /* Brute-force our way.  We have to consider a lane
    2445              :                      failing after fixing an earlier fail up in the
    2446              :                      SLP discovery recursion.  So track the current
    2447              :                      permute per lane.  */
    2448          756 :                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
    2449          756 :                   memset (perms, 0, sizeof (unsigned) * group_size);
    2450          835 :                   do
    2451              :                     {
    2452          835 :                       op_stmts.truncate (0);
    2453         4248 :                       for (lane = 0; lane < group_size; ++lane)
    2454         2578 :                         if (stmts[lane])
    2455         2578 :                           op_stmts.quick_push
    2456         2578 :                             (vinfo->lookup_def (chains[lane][n].op));
    2457              :                         else
    2458            0 :                           op_stmts.quick_push (NULL);
    2459          835 :                       child = vect_build_slp_tree (vinfo, op_stmts,
    2460              :                                                    group_size, &this_max_nunits,
    2461              :                                                    matches, limit,
    2462              :                                                    &this_tree_size, bst_map);
    2463              :                       /* ???  We're likely getting too many fatal mismatches
    2464              :                          here so maybe we want to ignore them (but then we
    2465              :                          have no idea which lanes fatally mismatched).  */
    2466          835 :                       if (child || !matches[0])
    2467              :                         break;
    2468              :                       /* Swap another lane we have not yet matched up into
    2469              :                          lanes that did not match.  If we run out of
    2470              :                          permute possibilities for a lane terminate the
    2471              :                          search.  */
    2472          257 :                       bool term = false;
    2473          257 :                       for (lane = 1; lane < group_size; ++lane)
    2474          178 :                         if (!matches[lane])
    2475              :                           {
    2476          150 :                             if (n + perms[lane] + 1 == chain_len)
    2477              :                               {
    2478              :                                 term = true;
    2479              :                                 break;
    2480              :                               }
    2481          131 :                             if (dump_enabled_p ())
    2482          113 :                               dump_printf_loc (MSG_NOTE, vect_location,
    2483              :                                                "swapping operand %d and %d "
    2484              :                                                "of lane %d\n",
    2485              :                                                n, n + perms[lane] + 1, lane);
    2486          262 :                             std::swap (chains[lane][n],
    2487          131 :                                        chains[lane][n + perms[lane] + 1]);
    2488          131 :                             perms[lane]++;
    2489              :                           }
    2490           98 :                       if (term)
    2491              :                         break;
    2492              :                     }
    2493              :                   while (1);
    2494          756 :                   if (!child)
    2495              :                     {
    2496           19 :                       if (dump_enabled_p ())
    2497           18 :                         dump_printf_loc (MSG_NOTE, vect_location,
    2498              :                                          "failed to match up op %d\n", n);
    2499           19 :                       op_stmts.release ();
    2500           19 :                       if (lane != group_size - 1)
    2501            9 :                         matches[0] = false;
    2502              :                       else
    2503           10 :                         matches[lane] = false;
    2504           19 :                       goto out;
    2505              :                     }
    2506          737 :                   if (dump_enabled_p ())
    2507              :                     {
    2508          397 :                       dump_printf_loc (MSG_NOTE, vect_location,
    2509              :                                        "matched up op %d to\n", n);
    2510          397 :                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
    2511              :                     }
    2512          737 :                   children.safe_push (child);
    2513              :                 }
    2514              :             }
    2515              :           /* 3. build SLP nodes to combine the chain.  */
    2516          950 :           for (unsigned lane = 0; lane < group_size; ++lane)
    2517         1416 :             if (stmts[lane] && chains[lane][0].code != code)
    2518              :               {
    2519              :                 /* See if there's any alternate all-PLUS entry.  */
    2520              :                 unsigned n;
    2521            6 :                 for (n = 1; n < chain_len; ++n)
    2522              :                   {
    2523           30 :                     for (lane = 0; lane < group_size; ++lane)
    2524           48 :                       if (stmts[lane] && chains[lane][n].code != code)
    2525              :                         break;
    2526            6 :                     if (lane == group_size)
    2527              :                       break;
    2528              :                   }
    2529            6 :                 if (n != chain_len)
    2530              :                   {
    2531              :                     /* Swap that in at first position.  */
    2532            6 :                     std::swap (children[0], children[n]);
    2533           30 :                     for (lane = 0; lane < group_size; ++lane)
    2534           24 :                       if (stmts[lane])
    2535           24 :                         std::swap (chains[lane][0], chains[lane][n]);
    2536              :                   }
    2537              :                 else
    2538              :                   {
    2539              :                     /* ???  When this triggers and we end up with two
    2540              :                        vect_constant/external_def up-front things break (ICE)
    2541              :                        spectacularly finding an insertion place for the
    2542              :                        all-constant op.  We should have a fully
    2543              :                        vect_internal_def operand though(?) so we can swap
    2544              :                        that into first place and then prepend the all-zero
    2545              :                        constant.  */
    2546            0 :                     if (dump_enabled_p ())
    2547            0 :                       dump_printf_loc (MSG_NOTE, vect_location,
    2548              :                                        "inserting constant zero to compensate "
    2549              :                                        "for (partially) negated first "
    2550              :                                        "operand\n");
    2551            0 :                     chain_len++;
    2552            0 :                     for (lane = 0; lane < group_size; ++lane)
    2553            0 :                       if (stmts[lane])
    2554            0 :                         chains[lane].safe_insert
    2555            0 :                           (0, chain_op_t (code, vect_constant_def, NULL_TREE));
    2556            0 :                     vec<tree> zero_ops;
    2557            0 :                     zero_ops.create (group_size);
    2558            0 :                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
    2559            0 :                     for (lane = 1; lane < group_size; ++lane)
    2560            0 :                       if (stmts[lane])
    2561            0 :                         zero_ops.quick_push (zero_ops[0]);
    2562              :                       else
    2563            0 :                         zero_ops.quick_push (NULL_TREE);
    2564            0 :                     slp_tree zero = vect_create_new_slp_node (zero_ops);
    2565            0 :                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
    2566            0 :                     children.safe_insert (0, zero);
    2567              :                   }
    2568              :                 break;
    2569              :               }
    2570          809 :           for (unsigned i = 1; i < children.length (); ++i)
    2571              :             {
    2572          561 :               slp_tree op0 = children[i - 1];
    2573          561 :               slp_tree op1 = children[i];
    2574          561 :               bool this_two_op = false;
    2575         2169 :               for (unsigned lane = 0; lane < group_size; ++lane)
    2576         3460 :                 if (stmts[lane] && chains[lane][i].code != chains[0][i].code)
    2577              :                   {
    2578              :                     this_two_op = true;
    2579              :                     break;
    2580              :                   }
    2581          561 :               slp_tree child;
    2582          561 :               if (i == children.length () - 1)
    2583          248 :                 child = vect_create_new_slp_node (node, stmts, 2);
    2584              :               else
    2585          313 :                 child = vect_create_new_slp_node (2, ERROR_MARK);
    2586          561 :               if (this_two_op)
    2587              :                 {
    2588          122 :                   vec<std::pair<unsigned, unsigned> > lperm;
    2589          122 :                   lperm.create (group_size);
    2590          462 :                   for (unsigned lane = 0; lane < group_size; ++lane)
    2591          680 :                     lperm.quick_push (std::make_pair
    2592          340 :                       (chains[lane][i].code != chains[0][i].code, lane));
    2593          244 :                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
    2594          122 :                                                      (chains[0][i].code == code
    2595              :                                                       ? op_stmt_info
    2596              :                                                       : other_op_stmt_info),
    2597          122 :                                                      (chains[0][i].code == code
    2598              :                                                       ? other_op_stmt_info
    2599              :                                                       : op_stmt_info),
    2600              :                                                      lperm);
    2601              :                 }
    2602              :               else
    2603              :                 {
    2604          439 :                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
    2605          439 :                   SLP_TREE_VECTYPE (child) = vectype;
    2606          439 :                   SLP_TREE_LANES (child) = group_size;
    2607          439 :                   SLP_TREE_CHILDREN (child).quick_push (op0);
    2608          439 :                   SLP_TREE_CHILDREN (child).quick_push (op1);
    2609          439 :                   SLP_TREE_REPRESENTATIVE (child)
    2610          878 :                     = (chains[0][i].code == code
    2611          439 :                        ? op_stmt_info : other_op_stmt_info);
    2612              :                 }
    2613          561 :               children[i] = child;
    2614              :             }
    2615          248 :           *tree_size += this_tree_size + 1;
    2616          248 :           *max_nunits = this_max_nunits;
    2617         1244 :           while (!chains.is_empty ())
    2618          726 :             chains.pop ().release ();
    2619              :           return node;
    2620              :         }
    2621        13384 : out:
    2622        13405 :       if (dump_enabled_p ())
    2623         2775 :         dump_printf_loc (MSG_NOTE, vect_location,
    2624              :                          "failed to line up SLP graph by re-associating "
    2625              :                          "operations in lanes%s\n",
    2626              :                          !hard_fail ? " trying regular discovery" : "");
    2627        13410 :       while (!children.is_empty ())
    2628            5 :         vect_free_slp_tree (children.pop ());
    2629        13564 :       while (!chains.is_empty ())
    2630          159 :         chains.pop ().release ();
    2631              :       /* Hard-fail, otherwise we might run into quadratic processing of the
    2632              :          chains starting one stmt into the chain again.  */
    2633        13405 :       if (hard_fail)
    2634              :         return NULL;
    2635              :       /* Fall thru to normal processing.  */
    2636        13653 :     }
    2637              : 
    2638              :   /* Get at the operands, verifying they are compatible.  */
    2639      2978481 :   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
    2640      2978481 :   slp_oprnd_info oprnd_info;
    2641     15193577 :   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
    2642              :     {
    2643     24432410 :       int res = vect_get_and_check_slp_defs (vinfo, vectype,
    2644     12216205 :                                              swap[i], skip_args,
    2645              :                                              stmts, i, &oprnds_info);
    2646     12216205 :       if (res != 0)
    2647       529897 :         matches[(res == -1) ? 0 : i] = false;
    2648     12216205 :       if (!matches[0])
    2649              :         break;
    2650              :     }
    2651     14893616 :   for (i = 0; i < group_size; ++i)
    2652     12125587 :     if (!matches[i])
    2653              :       {
    2654       210452 :         vect_free_oprnd_info (oprnds_info);
    2655       210452 :         return NULL;
    2656              :       }
    2657      8304087 :   swap = NULL;
    2658              : 
    2659      8304087 :   bool has_two_operators_perm = false;
    2660     16608174 :   auto_vec<unsigned> two_op_perm_indices[2];
    2661      2768029 :   vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
    2662              : 
    2663      2780193 :   if (two_operators && oprnds_info.length () == 2 && group_size > 2)
    2664              :     {
    2665         2723 :       unsigned idx = 0;
    2666         2723 :       hash_map<gimple *, unsigned> seen;
    2667         2723 :       vec<slp_oprnd_info> new_oprnds_info
    2668         2723 :         = vect_create_oprnd_info (1, group_size);
    2669         2723 :       bool success = true;
    2670              : 
    2671         2723 :       enum tree_code code = ERROR_MARK;
    2672         2723 :       if (oprnds_info[0]->def_stmts[0]
    2673         2723 :           && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
    2674         2665 :         code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
    2675         2723 :       basic_block bb = nullptr;
    2676              : 
    2677         5992 :       for (unsigned j = 0; j < group_size; ++j)
    2678              :         {
    2679        14323 :           FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
    2680              :             {
    2681        11054 :               stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
    2682        11054 :               if (!stmt_info
    2683        10843 :                   || !is_a<gassign *> (stmt_info->stmt)
    2684        10840 :                   || gimple_assign_rhs_code (stmt_info->stmt) != code
    2685        19783 :                   || skip_args[i])
    2686              :                 {
    2687              :                   success = false;
    2688         2329 :                   break;
    2689              :                 }
    2690              :               /* Avoid mixing lanes with defs in different basic-blocks.  */
    2691         8729 :               if (!bb)
    2692         2821 :                 bb = gimple_bb (vect_orig_stmt (stmt_info)->stmt);
    2693         7428 :               else if (gimple_bb (vect_orig_stmt (stmt_info)->stmt) != bb)
    2694              :                 {
    2695              :                   success = false;
    2696              :                   break;
    2697              :                 }
    2698              : 
    2699         8725 :               bool exists;
    2700         8725 :               unsigned &stmt_idx
    2701         8725 :                 = seen.get_or_insert (stmt_info->stmt, &exists);
    2702              : 
    2703         8725 :               if (!exists)
    2704              :                 {
    2705         7676 :                   new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
    2706         7676 :                   new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
    2707         7676 :                   stmt_idx = idx;
    2708         7676 :                   idx++;
    2709              :                 }
    2710              : 
    2711         8725 :               two_op_perm_indices[i].safe_push (stmt_idx);
    2712              :             }
    2713              : 
    2714         5598 :           if (!success)
    2715              :             break;
    2716              :         }
    2717              : 
    2718         2723 :       if (success && idx == group_size)
    2719              :         {
    2720           56 :           if (dump_enabled_p ())
    2721              :             {
    2722            0 :               dump_printf_loc (MSG_NOTE, vect_location,
    2723              :                                "Replace two_operators operands:\n");
    2724              : 
    2725            0 :               FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
    2726              :                 {
    2727            0 :                   dump_printf_loc (MSG_NOTE, vect_location,
    2728              :                                    "Operand %u:\n", i);
    2729            0 :                   for (unsigned j = 0; j < group_size; j++)
    2730            0 :                     dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
    2731            0 :                                      j, oprnd_info->def_stmts[j]->stmt);
    2732              :                 }
    2733              : 
    2734            0 :               dump_printf_loc (MSG_NOTE, vect_location,
    2735              :                                "With a single operand:\n");
    2736            0 :               for (unsigned j = 0; j < group_size; j++)
    2737            0 :                 dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
    2738            0 :                                  j, new_oprnds_info[0]->def_stmts[j]->stmt);
    2739              :             }
    2740              : 
    2741           56 :           two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
    2742           56 :           two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
    2743              : 
    2744           56 :           new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
    2745           56 :           new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
    2746           56 :           new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
    2747           56 :           new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
    2748           56 :           new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
    2749              : 
    2750           56 :           vect_free_oprnd_info (oprnds_info);
    2751           56 :           oprnds_info = new_oprnds_info;
    2752           56 :           nops = 1;
    2753           56 :           has_two_operators_perm = true;
    2754              :         }
    2755              :       else
    2756         2667 :         vect_free_oprnd_info (new_oprnds_info);
    2757         2723 :     }
    2758              : 
    2759      5536058 :   auto_vec<slp_tree, 4> children;
    2760              : 
    2761      2768029 :   stmt_info = stmts[0];
    2762              : 
    2763      2768029 :   int reduc_idx = -1;
    2764      2768029 :   int gs_scale = 0;
    2765      2768029 :   tree gs_base = NULL_TREE;
    2766              : 
    2767              :   /* Create SLP_TREE nodes for the definition node/s.  */
    2768      7074900 :   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
    2769              :     {
    2770      4394467 :       slp_tree child = nullptr;
    2771      4394467 :       unsigned int j;
    2772              : 
    2773              :       /* We're skipping certain operands from processing, for example
    2774              :          outer loop reduction initial defs.  */
    2775      4394467 :       if (skip_args[i])
    2776              :         {
    2777       417546 :           children.safe_push (NULL);
    2778      4724417 :           continue;
    2779              :         }
    2780              : 
    2781      3976921 :       if (oprnd_info->first_dt == vect_uninitialized_def)
    2782              :         {
    2783              :           /* COND_EXPR have one too many eventually if the condition
    2784              :              is a SSA name.  */
    2785            0 :           gcc_assert (i == 3 && nops == 4);
    2786            0 :           continue;
    2787              :         }
    2788              : 
    2789      3976921 :       if (oprnd_info->first_gs_p)
    2790              :         {
    2791        21765 :           gs_scale = oprnd_info->first_gs_info.scale;
    2792        21765 :           gs_base = oprnd_info->first_gs_info.base;
    2793              :         }
    2794              : 
    2795      3976921 :       if (is_a <bb_vec_info> (vinfo)
    2796      1564464 :           && oprnd_info->first_dt == vect_internal_def
    2797      4788378 :           && !oprnd_info->any_pattern)
    2798              :         {
    2799              :           /* For BB vectorization, if all defs are the same do not
    2800              :              bother to continue the build along the single-lane
    2801              :              graph but use a splat of the scalar value.  */
    2802       768525 :           stmt_vec_info first_def = oprnd_info->def_stmts[0];
    2803       824490 :           for (j = 1; j < group_size; ++j)
    2804       784394 :             if (oprnd_info->def_stmts[j] != first_def)
    2805              :               break;
    2806       768525 :           if (j == group_size
    2807              :               /* But avoid doing this for loads where we may be
    2808              :                  able to CSE things, unless the stmt is not
    2809              :                  vectorizable.  */
    2810       768525 :               && (!STMT_VINFO_VECTORIZABLE (first_def)
    2811        49356 :                   || !gimple_vuse (first_def->stmt)))
    2812              :             {
    2813        30833 :               if (dump_enabled_p ())
    2814           93 :                 dump_printf_loc (MSG_NOTE, vect_location,
    2815              :                                  "Using a splat of the uniform operand %G",
    2816              :                                  first_def->stmt);
    2817        30833 :               oprnd_info->first_dt = vect_external_def;
    2818              :             }
    2819              :         }
    2820              : 
    2821      3976921 :       if (oprnd_info->first_dt == vect_external_def
    2822      3976921 :           || oprnd_info->first_dt == vect_constant_def)
    2823              :         {
    2824      1388551 :           if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
    2825              :             {
    2826              :               tree op0;
    2827              :               tree uniform_val = op0 = oprnd_info->ops[0];
    2828              :               for (j = 1; j < oprnd_info->ops.length (); ++j)
    2829              :                 if (oprnd_info->ops[j]
    2830              :                     && !operand_equal_p (uniform_val, oprnd_info->ops[j]))
    2831              :                   {
    2832              :                     uniform_val = NULL_TREE;
    2833              :                     break;
    2834              :                   }
    2835              :               if (!uniform_val
    2836              :                   && !can_duplicate_and_interleave_p (vinfo,
    2837              :                                                       oprnd_info->ops.length (),
    2838              :                                                       TREE_TYPE (op0)))
    2839              :                 {
    2840              :                   matches[j] = false;
    2841              :                   if (dump_enabled_p ())
    2842              :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    2843              :                                      "Build SLP failed: invalid type of def "
    2844              :                                      "for variable-length SLP %T\n", op0);
    2845              :                   goto fail;
    2846              :                 }
    2847              :             }
    2848      1388551 :           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
    2849      1388551 :           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
    2850      1388551 :           oprnd_info->ops = vNULL;
    2851      1388551 :           children.safe_push (invnode);
    2852      1388551 :           continue;
    2853      1388551 :         }
    2854              : 
    2855              :       /* See which SLP operand a reduction chain continues on.  We want
    2856              :          to chain even PHIs but not backedges.  */
    2857      2588370 :       if (STMT_VINFO_REDUC_DEF (oprnd_info->def_stmts[0])
    2858      2588370 :           || STMT_VINFO_REDUC_IDX (oprnd_info->def_stmts[0]) != -1)
    2859              :         {
    2860       160550 :           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
    2861              :             {
    2862          638 :               if (oprnd_info->first_dt == vect_double_reduction_def)
    2863          319 :                 reduc_idx = i;
    2864              :             }
    2865       159912 :           else if (is_a <gphi *> (stmt_info->stmt)
    2866       159912 :                    && gimple_phi_num_args
    2867        70241 :                         (as_a <gphi *> (stmt_info->stmt)) != 1)
    2868              :             ;
    2869        89995 :           else if (STMT_VINFO_REDUC_IDX (stmt_info) == -1
    2870          324 :                    && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
    2871              :             ;
    2872        89995 :           else if (reduc_idx == -1)
    2873        85679 :             reduc_idx = i;
    2874              :           else
    2875              :             /* For .COND_* reduction operations the else value can be the
    2876              :                same as one of the operation operands.  The other def
    2877              :                stmts have been moved, so we can't check easily.  Check
    2878              :                it's a call at least.  */
    2879         4316 :             gcc_assert (is_a <gcall *> (stmt_info->stmt));
    2880              :         }
    2881              : 
    2882              :       /* When we have a masked load with uniform mask discover this
    2883              :          as a single-lane mask with a splat permute.  This way we can
    2884              :          recognize this as a masked load-lane by stripping the splat.  */
    2885      2588370 :       if (is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
    2886        34757 :           && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
    2887              :                                      IFN_MASK_LOAD)
    2888         4737 :           && STMT_VINFO_GROUPED_ACCESS (stmt_info)
    2889      2588447 :           && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info)))
    2890              :         {
    2891           35 :           vec<stmt_vec_info> def_stmts2;
    2892           35 :           def_stmts2.create (1);
    2893           35 :           def_stmts2.quick_push (oprnd_info->def_stmts[0]);
    2894           35 :           child = vect_build_slp_tree (vinfo, def_stmts2, 1,
    2895              :                                        &this_max_nunits,
    2896              :                                        matches, limit,
    2897              :                                        &this_tree_size, bst_map);
    2898           35 :           if (child)
    2899              :             {
    2900           35 :               slp_tree pnode = vect_create_new_slp_node (1, VEC_PERM_EXPR);
    2901           35 :               SLP_TREE_VECTYPE (pnode) = SLP_TREE_VECTYPE (child);
    2902           35 :               SLP_TREE_LANES (pnode) = group_size;
    2903           35 :               SLP_TREE_SCALAR_STMTS (pnode).create (group_size);
    2904           35 :               SLP_TREE_LANE_PERMUTATION (pnode).create (group_size);
    2905          210 :               for (unsigned k = 0; k < group_size; ++k)
    2906              :                 {
    2907          175 :                   SLP_TREE_SCALAR_STMTS (pnode)
    2908          175 :                     .quick_push (oprnd_info->def_stmts[0]);
    2909          175 :                   SLP_TREE_LANE_PERMUTATION (pnode)
    2910          175 :                     .quick_push (std::make_pair (0u, 0u));
    2911              :                 }
    2912           35 :               SLP_TREE_CHILDREN (pnode).quick_push (child);
    2913           35 :               pnode->max_nunits = child->max_nunits;
    2914           35 :               children.safe_push (pnode);
    2915           35 :               oprnd_info->def_stmts = vNULL;
    2916           35 :               continue;
    2917           35 :             }
    2918              :           else
    2919            0 :             def_stmts2.release ();
    2920              :         }
    2921              : 
    2922      2588335 :       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
    2923              :                                         group_size, &this_max_nunits,
    2924              :                                         matches, limit,
    2925              :                                         &this_tree_size, bst_map)) != NULL)
    2926              :         {
    2927      2135375 :           oprnd_info->def_stmts = vNULL;
    2928      2135375 :           children.safe_push (child);
    2929      2135375 :           continue;
    2930              :         }
    2931              : 
    2932              :       /* If the SLP build for operand zero failed and operand zero
    2933              :          and one can be commutated try that for the scalar stmts
    2934              :          that failed the match.  */
    2935       452960 :       if (i == 0
    2936              :           /* A first scalar stmt mismatch signals a fatal mismatch.  */
    2937       356413 :           && matches[0]
    2938              :           /* ???  For COND_EXPRs we can swap the comparison operands
    2939              :              as well as the arms under some constraints.  */
    2940       168514 :           && (nops == 2 || nops == 3)
    2941       101227 :           && oprnds_info[1]->first_dt == vect_internal_def
    2942        55262 :           && (is_gimple_assign (stmt_info->stmt)
    2943        11437 :               || is_gimple_call (stmt_info->stmt))
    2944              :           /* Swapping operands for reductions breaks assumptions later on.  */
    2945       496798 :           && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
    2946              :         {
    2947              :           /* See whether we can swap the matching or the non-matching
    2948              :              stmt operands.  */
    2949              :           bool swap_not_matching = true;
    2950        49259 :           do
    2951              :             {
    2952      7033900 :               for (j = 0; j < group_size; ++j)
    2953              :                 {
    2954      6998334 :                   if (matches[j] != !swap_not_matching)
    2955        64107 :                     continue;
    2956      6934227 :                   stmt_vec_info stmt_info = stmts[j];
    2957              :                   /* Verify if we can swap operands of this stmt.  */
    2958      6934227 :                   if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
    2959              :                     {
    2960      6934201 :                       tree_code code = gimple_assign_rhs_code (stmt);
    2961      6934201 :                       if (! commutative_tree_code (code)
    2962      6934201 :                           && ! commutative_ternary_tree_code (code))
    2963              :                         {
    2964        13669 :                           if (!swap_not_matching)
    2965         6279 :                             goto fail;
    2966              :                           swap_not_matching = false;
    2967              :                           break;
    2968              :                         }
    2969              :                     }
    2970      6984667 :                   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
    2971              :                     {
    2972           26 :                       internal_fn fn = (gimple_call_internal_p (call)
    2973           26 :                                         ? gimple_call_internal_fn (call)
    2974              :                                         : IFN_LAST);
    2975           26 :                       if ((! commutative_binary_fn_p (fn)
    2976           26 :                            && ! commutative_ternary_fn_p (fn))
    2977           28 :                           || first_commutative_argument (fn) != 0)
    2978              :                         {
    2979           24 :                           if (!swap_not_matching)
    2980           12 :                             goto fail;
    2981              :                           swap_not_matching = false;
    2982              :                           break;
    2983              :                         }
    2984              :                     }
    2985              :                 }
    2986              :             }
    2987        42968 :           while (j != group_size);
    2988              : 
    2989              :           /* Swap mismatched definition stmts.  */
    2990        35566 :           if (dump_enabled_p ())
    2991          345 :             dump_printf_loc (MSG_NOTE, vect_location,
    2992              :                              "Re-trying with swapped operands of stmts ");
    2993      7012000 :           for (j = 0; j < group_size; ++j)
    2994      6976434 :             if (matches[j] == !swap_not_matching)
    2995              :               {
    2996     13840756 :                 std::swap (oprnds_info[0]->def_stmts[j],
    2997      6920378 :                            oprnds_info[1]->def_stmts[j]);
    2998     13840756 :                 std::swap (oprnds_info[0]->ops[j],
    2999      6920378 :                            oprnds_info[1]->ops[j]);
    3000      6920378 :                 if (dump_enabled_p ())
    3001          938 :                   dump_printf (MSG_NOTE, "%d ", j);
    3002              :               }
    3003        35566 :           if (dump_enabled_p ())
    3004          345 :             dump_printf (MSG_NOTE, "\n");
    3005              :           /* After swapping some operands we lost track whether an
    3006              :              operand has any pattern defs so be conservative here.  */
    3007        67903 :           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
    3008         3273 :             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
    3009              :           /* And try again with scratch 'matches' ... */
    3010        35566 :           bool *tem = XALLOCAVEC (bool, group_size);
    3011        35566 :           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
    3012              :                                             group_size, &this_max_nunits,
    3013              :                                             tem, limit,
    3014              :                                             &this_tree_size, bst_map)) != NULL)
    3015              :             {
    3016         5592 :               oprnd_info->def_stmts = vNULL;
    3017         5592 :               children.safe_push (child);
    3018         5592 :               continue;
    3019              :             }
    3020              :         }
    3021       447368 : fail:
    3022              : 
    3023              :       /* If the SLP build failed and we analyze a basic-block
    3024              :          simply treat nodes we fail to build as externally defined
    3025              :          (and thus build vectors from the scalar defs).
    3026              :          The cost model will reject outright expensive cases.
    3027              :          ???  This doesn't treat cases where permutation ultimatively
    3028              :          fails (or we don't try permutation below).  Ideally we'd
    3029              :          even compute a permutation that will end up with the maximum
    3030              :          SLP tree size...  */
    3031       447368 :       if (is_a <bb_vec_info> (vinfo)
    3032              :           /* ???  Rejecting patterns this way doesn't work.  We'd have to
    3033              :              do extra work to cancel the pattern so the uses see the
    3034              :              scalar version.  */
    3035       394201 :           && !is_pattern_stmt_p (stmt_info)
    3036       817622 :           && !oprnd_info->any_pattern)
    3037              :         {
    3038              :           /* But if there's a leading vector sized set of matching stmts
    3039              :              fail here so we can split the group.  This matches the condition
    3040              :              vect_analyze_slp_instance uses.  */
    3041              :           /* ???  We might want to split here and combine the results to support
    3042              :              multiple vector sizes better.  */
    3043       580447 :           for (j = 0; j < group_size; ++j)
    3044       580447 :             if (!matches[j])
    3045              :               break;
    3046       369993 :           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
    3047       369964 :               && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
    3048              :             {
    3049       359772 :               if (dump_enabled_p ())
    3050          501 :                 dump_printf_loc (MSG_NOTE, vect_location,
    3051              :                                  "Building vector operands from scalars\n");
    3052       359772 :               this_tree_size++;
    3053       359772 :               child = vect_create_new_slp_node (oprnd_info->ops);
    3054       359772 :               children.safe_push (child);
    3055       359772 :               oprnd_info->ops = vNULL;
    3056       359772 :               continue;
    3057              :             }
    3058              :         }
    3059              : 
    3060        87596 :       gcc_assert (child == NULL);
    3061        98481 :       FOR_EACH_VEC_ELT (children, j, child)
    3062        10885 :         if (child)
    3063        10885 :           vect_free_slp_tree (child);
    3064        87596 :       vect_free_oprnd_info (oprnds_info);
    3065        87596 :       return NULL;
    3066              :     }
    3067              : 
    3068      2680433 :   vect_free_oprnd_info (oprnds_info);
    3069              : 
    3070              :   /* If we have all children of a child built up from uniform scalars
    3071              :      or does more than one possibly expensive vector construction then
    3072              :      just throw that away, causing it built up from scalars.
    3073              :      The exception is the SLP node for the vector store.  */
    3074      2680433 :   if (is_a <bb_vec_info> (vinfo)
    3075      1090288 :       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
    3076              :       /* ???  Rejecting patterns this way doesn't work.  We'd have to
    3077              :          do extra work to cancel the pattern so the uses see the
    3078              :          scalar version.  */
    3079      3113888 :       && !is_pattern_stmt_p (stmt_info))
    3080              :     {
    3081              :       slp_tree child;
    3082              :       unsigned j;
    3083              :       bool all_uniform_p = true;
    3084              :       unsigned n_vector_builds = 0;
    3085      1231550 :       FOR_EACH_VEC_ELT (children, j, child)
    3086              :         {
    3087       823332 :           if (!child)
    3088              :             ;
    3089       823332 :           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    3090              :             all_uniform_p = false;
    3091       587755 :           else if (!vect_slp_tree_uniform_p (child))
    3092              :             {
    3093       447893 :               all_uniform_p = false;
    3094       447893 :               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
    3095       413831 :                 n_vector_builds++;
    3096              :             }
    3097              :         }
    3098       408218 :       if (all_uniform_p
    3099       408218 :           || n_vector_builds > 1
    3100       692787 :           || (n_vector_builds == children.length ()
    3101        30290 :               && is_a <gphi *> (stmt_info->stmt)))
    3102              :         {
    3103              :           /* Roll back.  */
    3104       128445 :           matches[0] = false;
    3105       408342 :           FOR_EACH_VEC_ELT (children, j, child)
    3106       279897 :             if (child)
    3107       279897 :               vect_free_slp_tree (child);
    3108              : 
    3109       128445 :           if (dump_enabled_p ())
    3110          129 :             dump_printf_loc (MSG_NOTE, vect_location,
    3111              :                              "Building parent vector operands from "
    3112              :                              "scalars instead\n");
    3113       128445 :           return NULL;
    3114              :         }
    3115              :     }
    3116              : 
    3117      2551988 :   *tree_size += this_tree_size + 1;
    3118      2551988 :   *max_nunits = this_max_nunits;
    3119              : 
    3120      2551988 :   if (two_operators)
    3121              :     {
    3122              :       /* ???  We'd likely want to either cache in bst_map sth like
    3123              :          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
    3124              :          the true { a+b, a+b, a+b, a+b } ... but there we don't have
    3125              :          explicit stmts to put in so the keying on 'stmts' doesn't
    3126              :          work (but we have the same issue with nodes that use 'ops').  */
    3127              : 
    3128         5908 :       if (has_two_operators_perm)
    3129              :         {
    3130           22 :           slp_tree child = children[0];
    3131           22 :           children.truncate (0);
    3132           66 :           for (i = 0; i < 2; i++)
    3133              :             {
    3134           44 :               slp_tree pnode
    3135           44 :                 = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
    3136           44 :               SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
    3137           44 :               SLP_TREE_VECTYPE (pnode) = vectype;
    3138           44 :               SLP_TREE_CHILDREN (pnode).quick_push (child);
    3139           44 :               SLP_TREE_CHILDREN (pnode).quick_push (child);
    3140           44 :               lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
    3141           44 :               children.safe_push (pnode);
    3142              : 
    3143          476 :               for (unsigned j = 0; j < stmts.length (); j++)
    3144          432 :                 perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
    3145              :             }
    3146              : 
    3147           22 :           SLP_TREE_REF_COUNT (child) += 4;
    3148              :         }
    3149              : 
    3150         5908 :       slp_tree one = new _slp_tree;
    3151         5908 :       slp_tree two = new _slp_tree;
    3152         5908 :       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
    3153         5908 :       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
    3154         5908 :       SLP_TREE_VECTYPE (one) = vectype;
    3155         5908 :       SLP_TREE_VECTYPE (two) = vectype;
    3156         5908 :       SLP_TREE_CHILDREN (one).safe_splice (children);
    3157         5908 :       SLP_TREE_CHILDREN (two).safe_splice (children);
    3158         5908 :       slp_tree child;
    3159        23634 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
    3160        11818 :         SLP_TREE_REF_COUNT (child)++;
    3161              : 
    3162              :       /* Here we record the original defs since this
    3163              :          node represents the final lane configuration.  */
    3164         5908 :       node = vect_create_new_slp_node (node, stmts, 2);
    3165         5908 :       SLP_TREE_VECTYPE (node) = vectype;
    3166         5908 :       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
    3167         5908 :       SLP_TREE_CHILDREN (node).quick_push (one);
    3168         5908 :       SLP_TREE_CHILDREN (node).quick_push (two);
    3169         5908 :       enum tree_code code0 = ERROR_MARK;
    3170         5908 :       enum tree_code ocode = ERROR_MARK;
    3171         5908 :       if (gassign *stmt = dyn_cast <gassign *> (stmts[0]->stmt))
    3172         5906 :         code0 = gimple_assign_rhs_code (stmt);
    3173         5908 :       stmt_vec_info ostmt_info;
    3174         5908 :       unsigned j = 0;
    3175        22009 :       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
    3176              :         {
    3177        16101 :           int op = 0;
    3178        16101 :           if (gassign *ostmt = dyn_cast <gassign *> (ostmt_info->stmt))
    3179              :             {
    3180        16097 :               if (gimple_assign_rhs_code (ostmt) != code0)
    3181              :                 {
    3182         8083 :                   ocode = gimple_assign_rhs_code (ostmt);
    3183              :                   op = 1;
    3184              :                   j = i;
    3185              :                 }
    3186              :             }
    3187              :           else
    3188              :             {
    3189            8 :               if (gimple_call_combined_fn (stmts[0]->stmt)
    3190            4 :                   != gimple_call_combined_fn (ostmt_info->stmt))
    3191              :                 {
    3192            2 :                   op = 1;
    3193            2 :                   j = i;
    3194              :                 }
    3195              :             }
    3196        16101 :           SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (op, i));
    3197              :         }
    3198         5908 :       SLP_TREE_CODE (one) = code0;
    3199         5908 :       SLP_TREE_CODE (two) = ocode;
    3200         5908 :       SLP_TREE_LANES (one) = stmts.length ();
    3201         5908 :       SLP_TREE_LANES (two) = stmts.length ();
    3202         5908 :       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
    3203         5908 :       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
    3204              : 
    3205         5908 :       return node;
    3206              :     }
    3207              : 
    3208      2546080 :   node = vect_create_new_slp_node (node, stmts, nops);
    3209      2546080 :   SLP_TREE_VECTYPE (node) = vectype;
    3210      2546080 :   SLP_TREE_CHILDREN (node).splice (children);
    3211      2546080 :   SLP_TREE_GS_SCALE (node) = gs_scale;
    3212      2546080 :   SLP_TREE_GS_BASE (node) = gs_base;
    3213      2546080 :   if (reduc_idx != -1)
    3214              :     {
    3215        80848 :       gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) != -1
    3216              :                   || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
    3217              :                   || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def);
    3218        80848 :       SLP_TREE_REDUC_IDX (node) = reduc_idx;
    3219        80848 :       node->cycle_info.id = SLP_TREE_CHILDREN (node)[reduc_idx]->cycle_info.id;
    3220              :     }
    3221              :   /* When reaching the reduction PHI, create a vect_reduc_info.  */
    3222      2465232 :   else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
    3223      2465232 :             || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
    3224      2465232 :            && is_a <gphi *> (STMT_VINFO_STMT (stmt_info)))
    3225              :     {
    3226        71331 :       loop_vec_info loop_vinfo = as_a <loop_vec_info> (vinfo);
    3227        71331 :       gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) == -1);
    3228        71331 :       node->cycle_info.id = loop_vinfo->reduc_infos.length ();
    3229        71331 :       vect_reduc_info reduc_info = new vect_reduc_info_s ();
    3230        71331 :       loop_vinfo->reduc_infos.safe_push (reduc_info);
    3231        71331 :       stmt_vec_info reduc_phi = stmt_info;
    3232              :       /* ???  For double reductions vect_is_simple_reduction stores the
    3233              :          reduction type and code on the inner loop header PHI.  */
    3234        71331 :       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
    3235              :         {
    3236          319 :           use_operand_p use_p;
    3237          319 :           gimple *use_stmt;
    3238          319 :           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
    3239              :                                      &use_p, &use_stmt);
    3240          319 :           gcc_assert (res);
    3241          319 :           reduc_phi = loop_vinfo->lookup_stmt (use_stmt);
    3242              :         }
    3243        71331 :       VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (stmt_info);
    3244        71331 :       VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (reduc_phi);
    3245        71331 :       VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (reduc_phi);
    3246        71331 :       VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
    3247              :     }
    3248              :   return node;
    3249      8304087 : }
    3250              : 
    3251              : /* Dump a single SLP tree NODE.  */
    3252              : 
    3253              : static void
    3254       438634 : vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
    3255              :                      slp_tree node)
    3256              : {
    3257       438634 :   unsigned i, j;
    3258       438634 :   slp_tree child;
    3259       438634 :   stmt_vec_info stmt_info;
    3260       438634 :   tree op;
    3261              : 
    3262       438634 :   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
    3263       438634 :   dump_user_location_t user_loc = loc.get_user_location ();
    3264       438634 :   dump_printf_loc (metadata, user_loc,
    3265              :                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
    3266              :                    ", refcnt=%u)",
    3267       438634 :                    SLP_TREE_DEF_TYPE (node) == vect_external_def
    3268              :                    ? " (external)"
    3269              :                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
    3270       423395 :                       ? " (constant)"
    3271              :                       : ""), (void *) node,
    3272       438634 :                    estimated_poly_value (node->max_nunits),
    3273              :                                          SLP_TREE_REF_COUNT (node));
    3274       438634 :   if (SLP_TREE_VECTYPE (node))
    3275       372258 :     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
    3276       438634 :   dump_printf (metadata, "%s",
    3277       438634 :                node->avoid_stlf_fail ? " (avoid-stlf-fail)" : "");
    3278       438634 :   if (node->cycle_info.id != -1 || node->cycle_info.reduc_idx != -1)
    3279        23089 :     dump_printf (metadata, " cycle %d, link %d", node->cycle_info.id,
    3280              :                  node->cycle_info.reduc_idx);
    3281       438634 :   dump_printf (metadata, "\n");
    3282       438634 :   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
    3283              :     {
    3284       356896 :       if (SLP_TREE_PERMUTE_P (node))
    3285        13548 :         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
    3286              :       else
    3287       343348 :         dump_printf_loc (metadata, user_loc, "op template: %G",
    3288       343348 :                          SLP_TREE_REPRESENTATIVE (node)->stmt);
    3289              :     }
    3290       438634 :   if (SLP_TREE_SCALAR_STMTS (node).exists ())
    3291       854804 :     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    3292       505949 :       if (stmt_info)
    3293       500668 :         dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
    3294       500668 :                          STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
    3295              :                          i, stmt_info->stmt);
    3296              :       else
    3297         5281 :         dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
    3298              :   else
    3299              :     {
    3300        89779 :       dump_printf_loc (metadata, user_loc, "\t{ ");
    3301       287496 :       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
    3302       107938 :         dump_printf (metadata, "%T%s ", op,
    3303       107938 :                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
    3304        89779 :       dump_printf (metadata, "}\n");
    3305              :     }
    3306       438634 :   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
    3307              :     {
    3308        62702 :       dump_printf_loc (metadata, user_loc, "\tload permutation {");
    3309       205772 :       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
    3310        80368 :         dump_printf (dump_kind, " %u", j);
    3311        62702 :       dump_printf (dump_kind, " }\n");
    3312              :     }
    3313       438634 :   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
    3314              :     {
    3315        13556 :       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
    3316        64464 :       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
    3317        37352 :         dump_printf (dump_kind, " %u[%u]",
    3318        37352 :                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
    3319        37352 :                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
    3320        13556 :       dump_printf (dump_kind, " }%s\n",
    3321        13556 :                    node->ldst_lanes ? " (load-lanes)" : "");
    3322              :     }
    3323       438634 :   if (SLP_TREE_CHILDREN (node).is_empty ())
    3324       166609 :     return;
    3325       272025 :   dump_printf_loc (metadata, user_loc, "\tchildren");
    3326       990121 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3327       446071 :     dump_printf (dump_kind, " %p", (void *)child);
    3328       272025 :   dump_printf (dump_kind, "%s\n",
    3329       272025 :                node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
    3330              :                ? " (store-lanes)" : "");
    3331              : }
    3332              : 
    3333              : DEBUG_FUNCTION void
    3334            0 : debug (slp_tree node)
    3335              : {
    3336            0 :   debug_dump_context ctx;
    3337            0 :   vect_print_slp_tree (MSG_NOTE,
    3338            0 :                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
    3339              :                        node);
    3340            0 : }
    3341              : 
    3342              : /* Recursive helper for the dot producer below.  */
    3343              : 
    3344              : static void
    3345            0 : dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
    3346              : {
    3347            0 :   if (visited.add (node))
    3348              :     return;
    3349              : 
    3350            0 :   fprintf (f, "\"%p\" [label=\"", (void *)node);
    3351            0 :   vect_print_slp_tree (MSG_NOTE,
    3352            0 :                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
    3353              :                        node);
    3354            0 :   fprintf (f, "\"];\n");
    3355              : 
    3356              : 
    3357            0 :   for (slp_tree child : SLP_TREE_CHILDREN (node))
    3358            0 :     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
    3359              : 
    3360            0 :   for (slp_tree child : SLP_TREE_CHILDREN (node))
    3361            0 :     if (child)
    3362            0 :       dot_slp_tree (f, child, visited);
    3363              : }
    3364              : 
    3365              : DEBUG_FUNCTION void
    3366            0 : dot_slp_tree (const char *fname, slp_tree node)
    3367              : {
    3368            0 :   FILE *f = fopen (fname, "w");
    3369            0 :   fprintf (f, "digraph {\n");
    3370            0 :   fflush (f);
    3371            0 :     {
    3372            0 :       debug_dump_context ctx (f);
    3373            0 :       hash_set<slp_tree> visited;
    3374            0 :       dot_slp_tree (f, node, visited);
    3375            0 :     }
    3376            0 :   fflush (f);
    3377            0 :   fprintf (f, "}\n");
    3378            0 :   fclose (f);
    3379            0 : }
    3380              : 
    3381              : DEBUG_FUNCTION void
    3382            0 : dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
    3383              : {
    3384            0 :   FILE *f = fopen (fname, "w");
    3385            0 :   fprintf (f, "digraph {\n");
    3386            0 :   fflush (f);
    3387            0 :     {
    3388            0 :       debug_dump_context ctx (f);
    3389            0 :       hash_set<slp_tree> visited;
    3390            0 :       for (auto inst : slp_instances)
    3391            0 :         dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
    3392            0 :     }
    3393            0 :   fflush (f);
    3394            0 :   fprintf (f, "}\n");
    3395            0 :   fclose (f);
    3396            0 : }
    3397              : 
    3398              : /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
    3399              : 
    3400              : static void
    3401       477768 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
    3402              :                       slp_tree node, hash_set<slp_tree> &visited)
    3403              : {
    3404       477768 :   unsigned i;
    3405       477768 :   slp_tree child;
    3406              : 
    3407       477768 :   if (visited.add (node))
    3408       477768 :     return;
    3409              : 
    3410       438184 :   vect_print_slp_tree (dump_kind, loc, node);
    3411              : 
    3412      1321925 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3413       445557 :     if (child)
    3414       403596 :       vect_print_slp_graph (dump_kind, loc, child, visited);
    3415              : }
    3416              : 
    3417              : static void
    3418        45709 : vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
    3419              :                       slp_tree entry)
    3420              : {
    3421        45709 :   hash_set<slp_tree> visited;
    3422        45709 :   vect_print_slp_graph (dump_kind, loc, entry, visited);
    3423        45709 : }
    3424              : 
    3425              : DEBUG_FUNCTION void
    3426            0 : debug (slp_instance instance)
    3427              : {
    3428            0 :   debug_dump_context ctx;
    3429            0 :   vect_print_slp_graph (MSG_NOTE,
    3430            0 :                         dump_location_t::from_location_t (UNKNOWN_LOCATION),
    3431              :                         SLP_INSTANCE_TREE (instance));
    3432            0 : }
    3433              : 
    3434              : /* Mark the tree rooted at NODE with PURE_SLP.  */
    3435              : 
    3436              : static void
    3437      2325104 : vect_mark_slp_stmts (vec_info *vinfo, slp_tree node,
    3438              :                      hash_set<slp_tree> &visited)
    3439              : {
    3440      2325104 :   int i;
    3441      2325104 :   stmt_vec_info stmt_info;
    3442      2325104 :   slp_tree child;
    3443              : 
    3444      2325104 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    3445              :     return;
    3446              : 
    3447      1368327 :   if (visited.add (node))
    3448              :     return;
    3449              : 
    3450      4266228 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    3451      3004134 :     if (stmt_info)
    3452              :       {
    3453      3004134 :         STMT_SLP_TYPE (stmt_info) = pure_slp;
    3454              :         /* ???  For .MASK_LOAD and .MASK_STORE detected as load/store-lanes
    3455              :            when there is the mask_conversion pattern applied we have lost the
    3456              :            alternate lanes of the uniform mask which nevertheless
    3457              :            have separate pattern defs.  To not confuse hybrid
    3458              :            analysis we mark those as covered as well here.  */
    3459      3004134 :         if (node->ldst_lanes)
    3460      3004134 :           if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
    3461            0 :             if (gimple_call_internal_p (call, IFN_MASK_LOAD)
    3462            0 :                 || gimple_call_internal_p (call, IFN_MASK_STORE))
    3463              :               {
    3464            0 :                 tree mask = gimple_call_arg (call,
    3465              :                                              internal_fn_mask_index
    3466            0 :                                              (gimple_call_internal_fn (call)));
    3467            0 :                 if (TREE_CODE (mask) == SSA_NAME)
    3468            0 :                   if (stmt_vec_info mask_info = vinfo->lookup_def (mask))
    3469              :                     {
    3470            0 :                       mask_info = vect_stmt_to_vectorize (mask_info);
    3471            0 :                       STMT_SLP_TYPE (mask_info) = pure_slp;
    3472              :                     }
    3473              :               }
    3474              :       }
    3475              : 
    3476      2811520 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3477      1549426 :     if (child)
    3478      1549426 :       vect_mark_slp_stmts (vinfo, child, visited);
    3479              : }
    3480              : 
    3481              : static void
    3482       775678 : vect_mark_slp_stmts (vec_info *vinfo, slp_tree node)
    3483              : {
    3484       775678 :   hash_set<slp_tree> visited;
    3485       775678 :   vect_mark_slp_stmts (vinfo, node, visited);
    3486       775678 : }
    3487              : 
    3488              : /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
    3489              : 
    3490              : static void
    3491      2325104 : vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
    3492              : {
    3493      2325104 :   int i;
    3494      2325104 :   stmt_vec_info stmt_info;
    3495      2325104 :   slp_tree child;
    3496              : 
    3497      2325104 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    3498              :     return;
    3499              : 
    3500      1368327 :   if (visited.add (node))
    3501              :     return;
    3502              : 
    3503      4266228 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    3504      3004134 :     if (stmt_info)
    3505              :       {
    3506      3004134 :         gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
    3507              :                     || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
    3508      3004134 :         STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
    3509              :       }
    3510              : 
    3511      2811520 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3512      1549426 :     if (child)
    3513      1549426 :       vect_mark_slp_stmts_relevant (child, visited);
    3514              : }
    3515              : 
    3516              : static void
    3517       775678 : vect_mark_slp_stmts_relevant (slp_tree node)
    3518              : {
    3519       775678 :   hash_set<slp_tree> visited;
    3520       775678 :   vect_mark_slp_stmts_relevant (node, visited);
    3521       775678 : }
    3522              : 
    3523              : 
    3524              : /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
    3525              : 
    3526              : static void
    3527      9207471 : vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
    3528              :                        hash_set<slp_tree> &visited)
    3529              : {
    3530      9207471 :   if (!node || visited.add (node))
    3531      1409878 :     return;
    3532              : 
    3533      7797593 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    3534              :     return;
    3535              : 
    3536      5684240 :   if (!SLP_TREE_PERMUTE_P (node))
    3537              :     {
    3538      5507132 :       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
    3539      5507132 :       if (STMT_VINFO_DATA_REF (stmt_info)
    3540      2449845 :           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
    3541      1360004 :         loads.safe_push (node);
    3542              :     }
    3543              : 
    3544              :   unsigned i;
    3545              :   slp_tree child;
    3546     12846482 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3547      7162242 :     vect_gather_slp_loads (loads, child, visited);
    3548              : }
    3549              : 
    3550              : 
    3551              : /* Find the last store in SLP INSTANCE.  */
    3552              : 
    3553              : stmt_vec_info
    3554      2717662 : vect_find_last_scalar_stmt_in_slp (slp_tree node)
    3555              : {
    3556      2717662 :   stmt_vec_info last = NULL;
    3557      2717662 :   stmt_vec_info stmt_vinfo;
    3558              : 
    3559      9902102 :   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
    3560      7184440 :     if (stmt_vinfo)
    3561              :       {
    3562      7184440 :         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
    3563      7184440 :         last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
    3564              :       }
    3565              : 
    3566      2717662 :   return last;
    3567              : }
    3568              : 
    3569              : /* Find the first stmt in NODE.  */
    3570              : 
    3571              : stmt_vec_info
    3572       530923 : vect_find_first_scalar_stmt_in_slp (slp_tree node)
    3573              : {
    3574       530923 :   stmt_vec_info first = NULL;
    3575       530923 :   stmt_vec_info stmt_vinfo;
    3576              : 
    3577      1796305 :   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
    3578      1265382 :     if (stmt_vinfo)
    3579              :       {
    3580      1262688 :         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
    3581      1262688 :         if (!first
    3582      1262688 :             || get_later_stmt (stmt_vinfo, first) == first)
    3583              :           first = stmt_vinfo;
    3584              :       }
    3585              : 
    3586       530923 :   return first;
    3587              : }
    3588              : 
    3589              : /* Splits a group of stores, currently beginning at FIRST_VINFO, into
    3590              :    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
    3591              :    (also containing the first GROUP1_SIZE stmts, since stores are
    3592              :    consecutive), the second containing the remainder.
    3593              :    Return the first stmt in the second group.  */
    3594              : 
    3595              : static stmt_vec_info
    3596       156088 : vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
    3597              : {
    3598       156088 :   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
    3599       156088 :   gcc_assert (group1_size > 0);
    3600       156088 :   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
    3601       156088 :   gcc_assert (group2_size > 0);
    3602       156088 :   DR_GROUP_SIZE (first_vinfo) = group1_size;
    3603              : 
    3604       156088 :   stmt_vec_info stmt_info = first_vinfo;
    3605       522407 :   for (unsigned i = group1_size; i > 1; i--)
    3606              :     {
    3607       366319 :       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
    3608       366319 :       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
    3609              :     }
    3610              :   /* STMT is now the last element of the first group.  */
    3611       156088 :   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
    3612       156088 :   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
    3613              : 
    3614       156088 :   DR_GROUP_SIZE (group2) = group2_size;
    3615       436335 :   for (stmt_info = group2; stmt_info;
    3616       280247 :        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
    3617              :     {
    3618       280247 :       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
    3619       280247 :       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
    3620              :     }
    3621              : 
    3622              :   /* For the second group, the DR_GROUP_GAP is that before the original group,
    3623              :      plus skipping over the first vector.  */
    3624       156088 :   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
    3625              : 
    3626              :   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
    3627       156088 :   DR_GROUP_GAP (first_vinfo) += group2_size;
    3628              : 
    3629       156088 :   if (dump_enabled_p ())
    3630           61 :     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
    3631              :                      group1_size, group2_size);
    3632              : 
    3633       156088 :   return group2;
    3634              : }
    3635              : 
    3636              : /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
    3637              :    statements and a vector of NUNITS elements.  */
    3638              : 
    3639              : static poly_uint64
    3640      3673452 : calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
    3641              : {
    3642      3673452 :   return exact_div (common_multiple (nunits, group_size), group_size);
    3643              : }
    3644              : 
    3645              : /* Helper that checks to see if a node is a load node.  */
    3646              : 
    3647              : static inline bool
    3648           54 : vect_is_slp_load_node  (slp_tree root)
    3649              : {
    3650           54 :   return (!SLP_TREE_PERMUTE_P (root)
    3651           54 :           && SLP_TREE_DEF_TYPE (root) == vect_internal_def
    3652           48 :           && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
    3653           94 :           && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
    3654              : }
    3655              : 
    3656              : 
    3657              : /* Helper function of optimize_load_redistribution that performs the operation
    3658              :    recursively.  */
    3659              : 
    3660              : static slp_tree
    3661        20132 : optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
    3662              :                                 vec_info *vinfo, unsigned int group_size,
    3663              :                                 hash_map<slp_tree, slp_tree> *load_map,
    3664              :                                 slp_tree root)
    3665              : {
    3666        20132 :   if (slp_tree *leader = load_map->get (root))
    3667         3576 :     return *leader;
    3668              : 
    3669        16556 :   slp_tree node;
    3670        16556 :   unsigned i;
    3671              : 
    3672              :   /* For now, we don't know anything about externals so do not do anything.  */
    3673        16556 :   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
    3674              :     return NULL;
    3675        12002 :   else if (SLP_TREE_PERMUTE_P (root))
    3676              :     {
    3677              :       /* First convert this node into a load node and add it to the leaves
    3678              :          list and flatten the permute from a lane to a load one.  If it's
    3679              :          unneeded it will be elided later.  */
    3680           34 :       vec<stmt_vec_info> stmts;
    3681           34 :       stmts.create (SLP_TREE_LANES (root));
    3682           34 :       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
    3683           74 :       for (unsigned j = 0; j < lane_perm.length (); j++)
    3684              :         {
    3685           54 :           std::pair<unsigned, unsigned> perm = lane_perm[j];
    3686           54 :           node = SLP_TREE_CHILDREN (root)[perm.first];
    3687              : 
    3688           54 :           if (!vect_is_slp_load_node (node)
    3689           54 :               || SLP_TREE_CHILDREN (node).exists ())
    3690              :             {
    3691           14 :               stmts.release ();
    3692           14 :               goto next;
    3693              :             }
    3694              : 
    3695           40 :           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
    3696              :         }
    3697              : 
    3698           20 :       if (dump_enabled_p ())
    3699            0 :         dump_printf_loc (MSG_NOTE, vect_location,
    3700              :                          "converting stmts on permute node %p\n",
    3701              :                          (void *) root);
    3702              : 
    3703           20 :       bool *matches = XALLOCAVEC (bool, group_size);
    3704           20 :       poly_uint64 max_nunits = 1;
    3705           20 :       unsigned tree_size = 0, limit = 1;
    3706           20 :       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
    3707              :                                   matches, &limit, &tree_size, bst_map);
    3708           20 :       if (!node)
    3709            0 :         stmts.release ();
    3710              : 
    3711           20 :       load_map->put (root, node);
    3712           20 :       return node;
    3713              :     }
    3714              : 
    3715        11968 : next:
    3716        11982 :   load_map->put (root, NULL);
    3717              : 
    3718        28363 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
    3719              :     {
    3720        16381 :       slp_tree value
    3721        16381 :         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
    3722              :                                           node);
    3723        16381 :       if (value)
    3724              :         {
    3725           20 :           SLP_TREE_REF_COUNT (value)++;
    3726           20 :           SLP_TREE_CHILDREN (root)[i] = value;
    3727              :           /* ???  We know the original leafs of the replaced nodes will
    3728              :              be referenced by bst_map, only the permutes created by
    3729              :              pattern matching are not.  */
    3730           20 :           if (SLP_TREE_REF_COUNT (node) == 1)
    3731           20 :             load_map->remove (node);
    3732           20 :           vect_free_slp_tree (node);
    3733              :         }
    3734              :     }
    3735              : 
    3736              :   return NULL;
    3737              : }
    3738              : 
    3739              : /* Temporary workaround for loads not being CSEd during SLP build.  This
    3740              :    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
    3741              :    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
    3742              :    same DR such that the final operation is equal to a permuted load.  Such
    3743              :    NODES are then directly converted into LOADS themselves.  The nodes are
    3744              :    CSEd using BST_MAP.  */
    3745              : 
    3746              : static void
    3747         2835 : optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
    3748              :                               vec_info *vinfo, unsigned int group_size,
    3749              :                               hash_map<slp_tree, slp_tree> *load_map,
    3750              :                               slp_tree root)
    3751              : {
    3752         2835 :   slp_tree node;
    3753         2835 :   unsigned i;
    3754              : 
    3755         6586 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
    3756              :     {
    3757         3751 :       slp_tree value
    3758         3751 :         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
    3759              :                                           node);
    3760         3751 :       if (value)
    3761              :         {
    3762            0 :           SLP_TREE_REF_COUNT (value)++;
    3763            0 :           SLP_TREE_CHILDREN (root)[i] = value;
    3764              :           /* ???  We know the original leafs of the replaced nodes will
    3765              :              be referenced by bst_map, only the permutes created by
    3766              :              pattern matching are not.  */
    3767            0 :           if (SLP_TREE_REF_COUNT (node) == 1)
    3768            0 :             load_map->remove (node);
    3769            0 :           vect_free_slp_tree (node);
    3770              :         }
    3771              :     }
    3772         2835 : }
    3773              : 
    3774              : /* Helper function of vect_match_slp_patterns.
    3775              : 
    3776              :    Attempts to match patterns against the slp tree rooted in REF_NODE using
    3777              :    VINFO.  Patterns are matched in post-order traversal.
    3778              : 
    3779              :    If matching is successful the value in REF_NODE is updated and returned, if
    3780              :    not then it is returned unchanged.  */
    3781              : 
    3782              : static bool
    3783      5467318 : vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
    3784              :                            slp_tree_to_load_perm_map_t *perm_cache,
    3785              :                            slp_compat_nodes_map_t *compat_cache,
    3786              :                            hash_set<slp_tree> *visited)
    3787              : {
    3788      5467318 :   unsigned i;
    3789      5467318 :   slp_tree node = *ref_node;
    3790      5467318 :   bool found_p = false;
    3791      5467318 :   if (!node || visited->add (node))
    3792       723008 :     return false;
    3793              : 
    3794              :   slp_tree child;
    3795      8763405 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    3796      4019095 :     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
    3797              :                                           vinfo, perm_cache, compat_cache,
    3798              :                                           visited);
    3799              : 
    3800     14232930 :   for (unsigned x = 0; x < num__slp_patterns; x++)
    3801              :     {
    3802      9488620 :       vect_pattern *pattern
    3803      9488620 :         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
    3804      9488620 :       if (pattern)
    3805              :         {
    3806         1081 :           pattern->build (vinfo);
    3807         1081 :           delete pattern;
    3808         1081 :           found_p = true;
    3809              :         }
    3810              :     }
    3811              : 
    3812              :   return found_p;
    3813              : }
    3814              : 
    3815              : /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
    3816              :    vec_info VINFO.
    3817              : 
    3818              :    The modified tree is returned.  Patterns are tried in order and multiple
    3819              :    patterns may match.  */
    3820              : 
    3821              : static bool
    3822      1448223 : vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
    3823              :                          hash_set<slp_tree> *visited,
    3824              :                          slp_tree_to_load_perm_map_t *perm_cache,
    3825              :                          slp_compat_nodes_map_t *compat_cache)
    3826              : {
    3827      1448223 :   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
    3828      1448223 :   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
    3829              : 
    3830      1448223 :   if (dump_enabled_p ())
    3831        29623 :     dump_printf_loc (MSG_NOTE, vect_location,
    3832              :                      "Analyzing SLP tree %p for patterns\n",
    3833        29623 :                      (void *) SLP_INSTANCE_TREE (instance));
    3834              : 
    3835      1448223 :   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
    3836      1448223 :                                     visited);
    3837              : }
    3838              : 
    3839              : /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
    3840              :    vectorizing with VECTYPE that might be NULL.  MASKED_P indicates whether
    3841              :    the stores are masked.
    3842              :    Return true if we could use IFN_STORE_LANES instead and if that appears
    3843              :    to be the better approach.  */
    3844              : 
    3845              : static bool
    3846         4866 : vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
    3847              :                                tree vectype, bool masked_p,
    3848              :                                unsigned int group_size,
    3849              :                                unsigned int new_group_size)
    3850              : {
    3851         4866 :   if (!vectype)
    3852              :     {
    3853         4866 :       tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
    3854         4866 :       vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
    3855              :     }
    3856         4866 :   if (!vectype)
    3857              :     return false;
    3858              :   /* Allow the split if one of the two new groups would operate on full
    3859              :      vectors *within* rather than across one scalar loop iteration.
    3860              :      This is purely a heuristic, but it should work well for group
    3861              :      sizes of 3 and 4, where the possible splits are:
    3862              : 
    3863              :        3->2+1:  OK if the vector has exactly two elements
    3864              :        4->2+2:  Likewise
    3865              :        4->3+1:  Less clear-cut.  */
    3866         4866 :   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
    3867         2537 :       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
    3868         2346 :     return false;
    3869         2520 :   return vect_store_lanes_supported (vectype, group_size, masked_p) != IFN_LAST;
    3870              : }
    3871              : 
    3872              : /* Analyze an SLP instance starting from a group of grouped stores.  Call
    3873              :    vect_build_slp_tree to build a tree of packed stmts if possible.
    3874              :    Return FALSE if it's impossible to SLP any stmt in the loop.  */
    3875              : 
    3876              : static bool
    3877              : vect_analyze_slp_instance (vec_info *vinfo,
    3878              :                            scalar_stmts_to_slp_tree_map_t *bst_map,
    3879              :                            stmt_vec_info stmt_info, slp_instance_kind kind,
    3880              :                            unsigned max_tree_size, unsigned *limit,
    3881              :                            bool force_single_lane);
    3882              : 
    3883              : /* Build an interleaving scheme for the store sources RHS_NODES from
    3884              :    SCALAR_STMTS.  */
    3885              : 
    3886              : static slp_tree
    3887         6204 : vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
    3888              :                                    vec<stmt_vec_info> &scalar_stmts,
    3889              :                                    poly_uint64 max_nunits)
    3890              : {
    3891         6204 :   unsigned int group_size = scalar_stmts.length ();
    3892        12408 :   slp_tree node = vect_create_new_slp_node (scalar_stmts,
    3893         6204 :                                             SLP_TREE_CHILDREN
    3894              :                                               (rhs_nodes[0]).length ());
    3895         6204 :   SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
    3896         6204 :   node->max_nunits = max_nunits;
    3897         6204 :   for (unsigned l = 0;
    3898        12435 :        l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
    3899              :     {
    3900              :       /* And a permute merging all RHS SLP trees.  */
    3901         6231 :       slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
    3902         6231 :                                                 VEC_PERM_EXPR);
    3903         6231 :       SLP_TREE_CHILDREN (node).quick_push (perm);
    3904         6231 :       SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
    3905         6231 :       SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
    3906         6231 :       perm->max_nunits = max_nunits;
    3907         6231 :       SLP_TREE_LANES (perm) = group_size;
    3908              :       /* ???  We should set this NULL but that's not expected.  */
    3909         6231 :       SLP_TREE_REPRESENTATIVE (perm)
    3910         6231 :         = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
    3911        24558 :       for (unsigned j = 0; j < rhs_nodes.length (); ++j)
    3912              :         {
    3913        18327 :           SLP_TREE_CHILDREN (perm)
    3914        18327 :             .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
    3915        18327 :           SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
    3916        18327 :           for (unsigned k = 0;
    3917        38665 :                k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
    3918              :             {
    3919              :               /* ???  We should populate SLP_TREE_SCALAR_STMTS
    3920              :                  or SLP_TREE_SCALAR_OPS but then we might have
    3921              :                  a mix of both in our children.  */
    3922        20338 :               SLP_TREE_LANE_PERMUTATION (perm)
    3923        20338 :                 .quick_push (std::make_pair (j, k));
    3924              :             }
    3925              :         }
    3926              : 
    3927              :       /* Now we have a single permute node but we cannot code-generate
    3928              :          the case with more than two inputs.
    3929              :          Perform pairwise reduction, reducing the two inputs
    3930              :          with the least number of lanes to one and then repeat until
    3931              :          we end up with two inputs.  That scheme makes sure we end
    3932              :          up with permutes satisfying the restriction of requiring at
    3933              :          most two vector inputs to produce a single vector output
    3934              :          when the number of lanes is even.  */
    3935        12096 :       while (SLP_TREE_CHILDREN (perm).length () > 2)
    3936              :         {
    3937              :           /* When we have three equal sized groups left the pairwise
    3938              :              reduction does not result in a scheme that avoids using
    3939              :              three vectors.  Instead merge the first two groups
    3940              :              to the final size with do-not-care elements (chosen
    3941              :              from the first group) and then merge with the third.
    3942              :                   { A0, B0,  x, A1, B1,  x, ... }
    3943              :                -> { A0, B0, C0, A1, B1, C1, ... }
    3944              :              This handles group size of three (and at least
    3945              :              power-of-two multiples of that).  */
    3946         5865 :           if (SLP_TREE_CHILDREN (perm).length () == 3
    3947         3022 :               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
    3948         3022 :                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
    3949         5865 :               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
    3950         2280 :                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
    3951              :             {
    3952         2084 :               int ai = 0;
    3953         2084 :               int bi = 1;
    3954         2084 :               slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
    3955         2084 :               slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
    3956         2084 :               unsigned n = SLP_TREE_LANES (perm);
    3957              : 
    3958         2084 :               slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
    3959         2084 :               SLP_TREE_LANES (permab) = n;
    3960         2084 :               SLP_TREE_LANE_PERMUTATION (permab).create (n);
    3961         2084 :               SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
    3962         2084 :               permab->max_nunits = max_nunits;
    3963              :               /* ???  Should be NULL but that's not expected.  */
    3964         2084 :               SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
    3965         2084 :               SLP_TREE_CHILDREN (permab).quick_push (a);
    3966         4179 :               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
    3967         2095 :                 SLP_TREE_LANE_PERMUTATION (permab)
    3968         2095 :                   .quick_push (std::make_pair (0, k));
    3969         2084 :               SLP_TREE_CHILDREN (permab).quick_push (b);
    3970         4179 :               for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
    3971         2095 :                 SLP_TREE_LANE_PERMUTATION (permab)
    3972         2095 :                   .quick_push (std::make_pair (1, k));
    3973              :               /* Push the do-not-care lanes.  */
    3974         4179 :               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
    3975         2095 :                 SLP_TREE_LANE_PERMUTATION (permab)
    3976         2095 :                   .quick_push (std::make_pair (0, k));
    3977              : 
    3978              :               /* Put the merged node into 'perm', in place of a.  */
    3979         2084 :               SLP_TREE_CHILDREN (perm)[ai] = permab;
    3980              :               /* Adjust the references to b in the permutation
    3981              :                  of perm and to the later children which we'll
    3982              :                  remove.  */
    3983         8369 :               for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
    3984              :                 {
    3985         6285 :                   std::pair<unsigned, unsigned> &p
    3986         6285 :                     = SLP_TREE_LANE_PERMUTATION (perm)[k];
    3987         6285 :                   if (p.first == (unsigned) bi)
    3988              :                     {
    3989         2095 :                       p.first = ai;
    3990         2095 :                       p.second += SLP_TREE_LANES (a);
    3991              :                     }
    3992         4190 :                   else if (p.first > (unsigned) bi)
    3993         2095 :                     p.first--;
    3994              :                 }
    3995         2084 :               SLP_TREE_CHILDREN (perm).ordered_remove (bi);
    3996         2084 :               break;
    3997              :             }
    3998              : 
    3999              :           /* Pick the two nodes with the least number of lanes,
    4000              :              prefer the earliest candidate and maintain ai < bi.  */
    4001              :           int ai = -1;
    4002              :           int bi = -1;
    4003        33069 :           for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
    4004              :             {
    4005        29288 :               if (ai == -1)
    4006         3781 :                 ai = ci;
    4007        25507 :               else if (bi == -1)
    4008         3781 :                 bi = ci;
    4009        21726 :               else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
    4010        21726 :                         < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
    4011        21726 :                        || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
    4012        17812 :                            < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
    4013              :                 {
    4014         8714 :                   if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
    4015         4357 :                       <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
    4016         2074 :                     bi = ci;
    4017              :                   else
    4018              :                     {
    4019         2283 :                       ai = bi;
    4020         2283 :                       bi = ci;
    4021              :                     }
    4022              :                 }
    4023              :             }
    4024              : 
    4025              :           /* Produce a merge of nodes ai and bi.  */
    4026         3781 :           slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
    4027         3781 :           slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
    4028         3781 :           unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
    4029         3781 :           slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
    4030         3781 :           SLP_TREE_LANES (permab) = n;
    4031         3781 :           SLP_TREE_LANE_PERMUTATION (permab).create (n);
    4032         3781 :           SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
    4033         3781 :           permab->max_nunits = max_nunits;
    4034              :           /* ???  Should be NULL but that's not expected.  */
    4035         3781 :           SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
    4036         3781 :           SLP_TREE_CHILDREN (permab).quick_push (a);
    4037         9886 :           for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
    4038         6105 :             SLP_TREE_LANE_PERMUTATION (permab)
    4039         6105 :               .quick_push (std::make_pair (0, k));
    4040         3781 :           SLP_TREE_CHILDREN (permab).quick_push (b);
    4041         9398 :           for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
    4042         5617 :             SLP_TREE_LANE_PERMUTATION (permab)
    4043         5617 :               .quick_push (std::make_pair (1, k));
    4044              : 
    4045              :           /* Put the merged node into 'perm', in place of a.  */
    4046         3781 :           SLP_TREE_CHILDREN (perm)[ai] = permab;
    4047              :           /* Adjust the references to b in the permutation
    4048              :              of perm and to the later children which we'll
    4049              :              remove.  */
    4050        52693 :           for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
    4051              :             {
    4052        48912 :               std::pair<unsigned, unsigned> &p
    4053        48912 :                 = SLP_TREE_LANE_PERMUTATION (perm)[k];
    4054        48912 :               if (p.first == (unsigned) bi)
    4055              :                 {
    4056         5617 :                   p.first = ai;
    4057         5617 :                   p.second += SLP_TREE_LANES (a);
    4058              :                 }
    4059        43295 :               else if (p.first > (unsigned) bi)
    4060        17862 :                 p.first--;
    4061              :             }
    4062         3781 :           SLP_TREE_CHILDREN (perm).ordered_remove (bi);
    4063              :         }
    4064              :     }
    4065              : 
    4066         6204 :   return node;
    4067              : }
    4068              : 
    4069              : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
    4070              :    of KIND.  Return true if successful.  SCALAR_STMTS is owned by this
    4071              :    function, REMAIN and ROOT_STMT_INFOS ownership is transferred back to
    4072              :    the caller upon failure.  */
    4073              : 
    4074              : static bool
    4075      1795331 : vect_build_slp_instance (vec_info *vinfo,
    4076              :                          slp_instance_kind kind,
    4077              :                          vec<stmt_vec_info> &scalar_stmts,
    4078              :                          vec<stmt_vec_info> &root_stmt_infos,
    4079              :                          vec<tree> &remain,
    4080              :                          unsigned max_tree_size, unsigned *limit,
    4081              :                          scalar_stmts_to_slp_tree_map_t *bst_map,
    4082              :                          bool force_single_lane)
    4083              : {
    4084              :   /* If there's no budget left bail out early.  */
    4085      1795331 :   if (*limit == 0)
    4086              :     {
    4087        27205 :       scalar_stmts.release ();
    4088        27205 :       return false;
    4089              :     }
    4090              : 
    4091      1768126 :   if (kind == slp_inst_kind_ctor)
    4092              :     {
    4093        12899 :       if (dump_enabled_p ())
    4094           86 :         dump_printf_loc (MSG_NOTE, vect_location,
    4095              :                          "Analyzing vectorizable constructor: %G\n",
    4096           43 :                          root_stmt_infos[0]->stmt);
    4097              :     }
    4098      1755227 :   else if (kind == slp_inst_kind_gcond)
    4099              :     {
    4100       272720 :       if (dump_enabled_p ())
    4101         5558 :         dump_printf_loc (MSG_NOTE, vect_location,
    4102              :                          "Analyzing vectorizable control flow: %G",
    4103         2779 :                          root_stmt_infos[0]->stmt);
    4104              :     }
    4105              : 
    4106      1768126 :   if (dump_enabled_p ())
    4107              :     {
    4108        24809 :       dump_printf_loc (MSG_NOTE, vect_location,
    4109              :                        "Starting SLP discovery for\n");
    4110        52961 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4111        56304 :         dump_printf_loc (MSG_NOTE, vect_location,
    4112        28152 :                          "  %G", scalar_stmts[i]->stmt);
    4113              :     }
    4114              : 
    4115              :   /* Build the tree for the SLP instance.  */
    4116      1768126 :   unsigned int group_size = scalar_stmts.length ();
    4117      1768126 :   bool *matches = XALLOCAVEC (bool, group_size);
    4118      1768126 :   poly_uint64 max_nunits = 1;
    4119      1768126 :   unsigned tree_size = 0;
    4120              : 
    4121      1768126 :   slp_tree node = NULL;
    4122      1768126 :   if (group_size > 1 && force_single_lane)
    4123              :     {
    4124            0 :       matches[0] = true;
    4125            0 :       matches[1] = false;
    4126              :     }
    4127              :   else
    4128      1768126 :     node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4129              :                                 &max_nunits, matches, limit,
    4130              :                                 &tree_size, bst_map);
    4131      1768126 :   if (node != NULL)
    4132              :     {
    4133              :       /* Calculate the unrolling factor based on the smallest type.  */
    4134       701136 :       poly_uint64 unrolling_factor
    4135       701136 :         = calculate_unrolling_factor (max_nunits, group_size);
    4136              : 
    4137       701136 :       if (maybe_ne (unrolling_factor, 1U)
    4138       701136 :           && is_a <bb_vec_info> (vinfo))
    4139              :         {
    4140            0 :           unsigned HOST_WIDE_INT const_max_nunits;
    4141            0 :           if (!max_nunits.is_constant (&const_max_nunits)
    4142            0 :               || const_max_nunits > group_size)
    4143              :             {
    4144            0 :               if (dump_enabled_p ())
    4145            0 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    4146              :                                  "Build SLP failed: store group "
    4147              :                                  "size not a multiple of the vector size "
    4148              :                                  "in basic block SLP\n");
    4149            0 :               vect_free_slp_tree (node);
    4150            0 :               return false;
    4151              :             }
    4152              :           /* Fatal mismatch.  */
    4153            0 :           if (dump_enabled_p ())
    4154            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    4155              :                              "SLP discovery succeeded but node needs "
    4156              :                              "splitting\n");
    4157            0 :           memset (matches, true, group_size);
    4158            0 :           matches[group_size / const_max_nunits * const_max_nunits] = false;
    4159            0 :           vect_free_slp_tree (node);
    4160              :         }
    4161              :       else
    4162              :         {
    4163              :           /* Create a new SLP instance.  */
    4164       701136 :           slp_instance new_instance = XNEW (class _slp_instance);
    4165       701136 :           SLP_INSTANCE_TREE (new_instance) = node;
    4166       701136 :           SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4167       701136 :           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    4168       701136 :           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    4169       701136 :           SLP_INSTANCE_KIND (new_instance) = kind;
    4170       701136 :           new_instance->reduc_phis = NULL;
    4171       701136 :           new_instance->cost_vec = vNULL;
    4172       701136 :           new_instance->subgraph_entries = vNULL;
    4173              : 
    4174       701136 :           if (dump_enabled_p ())
    4175        21827 :             dump_printf_loc (MSG_NOTE, vect_location,
    4176              :                              "SLP size %u vs. limit %u.\n",
    4177              :                              tree_size, max_tree_size);
    4178              : 
    4179       701136 :           vinfo->slp_instances.safe_push (new_instance);
    4180              : 
    4181              :           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4182              :              the number of scalar stmts in the root in a few places.
    4183              :              Verify that assumption holds.  */
    4184      1402272 :           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4185              :                         .length () == group_size);
    4186              : 
    4187       701136 :           if (dump_enabled_p ())
    4188              :             {
    4189        21827 :               if (kind == slp_inst_kind_reduc_group)
    4190         1431 :                 dump_printf_loc (MSG_NOTE, vect_location,
    4191              :                                  "SLP discovery of size %d reduction group "
    4192              :                                  "succeeded\n", group_size);
    4193        21827 :               dump_printf_loc (MSG_NOTE, vect_location,
    4194              :                                "Final SLP tree for instance %p:\n",
    4195              :                                (void *) new_instance);
    4196        21827 :               vect_print_slp_graph (MSG_NOTE, vect_location,
    4197              :                                     SLP_INSTANCE_TREE (new_instance));
    4198              :             }
    4199              : 
    4200       701136 :           return true;
    4201              :         }
    4202              :     }
    4203              :   /* Failed to SLP.  */
    4204              : 
    4205              :   /* While we arrive here even with slp_inst_kind_store we should only
    4206              :      for group_size == 1.  The code to split store groups is only in
    4207              :      vect_analyze_slp_instance now.  */
    4208      1066990 :   gcc_assert (kind != slp_inst_kind_store || group_size == 1);
    4209              : 
    4210              :   /* Free the allocated memory.  */
    4211      1066990 :   scalar_stmts.release ();
    4212              : 
    4213              :   /* Failed to SLP.  */
    4214      1066990 :   if (dump_enabled_p ())
    4215         2982 :     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
    4216              :   return false;
    4217              : }
    4218              : 
    4219              : /* Analyze an SLP instance starting from a the start of a reduction chain.
    4220              :    Call vect_build_slp_tree to build a tree of packed stmts if possible.
    4221              :    Return FALSE if SLP build fails.  */
    4222              : 
    4223              : static bool
    4224        42784 : vect_analyze_slp_reduc_chain (loop_vec_info vinfo,
    4225              :                               scalar_stmts_to_slp_tree_map_t *bst_map,
    4226              :                               stmt_vec_info scalar_stmt,
    4227              :                               unsigned max_tree_size, unsigned *limit)
    4228              : {
    4229        42784 :   vec<stmt_vec_info> scalar_stmts = vNULL;
    4230              : 
    4231        42784 :   bool fail = false;
    4232              :   /* ???  We could leave operation code checking to SLP discovery.  */
    4233        42784 :   code_helper code = STMT_VINFO_REDUC_CODE (STMT_VINFO_REDUC_DEF
    4234              :                                               (vect_orig_stmt (scalar_stmt)));
    4235        42784 :   bool first = true;
    4236        42784 :   stmt_vec_info next_stmt = scalar_stmt;
    4237        47928 :   do
    4238              :     {
    4239        47928 :       stmt_vec_info stmt = next_stmt;
    4240        47928 :       gimple_match_op op;
    4241        47928 :       if (!gimple_extract_op (STMT_VINFO_STMT (stmt), &op))
    4242            0 :         gcc_unreachable ();
    4243        95856 :       tree reduc_def = gimple_arg (STMT_VINFO_STMT (stmt),
    4244        47928 :                                    STMT_VINFO_REDUC_IDX (stmt));
    4245        47928 :       next_stmt = vect_stmt_to_vectorize (vinfo->lookup_def (reduc_def));
    4246        47928 :       gcc_assert (is_a <gphi *> (STMT_VINFO_STMT (next_stmt))
    4247              :                   || STMT_VINFO_REDUC_IDX (next_stmt) != -1);
    4248        51332 :       if (!gimple_extract_op (STMT_VINFO_STMT (vect_orig_stmt (stmt)), &op))
    4249            0 :         gcc_unreachable ();
    4250        47928 :       if (CONVERT_EXPR_CODE_P (op.code)
    4251         2149 :           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0]))
    4252        50065 :           && (first
    4253         1058 :               || is_a <gphi *> (STMT_VINFO_STMT (next_stmt))))
    4254              :         ;
    4255        45793 :       else if (code != op.code)
    4256              :         {
    4257         1718 :           fail = true;
    4258         1718 :           break;
    4259              :         }
    4260              :       else
    4261        44075 :         scalar_stmts.safe_push (stmt);
    4262        46210 :       first = false;
    4263              :     }
    4264        46210 :   while (!is_a <gphi *> (STMT_VINFO_STMT (next_stmt)));
    4265        42784 :   if (fail)
    4266         1718 :     return false;
    4267              : 
    4268              :   /* Remember a stmt with the actual reduction operation.  */
    4269        41066 :   stmt_vec_info reduc_scalar_stmt = scalar_stmts[0];
    4270              : 
    4271              :   /* When the SSA def chain through reduc-idx does not form a natural
    4272              :      reduction chain try to linearize an associative operation manually.  */
    4273        41066 :   if (scalar_stmts.length () == 1
    4274        39411 :       && code.is_tree_code ()
    4275        36025 :       && associative_tree_code ((tree_code)code)
    4276              :       /* We may not associate if a fold-left reduction is required.  */
    4277        76220 :       && !needs_fold_left_reduction_p (TREE_TYPE (gimple_get_lhs
    4278              :                                                     (reduc_scalar_stmt->stmt)),
    4279              :                                        code))
    4280              :     {
    4281        33330 :       auto_vec<chain_op_t> chain;
    4282        33330 :       auto_vec<std::pair<tree_code, gimple *> > worklist;
    4283        33330 :       gimple *op_stmt = NULL, *other_op_stmt = NULL;
    4284        33330 :       vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
    4285        33330 :                                 scalar_stmts[0]->stmt, op_stmt, other_op_stmt,
    4286              :                                 NULL);
    4287              : 
    4288        33330 :       scalar_stmts.truncate (0);
    4289        33330 :       stmt_vec_info tail = NULL;
    4290       165891 :       for (auto el : chain)
    4291              :         {
    4292        66583 :           if (el.dt == vect_external_def
    4293        66583 :               || el.dt == vect_constant_def
    4294        66583 :               || el.code != (tree_code) code)
    4295              :             {
    4296          682 :               scalar_stmts.release ();
    4297          682 :               return false;
    4298              :             }
    4299        65901 :           stmt_vec_info stmt = vinfo->lookup_def (el.op);
    4300        65901 :           if (STMT_VINFO_REDUC_IDX (stmt) != -1
    4301        64934 :               || STMT_VINFO_REDUC_DEF (stmt))
    4302              :             {
    4303        32824 :               gcc_assert (tail == NULL);
    4304        32824 :               tail = stmt;
    4305        32824 :               continue;
    4306              :             }
    4307        33077 :           scalar_stmts.safe_push (stmt);
    4308              :         }
    4309        32648 :       gcc_assert (tail);
    4310              : 
    4311              :       /* When this linearization didn't produce a chain see if stripping
    4312              :          a wrapping sign conversion produces one.  */
    4313        32648 :       if (scalar_stmts.length () == 1
    4314        32648 :           && (code == PLUS_EXPR || code == MULT_EXPR || code == BIT_IOR_EXPR
    4315              :               || code == BIT_AND_EXPR || code == BIT_XOR_EXPR))
    4316              :         {
    4317        31358 :           gimple *stmt = scalar_stmts[0]->stmt;
    4318        31358 :           if (!is_gimple_assign (stmt)
    4319        30318 :               || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))
    4320         3917 :               || TREE_CODE (gimple_assign_rhs1 (stmt)) != SSA_NAME
    4321        35275 :               || !tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
    4322         3917 :                                          TREE_TYPE (gimple_assign_rhs1 (stmt))))
    4323              :             {
    4324        29878 :               scalar_stmts.release ();
    4325        29878 :               return false;
    4326              :             }
    4327         1480 :           stmt = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (stmt));
    4328         1480 :           if (!is_gimple_assign (stmt)
    4329         1480 :               || gimple_assign_rhs_code (stmt) != (tree_code)code)
    4330              :             {
    4331         1462 :               scalar_stmts.release ();
    4332         1462 :               return false;
    4333              :             }
    4334           18 :           chain.truncate (0);
    4335           18 :           vect_slp_linearize_chain (vinfo, worklist, chain, (tree_code)code,
    4336              :                                     stmt, op_stmt, other_op_stmt, NULL);
    4337              : 
    4338           18 :           scalar_stmts.truncate (0);
    4339           18 :           tail = NULL;
    4340           88 :           for (auto el : chain)
    4341              :             {
    4342           42 :               if (el.dt == vect_external_def
    4343           42 :                   || el.dt == vect_constant_def
    4344           42 :                   || el.code != (tree_code) code)
    4345              :                 {
    4346            8 :                   scalar_stmts.release ();
    4347            8 :                   return false;
    4348              :                 }
    4349           34 :               stmt_vec_info stmt = vinfo->lookup_def (el.op);
    4350           34 :               if (STMT_VINFO_REDUC_IDX (stmt) != -1
    4351           34 :                   || STMT_VINFO_REDUC_DEF (stmt))
    4352              :                 {
    4353            0 :                   gcc_assert (tail == NULL);
    4354            0 :                   tail = stmt;
    4355            0 :                   continue;
    4356              :                 }
    4357           34 :               scalar_stmts.safe_push (stmt);
    4358              :             }
    4359              :           /* Unlike the above this does not include the reduction SSA
    4360              :              cycle.  */
    4361           10 :           gcc_assert (!tail);
    4362              :         }
    4363              : 
    4364         1300 :       if (scalar_stmts.length () < 2)
    4365              :         {
    4366         1207 :           scalar_stmts.release ();
    4367         1207 :           return false;
    4368              :         }
    4369              : 
    4370           93 :       if (dump_enabled_p ())
    4371              :         {
    4372           34 :           dump_printf_loc (MSG_NOTE, vect_location,
    4373              :                            "Starting SLP discovery of reduction chain for\n");
    4374          140 :           for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4375          212 :             dump_printf_loc (MSG_NOTE, vect_location,
    4376          106 :                              "  %G", scalar_stmts[i]->stmt);
    4377              :         }
    4378              : 
    4379           93 :       unsigned int group_size = scalar_stmts.length ();
    4380           93 :       bool *matches = XALLOCAVEC (bool, group_size);
    4381           93 :       poly_uint64 max_nunits = 1;
    4382           93 :       unsigned tree_size = 0;
    4383           93 :       slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4384              :                                            &max_nunits, matches, limit,
    4385           93 :                                            &tree_size, bst_map);
    4386           93 :       if (!node)
    4387              :         {
    4388           37 :           scalar_stmts.release ();
    4389           37 :           return false;
    4390              :         }
    4391              : 
    4392           56 :       unsigned cycle_id = vinfo->reduc_infos.length ();
    4393           56 :       vect_reduc_info reduc_info = new vect_reduc_info_s ();
    4394           56 :       vinfo->reduc_infos.safe_push (reduc_info);
    4395           56 :       VECT_REDUC_INFO_DEF_TYPE (reduc_info) = STMT_VINFO_DEF_TYPE (next_stmt);
    4396           56 :       VECT_REDUC_INFO_TYPE (reduc_info) = STMT_VINFO_REDUC_TYPE (next_stmt);
    4397           56 :       VECT_REDUC_INFO_CODE (reduc_info) = STMT_VINFO_REDUC_CODE (next_stmt);
    4398           56 :       VECT_REDUC_INFO_FN (reduc_info) = IFN_LAST;
    4399           56 :       reduc_info->is_reduc_chain = true;
    4400              : 
    4401              :       /* Build the node for the PHI and possibly the conversions.  */
    4402           56 :       slp_tree phis = vect_create_new_slp_node (2, ERROR_MARK);
    4403           56 :       SLP_TREE_REPRESENTATIVE (phis) = next_stmt;
    4404           56 :       phis->cycle_info.id = cycle_id;
    4405           56 :       SLP_TREE_LANES (phis) = group_size;
    4406           56 :       if (reduc_scalar_stmt == scalar_stmt)
    4407           52 :         SLP_TREE_VECTYPE (phis) = SLP_TREE_VECTYPE (node);
    4408              :       else
    4409            4 :         SLP_TREE_VECTYPE (phis)
    4410            4 :           = signed_or_unsigned_type_for (TYPE_UNSIGNED
    4411              :                                            (TREE_TYPE (gimple_get_lhs
    4412              :                                                          (scalar_stmt->stmt))),
    4413              :                                          SLP_TREE_VECTYPE (node));
    4414              :       /* ???  vect_cse_slp_nodes cannot cope with cycles without any
    4415              :          SLP_TREE_SCALAR_STMTS.  */
    4416           56 :       SLP_TREE_SCALAR_STMTS (phis).create (group_size);
    4417          235 :       for (unsigned i = 0; i < group_size; ++i)
    4418          179 :         SLP_TREE_SCALAR_STMTS (phis).quick_push (next_stmt);
    4419              : 
    4420           56 :       slp_tree op_input = phis;
    4421           56 :       if (reduc_scalar_stmt != scalar_stmt)
    4422              :         {
    4423            4 :           slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
    4424            4 :           SLP_TREE_REPRESENTATIVE (conv)
    4425            4 :             = vinfo->lookup_def (gimple_arg (reduc_scalar_stmt->stmt,
    4426            4 :                                              STMT_VINFO_REDUC_IDX
    4427              :                                                (reduc_scalar_stmt)));
    4428            4 :           SLP_TREE_CHILDREN (conv).quick_push (phis);
    4429            4 :           conv->cycle_info.id = cycle_id;
    4430            4 :           SLP_TREE_REDUC_IDX (conv) = 0;
    4431            4 :           SLP_TREE_LANES (conv) = group_size;
    4432            4 :           SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (node);
    4433            4 :           SLP_TREE_SCALAR_STMTS (conv) = vNULL;
    4434            4 :           op_input = conv;
    4435              :         }
    4436              : 
    4437           56 :       slp_tree reduc = vect_create_new_slp_node (2, ERROR_MARK);
    4438           56 :       SLP_TREE_REPRESENTATIVE (reduc) = reduc_scalar_stmt;
    4439           56 :       SLP_TREE_CHILDREN (reduc).quick_push (op_input);
    4440           56 :       SLP_TREE_CHILDREN (reduc).quick_push (node);
    4441           56 :       reduc->cycle_info.id = cycle_id;
    4442           56 :       SLP_TREE_REDUC_IDX (reduc) = 0;
    4443           56 :       SLP_TREE_LANES (reduc) = group_size;
    4444           56 :       SLP_TREE_VECTYPE (reduc) = SLP_TREE_VECTYPE (node);
    4445              :       /* ???  For the reduction epilogue we need a live lane.  */
    4446           56 :       SLP_TREE_SCALAR_STMTS (reduc).create (group_size);
    4447           56 :       SLP_TREE_SCALAR_STMTS (reduc).quick_push (reduc_scalar_stmt);
    4448          179 :       for (unsigned i = 1; i < group_size; ++i)
    4449          123 :         SLP_TREE_SCALAR_STMTS (reduc).quick_push (NULL);
    4450              : 
    4451           56 :       if (reduc_scalar_stmt != scalar_stmt)
    4452              :         {
    4453            4 :           slp_tree conv = vect_create_new_slp_node (1, ERROR_MARK);
    4454            4 :           SLP_TREE_REPRESENTATIVE (conv) = scalar_stmt;
    4455            4 :           SLP_TREE_CHILDREN (conv).quick_push (reduc);
    4456            4 :           conv->cycle_info.id = cycle_id;
    4457            4 :           SLP_TREE_REDUC_IDX (conv) = 0;
    4458            4 :           SLP_TREE_LANES (conv) = group_size;
    4459            4 :           SLP_TREE_VECTYPE (conv) = SLP_TREE_VECTYPE (phis);
    4460              :           /* ???  For the reduction epilogue we need a live lane.  */
    4461            4 :           SLP_TREE_SCALAR_STMTS (conv).create (group_size);
    4462            4 :           SLP_TREE_SCALAR_STMTS (conv).quick_push (scalar_stmt);
    4463            8 :           for (unsigned i = 1; i < group_size; ++i)
    4464            4 :             SLP_TREE_SCALAR_STMTS (conv).quick_push (NULL);
    4465            4 :           reduc = conv;
    4466              :         }
    4467              : 
    4468           56 :       edge le = loop_latch_edge (LOOP_VINFO_LOOP (vinfo));
    4469           56 :       SLP_TREE_CHILDREN (phis).quick_push (NULL);
    4470           56 :       SLP_TREE_CHILDREN (phis).quick_push (NULL);
    4471           56 :       SLP_TREE_CHILDREN (phis)[le->dest_idx] = reduc;
    4472           56 :       SLP_TREE_REF_COUNT (reduc)++;
    4473              : 
    4474              :       /* Create a new SLP instance.  */
    4475           56 :       slp_instance new_instance = XNEW (class _slp_instance);
    4476           56 :       SLP_INSTANCE_TREE (new_instance) = reduc;
    4477           56 :       SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4478           56 :       SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
    4479           56 :       SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
    4480           56 :       SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
    4481           56 :       new_instance->reduc_phis = NULL;
    4482           56 :       new_instance->cost_vec = vNULL;
    4483           56 :       new_instance->subgraph_entries = vNULL;
    4484              : 
    4485           56 :       vinfo->slp_instances.safe_push (new_instance);
    4486              : 
    4487           56 :       if (dump_enabled_p ())
    4488              :         {
    4489           24 :           dump_printf_loc (MSG_NOTE, vect_location,
    4490              :                            "Final SLP tree for instance %p:\n",
    4491              :                            (void *) new_instance);
    4492           24 :           vect_print_slp_graph (MSG_NOTE, vect_location,
    4493              :                                 SLP_INSTANCE_TREE (new_instance));
    4494              :         }
    4495              : 
    4496           56 :       return true;
    4497        33330 :     }
    4498              : 
    4499         7736 :   if (scalar_stmts.length () <= 1)
    4500              :     {
    4501         6081 :       scalar_stmts.release ();
    4502         6081 :       return false;
    4503              :     }
    4504              : 
    4505         1655 :   scalar_stmts.reverse ();
    4506         1655 :   stmt_vec_info reduc_phi_info = next_stmt;
    4507              : 
    4508              :   /* Build the tree for the SLP instance.  */
    4509         1655 :   vec<stmt_vec_info> root_stmt_infos = vNULL;
    4510         1655 :   vec<tree> remain = vNULL;
    4511              : 
    4512         1655 :   if (dump_enabled_p ())
    4513              :     {
    4514          180 :       dump_printf_loc (MSG_NOTE, vect_location,
    4515              :                        "Starting SLP discovery of reduction chain for\n");
    4516          966 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4517         1572 :         dump_printf_loc (MSG_NOTE, vect_location,
    4518          786 :                          "  %G", scalar_stmts[i]->stmt);
    4519              :     }
    4520              : 
    4521              :   /* Build the tree for the SLP instance.  */
    4522         1655 :   unsigned int group_size = scalar_stmts.length ();
    4523         1655 :   bool *matches = XALLOCAVEC (bool, group_size);
    4524         1655 :   poly_uint64 max_nunits = 1;
    4525         1655 :   unsigned tree_size = 0;
    4526              : 
    4527              :   /* ???  We need this only for SLP discovery.  */
    4528         6315 :   for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4529         4660 :     REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = scalar_stmts[0];
    4530              : 
    4531         1655 :   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4532              :                                        &max_nunits, matches, limit,
    4533         1655 :                                        &tree_size, bst_map);
    4534              : 
    4535         6315 :   for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4536         4660 :     REDUC_GROUP_FIRST_ELEMENT (scalar_stmts[i]) = NULL;
    4537              : 
    4538         1655 :   if (node != NULL)
    4539              :     {
    4540              :       /* Create a new SLP instance.  */
    4541         1395 :       slp_instance new_instance = XNEW (class _slp_instance);
    4542         1395 :       SLP_INSTANCE_TREE (new_instance) = node;
    4543         1395 :       SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4544         1395 :       SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    4545         1395 :       SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    4546         1395 :       SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_chain;
    4547         1395 :       new_instance->reduc_phis = NULL;
    4548         1395 :       new_instance->cost_vec = vNULL;
    4549         1395 :       new_instance->subgraph_entries = vNULL;
    4550              : 
    4551         1395 :       vect_reduc_info reduc_info = info_for_reduction (vinfo, node);
    4552         1395 :       reduc_info->is_reduc_chain = true;
    4553              : 
    4554         1395 :       if (dump_enabled_p ())
    4555          135 :         dump_printf_loc (MSG_NOTE, vect_location,
    4556              :                          "SLP size %u vs. limit %u.\n",
    4557              :                          tree_size, max_tree_size);
    4558              : 
    4559              :       /* Fixup SLP reduction chains.  If this is a reduction chain with
    4560              :          a conversion in front amend the SLP tree with a node for that.  */
    4561         1395 :       gimple *scalar_def = STMT_VINFO_REDUC_DEF (reduc_phi_info)->stmt;
    4562         1395 :       if (is_gimple_assign (scalar_def)
    4563         1395 :           && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (scalar_def)))
    4564              :         {
    4565           28 :           stmt_vec_info conv_info = vect_stmt_to_vectorize
    4566           28 :                                         (STMT_VINFO_REDUC_DEF (reduc_phi_info));
    4567           28 :           scalar_stmts = vNULL;
    4568           28 :           scalar_stmts.create (group_size);
    4569           90 :           for (unsigned i = 0; i < group_size; ++i)
    4570           62 :             scalar_stmts.quick_push (conv_info);
    4571           28 :           slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
    4572           28 :           SLP_TREE_VECTYPE (conv)
    4573           28 :             = get_vectype_for_scalar_type (vinfo,
    4574           28 :                                            TREE_TYPE
    4575              :                                              (gimple_assign_lhs (scalar_def)),
    4576              :                                            group_size);
    4577           28 :           SLP_TREE_REDUC_IDX (conv) = 0;
    4578           28 :           conv->cycle_info.id = node->cycle_info.id;
    4579           28 :           SLP_TREE_CHILDREN (conv).quick_push (node);
    4580           28 :           SLP_INSTANCE_TREE (new_instance) = conv;
    4581              :         }
    4582              :       /* Fill the backedge child of the PHI SLP node.  The
    4583              :          general matching code cannot find it because the
    4584              :          scalar code does not reflect how we vectorize the
    4585              :          reduction.  */
    4586         1395 :       use_operand_p use_p;
    4587         1395 :       imm_use_iterator imm_iter;
    4588         1395 :       class loop *loop = LOOP_VINFO_LOOP (vinfo);
    4589         6670 :       FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
    4590              :                              gimple_get_lhs (scalar_def))
    4591              :         /* There are exactly two non-debug uses, the reduction
    4592              :            PHI and the loop-closed PHI node.  */
    4593         3880 :         if (!is_gimple_debug (USE_STMT (use_p))
    4594         3880 :             && gimple_bb (USE_STMT (use_p)) == loop->header)
    4595              :           {
    4596         1395 :             auto_vec<stmt_vec_info, 64> phis (group_size);
    4597         1395 :             stmt_vec_info phi_info = vinfo->lookup_stmt (USE_STMT (use_p));
    4598         5386 :             for (unsigned i = 0; i < group_size; ++i)
    4599         3991 :               phis.quick_push (phi_info);
    4600         1395 :             slp_tree *phi_node = bst_map->get (phis);
    4601         1395 :             unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
    4602         2790 :             SLP_TREE_CHILDREN (*phi_node)[dest_idx]
    4603         1395 :               = SLP_INSTANCE_TREE (new_instance);
    4604         1395 :             SLP_INSTANCE_TREE (new_instance)->refcnt++;
    4605         1395 :           }
    4606              : 
    4607         1395 :       vinfo->slp_instances.safe_push (new_instance);
    4608              : 
    4609              :       /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4610              :          the number of scalar stmts in the root in a few places.
    4611              :          Verify that assumption holds.  */
    4612         2790 :       gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4613              :                   .length () == group_size);
    4614              : 
    4615         1395 :       if (dump_enabled_p ())
    4616              :         {
    4617          135 :           dump_printf_loc (MSG_NOTE, vect_location,
    4618              :                            "Final SLP tree for instance %p:\n",
    4619              :                            (void *) new_instance);
    4620          135 :           vect_print_slp_graph (MSG_NOTE, vect_location,
    4621              :                                 SLP_INSTANCE_TREE (new_instance));
    4622              :         }
    4623              : 
    4624         1395 :       return true;
    4625              :     }
    4626              : 
    4627              :   /* Failed to SLP.  */
    4628          260 :   scalar_stmts.release ();
    4629          260 :   if (dump_enabled_p ())
    4630           45 :     dump_printf_loc (MSG_NOTE, vect_location,
    4631              :                      "SLP discovery of reduction chain failed\n");
    4632              :   return false;
    4633              : }
    4634              : 
    4635              : /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
    4636              :    of KIND.  Return true if successful.  */
    4637              : 
    4638              : static bool
    4639        63593 : vect_analyze_slp_reduction (loop_vec_info vinfo,
    4640              :                             stmt_vec_info scalar_stmt,
    4641              :                             unsigned max_tree_size, unsigned *limit,
    4642              :                             scalar_stmts_to_slp_tree_map_t *bst_map,
    4643              :                             bool force_single_lane)
    4644              : {
    4645        63593 :   slp_instance_kind kind = slp_inst_kind_reduc_group;
    4646              : 
    4647              :   /* If there's no budget left bail out early.  */
    4648        63593 :   if (*limit == 0)
    4649              :     return false;
    4650              : 
    4651              :   /* Try to gather a reduction chain.  */
    4652        63593 :   if (! force_single_lane
    4653        43001 :       && STMT_VINFO_DEF_TYPE (scalar_stmt) == vect_reduction_def
    4654       106377 :       && vect_analyze_slp_reduc_chain (vinfo, bst_map, scalar_stmt,
    4655              :                                        max_tree_size, limit))
    4656              :     return true;
    4657              : 
    4658        62142 :   vec<stmt_vec_info> scalar_stmts;
    4659        62142 :   scalar_stmts.create (1);
    4660        62142 :   scalar_stmts.quick_push (scalar_stmt);
    4661              : 
    4662        62142 :   if (dump_enabled_p ())
    4663              :     {
    4664         3338 :       dump_printf_loc (MSG_NOTE, vect_location,
    4665              :                        "Starting SLP discovery for\n");
    4666         6676 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4667         6676 :         dump_printf_loc (MSG_NOTE, vect_location,
    4668         3338 :                          "  %G", scalar_stmts[i]->stmt);
    4669              :     }
    4670              : 
    4671              :   /* Build the tree for the SLP instance.  */
    4672        62142 :   unsigned int group_size = scalar_stmts.length ();
    4673        62142 :   bool *matches = XALLOCAVEC (bool, group_size);
    4674        62142 :   poly_uint64 max_nunits = 1;
    4675        62142 :   unsigned tree_size = 0;
    4676              : 
    4677        62142 :   slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4678              :                                        &max_nunits, matches, limit,
    4679              :                                        &tree_size, bst_map);
    4680        62142 :   if (node != NULL)
    4681              :     {
    4682              :       /* Create a new SLP instance.  */
    4683        59557 :       slp_instance new_instance = XNEW (class _slp_instance);
    4684        59557 :       SLP_INSTANCE_TREE (new_instance) = node;
    4685        59557 :       SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4686        59557 :       SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
    4687        59557 :       SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
    4688        59557 :       SLP_INSTANCE_KIND (new_instance) = kind;
    4689        59557 :       new_instance->reduc_phis = NULL;
    4690        59557 :       new_instance->cost_vec = vNULL;
    4691        59557 :       new_instance->subgraph_entries = vNULL;
    4692              : 
    4693        59557 :       if (dump_enabled_p ())
    4694         3222 :         dump_printf_loc (MSG_NOTE, vect_location,
    4695              :                          "SLP size %u vs. limit %u.\n",
    4696              :                          tree_size, max_tree_size);
    4697              : 
    4698        59557 :       vinfo->slp_instances.safe_push (new_instance);
    4699              : 
    4700              :       /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4701              :          the number of scalar stmts in the root in a few places.
    4702              :          Verify that assumption holds.  */
    4703       119114 :       gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4704              :                   .length () == group_size);
    4705              : 
    4706        59557 :       if (dump_enabled_p ())
    4707              :         {
    4708         3222 :           dump_printf_loc (MSG_NOTE, vect_location,
    4709              :                            "Final SLP tree for instance %p:\n",
    4710              :                            (void *) new_instance);
    4711         3222 :           vect_print_slp_graph (MSG_NOTE, vect_location,
    4712              :                                 SLP_INSTANCE_TREE (new_instance));
    4713              :         }
    4714              : 
    4715        59557 :       return true;
    4716              :     }
    4717              :   /* Failed to SLP.  */
    4718              : 
    4719              :   /* Free the allocated memory.  */
    4720         2585 :   scalar_stmts.release ();
    4721              : 
    4722              :   /* Failed to SLP.  */
    4723         2585 :   if (dump_enabled_p ())
    4724          116 :     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
    4725              :   return false;
    4726              : }
    4727              : 
    4728              : /* Analyze a single SLP reduction group.  If successful add a SLP instance
    4729              :    for it and return true, otherwise return false and have *MATCHES
    4730              :    populated.  */
    4731              : 
    4732              : static bool
    4733        18143 : vect_analyze_slp_reduction_group (loop_vec_info loop_vinfo,
    4734              :                                   vec<stmt_vec_info> scalar_stmts,
    4735              :                                   scalar_stmts_to_slp_tree_map_t *bst_map,
    4736              :                                   unsigned max_tree_size, unsigned *limit,
    4737              :                                   bool *matches)
    4738              : {
    4739              :   /* Try to form a reduction group.  */
    4740        18143 :   unsigned int group_size = scalar_stmts.length ();
    4741        18143 :   if (!matches)
    4742         7417 :     matches = XALLOCAVEC (bool, group_size);
    4743        18143 :   poly_uint64 max_nunits = 1;
    4744        18143 :   unsigned tree_size = 0;
    4745        18143 :   slp_tree node = vect_build_slp_tree (loop_vinfo, scalar_stmts,
    4746              :                                        group_size,
    4747              :                                        &max_nunits, matches, limit,
    4748              :                                        &tree_size, bst_map);
    4749        18143 :   if (!node)
    4750              :     return false;
    4751              : 
    4752              :   /* Create a new SLP instance.  */
    4753         8601 :   slp_instance new_instance = XNEW (class _slp_instance);
    4754         8601 :   SLP_INSTANCE_TREE (new_instance) = node;
    4755         8601 :   SLP_INSTANCE_LOADS (new_instance) = vNULL;
    4756         8601 :   SLP_INSTANCE_ROOT_STMTS (new_instance) = vNULL;
    4757         8601 :   SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
    4758         8601 :   SLP_INSTANCE_KIND (new_instance) = slp_inst_kind_reduc_group;
    4759         8601 :   new_instance->reduc_phis = NULL;
    4760         8601 :   new_instance->cost_vec = vNULL;
    4761         8601 :   new_instance->subgraph_entries = vNULL;
    4762              : 
    4763         8601 :   if (dump_enabled_p ())
    4764          544 :     dump_printf_loc (MSG_NOTE, vect_location,
    4765              :                      "SLP size %u vs. limit %u.\n",
    4766              :                      tree_size, max_tree_size);
    4767              : 
    4768         8601 :   loop_vinfo->slp_instances.safe_push (new_instance);
    4769              : 
    4770              :   /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    4771              :      the number of scalar stmts in the root in a few places.
    4772              :      Verify that assumption holds.  */
    4773        17202 :   gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    4774              :               .length () == group_size);
    4775              : 
    4776         8601 :   if (dump_enabled_p ())
    4777              :     {
    4778          544 :       dump_printf_loc (MSG_NOTE, vect_location,
    4779              :                        "SLP discovery of size %d reduction group "
    4780              :                        "succeeded\n", group_size);
    4781          544 :       dump_printf_loc (MSG_NOTE, vect_location,
    4782              :                        "Final SLP tree for instance %p:\n",
    4783              :                        (void *) new_instance);
    4784          544 :       vect_print_slp_graph (MSG_NOTE, vect_location,
    4785              :                             SLP_INSTANCE_TREE (new_instance));
    4786              :     }
    4787              : 
    4788              :   return true;
    4789              : }
    4790              : 
    4791              : /* Analyze reductions in LOOP_VINFO and populate SLP instances
    4792              :    accordingly.  Returns false if something fails.  */
    4793              : 
    4794              : static bool
    4795       423314 : vect_analyze_slp_reductions (loop_vec_info loop_vinfo,
    4796              :                              unsigned max_tree_size, unsigned *limit,
    4797              :                              scalar_stmts_to_slp_tree_map_t *bst_map,
    4798              :                              bool force_single_lane)
    4799              : {
    4800       470668 :   if (loop_vinfo->reductions.is_empty ())
    4801              :     return true;
    4802              : 
    4803              :   /* Collect reduction statements we can combine into
    4804              :      a SLP reduction.  */
    4805        53144 :   vec<stmt_vec_info> scalar_stmts;
    4806        53144 :   scalar_stmts.create (loop_vinfo->reductions.length ());
    4807       234289 :   for (auto next_info : loop_vinfo->reductions)
    4808              :     {
    4809        74857 :       next_info = vect_stmt_to_vectorize (next_info);
    4810        74857 :       if ((STMT_VINFO_RELEVANT_P (next_info)
    4811           14 :            || STMT_VINFO_LIVE_P (next_info))
    4812              :           /* ???  Make sure we didn't skip a conversion around a
    4813              :              reduction path.  In that case we'd have to reverse
    4814              :              engineer that conversion stmt following the chain using
    4815              :              reduc_idx and from the PHI using reduc_def.  */
    4816        74843 :           && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
    4817        74843 :               || (STMT_VINFO_DEF_TYPE (next_info)
    4818              :                   == vect_double_reduction_def)))
    4819              :         {
    4820              :           /* Do not discover SLP reductions combining lane-reducing
    4821              :              ops, that will fail later.  */
    4822        74843 :           if (!force_single_lane
    4823        74843 :               && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
    4824        53816 :             scalar_stmts.quick_push (next_info);
    4825              :           /* Do SLP discovery for single-lane reductions.  */
    4826        21027 :           else if (! vect_analyze_slp_reduction (loop_vinfo, next_info,
    4827              :                                                  max_tree_size, limit,
    4828              :                                                  bst_map,
    4829              :                                                  force_single_lane))
    4830              :             {
    4831            0 :               scalar_stmts.release ();
    4832            0 :               return false;
    4833              :             }
    4834              :         }
    4835              :     }
    4836              : 
    4837        53144 :   if (scalar_stmts.length () > 1)
    4838              :     {
    4839              :       /* Try to form a reduction group.  */
    4840         3331 :       unsigned int group_size = scalar_stmts.length ();
    4841         3331 :       bool *matches = XALLOCAVEC (bool, group_size);
    4842         3331 :       if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts, bst_map,
    4843              :                                             max_tree_size, limit, matches))
    4844         3227 :         return true;
    4845              : 
    4846              :       /* When analysis as a single SLP reduction group failed try to
    4847              :          form sub-groups by collecting matching lanes.  Do not recurse
    4848              :          that on failure (to limit compile-time costs), but recurse
    4849              :          for the initial non-matching parts.  Everything not covered
    4850              :          by a sub-group gets single-reduction treatment.  */
    4851         2418 :       vec<stmt_vec_info> cands = vNULL;
    4852         7521 :       while (matches[0])
    4853              :         {
    4854         7417 :           cands.truncate (0);
    4855         7417 :           cands.reserve (group_size, true);
    4856        58074 :           for (unsigned i = 0; i < group_size; ++i)
    4857        50657 :             if (matches[i])
    4858        12395 :               cands.quick_push (scalar_stmts[i]);
    4859              : 
    4860              :           /* Try to form a reduction group.  */
    4861         7417 :           if (vect_analyze_slp_reduction_group (loop_vinfo, cands, bst_map,
    4862              :                                                 max_tree_size, limit, NULL))
    4863         5396 :             cands = vNULL;
    4864              :           else
    4865              :             {
    4866              :               /* Do SLP discovery for single-lane reductions.  */
    4867        12272 :               for (auto stmt_info : cands)
    4868         6231 :                 if (! vect_analyze_slp_reduction (loop_vinfo,
    4869              :                                                   vect_stmt_to_vectorize
    4870              :                                                     (stmt_info),
    4871              :                                                   max_tree_size, limit,
    4872              :                                                   bst_map, force_single_lane))
    4873              :                   {
    4874           22 :                     scalar_stmts.release ();
    4875           22 :                     cands.release ();
    4876           22 :                     return false;
    4877              :                   }
    4878              :             }
    4879              :           /* Remove the handled stmts from scalar_stmts and try again,
    4880              :              possibly repeating the above with updated matches[].  */
    4881              :           unsigned j = 0;
    4882        57990 :           for (unsigned i = 0; i < group_size; ++i)
    4883        50595 :             if (!matches[i])
    4884              :               {
    4885        38235 :                 scalar_stmts[j] = scalar_stmts[i];
    4886        38235 :                 ++j;
    4887              :               }
    4888         7395 :           scalar_stmts.truncate (j);
    4889         7395 :           group_size = scalar_stmts.length ();
    4890         7395 :           if (vect_analyze_slp_reduction_group (loop_vinfo, scalar_stmts,
    4891              :                                                 bst_map, max_tree_size, limit,
    4892              :                                                 matches))
    4893              :             return true;
    4894              :         }
    4895              :     }
    4896              :   /* Do SLP discovery for single-lane reductions.  */
    4897       183523 :   for (auto stmt_info : scalar_stmts)
    4898        36335 :     if (! vect_analyze_slp_reduction (loop_vinfo,
    4899              :                                       vect_stmt_to_vectorize (stmt_info),
    4900              :                                       max_tree_size, limit,
    4901              :                                       bst_map, force_single_lane))
    4902              :       {
    4903         2563 :         scalar_stmts.release ();
    4904         2563 :         return false;
    4905              :       }
    4906              : 
    4907        47354 :   scalar_stmts.release ();
    4908        47354 :   return true;
    4909              : }
    4910              : 
    4911              : /* Analyze an SLP instance starting from a group of grouped stores.  Call
    4912              :    vect_build_slp_tree to build a tree of packed stmts if possible.
    4913              :    Return FALSE if it's impossible to SLP any stmt in the group.  */
    4914              : 
    4915              : static bool
    4916      1083263 : vect_analyze_slp_instance (vec_info *vinfo,
    4917              :                            scalar_stmts_to_slp_tree_map_t *bst_map,
    4918              :                            stmt_vec_info stmt_info,
    4919              :                            slp_instance_kind kind,
    4920              :                            unsigned max_tree_size, unsigned *limit,
    4921              :                            bool force_single_lane)
    4922              : {
    4923      1083263 :   vec<stmt_vec_info> scalar_stmts;
    4924              : 
    4925      1083263 :   if (is_a <bb_vec_info> (vinfo))
    4926      1059535 :     vect_location = stmt_info->stmt;
    4927              : 
    4928      1083263 :   gcc_assert (kind == slp_inst_kind_store);
    4929              : 
    4930              :   /* Collect the stores and store them in scalar_stmts.  */
    4931      1083263 :   scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
    4932      1083263 :   stmt_vec_info next_info = stmt_info;
    4933      5376605 :   while (next_info)
    4934              :     {
    4935      3210079 :       scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
    4936      3210079 :       next_info = DR_GROUP_NEXT_ELEMENT (next_info);
    4937              :     }
    4938              : 
    4939      1083263 :   vec<stmt_vec_info> root_stmt_infos = vNULL;
    4940      1083263 :   vec<tree> remain = vNULL;
    4941              : 
    4942              :   /* Build the tree for the SLP instance.  */
    4943              : 
    4944              :   /* If there's no budget left bail out early.  */
    4945      1083263 :   if (*limit == 0)
    4946              :     return false;
    4947              : 
    4948      1083240 :   if (dump_enabled_p ())
    4949              :     {
    4950         4109 :       dump_printf_loc (MSG_NOTE, vect_location,
    4951              :                        "Starting SLP discovery for\n");
    4952        23674 :       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
    4953        39130 :         dump_printf_loc (MSG_NOTE, vect_location,
    4954        19565 :                          "  %G", scalar_stmts[i]->stmt);
    4955              :     }
    4956              : 
    4957              :   /* Build the tree for the SLP instance.  */
    4958      1083240 :   unsigned int group_size = scalar_stmts.length ();
    4959      1083240 :   bool *matches = XALLOCAVEC (bool, group_size);
    4960      1083240 :   poly_uint64 max_nunits = 1;
    4961      1083240 :   unsigned tree_size = 0;
    4962      1083240 :   unsigned i;
    4963              : 
    4964      1083240 :   slp_tree node = NULL;
    4965      1083240 :   if (group_size > 1 && force_single_lane)
    4966              :     {
    4967         1498 :       matches[0] = true;
    4968         1498 :       matches[1] = false;
    4969              :     }
    4970              :   else
    4971      1081742 :     node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
    4972              :                                 &max_nunits, matches, limit,
    4973              :                                 &tree_size, bst_map);
    4974      1083240 :   if (node != NULL)
    4975              :     {
    4976              :       /* Calculate the unrolling factor based on the smallest type.  */
    4977       674116 :       poly_uint64 unrolling_factor
    4978       674116 :         = calculate_unrolling_factor (max_nunits, group_size);
    4979              : 
    4980       674116 :       if (maybe_ne (unrolling_factor, 1U)
    4981       674116 :           && is_a <bb_vec_info> (vinfo))
    4982              :         {
    4983            0 :           unsigned HOST_WIDE_INT const_max_nunits;
    4984            0 :           if (!max_nunits.is_constant (&const_max_nunits)
    4985            0 :               || const_max_nunits > group_size)
    4986              :             {
    4987            0 :               if (dump_enabled_p ())
    4988            0 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    4989              :                                  "Build SLP failed: store group "
    4990              :                                  "size not a multiple of the vector size "
    4991              :                                  "in basic block SLP\n");
    4992            0 :               vect_free_slp_tree (node);
    4993            0 :               return false;
    4994              :             }
    4995              :           /* Fatal mismatch.  */
    4996            0 :           if (dump_enabled_p ())
    4997            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    4998              :                              "SLP discovery succeeded but node needs "
    4999              :                              "splitting\n");
    5000            0 :           memset (matches, true, group_size);
    5001            0 :           matches[group_size / const_max_nunits * const_max_nunits] = false;
    5002            0 :           vect_free_slp_tree (node);
    5003              :         }
    5004              :       else
    5005              :         {
    5006              :           /* Create a new SLP instance.  */
    5007       674116 :           slp_instance new_instance = XNEW (class _slp_instance);
    5008       674116 :           SLP_INSTANCE_TREE (new_instance) = node;
    5009       674116 :           SLP_INSTANCE_LOADS (new_instance) = vNULL;
    5010       674116 :           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    5011       674116 :           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    5012       674116 :           SLP_INSTANCE_KIND (new_instance) = kind;
    5013       674116 :           new_instance->reduc_phis = NULL;
    5014       674116 :           new_instance->cost_vec = vNULL;
    5015       674116 :           new_instance->subgraph_entries = vNULL;
    5016              : 
    5017       674116 :           if (dump_enabled_p ())
    5018         3126 :             dump_printf_loc (MSG_NOTE, vect_location,
    5019              :                              "SLP size %u vs. limit %u.\n",
    5020              :                              tree_size, max_tree_size);
    5021              : 
    5022       674116 :           vinfo->slp_instances.safe_push (new_instance);
    5023              : 
    5024              :           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    5025              :              the number of scalar stmts in the root in a few places.
    5026              :              Verify that assumption holds.  */
    5027      1348232 :           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    5028              :                         .length () == group_size);
    5029              : 
    5030       674116 :           if (dump_enabled_p ())
    5031              :             {
    5032         3126 :               dump_printf_loc (MSG_NOTE, vect_location,
    5033              :                                "Final SLP tree for instance %p:\n",
    5034              :                                (void *) new_instance);
    5035         3126 :               vect_print_slp_graph (MSG_NOTE, vect_location,
    5036              :                                     SLP_INSTANCE_TREE (new_instance));
    5037              :             }
    5038              : 
    5039       674116 :           return true;
    5040              :         }
    5041              :     }
    5042              :   /* Failed to SLP.  */
    5043              : 
    5044              :   /* Try to break the group up into pieces.  */
    5045       409124 :   if (*limit > 0 && kind == slp_inst_kind_store)
    5046              :     {
    5047              :       /* ???  We could delay all the actual splitting of store-groups
    5048              :          until after SLP discovery of the original group completed.
    5049              :          Then we can recurse to vect_build_slp_instance directly.  */
    5050      1071610 :       for (i = 0; i < group_size; i++)
    5051      1071610 :         if (!matches[i])
    5052              :           break;
    5053              : 
    5054              :       /* For basic block SLP, try to break the group up into multiples of
    5055              :          a vector size.  */
    5056       409123 :       if (is_a <bb_vec_info> (vinfo)
    5057       409123 :           && (i > 1 && i < group_size))
    5058              :         {
    5059              :           /* Free the allocated memory.  */
    5060       153686 :           scalar_stmts.release ();
    5061              : 
    5062       153686 :           tree scalar_type
    5063       153686 :             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
    5064       307372 :           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
    5065       153686 :                                                       1 << floor_log2 (i));
    5066       153686 :           unsigned HOST_WIDE_INT const_nunits;
    5067       153686 :           if (vectype
    5068       153686 :               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
    5069              :             {
    5070              :               /* Split into two groups at the first vector boundary.  */
    5071       153686 :               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
    5072       153686 :               unsigned group1_size = i & ~(const_nunits - 1);
    5073              : 
    5074       153686 :               if (dump_enabled_p ())
    5075           59 :                 dump_printf_loc (MSG_NOTE, vect_location,
    5076              :                                  "Splitting SLP group at stmt %u\n", i);
    5077       153686 :               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
    5078              :                                                                group1_size);
    5079       153686 :               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
    5080              :                                                     kind, max_tree_size,
    5081              :                                                     limit, false);
    5082              :               /* Split the rest at the failure point and possibly
    5083              :                  re-analyze the remaining matching part if it has
    5084              :                  at least two lanes.  */
    5085       153686 :               if (group1_size < i
    5086         5271 :                   && (i + 1 < group_size
    5087         2901 :                       || i - group1_size > 1))
    5088              :                 {
    5089         2402 :                   stmt_vec_info rest2 = rest;
    5090         2402 :                   rest = vect_split_slp_store_group (rest, i - group1_size);
    5091         2402 :                   if (i - group1_size > 1)
    5092           61 :                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
    5093              :                                                       kind, max_tree_size,
    5094              :                                                       limit, false);
    5095              :                 }
    5096              :               /* Re-analyze the non-matching tail if it has at least
    5097              :                  two lanes.  */
    5098       153686 :               if (i + 1 < group_size)
    5099        21730 :                 res |= vect_analyze_slp_instance (vinfo, bst_map,
    5100              :                                                   rest, kind, max_tree_size,
    5101              :                                                   limit, false);
    5102       153686 :               return res;
    5103              :             }
    5104              :         }
    5105              : 
    5106              :       /* For loop vectorization split the RHS into arbitrary pieces of
    5107              :          size >= 1.  */
    5108       255437 :       else if (is_a <loop_vec_info> (vinfo)
    5109       255437 :                && (group_size != 1 && i < group_size))
    5110              :         {
    5111         6434 :           gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
    5112           28 :           bool masked_p = call
    5113           28 :               && gimple_call_internal_p (call)
    5114           28 :               && internal_fn_mask_index (gimple_call_internal_fn (call)) != -1;
    5115              :           /* There are targets that cannot do even/odd interleaving schemes
    5116              :              so they absolutely need to use load/store-lanes.  For now
    5117              :              force single-lane SLP for them - they would be happy with
    5118              :              uniform power-of-two lanes (but depending on element size),
    5119              :              but even if we can use 'i' as indicator we would need to
    5120              :              backtrack when later lanes fail to discover with the same
    5121              :              granularity.  We cannot turn any of strided or scatter store
    5122              :              into store-lanes.  */
    5123              :           /* ???  If this is not in sync with what get_load_store_type
    5124              :              later decides the SLP representation is not good for other
    5125              :              store vectorization methods.  */
    5126         6434 :           bool want_store_lanes
    5127         6434 :             = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    5128         6434 :                && ! STMT_VINFO_STRIDED_P (stmt_info)
    5129         4893 :                && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
    5130         4889 :                && compare_step_with_zero (vinfo, stmt_info) > 0
    5131        11300 :                && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
    5132        12868 :                                                  masked_p, group_size, i));
    5133         6434 :           if (want_store_lanes || force_single_lane)
    5134              :             i = 1;
    5135              : 
    5136              :           /* A fatal discovery fail doesn't always mean single-lane SLP
    5137              :              isn't a possibility, so try.  */
    5138         4936 :           if (i == 0)
    5139              :             i = 1;
    5140              : 
    5141         6434 :           if (dump_enabled_p ())
    5142          882 :             dump_printf_loc (MSG_NOTE, vect_location,
    5143              :                              "Splitting SLP group at stmt %u\n", i);
    5144              : 
    5145              :           /* Analyze the stored values and pinch them together with
    5146              :              a permute node so we can preserve the whole store group.  */
    5147         6434 :           auto_vec<slp_tree> rhs_nodes;
    5148         6434 :           poly_uint64 max_nunits = 1;
    5149              : 
    5150         6434 :           unsigned int rhs_common_nlanes = 0;
    5151         6434 :           unsigned int start = 0, end = i;
    5152        29167 :           while (start < group_size)
    5153              :             {
    5154        22963 :               gcc_assert (end - start >= 1);
    5155        22963 :               vec<stmt_vec_info> substmts;
    5156        22963 :               substmts.create (end - start);
    5157        69463 :               for (unsigned j = start; j < end; ++j)
    5158        46500 :                 substmts.quick_push (scalar_stmts[j]);
    5159        22963 :               max_nunits = 1;
    5160        22963 :               node = vect_build_slp_tree (vinfo, substmts, end - start,
    5161              :                                           &max_nunits,
    5162              :                                           matches, limit, &tree_size, bst_map);
    5163        22963 :               if (node)
    5164              :                 {
    5165        18270 :                   rhs_nodes.safe_push (node);
    5166        18270 :                   vect_update_max_nunits (&max_nunits, node->max_nunits);
    5167        18270 :                   if (start == 0)
    5168         6208 :                     rhs_common_nlanes = SLP_TREE_LANES (node);
    5169        12062 :                   else if (rhs_common_nlanes != SLP_TREE_LANES (node))
    5170         1267 :                     rhs_common_nlanes = 0;
    5171        18270 :                   start = end;
    5172        18270 :                   if (want_store_lanes || force_single_lane)
    5173         4532 :                     end = start + 1;
    5174              :                   else
    5175              :                     end = group_size;
    5176              :                 }
    5177              :               else
    5178              :                 {
    5179         4693 :                   substmts.release ();
    5180         4693 :                   if (end - start == 1)
    5181              :                     {
    5182              :                       /* Single-lane discovery failed.  Free ressources.  */
    5183          244 :                       for (auto node : rhs_nodes)
    5184            6 :                         vect_free_slp_tree (node);
    5185          230 :                       scalar_stmts.release ();
    5186          230 :                       if (dump_enabled_p ())
    5187           38 :                         dump_printf_loc (MSG_NOTE, vect_location,
    5188              :                                          "SLP discovery failed\n");
    5189          230 :                       return false;
    5190              :                     }
    5191              : 
    5192              :                   /* ???  It really happens that we soft-fail SLP
    5193              :                      build at a mismatch but the matching part hard-fails
    5194              :                      later.  As we know we arrived here with a group
    5195              :                      larger than one try a group of size one!  */
    5196         4463 :                   if (!matches[0])
    5197           42 :                     end = start + 1;
    5198              :                   else
    5199         9934 :                     for (unsigned j = start; j < end; j++)
    5200         9934 :                       if (!matches[j - start])
    5201              :                         {
    5202              :                           end = j;
    5203              :                           break;
    5204              :                         }
    5205              :                 }
    5206              :             }
    5207              : 
    5208              :           /* Now re-assess whether we want store lanes in case the
    5209              :              discovery ended up producing all single-lane RHSs.  */
    5210         6204 :           if (! want_store_lanes
    5211         6204 :               && rhs_common_nlanes == 1
    5212         5339 :               && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
    5213         5339 :               && ! STMT_VINFO_STRIDED_P (stmt_info)
    5214         4052 :               && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
    5215         4049 :               && compare_step_with_zero (vinfo, stmt_info) > 0
    5216        10242 :               && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes[0]),
    5217              :                                               group_size, masked_p)
    5218              :                   != IFN_LAST))
    5219              :             want_store_lanes = true;
    5220              : 
    5221              :           /* Now we assume we can build the root SLP node from all stores.  */
    5222         6204 :           if (want_store_lanes)
    5223              :             {
    5224              :               /* For store-lanes feed the store node with all RHS nodes
    5225              :                  in order.  */
    5226            0 :               node = vect_create_new_slp_node (scalar_stmts,
    5227            0 :                                                SLP_TREE_CHILDREN
    5228              :                                                  (rhs_nodes[0]).length ());
    5229            0 :               SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
    5230            0 :               node->max_nunits = max_nunits;
    5231            0 :               node->ldst_lanes = true;
    5232            0 :               SLP_TREE_CHILDREN (node)
    5233            0 :                 .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
    5234            0 :                                 + rhs_nodes.length () - 1);
    5235              :               /* First store value and possibly mask.  */
    5236            0 :               SLP_TREE_CHILDREN (node)
    5237            0 :                 .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
    5238              :               /* Rest of the store values.  All mask nodes are the same,
    5239              :                  this should be guaranteed by dataref group discovery.  */
    5240            0 :               for (unsigned j = 1; j < rhs_nodes.length (); ++j)
    5241            0 :                 SLP_TREE_CHILDREN (node)
    5242            0 :                   .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
    5243            0 :               for (slp_tree child : SLP_TREE_CHILDREN (node))
    5244            0 :                 child->refcnt++;
    5245              :             }
    5246              :           else
    5247         6204 :             node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
    5248              :                                                       max_nunits);
    5249              : 
    5250        24468 :           while (!rhs_nodes.is_empty ())
    5251        18264 :             vect_free_slp_tree (rhs_nodes.pop ());
    5252              : 
    5253              :           /* Create a new SLP instance.  */
    5254         6204 :           slp_instance new_instance = XNEW (class _slp_instance);
    5255         6204 :           SLP_INSTANCE_TREE (new_instance) = node;
    5256         6204 :           SLP_INSTANCE_LOADS (new_instance) = vNULL;
    5257         6204 :           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
    5258         6204 :           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
    5259         6204 :           SLP_INSTANCE_KIND (new_instance) = kind;
    5260         6204 :           new_instance->reduc_phis = NULL;
    5261         6204 :           new_instance->cost_vec = vNULL;
    5262         6204 :           new_instance->subgraph_entries = vNULL;
    5263              : 
    5264         6204 :           if (dump_enabled_p ())
    5265          844 :             dump_printf_loc (MSG_NOTE, vect_location,
    5266              :                              "SLP size %u vs. limit %u.\n",
    5267              :                              tree_size, max_tree_size);
    5268              : 
    5269         6204 :           vinfo->slp_instances.safe_push (new_instance);
    5270              : 
    5271              :           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
    5272              :              the number of scalar stmts in the root in a few places.
    5273              :              Verify that assumption holds.  */
    5274        12408 :           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
    5275              :                         .length () == group_size);
    5276              : 
    5277         6204 :           if (dump_enabled_p ())
    5278              :             {
    5279          844 :               dump_printf_loc (MSG_NOTE, vect_location,
    5280              :                                "Final SLP tree for instance %p:\n",
    5281              :                                (void *) new_instance);
    5282          844 :               vect_print_slp_graph (MSG_NOTE, vect_location,
    5283              :                                     SLP_INSTANCE_TREE (new_instance));
    5284              :             }
    5285         6204 :           return true;
    5286         6434 :         }
    5287              :       else
    5288              :         /* Free the allocated memory.  */
    5289       249003 :         scalar_stmts.release ();
    5290              : 
    5291              :       /* Even though the first vector did not all match, we might be able to SLP
    5292              :          (some) of the remainder.  FORNOW ignore this possibility.  */
    5293              :     }
    5294              :   else
    5295              :     /* Free the allocated memory.  */
    5296            1 :     scalar_stmts.release ();
    5297              : 
    5298              :   /* Failed to SLP.  */
    5299       249004 :   if (dump_enabled_p ())
    5300           42 :     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
    5301              :   return false;
    5302              : }
    5303              : 
    5304              : /* qsort comparator ordering SLP load nodes.  */
    5305              : 
    5306              : static int
    5307      2243707 : vllp_cmp (const void *a_, const void *b_)
    5308              : {
    5309      2243707 :   const slp_tree a = *(const slp_tree *)a_;
    5310      2243707 :   const slp_tree b = *(const slp_tree *)b_;
    5311      2243707 :   stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
    5312      2243707 :   stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
    5313      2243707 :   if (STMT_VINFO_GROUPED_ACCESS (a0)
    5314      1374376 :       && STMT_VINFO_GROUPED_ACCESS (b0)
    5315      3557681 :       && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
    5316              :     {
    5317              :       /* Same group, order after lanes used.  */
    5318       296477 :       if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
    5319              :         return 1;
    5320       290446 :       else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
    5321              :         return -1;
    5322              :       else
    5323              :         {
    5324              :           /* Try to order loads using the same lanes together, breaking
    5325              :              the tie with the lane number that first differs.  */
    5326       283788 :           if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
    5327       283788 :               && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
    5328              :             return 0;
    5329       283788 :           else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
    5330       283788 :                    && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
    5331              :             return 1;
    5332       281194 :           else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
    5333       281194 :                    && SLP_TREE_LOAD_PERMUTATION (b).exists ())
    5334              :             return -1;
    5335              :           else
    5336              :             {
    5337       276414 :               for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
    5338       276414 :                 if (SLP_TREE_LOAD_PERMUTATION (a)[i]
    5339       276414 :                     != SLP_TREE_LOAD_PERMUTATION (b)[i])
    5340              :                   {
    5341              :                     /* In-order lane first, that's what the above case for
    5342              :                        no permutation does.  */
    5343       275582 :                     if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
    5344              :                       return -1;
    5345       167937 :                     else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
    5346              :                       return 1;
    5347        88830 :                     else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
    5348        88830 :                              < SLP_TREE_LOAD_PERMUTATION (b)[i])
    5349              :                       return -1;
    5350              :                     else
    5351              :                       return 1;
    5352              :                   }
    5353              :               return 0;
    5354              :             }
    5355              :         }
    5356              :     }
    5357              :   else /* Different groups or non-groups.  */
    5358              :     {
    5359              :       /* Order groups as their first element to keep them together.  */
    5360      1947230 :       if (STMT_VINFO_GROUPED_ACCESS (a0))
    5361      1947230 :         a0 = DR_GROUP_FIRST_ELEMENT (a0);
    5362      1947230 :       if (STMT_VINFO_GROUPED_ACCESS (b0))
    5363      1947230 :         b0 = DR_GROUP_FIRST_ELEMENT (b0);
    5364      1947230 :       if (a0 == b0)
    5365              :         return 0;
    5366              :       /* Tie using UID.  */
    5367      1947110 :       else if (gimple_uid (STMT_VINFO_STMT (a0))
    5368      1947110 :                < gimple_uid (STMT_VINFO_STMT (b0)))
    5369              :         return -1;
    5370              :       else
    5371              :         {
    5372       856954 :           gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
    5373              :                       != gimple_uid (STMT_VINFO_STMT (b0)));
    5374              :           return 1;
    5375              :         }
    5376              :     }
    5377              : }
    5378              : 
    5379              : /* Return whether if the load permutation of NODE is consecutive starting
    5380              :    with value START_VAL in the first element.  If START_VAL is not given
    5381              :    the first element's value is used.  */
    5382              : 
    5383              : bool
    5384       544354 : vect_load_perm_consecutive_p (slp_tree node, unsigned start_val)
    5385              : {
    5386       544354 :   load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
    5387              : 
    5388       544354 :   if (!perm.exists () || !perm.length ())
    5389              :     return false;
    5390              : 
    5391       544354 :   if (start_val == UINT_MAX)
    5392        74008 :     start_val = perm[0];
    5393              : 
    5394      1076385 :   for (unsigned int i = 0; i < perm.length (); i++)
    5395       550147 :     if (perm[i] != start_val + (unsigned int) i)
    5396              :       return false;
    5397              : 
    5398              :   return true;
    5399              : }
    5400              : 
    5401              : /* Process the set of LOADS that are all from the same dataref group.  */
    5402              : 
    5403              : static void
    5404       151287 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
    5405              :                               scalar_stmts_to_slp_tree_map_t *bst_map,
    5406              :                               const array_slice<slp_tree> &loads,
    5407              :                               bool force_single_lane)
    5408              : {
    5409              :   /* We at this point want to lower without a fixed VF or vector
    5410              :      size in mind which means we cannot actually compute whether we
    5411              :      need three or more vectors for a load permutation yet.  So always
    5412              :      lower.  */
    5413       151287 :   stmt_vec_info first
    5414       151287 :     = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
    5415       151287 :   unsigned group_lanes = DR_GROUP_SIZE (first);
    5416              : 
    5417              :   /* Verify if all load permutations can be implemented with a suitably
    5418              :      large element load-lanes operation.  */
    5419       151287 :   unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
    5420       151287 :   if (STMT_VINFO_STRIDED_P (first)
    5421       149173 :       || compare_step_with_zero (loop_vinfo, first) <= 0
    5422       146842 :       || exact_log2 (ld_lanes_lanes) == -1
    5423              :       /* ???  For now only support the single-lane case as there is
    5424              :          missing support on the store-lane side and code generation
    5425              :          isn't up to the task yet.  */
    5426       144732 :       || ld_lanes_lanes != 1
    5427       288884 :       || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
    5428              :                                     group_lanes / ld_lanes_lanes,
    5429              :                                     false) == IFN_LAST)
    5430              :     ld_lanes_lanes = 0;
    5431              :   else
    5432              :     /* Verify the loads access the same number of lanes aligned to
    5433              :        ld_lanes_lanes.  */
    5434            0 :     for (slp_tree load : loads)
    5435              :       {
    5436            0 :         if (SLP_TREE_LANES (load) != ld_lanes_lanes)
    5437              :           {
    5438              :             ld_lanes_lanes = 0;
    5439              :             break;
    5440              :           }
    5441            0 :         unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
    5442            0 :         if (first % ld_lanes_lanes != 0)
    5443              :           {
    5444              :             ld_lanes_lanes = 0;
    5445              :             break;
    5446              :           }
    5447            0 :         if (!vect_load_perm_consecutive_p (load))
    5448              :           {
    5449              :             ld_lanes_lanes = 0;
    5450              :             break;
    5451              :           }
    5452              :       }
    5453              : 
    5454              :   /* Only a power-of-two number of lanes matches interleaving with N levels.
    5455              :      ???  An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
    5456              :      at each step.  */
    5457       249117 :   if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
    5458              :     return;
    5459              : 
    5460       238047 :   for (slp_tree load : loads)
    5461              :     {
    5462              :       /* Leave masked or gather loads alone for now.  */
    5463       168889 :       if (!SLP_TREE_CHILDREN (load).is_empty ())
    5464        48193 :         continue;
    5465              : 
    5466              :       /* For single-element interleaving spanning multiple vectors avoid
    5467              :          lowering, we want to use VMAT_ELEMENTWISE later.  */
    5468       168883 :       if (ld_lanes_lanes == 0
    5469       168883 :           && SLP_TREE_LANES (load) == 1
    5470       155744 :           && !DR_GROUP_NEXT_ELEMENT (first)
    5471       247122 :           && maybe_gt (group_lanes,
    5472              :                        TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (load))))
    5473        50420 :         return;
    5474              : 
    5475              :       /* We want to pattern-match special cases here and keep those
    5476              :          alone.  Candidates are splats and load-lane.  */
    5477              : 
    5478              :       /* We need to lower only loads of less than half of the groups
    5479              :          lanes, including duplicate lanes.  Note this leaves nodes
    5480              :          with a non-1:1 load permutation around instead of canonicalizing
    5481              :          those into a load and a permute node.  Removing this early
    5482              :          check would do such canonicalization.  */
    5483       118463 :       if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
    5484        44809 :           && ld_lanes_lanes == 0)
    5485        44809 :         continue;
    5486              : 
    5487              :       /* Build the permute to get the original load permutation order.  */
    5488        73654 :       bool contiguous = vect_load_perm_consecutive_p (load);
    5489        73654 :       lane_permutation_t final_perm;
    5490        73654 :       final_perm.create (SLP_TREE_LANES (load));
    5491       147966 :       for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
    5492       148624 :         final_perm.quick_push (
    5493        74312 :           std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
    5494              : 
    5495              :       /* When the load permutation accesses a contiguous unpermuted,
    5496              :          power-of-two aligned and sized chunk leave the load alone.
    5497              :          We can likely (re-)load it more efficiently rather than
    5498              :          extracting it from the larger load.
    5499              :          ???  Long-term some of the lowering should move to where
    5500              :          the vector types involved are fixed.  */
    5501        77032 :       if (!force_single_lane
    5502        73654 :           && ld_lanes_lanes == 0
    5503        48916 :           && contiguous
    5504        48679 :           && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
    5505         6373 :           && pow2p_hwi (SLP_TREE_LANES (load))
    5506         6337 :           && pow2p_hwi (group_lanes)
    5507         3378 :           && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
    5508        77032 :           && group_lanes % SLP_TREE_LANES (load) == 0)
    5509              :         {
    5510         3378 :           final_perm.release ();
    5511         3378 :           continue;
    5512              :         }
    5513              : 
    5514              :       /* First build (and possibly re-use) a load node for the
    5515              :          unpermuted group.  Gaps in the middle and on the end are
    5516              :          represented with NULL stmts.  */
    5517        70276 :       vec<stmt_vec_info> stmts;
    5518        70276 :       stmts.create (group_lanes);
    5519       245345 :       for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
    5520              :         {
    5521       175069 :           if (s != first)
    5522       108876 :             for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
    5523         4083 :               stmts.quick_push (NULL);
    5524       175069 :           stmts.quick_push (s);
    5525              :         }
    5526       131526 :       for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
    5527        61250 :         stmts.quick_push (NULL);
    5528        70276 :       poly_uint64 max_nunits = 1;
    5529        70276 :       bool *matches = XALLOCAVEC (bool, group_lanes);
    5530        70276 :       unsigned limit = 1;
    5531        70276 :       unsigned tree_size = 0;
    5532        70276 :       slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
    5533              :                                          group_lanes,
    5534              :                                          &max_nunits, matches, &limit,
    5535        70276 :                                          &tree_size, bst_map);
    5536        70276 :       gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0).exists ());
    5537              : 
    5538        70276 :       if (ld_lanes_lanes != 0)
    5539              :         {
    5540              :           /* ???  If this is not in sync with what get_load_store_type
    5541              :              later decides the SLP representation is not good for other
    5542              :              store vectorization methods.  */
    5543            0 :           l0->ldst_lanes = true;
    5544            0 :           load->ldst_lanes = true;
    5545              :         }
    5546              : 
    5547       217494 :       while (1)
    5548              :         {
    5549       143885 :           unsigned group_lanes = SLP_TREE_LANES (l0);
    5550       143885 :           if (ld_lanes_lanes != 0
    5551       143885 :               || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
    5552              :             break;
    5553              : 
    5554              :           /* Try to lower by reducing the group to half its size using an
    5555              :              interleaving scheme.  For this try to compute whether all
    5556              :              elements needed for this load are in even or odd elements of
    5557              :              an even/odd decomposition with N consecutive elements.
    5558              :              Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
    5559              :              with N == 2.  */
    5560              :           /* ???  Only an even number of lanes can be handed this way, but the
    5561              :              fallback below could work for any number.  We have to make sure
    5562              :              to round up in that case.  */
    5563        73609 :           gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
    5564         9807 :           unsigned even = 0, odd = 0;
    5565         9807 :           if ((group_lanes & 1) == 0)
    5566              :             {
    5567         9807 :               even = (1 << ceil_log2 (group_lanes)) - 1;
    5568         9807 :               odd = even;
    5569        39899 :               for (auto l : final_perm)
    5570              :                 {
    5571        10478 :                   even &= ~l.second;
    5572        10478 :                   odd &= l.second;
    5573              :                 }
    5574              :             }
    5575              : 
    5576              :           /* Now build an even or odd extraction from the unpermuted load.  */
    5577        73609 :           lane_permutation_t perm;
    5578        73609 :           perm.create ((group_lanes + 1) / 2);
    5579        73609 :           unsigned even_level = even ? 1 << ctz_hwi (even) : 0;
    5580        73609 :           unsigned odd_level = odd ? 1 << ctz_hwi (odd) : 0;
    5581        73609 :           if (even_level
    5582         9051 :               && group_lanes % (2 * even_level) == 0
    5583              :               /* ???  When code generating permutes we do not try to pun
    5584              :                  to larger component modes so level != 1 isn't a natural
    5585              :                  even/odd extract.  Prefer one if possible.  */
    5586         9051 :               && (even_level == 1 || !odd_level || odd_level != 1))
    5587              :             {
    5588              :               /* { 0, 1, ... 4, 5 ..., } */
    5589        33232 :               for (unsigned i = 0; i < group_lanes / 2 / even_level; ++i)
    5590        52520 :                 for (unsigned j = 0; j < even_level; ++j)
    5591        26430 :                   perm.quick_push (std::make_pair (0, 2 * i * even_level + j));
    5592              :             }
    5593        64558 :           else if (odd_level)
    5594              :             {
    5595              :               /* { ..., 2, 3, ... 6, 7 } */
    5596         2635 :               gcc_assert (group_lanes % (2 * odd_level) == 0);
    5597        11413 :               for (unsigned i = 0; i < group_lanes / 2 / odd_level; ++i)
    5598        17610 :                 for (unsigned j = 0; j < odd_level; ++j)
    5599         8832 :                   perm.quick_push
    5600         8832 :                     (std::make_pair (0, (2 * i + 1) * odd_level + j));
    5601              :             }
    5602              :           else
    5603              :             {
    5604              :               /* As fallback extract all used lanes and fill to half the
    5605              :                  group size by repeating the last element.
    5606              :                  ???  This is quite a bad strathegy for re-use - we could
    5607              :                  brute force our way to find more optimal filling lanes to
    5608              :                  maximize re-use when looking at all loads from the group.  */
    5609        63832 :               auto_bitmap l;
    5610       255384 :               for (auto p : final_perm)
    5611        63888 :                 bitmap_set_bit (l, p.second);
    5612        63832 :               unsigned i = 0;
    5613        63832 :               bitmap_iterator bi;
    5614       127720 :               EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
    5615        63888 :                   perm.quick_push (std::make_pair (0, i));
    5616       255480 :               while (perm.length () < (group_lanes + 1) / 2)
    5617        63908 :                 perm.quick_push (perm.last ());
    5618        63832 :             }
    5619              : 
    5620              :           /* Update final_perm with the intermediate permute.  */
    5621       147889 :           for (unsigned i = 0; i < final_perm.length (); ++i)
    5622              :             {
    5623        74280 :               unsigned l = final_perm[i].second;
    5624        74280 :               unsigned j;
    5625        81480 :               for (j = 0; j < perm.length (); ++j)
    5626        81480 :                 if (perm[j].second == l)
    5627              :                   {
    5628        74280 :                     final_perm[i].second = j;
    5629        74280 :                     break;
    5630              :                   }
    5631        74280 :               gcc_assert (j < perm.length ());
    5632              :             }
    5633              : 
    5634              :           /* And create scalar stmts.  */
    5635        73609 :           vec<stmt_vec_info> perm_stmts;
    5636        73609 :           perm_stmts.create (perm.length ());
    5637       236667 :           for (unsigned i = 0; i < perm.length (); ++i)
    5638       163058 :             perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
    5639              : 
    5640        73609 :           slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
    5641        73609 :           SLP_TREE_CHILDREN (p).quick_push (l0);
    5642        73609 :           SLP_TREE_LANE_PERMUTATION (p) = perm;
    5643        73609 :           SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
    5644        73609 :           SLP_TREE_LANES (p) = perm.length ();
    5645        73609 :           SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
    5646              :           /* ???  As we have scalar stmts for this intermediate permute we
    5647              :              could CSE it via bst_map but we do not want to pick up
    5648              :              another SLP node with a load permutation.  We instead should
    5649              :              have a "local" CSE map here.  */
    5650        73609 :           SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
    5651              : 
    5652              :           /* We now have a node for (group_lanes + 1) / 2 lanes.  */
    5653        73609 :           l0 = p;
    5654        73609 :         }
    5655              : 
    5656              :       /* And finally from the ordered reduction node create the
    5657              :          permute to shuffle the lanes into the original load-permutation
    5658              :          order.  We replace the original load node with this.  */
    5659        70276 :       SLP_TREE_CODE (load) = VEC_PERM_EXPR;
    5660        70276 :       SLP_TREE_LOAD_PERMUTATION (load).release ();
    5661        70276 :       SLP_TREE_LANE_PERMUTATION (load) = final_perm;
    5662        70276 :       SLP_TREE_CHILDREN (load).create (1);
    5663        70276 :       SLP_TREE_CHILDREN (load).quick_push (l0);
    5664              :     }
    5665              : }
    5666              : 
    5667              : /* Transform SLP loads in the SLP graph created by SLP discovery to
    5668              :    group loads from the same group and lower load permutations that
    5669              :    are unlikely to be supported into a series of permutes.
    5670              :    In the degenerate case of having only single-lane SLP instances
    5671              :    this should result in a series of permute nodes emulating an
    5672              :    interleaving scheme.  */
    5673              : 
    5674              : static void
    5675       405823 : vect_lower_load_permutations (loop_vec_info loop_vinfo,
    5676              :                               scalar_stmts_to_slp_tree_map_t *bst_map,
    5677              :                               bool force_single_lane)
    5678              : {
    5679              :   /* Gather and sort loads across all instances.  */
    5680       405823 :   hash_set<slp_tree> visited;
    5681       405823 :   auto_vec<slp_tree> loads;
    5682      1879773 :   for (auto inst : loop_vinfo->slp_instances)
    5683       664226 :     vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
    5684       405823 :   if (loads.is_empty ())
    5685        71505 :     return;
    5686       334318 :   loads.qsort (vllp_cmp);
    5687              : 
    5688              :   /* Now process each dataref group separately.  */
    5689       334318 :   unsigned firsti = 0;
    5690       621891 :   for (unsigned i = 1; i < loads.length (); ++i)
    5691              :     {
    5692       287573 :       slp_tree first = loads[firsti];
    5693       287573 :       slp_tree next = loads[i];
    5694       287573 :       stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
    5695       287573 :       stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
    5696       287573 :       if (STMT_VINFO_GROUPED_ACCESS (a0)
    5697       145112 :           && STMT_VINFO_GROUPED_ACCESS (b0)
    5698       419718 :           && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
    5699        54268 :         continue;
    5700              :       /* Now we have one or multiple SLP loads of the same group from
    5701              :          firsti to i - 1.  */
    5702       233305 :       if (STMT_VINFO_GROUPED_ACCESS (a0))
    5703        90844 :         vect_lower_load_permutations (loop_vinfo, bst_map,
    5704        90844 :                                       make_array_slice (&loads[firsti],
    5705              :                                                         i - firsti),
    5706              :                                       force_single_lane);
    5707              :       firsti = i;
    5708              :     }
    5709       668636 :   if (firsti < loads.length ()
    5710       668636 :       && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
    5711        60443 :     vect_lower_load_permutations (loop_vinfo, bst_map,
    5712        60443 :                                   make_array_slice (&loads[firsti],
    5713        60443 :                                                     loads.length () - firsti),
    5714              :                                   force_single_lane);
    5715       405823 : }
    5716              : 
    5717              : /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
    5718              :    trees of packed scalar stmts if SLP is possible.  */
    5719              : 
    5720              : opt_result
    5721      1036359 : vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
    5722              :                   bool force_single_lane)
    5723              : {
    5724      1036359 :   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
    5725      1036359 :   unsigned int i;
    5726      1036359 :   stmt_vec_info first_element;
    5727      1036359 :   slp_instance instance;
    5728              : 
    5729      1036359 :   DUMP_VECT_SCOPE ("vect_analyze_slp");
    5730              : 
    5731      1036359 :   unsigned limit = max_tree_size;
    5732              : 
    5733      1036359 :   scalar_stmts_to_slp_tree_map_t *bst_map
    5734      1036359 :     = new scalar_stmts_to_slp_tree_map_t ();
    5735              : 
    5736              :   /* Find SLP sequences starting from groups of grouped stores.  */
    5737      2980266 :   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
    5738       907786 :     if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
    5739              :                                      slp_inst_kind_store, max_tree_size, &limit,
    5740              :                                      force_single_lane)
    5741       907786 :         && loop_vinfo)
    5742              :       {
    5743          238 :         release_scalar_stmts_to_slp_tree_map (bst_map);
    5744          238 :         return opt_result::failure_at (vect_location, "SLP build failed.\n");
    5745              :       }
    5746              : 
    5747              :   /* For loops also start SLP discovery from non-grouped stores.  */
    5748      1036121 :   if (loop_vinfo)
    5749              :     {
    5750              :       data_reference_p dr;
    5751      1371732 :       FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
    5752       948418 :         if (DR_IS_WRITE (dr))
    5753              :           {
    5754       287920 :             stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
    5755              :             /* Grouped stores are already handled above.  */
    5756       287920 :             if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
    5757        76098 :               continue;
    5758       211822 :             vec<stmt_vec_info> stmts;
    5759       211822 :             vec<stmt_vec_info> roots = vNULL;
    5760       211822 :             vec<tree> remain = vNULL;
    5761       211822 :             stmts.create (1);
    5762       211822 :             stmts.quick_push (stmt_info);
    5763       211822 :             if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
    5764              :                                            stmts, roots, remain, max_tree_size,
    5765              :                                            &limit, bst_map, force_single_lane))
    5766              :               {
    5767         3718 :                 release_scalar_stmts_to_slp_tree_map (bst_map);
    5768         3718 :                 return opt_result::failure_at (vect_location,
    5769              :                                                "SLP build failed.\n");
    5770              :               }
    5771              :           }
    5772              : 
    5773              :       stmt_vec_info stmt_info;
    5774       423354 :       FOR_EACH_VEC_ELT (LOOP_VINFO_ALTERNATE_DEFS (loop_vinfo), i, stmt_info)
    5775              :         {
    5776           20 :           vec<stmt_vec_info> stmts;
    5777           20 :           vec<stmt_vec_info> roots = vNULL;
    5778           20 :           vec<tree> remain = vNULL;
    5779           20 :           stmts.create (1);
    5780           20 :           stmts.quick_push (stmt_info);
    5781           20 :           if (! vect_build_slp_instance (vinfo, slp_inst_kind_store,
    5782              :                                          stmts, roots, remain, max_tree_size,
    5783              :                                          &limit, bst_map, force_single_lane))
    5784              :             {
    5785            0 :               release_scalar_stmts_to_slp_tree_map (bst_map);
    5786            0 :               return opt_result::failure_at (vect_location,
    5787              :                                              "SLP build failed.\n");
    5788              :             }
    5789              :         }
    5790              :     }
    5791              : 
    5792      1032403 :   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
    5793              :     {
    5794      1813553 :       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
    5795              :         {
    5796      1204464 :           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
    5797              :           /* Apply patterns.  */
    5798      3767026 :           for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
    5799      5125124 :             bb_vinfo->roots[i].stmts[j]
    5800      2636047 :               = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
    5801      1204464 :           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
    5802      1204464 :                                        bb_vinfo->roots[i].stmts,
    5803      1204464 :                                        bb_vinfo->roots[i].roots,
    5804      1204464 :                                        bb_vinfo->roots[i].remain,
    5805              :                                        max_tree_size, &limit, bst_map, false))
    5806              :             {
    5807       127167 :               bb_vinfo->roots[i].roots = vNULL;
    5808       127167 :               bb_vinfo->roots[i].remain = vNULL;
    5809              :             }
    5810      1204464 :           bb_vinfo->roots[i].stmts = vNULL;
    5811              :         }
    5812              :     }
    5813              : 
    5814      1032403 :   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
    5815              :     {
    5816              :       /* Find SLP sequences starting from groups of reductions.  */
    5817       423314 :       if (!vect_analyze_slp_reductions (loop_vinfo, max_tree_size, &limit,
    5818              :                                         bst_map, force_single_lane))
    5819              :         {
    5820         2585 :           release_scalar_stmts_to_slp_tree_map (bst_map);
    5821         2585 :           return opt_result::failure_at (vect_location, "SLP build failed.\n");
    5822              :         }
    5823              : 
    5824              :       /* Make sure to vectorize only-live stmts, usually inductions.  */
    5825      1925740 :       for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
    5826      1271841 :         for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
    5827       598796 :              gsi_next (&gsi))
    5828              :           {
    5829       608288 :             gphi *lc_phi = *gsi;
    5830       608288 :             tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
    5831       608288 :             stmt_vec_info stmt_info;
    5832       608288 :             if (TREE_CODE (def) == SSA_NAME
    5833       497150 :                 && !virtual_operand_p (def)
    5834       268710 :                 && (stmt_info = loop_vinfo->lookup_def (def))
    5835       237887 :                 && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
    5836       237887 :                 && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
    5837       178157 :                 && STMT_VINFO_LIVE_P (stmt_info)
    5838       178157 :                 && !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))
    5839       714672 :                 && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
    5840              :               {
    5841       106305 :                 vec<stmt_vec_info> stmts;
    5842       106305 :                 vec<stmt_vec_info> roots = vNULL;
    5843       106305 :                 vec<tree> remain = vNULL;
    5844       106305 :                 stmts.create (1);
    5845       106305 :                 stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
    5846       106305 :                 if (! vect_build_slp_instance (vinfo,
    5847              :                                                slp_inst_kind_reduc_group,
    5848              :                                                stmts, roots, remain,
    5849              :                                                max_tree_size, &limit,
    5850              :                                                bst_map, force_single_lane))
    5851              :                   {
    5852         9492 :                     release_scalar_stmts_to_slp_tree_map (bst_map);
    5853         9492 :                     return opt_result::failure_at (vect_location,
    5854              :                                                    "SLP build failed.\n");
    5855              :                   }
    5856              :               }
    5857         9492 :           }
    5858              : 
    5859              :       /* Find SLP sequences starting from gconds.  */
    5860      1108605 :       for (auto cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
    5861              :         {
    5862       274446 :           auto cond_info = loop_vinfo->lookup_stmt (cond);
    5863              : 
    5864       274446 :           cond_info = vect_stmt_to_vectorize (cond_info);
    5865       274446 :           vec<stmt_vec_info> roots = vNULL;
    5866       274446 :           roots.safe_push (cond_info);
    5867       274446 :           gimple *stmt = STMT_VINFO_STMT (cond_info);
    5868       274446 :           tree args0 = gimple_cond_lhs (stmt);
    5869       274446 :           tree args1 = gimple_cond_rhs (stmt);
    5870              : 
    5871              :           /* These should be enforced by cond lowering, but if it failed
    5872              :              bail.  */
    5873       274446 :           if (gimple_cond_code (stmt) != NE_EXPR
    5874       273368 :               || TREE_TYPE (args0) != boolean_type_node
    5875       547166 :               || !integer_zerop (args1))
    5876              :             {
    5877         1726 :               roots.release ();
    5878         1726 :               release_scalar_stmts_to_slp_tree_map (bst_map);
    5879         1726 :               return opt_result::failure_at (vect_location,
    5880              :                                              "SLP build failed.\n");
    5881              :             }
    5882              : 
    5883              :           /* An argument without a loop def will be codegened from vectorizing the
    5884              :              root gcond itself.  As such we don't need to try to build an SLP tree
    5885              :              from them.  It's highly likely that the resulting SLP tree here if both
    5886              :              arguments have a def will be incompatible, but we rely on it being split
    5887              :              later on.  */
    5888       272720 :           auto varg = loop_vinfo->lookup_def (args0);
    5889       272720 :           vec<stmt_vec_info> stmts;
    5890       272720 :           vec<tree> remain = vNULL;
    5891       272720 :           stmts.create (1);
    5892       272720 :           stmts.quick_push (vect_stmt_to_vectorize (varg));
    5893              : 
    5894       272720 :           if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
    5895              :                                          stmts, roots, remain,
    5896              :                                          max_tree_size, &limit,
    5897              :                                          bst_map, force_single_lane))
    5898              :             {
    5899         3688 :               roots.release ();
    5900         3688 :               release_scalar_stmts_to_slp_tree_map (bst_map);
    5901         3688 :               return opt_result::failure_at (vect_location,
    5902              :                                              "SLP build failed.\n");
    5903              :             }
    5904              :         }
    5905              :     }
    5906              : 
    5907      1014912 :   hash_set<slp_tree> visited_patterns;
    5908      1014912 :   slp_tree_to_load_perm_map_t perm_cache;
    5909      1014912 :   slp_compat_nodes_map_t compat_cache;
    5910              : 
    5911              :   /* See if any patterns can be found in the SLP tree.  */
    5912      1014912 :   bool pattern_found = false;
    5913      3478047 :   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    5914      1448223 :     pattern_found |= vect_match_slp_patterns (instance, vinfo,
    5915              :                                               &visited_patterns, &perm_cache,
    5916              :                                               &compat_cache);
    5917              : 
    5918              :   /* If any were found optimize permutations of loads.  */
    5919      1014912 :   if (pattern_found)
    5920              :     {
    5921          202 :       hash_map<slp_tree, slp_tree> load_map;
    5922         3239 :       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    5923              :         {
    5924         2835 :           slp_tree root = SLP_INSTANCE_TREE (instance);
    5925         2835 :           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
    5926              :                                         &load_map, root);
    5927              :         }
    5928          202 :     }
    5929              : 
    5930              :   /* Check whether we should force some SLP instances to use load/store-lanes
    5931              :      and do so by forcing SLP re-discovery with single lanes.  We used
    5932              :      to cancel SLP when this applied to all instances in a loop but now
    5933              :      we decide this per SLP instance.  It's important to do this only
    5934              :      after SLP pattern recognition.  */
    5935      1014912 :   if (is_a <loop_vec_info> (vinfo))
    5936      1070049 :     FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    5937       664226 :       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
    5938       229875 :           && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
    5939              :         {
    5940       229875 :           slp_tree slp_root = SLP_INSTANCE_TREE (instance);
    5941       229875 :           unsigned int group_size = SLP_TREE_LANES (slp_root);
    5942       229875 :           tree vectype = SLP_TREE_VECTYPE (slp_root);
    5943              : 
    5944       229875 :           stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
    5945       229875 :           gimple *rep = STMT_VINFO_STMT (rep_info);
    5946       229875 :           bool masked = (is_gimple_call (rep)
    5947         1366 :                          && gimple_call_internal_p (rep)
    5948       231221 :                          && internal_fn_mask_index
    5949         1346 :                               (gimple_call_internal_fn (rep)) != -1);
    5950       229855 :           if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
    5951        23466 :               || slp_root->ldst_lanes
    5952       253341 :               || (vect_store_lanes_supported (vectype, group_size, masked)
    5953              :                   == IFN_LAST))
    5954       229875 :             continue;
    5955              : 
    5956            0 :           auto_vec<slp_tree> loads;
    5957            0 :           hash_set<slp_tree> visited;
    5958            0 :           vect_gather_slp_loads (loads, slp_root, visited);
    5959              : 
    5960              :           /* Check whether any load in the SLP instance is possibly
    5961              :              permuted.  */
    5962            0 :           bool loads_permuted = false;
    5963            0 :           slp_tree load_node;
    5964            0 :           unsigned j;
    5965            0 :           FOR_EACH_VEC_ELT (loads, j, load_node)
    5966              :             {
    5967            0 :               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
    5968            0 :                 continue;
    5969              :               unsigned k;
    5970              :               stmt_vec_info load_info;
    5971            0 :               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
    5972            0 :                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
    5973              :                   {
    5974              :                     loads_permuted = true;
    5975              :                     break;
    5976              :                   }
    5977              :             }
    5978              : 
    5979              :           /* If the loads and stores can use load/store-lanes force re-discovery
    5980              :              with single lanes.  */
    5981            0 :           if (loads_permuted)
    5982              :             {
    5983            0 :               bool can_use_lanes = true;
    5984              :               bool prefer_load_lanes = false;
    5985            0 :               FOR_EACH_VEC_ELT (loads, j, load_node)
    5986            0 :                 if (STMT_VINFO_GROUPED_ACCESS
    5987              :                       (SLP_TREE_REPRESENTATIVE (load_node)))
    5988              :                   {
    5989            0 :                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
    5990              :                         (SLP_TREE_REPRESENTATIVE (load_node));
    5991            0 :                     rep = STMT_VINFO_STMT (stmt_vinfo);
    5992            0 :                     masked = (is_gimple_call (rep)
    5993            0 :                               && gimple_call_internal_p (rep)
    5994            0 :                               && internal_fn_mask_index
    5995            0 :                                    (gimple_call_internal_fn (rep)));
    5996              :                     /* Use SLP for strided accesses (or if we can't
    5997              :                        load-lanes).  */
    5998            0 :                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
    5999            0 :                         || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
    6000            0 :                         || vect_load_lanes_supported
    6001            0 :                              (SLP_TREE_VECTYPE (load_node),
    6002            0 :                               DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
    6003              :                         /* ???  During SLP re-discovery with a single lane
    6004              :                            a masked grouped load will appear permuted and
    6005              :                            discovery will fail.  We have to rework this
    6006              :                            on the discovery side - for now avoid ICEing.  */
    6007            0 :                         || masked)
    6008              :                       {
    6009              :                         can_use_lanes = false;
    6010              :                         break;
    6011              :                       }
    6012              :                     /* Make sure that the target would prefer store-lanes
    6013              :                        for at least one of the loads.
    6014              : 
    6015              :                        ??? Perhaps we should instead require this for
    6016              :                        all loads?  */
    6017            0 :                     prefer_load_lanes
    6018              :                       = (prefer_load_lanes
    6019            0 :                          || SLP_TREE_LANES (load_node) == group_size
    6020            0 :                          || (vect_slp_prefer_store_lanes_p
    6021            0 :                              (vinfo, stmt_vinfo,
    6022              :                               SLP_TREE_VECTYPE (load_node), masked,
    6023              :                               group_size, SLP_TREE_LANES (load_node))));
    6024              :                   }
    6025              : 
    6026            0 :               if (can_use_lanes && prefer_load_lanes)
    6027              :                 {
    6028            0 :                   if (dump_enabled_p ())
    6029            0 :                     dump_printf_loc (MSG_NOTE, vect_location,
    6030              :                                      "SLP instance %p can use load/store-lanes,"
    6031              :                                      " re-discovering with single-lanes\n",
    6032              :                                      (void *) instance);
    6033              : 
    6034            0 :                   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
    6035              : 
    6036            0 :                   vect_free_slp_instance (instance);
    6037            0 :                   limit = max_tree_size;
    6038            0 :                   bool res = vect_analyze_slp_instance (vinfo, bst_map,
    6039              :                                                         stmt_info,
    6040              :                                                         slp_inst_kind_store,
    6041              :                                                         max_tree_size, &limit,
    6042              :                                                         true);
    6043            0 :                   gcc_assert (res);
    6044            0 :                   auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
    6045            0 :                   LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
    6046              :                 }
    6047              :             }
    6048            0 :         }
    6049              : 
    6050              :   /* When we end up with load permutations that we cannot possibly handle,
    6051              :      like those requiring three vector inputs, lower them using interleaving
    6052              :      like schemes.  */
    6053      1014912 :   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
    6054              :     {
    6055       405823 :       vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
    6056       405823 :       if (dump_enabled_p ())
    6057              :         {
    6058        19245 :           dump_printf_loc (MSG_NOTE, vect_location,
    6059              :                            "SLP graph after lowering permutations:\n");
    6060        19245 :           hash_set<slp_tree> visited;
    6061        86042 :           FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6062        28332 :             vect_print_slp_graph (MSG_NOTE, vect_location,
    6063              :                                   SLP_INSTANCE_TREE (instance), visited);
    6064        19245 :         }
    6065              :     }
    6066              : 
    6067      1014912 :   release_scalar_stmts_to_slp_tree_map (bst_map);
    6068              : 
    6069      1014912 :   if (pattern_found && dump_enabled_p ())
    6070              :     {
    6071           14 :       dump_printf_loc (MSG_NOTE, vect_location,
    6072              :                        "Pattern matched SLP tree\n");
    6073           14 :       hash_set<slp_tree> visited;
    6074           74 :       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
    6075           32 :         vect_print_slp_graph (MSG_NOTE, vect_location,
    6076              :                               SLP_INSTANCE_TREE (instance), visited);
    6077           14 :     }
    6078              : 
    6079      1014912 :   return opt_result::success ();
    6080      1014912 : }
    6081              : 
    6082              : /* Estimates the cost of inserting layout changes into the SLP graph.
    6083              :    It can also say that the insertion is impossible.  */
    6084              : 
    6085              : struct slpg_layout_cost
    6086              : {
    6087      9641099 :   slpg_layout_cost () = default;
    6088              :   slpg_layout_cost (sreal, bool);
    6089              : 
    6090       453358 :   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
    6091      4991049 :   bool is_possible () const { return depth != sreal::max (); }
    6092              : 
    6093              :   bool operator== (const slpg_layout_cost &) const;
    6094              :   bool operator!= (const slpg_layout_cost &) const;
    6095              : 
    6096              :   bool is_better_than (const slpg_layout_cost &, bool) const;
    6097              : 
    6098              :   void add_parallel_cost (const slpg_layout_cost &);
    6099              :   void add_serial_cost (const slpg_layout_cost &);
    6100              :   void split (unsigned int);
    6101              : 
    6102              :   /* The longest sequence of layout changes needed during any traversal
    6103              :      of the partition dag, weighted by execution frequency.
    6104              : 
    6105              :      This is the most important metric when optimizing for speed, since
    6106              :      it helps to ensure that we keep the number of operations on
    6107              :      critical paths to a minimum.  */
    6108              :   sreal depth = 0;
    6109              : 
    6110              :   /* An estimate of the total number of operations needed.  It is weighted by
    6111              :      execution frequency when optimizing for speed but not when optimizing for
    6112              :      size.  In order to avoid double-counting, a node with a fanout of N will
    6113              :      distribute 1/N of its total cost to each successor.
    6114              : 
    6115              :      This is the most important metric when optimizing for size, since
    6116              :      it helps to keep the total number of operations to a minimum,  */
    6117              :   sreal total = 0;
    6118              : };
    6119              : 
    6120              : /* Construct costs for a node with weight WEIGHT.  A higher weight
    6121              :    indicates more frequent execution.  IS_FOR_SIZE is true if we are
    6122              :    optimizing for size rather than speed.  */
    6123              : 
    6124      1172300 : slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
    6125      1173168 :   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
    6126              : {
    6127      1172300 : }
    6128              : 
    6129              : bool
    6130            0 : slpg_layout_cost::operator== (const slpg_layout_cost &other) const
    6131              : {
    6132            0 :   return depth == other.depth && total == other.total;
    6133              : }
    6134              : 
    6135              : bool
    6136            0 : slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
    6137              : {
    6138            0 :   return !operator== (other);
    6139              : }
    6140              : 
    6141              : /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
    6142              :    true if we are optimizing for size rather than speed.  */
    6143              : 
    6144              : bool
    6145       292991 : slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
    6146              :                                   bool is_for_size) const
    6147              : {
    6148       292991 :   if (is_for_size)
    6149              :     {
    6150          382 :       if (total != other.total)
    6151          159 :         return total < other.total;
    6152          223 :       return depth < other.depth;
    6153              :     }
    6154              :   else
    6155              :     {
    6156       292609 :       if (depth != other.depth)
    6157       125223 :         return depth < other.depth;
    6158       167386 :       return total < other.total;
    6159              :     }
    6160              : }
    6161              : 
    6162              : /* Increase the costs to account for something with cost INPUT_COST
    6163              :    happening in parallel with the current costs.  */
    6164              : 
    6165              : void
    6166       346132 : slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
    6167              : {
    6168       346132 :   depth = std::max (depth, input_cost.depth);
    6169       346132 :   total += input_cost.total;
    6170       346132 : }
    6171              : 
    6172              : /* Increase the costs to account for something with cost INPUT_COST
    6173              :    happening in series with the current costs.  */
    6174              : 
    6175              : void
    6176      1412031 : slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
    6177              : {
    6178      1412031 :   depth += other.depth;
    6179      1412031 :   total += other.total;
    6180      1412031 : }
    6181              : 
    6182              : /* Split the total cost among TIMES successors or predecessors.  */
    6183              : 
    6184              : void
    6185      1161954 : slpg_layout_cost::split (unsigned int times)
    6186              : {
    6187      1161954 :   if (times > 1)
    6188       483821 :     total /= times;
    6189      1161954 : }
    6190              : 
    6191              : /* Information about one node in the SLP graph, for use during
    6192              :    vect_optimize_slp_pass.  */
    6193              : 
    6194              : struct slpg_vertex
    6195              : {
    6196      9102782 :   slpg_vertex (slp_tree node_) : node (node_) {}
    6197              : 
    6198              :   /* The node itself.  */
    6199              :   slp_tree node;
    6200              : 
    6201              :   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
    6202              :      partitions are flexible; they can have whichever layout consumers
    6203              :      want them to have.  */
    6204              :   int partition = -1;
    6205              : 
    6206              :   /* The number of nodes that directly use the result of this one
    6207              :      (i.e. the number of nodes that count this one as a child).  */
    6208              :   unsigned int out_degree = 0;
    6209              : 
    6210              :   /* The execution frequency of the node.  */
    6211              :   sreal weight = 0;
    6212              : 
    6213              :   /* The total execution frequency of all nodes that directly use the
    6214              :      result of this one.  */
    6215              :   sreal out_weight = 0;
    6216              : };
    6217              : 
    6218              : /* Information about one partition of the SLP graph, for use during
    6219              :    vect_optimize_slp_pass.  */
    6220              : 
    6221              : struct slpg_partition_info
    6222              : {
    6223              :   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
    6224              :      of m_partitioned_nodes.  */
    6225              :   unsigned int node_begin = 0;
    6226              :   unsigned int node_end = 0;
    6227              : 
    6228              :   /* Which layout we've chosen to use for this partition, or -1 if
    6229              :      we haven't picked one yet.  */
    6230              :   int layout = -1;
    6231              : 
    6232              :   /* The number of predecessors and successors in the partition dag.
    6233              :      The predecessors always have lower partition numbers and the
    6234              :      successors always have higher partition numbers.
    6235              : 
    6236              :      Note that the directions of these edges are not necessarily the
    6237              :      same as in the data flow graph.  For example, if an SCC has separate
    6238              :      partitions for an inner loop and an outer loop, the inner loop's
    6239              :      partition will have at least two incoming edges from the outer loop's
    6240              :      partition: one for a live-in value and one for a live-out value.
    6241              :      In data flow terms, one of these edges would also be from the outer loop
    6242              :      to the inner loop, but the other would be in the opposite direction.  */
    6243              :   unsigned int in_degree = 0;
    6244              :   unsigned int out_degree = 0;
    6245              : };
    6246              : 
    6247              : /* Information about the costs of using a particular layout for a
    6248              :    particular partition.  It can also say that the combination is
    6249              :    impossible.  */
    6250              : 
    6251              : struct slpg_partition_layout_costs
    6252              : {
    6253      1429544 :   bool is_possible () const { return internal_cost.is_possible (); }
    6254        50930 :   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
    6255              : 
    6256              :   /* The costs inherited from predecessor partitions.  */
    6257              :   slpg_layout_cost in_cost;
    6258              : 
    6259              :   /* The inherent cost of the layout within the node itself.  For example,
    6260              :      this is nonzero for a load if choosing a particular layout would require
    6261              :      the load to permute the loaded elements.  It is nonzero for a
    6262              :      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
    6263              :      to full-vector moves.  */
    6264              :   slpg_layout_cost internal_cost;
    6265              : 
    6266              :   /* The costs inherited from successor partitions.  */
    6267              :   slpg_layout_cost out_cost;
    6268              : };
    6269              : 
    6270              : /* This class tries to optimize the layout of vectors in order to avoid
    6271              :    unnecessary shuffling.  At the moment, the set of possible layouts are
    6272              :    restricted to bijective permutations.
    6273              : 
    6274              :    The goal of the pass depends on whether we're optimizing for size or
    6275              :    for speed.  When optimizing for size, the goal is to reduce the overall
    6276              :    number of layout changes (including layout changes implied by things
    6277              :    like load permutations).  When optimizing for speed, the goal is to
    6278              :    reduce the maximum latency attributable to layout changes on any
    6279              :    non-cyclical path through the data flow graph.
    6280              : 
    6281              :    For example, when optimizing a loop nest for speed, we will prefer
    6282              :    to make layout changes outside of a loop rather than inside of a loop,
    6283              :    and will prefer to make layout changes in parallel rather than serially,
    6284              :    even if that increases the overall number of layout changes.
    6285              : 
    6286              :    The high-level procedure is:
    6287              : 
    6288              :    (1) Build a graph in which edges go from uses (parents) to definitions
    6289              :        (children).
    6290              : 
    6291              :    (2) Divide the graph into a dag of strongly-connected components (SCCs).
    6292              : 
    6293              :    (3) When optimizing for speed, partition the nodes in each SCC based
    6294              :        on their containing cfg loop.  When optimizing for size, treat
    6295              :        each SCC as a single partition.
    6296              : 
    6297              :        This gives us a dag of partitions.  The goal is now to assign a
    6298              :        layout to each partition.
    6299              : 
    6300              :    (4) Construct a set of vector layouts that are worth considering.
    6301              :        Record which nodes must keep their current layout.
    6302              : 
    6303              :    (5) Perform a forward walk over the partition dag (from loads to stores)
    6304              :        accumulating the "forward" cost of using each layout.  When visiting
    6305              :        each partition, assign a tentative choice of layout to the partition
    6306              :        and use that choice when calculating the cost of using a different
    6307              :        layout in successor partitions.
    6308              : 
    6309              :    (6) Perform a backward walk over the partition dag (from stores to loads),
    6310              :        accumulating the "backward" cost of using each layout.  When visiting
    6311              :        each partition, make a final choice of layout for that partition based
    6312              :        on the accumulated forward costs (from (5)) and backward costs
    6313              :        (from (6)).
    6314              : 
    6315              :    (7) Apply the chosen layouts to the SLP graph.
    6316              : 
    6317              :    For example, consider the SLP statements:
    6318              : 
    6319              :    S1:      a_1 = load
    6320              :        loop:
    6321              :    S2:      a_2 = PHI<a_1, a_3>
    6322              :    S3:      b_1 = load
    6323              :    S4:      a_3 = a_2 + b_1
    6324              :        exit:
    6325              :    S5:      a_4 = PHI<a_3>
    6326              :    S6:      store a_4
    6327              : 
    6328              :    S2 and S4 form an SCC and are part of the same loop.  Every other
    6329              :    statement is in a singleton SCC.  In this example there is a one-to-one
    6330              :    mapping between SCCs and partitions and the partition dag looks like this;
    6331              : 
    6332              :         S1     S3
    6333              :          \     /
    6334              :           S2+S4
    6335              :             |
    6336              :            S5
    6337              :             |
    6338              :            S6
    6339              : 
    6340              :    S2, S3 and S4 will have a higher execution frequency than the other
    6341              :    statements, so when optimizing for speed, the goal is to avoid any
    6342              :    layout changes:
    6343              : 
    6344              :    - within S3
    6345              :    - within S2+S4
    6346              :    - on the S3->S2+S4 edge
    6347              : 
    6348              :    For example, if S3 was originally a reversing load, the goal of the
    6349              :    pass is to make it an unreversed load and change the layout on the
    6350              :    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
    6351              :    on S1->S2+S4 and S5->S6 would also be acceptable.)
    6352              : 
    6353              :    The difference between SCCs and partitions becomes important if we
    6354              :    add an outer loop:
    6355              : 
    6356              :    S1:      a_1 = ...
    6357              :        loop1:
    6358              :    S2:      a_2 = PHI<a_1, a_6>
    6359              :    S3:      b_1 = load
    6360              :    S4:      a_3 = a_2 + b_1
    6361              :        loop2:
    6362              :    S5:      a_4 = PHI<a_3, a_5>
    6363              :    S6:      c_1 = load
    6364              :    S7:      a_5 = a_4 + c_1
    6365              :        exit2:
    6366              :    S8:      a_6 = PHI<a_5>
    6367              :    S9:      store a_6
    6368              :        exit1:
    6369              : 
    6370              :    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
    6371              :    for speed, we usually do not want restrictions in the outer loop to "infect"
    6372              :    the decision for the inner loop.  For example, if an outer-loop node
    6373              :    in the SCC contains a statement with a fixed layout, that should not
    6374              :    prevent the inner loop from using a different layout.  Conversely,
    6375              :    the inner loop should not dictate a layout to the outer loop: if the
    6376              :    outer loop does a lot of computation, then it may not be efficient to
    6377              :    do all of that computation in the inner loop's preferred layout.
    6378              : 
    6379              :    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
    6380              :    and S5+S7 (inner).  We also try to arrange partitions so that:
    6381              : 
    6382              :    - the partition for an outer loop comes before the partition for
    6383              :      an inner loop
    6384              : 
    6385              :    - if a sibling loop A dominates a sibling loop B, A's partition
    6386              :      comes before B's
    6387              : 
    6388              :    This gives the following partition dag for the example above:
    6389              : 
    6390              :         S1        S3
    6391              :          \        /
    6392              :           S2+S4+S8   S6
    6393              :            |   \\    /
    6394              :            |    S5+S7
    6395              :            |
    6396              :           S9
    6397              : 
    6398              :    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
    6399              :    one for a reversal of the edge S7->S8.
    6400              : 
    6401              :    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
    6402              :    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
    6403              :    preferred layout against the cost of changing the layout on entry to the
    6404              :    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
    6405              : 
    6406              :    Although this works well when optimizing for speed, it has the downside
    6407              :    when optimizing for size that the choice of layout for S5+S7 is completely
    6408              :    independent of S9, which lessens the chance of reducing the overall number
    6409              :    of permutations.  We therefore do not partition SCCs when optimizing
    6410              :    for size.
    6411              : 
    6412              :    To give a concrete example of the difference between optimizing
    6413              :    for size and speed, consider:
    6414              : 
    6415              :    a[0] = (b[1] << c[3]) - d[1];
    6416              :    a[1] = (b[0] << c[2]) - d[0];
    6417              :    a[2] = (b[3] << c[1]) - d[3];
    6418              :    a[3] = (b[2] << c[0]) - d[2];
    6419              : 
    6420              :    There are three different layouts here: one for a, one for b and d,
    6421              :    and one for c.  When optimizing for speed it is better to permute each
    6422              :    of b, c and d into the order required by a, since those permutations
    6423              :    happen in parallel.  But when optimizing for size, it is better to:
    6424              : 
    6425              :    - permute c into the same order as b
    6426              :    - do the arithmetic
    6427              :    - permute the result into the order required by a
    6428              : 
    6429              :    This gives 2 permutations rather than 3.  */
    6430              : 
    6431              : class vect_optimize_slp_pass
    6432              : {
    6433              : public:
    6434       624895 :   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
    6435              :   void run ();
    6436              : 
    6437              : private:
    6438              :   /* Graph building.  */
    6439              :   struct loop *containing_loop (slp_tree);
    6440              :   bool is_cfg_latch_edge (graph_edge *);
    6441              :   void build_vertices (hash_set<slp_tree> &, slp_tree);
    6442              :   void build_vertices ();
    6443              :   void build_graph ();
    6444              : 
    6445              :   /* Partitioning.  */
    6446              :   void create_partitions ();
    6447              :   template<typename T> void for_each_partition_edge (unsigned int, T);
    6448              : 
    6449              :   /* Layout selection.  */
    6450              :   bool is_compatible_layout (slp_tree, unsigned int);
    6451              :   bool is_compatible_layout (const slpg_partition_info &, unsigned int);
    6452              :   int change_layout_cost (slp_tree, unsigned int, unsigned int);
    6453              :   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
    6454              :                                                        unsigned int);
    6455              :   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
    6456              :                                int, unsigned int);
    6457              :   int internal_node_cost (slp_tree, int, unsigned int);
    6458              :   void start_choosing_layouts ();
    6459              :   bool legitimize ();
    6460              : 
    6461              :   /* Cost propagation.  */
    6462              :   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
    6463              :                                      unsigned int, unsigned int);
    6464              :   slpg_layout_cost total_in_cost (unsigned int);
    6465              :   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
    6466              :   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
    6467              :   void forward_pass ();
    6468              :   void backward_pass ();
    6469              : 
    6470              :   /* Rematerialization.  */
    6471              :   slp_tree get_result_with_layout (slp_tree, unsigned int);
    6472              :   void materialize ();
    6473              : 
    6474              :   /* Clean-up.  */
    6475              :   void remove_redundant_permutations ();
    6476              : 
    6477              :   /* Masked load lanes discovery.  */
    6478              :   void decide_masked_load_lanes ();
    6479              : 
    6480              :   void dump ();
    6481              : 
    6482              :   vec_info *m_vinfo;
    6483              : 
    6484              :   /* True if we should optimize the graph for size, false if we should
    6485              :      optimize it for speed.  (It wouldn't be easy to make this decision
    6486              :      more locally.)  */
    6487              :   bool m_optimize_size;
    6488              : 
    6489              :   /* A graph of all SLP nodes, with edges leading from uses to definitions.
    6490              :      In other words, a node's predecessors are its slp_tree parents and
    6491              :      a node's successors are its slp_tree children.  */
    6492              :   graph *m_slpg = nullptr;
    6493              : 
    6494              :   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
    6495              :   auto_vec<slpg_vertex> m_vertices;
    6496              : 
    6497              :   /* The list of all leaves of M_SLPG. such as external definitions, constants,
    6498              :      and loads.  */
    6499              :   auto_vec<int> m_leafs;
    6500              : 
    6501              :   /* This array has one entry for every vector layout that we're considering.
    6502              :      Element 0 is null and indicates "no change".  Other entries describe
    6503              :      permutations that are inherent in the current graph and that we would
    6504              :      like to reverse if possible.
    6505              : 
    6506              :      For example, a permutation { 1, 2, 3, 0 } means that something has
    6507              :      effectively been permuted in that way, such as a load group
    6508              :      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
    6509              :      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
    6510              :      in order to put things "back" in order.  */
    6511              :   auto_vec<vec<unsigned> > m_perms;
    6512              : 
    6513              :   /* A partitioning of the nodes for which a layout must be chosen.
    6514              :      Each partition represents an <SCC, cfg loop> pair; that is,
    6515              :      nodes in different SCCs belong to different partitions, and nodes
    6516              :      within an SCC can be further partitioned according to a containing
    6517              :      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
    6518              : 
    6519              :      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
    6520              :        from leaves (such as loads) to roots (such as stores).
    6521              : 
    6522              :      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
    6523              :   auto_vec<slpg_partition_info> m_partitions;
    6524              : 
    6525              :   /* The list of all nodes for which a layout must be chosen.  Nodes for
    6526              :      partition P come before the nodes for partition P+1.  Nodes within a
    6527              :      partition are in reverse postorder.  */
    6528              :   auto_vec<unsigned int> m_partitioned_nodes;
    6529              : 
    6530              :   /* Index P * num-layouts + L contains the cost of using layout L
    6531              :      for partition P.  */
    6532              :   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
    6533              : 
    6534              :   /* Index N * num-layouts + L, if nonnull, is a node that provides the
    6535              :      original output of node N adjusted to have layout L.  */
    6536              :   auto_vec<slp_tree> m_node_layouts;
    6537              : };
    6538              : 
    6539              : /* Fill the vertices and leafs vector with all nodes in the SLP graph.
    6540              :    Also record whether we should optimize anything for speed rather
    6541              :    than size.  */
    6542              : 
    6543              : void
    6544      9747330 : vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
    6545              :                                         slp_tree node)
    6546              : {
    6547      9747330 :   unsigned i;
    6548      9747330 :   slp_tree child;
    6549              : 
    6550      9747330 :   if (visited.add (node))
    6551      9747330 :     return;
    6552              : 
    6553      9102782 :   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
    6554              :     {
    6555      7062157 :       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
    6556      6279703 :       if (optimize_bb_for_speed_p (bb))
    6557      6162465 :         m_optimize_size = false;
    6558              :     }
    6559              : 
    6560      9102782 :   node->vertex = m_vertices.length ();
    6561      9102782 :   m_vertices.safe_push (slpg_vertex (node));
    6562              : 
    6563      9102782 :   bool leaf = true;
    6564      9102782 :   bool force_leaf = false;
    6565     16843072 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    6566      7740290 :     if (child)
    6567              :       {
    6568      6985324 :         leaf = false;
    6569      6985324 :         build_vertices (visited, child);
    6570              :       }
    6571              :     else
    6572              :       force_leaf = true;
    6573              :   /* Since SLP discovery works along use-def edges all cycles have an
    6574              :      entry - but there's the exception of cycles where we do not handle
    6575              :      the entry explicitly (but with a NULL SLP node), like some reductions
    6576              :      and inductions.  Force those SLP PHIs to act as leafs to make them
    6577              :      backwards reachable.  */
    6578      9102782 :   if (leaf || force_leaf)
    6579      4533242 :     m_leafs.safe_push (node->vertex);
    6580              : }
    6581              : 
    6582              : /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
    6583              : 
    6584              : void
    6585      1249790 : vect_optimize_slp_pass::build_vertices ()
    6586              : {
    6587      1249790 :   hash_set<slp_tree> visited;
    6588      1249790 :   unsigned i;
    6589      1249790 :   slp_instance instance;
    6590      1249790 :   m_vertices.truncate (0);
    6591      1249790 :   m_leafs.truncate (0);
    6592      6511376 :   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
    6593      2762006 :     build_vertices (visited, SLP_INSTANCE_TREE (instance));
    6594      1249790 : }
    6595              : 
    6596              : /* Apply (reverse) bijectite PERM to VEC.  */
    6597              : 
    6598              : template <class T>
    6599              : static void
    6600       191963 : vect_slp_permute (vec<unsigned> perm,
    6601              :                   vec<T> &vec, bool reverse)
    6602              : {
    6603       191963 :   auto_vec<T, 64> saved;
    6604       191963 :   saved.create (vec.length ());
    6605       626423 :   for (unsigned i = 0; i < vec.length (); ++i)
    6606       434460 :     saved.quick_push (vec[i]);
    6607              : 
    6608       191963 :   if (reverse)
    6609              :     {
    6610      1242759 :       for (unsigned i = 0; i < vec.length (); ++i)
    6611       433248 :         vec[perm[i]] = saved[i];
    6612       624673 :       for (unsigned i = 0; i < vec.length (); ++i)
    6613       762549 :         gcc_assert (vec[perm[i]] == saved[i]);
    6614              :     }
    6615              :   else
    6616              :     {
    6617         3500 :       for (unsigned i = 0; i < vec.length (); ++i)
    6618         1212 :         vec[i] = saved[perm[i]];
    6619       193175 :       for (unsigned i = 0; i < vec.length (); ++i)
    6620         1818 :         gcc_assert (vec[i] == saved[perm[i]]);
    6621              :     }
    6622       191963 : }
    6623              : 
    6624              : /* Return the cfg loop that contains NODE.  */
    6625              : 
    6626              : struct loop *
    6627      3434447 : vect_optimize_slp_pass::containing_loop (slp_tree node)
    6628              : {
    6629      3434447 :   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
    6630      3434447 :   if (!rep)
    6631         4608 :     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
    6632      3830219 :   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
    6633              : }
    6634              : 
    6635              : /* Return true if UD (an edge from a use to a definition) is associated
    6636              :    with a loop latch edge in the cfg.  */
    6637              : 
    6638              : bool
    6639      6985324 : vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
    6640              : {
    6641      6985324 :   slp_tree use = m_vertices[ud->src].node;
    6642      6985324 :   slp_tree def = m_vertices[ud->dest].node;
    6643      6985324 :   if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
    6644      6985324 :        || SLP_TREE_PERMUTE_P (use))
    6645      6693736 :       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
    6646              :     return false;
    6647              : 
    6648      3881618 :   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
    6649      3881618 :   return (is_a<gphi *> (use_rep->stmt)
    6650       319332 :           && bb_loop_header_p (gimple_bb (use_rep->stmt))
    6651      4038800 :           && containing_loop (def) == containing_loop (use));
    6652              : }
    6653              : 
    6654              : /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
    6655              :    a nonnull data field.  */
    6656              : 
    6657              : void
    6658      1249790 : vect_optimize_slp_pass::build_graph ()
    6659              : {
    6660      1249790 :   m_optimize_size = true;
    6661      1249790 :   build_vertices ();
    6662              : 
    6663      2499580 :   m_slpg = new_graph (m_vertices.length ());
    6664     12852152 :   for (slpg_vertex &v : m_vertices)
    6665     26865004 :     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
    6666      7740290 :       if (child)
    6667              :         {
    6668      6985324 :           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
    6669      6985324 :           if (is_cfg_latch_edge (ud))
    6670       148710 :             ud->data = this;
    6671              :         }
    6672      1249790 : }
    6673              : 
    6674              : /* Return true if E corresponds to a loop latch edge in the cfg.  */
    6675              : 
    6676              : static bool
    6677      3566748 : skip_cfg_latch_edges (graph_edge *e)
    6678              : {
    6679      3566748 :   return e->data;
    6680              : }
    6681              : 
    6682              : /* Create the node partitions.  */
    6683              : 
    6684              : void
    6685       624895 : vect_optimize_slp_pass::create_partitions ()
    6686              : {
    6687              :   /* Calculate a postorder of the graph, ignoring edges that correspond
    6688              :      to natural latch edges in the cfg.  Reading the vector from the end
    6689              :      to the beginning gives the reverse postorder.  */
    6690       624895 :   auto_vec<int> initial_rpo;
    6691      1249790 :   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
    6692              :                false, NULL, skip_cfg_latch_edges);
    6693      1874685 :   gcc_assert (initial_rpo.length () == m_vertices.length ());
    6694              : 
    6695              :   /* Calculate the strongly connected components of the graph.  */
    6696       624895 :   auto_vec<int> scc_grouping;
    6697       624895 :   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
    6698              : 
    6699              :   /* Create a new index order in which all nodes from the same SCC are
    6700              :      consecutive.  Use scc_pos to record the index of the first node in
    6701              :      each SCC.  */
    6702       624895 :   auto_vec<unsigned int> scc_pos (num_sccs);
    6703       624895 :   int last_component = -1;
    6704       624895 :   unsigned int node_count = 0;
    6705      6425809 :   for (unsigned int node_i : scc_grouping)
    6706              :     {
    6707      4551124 :       if (last_component != m_slpg->vertices[node_i].component)
    6708              :         {
    6709      4459418 :           last_component = m_slpg->vertices[node_i].component;
    6710      8918836 :           gcc_assert (last_component == int (scc_pos.length ()));
    6711      4459418 :           scc_pos.quick_push (node_count);
    6712              :         }
    6713      4551124 :       node_count += 1;
    6714              :     }
    6715      1249790 :   gcc_assert (node_count == initial_rpo.length ()
    6716              :               && last_component + 1 == int (num_sccs));
    6717              : 
    6718              :   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
    6719              :      inside each SCC following the RPO we calculated above.  The fact that
    6720              :      we ignored natural latch edges when calculating the RPO should ensure
    6721              :      that, for natural loop nests:
    6722              : 
    6723              :      - the first node that we encounter in a cfg loop is the loop header phi
    6724              :      - the loop header phis are in dominance order
    6725              : 
    6726              :      Arranging for this is an optimization (see below) rather than a
    6727              :      correctness issue.  Unnatural loops with a tangled mess of backedges
    6728              :      will still work correctly, but might give poorer results.
    6729              : 
    6730              :      Also update scc_pos so that it gives 1 + the index of the last node
    6731              :      in the SCC.  */
    6732       624895 :   m_partitioned_nodes.safe_grow (node_count);
    6733      5800914 :   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
    6734              :     {
    6735      4551124 :       unsigned int node_i = initial_rpo[old_i];
    6736      4551124 :       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
    6737      4551124 :       m_partitioned_nodes[new_i] = node_i;
    6738              :     }
    6739              : 
    6740              :   /* When optimizing for speed, partition each SCC based on the containing
    6741              :      cfg loop. The order we constructed above should ensure that, for natural
    6742              :      cfg loops, we'll create sub-SCC partitions for outer loops before
    6743              :      the corresponding sub-SCC partitions for inner loops.  Similarly,
    6744              :      when one sibling loop A dominates another sibling loop B, we should
    6745              :      create a sub-SCC partition for A before a sub-SCC partition for B.
    6746              : 
    6747              :      As above, nothing depends for correctness on whether this achieves
    6748              :      a natural nesting, but we should get better results when it does.  */
    6749      1249790 :   m_partitions.reserve (m_vertices.length ());
    6750       624895 :   unsigned int next_partition_i = 0;
    6751       624895 :   hash_map<struct loop *, int> loop_partitions;
    6752       624895 :   unsigned int rpo_begin = 0;
    6753       624895 :   unsigned int num_partitioned_nodes = 0;
    6754      6334103 :   for (unsigned int rpo_end : scc_pos)
    6755              :     {
    6756      4459418 :       loop_partitions.empty ();
    6757              :       unsigned int partition_i = next_partition_i;
    6758      9010542 :       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
    6759              :         {
    6760              :           /* Handle externals and constants optimistically throughout.
    6761              :              But treat existing vectors as fixed since we do not handle
    6762              :              permuting them.  */
    6763      4551124 :           unsigned int node_i = m_partitioned_nodes[rpo_i];
    6764      4551124 :           auto &vertex = m_vertices[node_i];
    6765      4551124 :           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
    6766       494625 :                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
    6767      4553368 :               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
    6768      1406931 :             vertex.partition = -1;
    6769              :           else
    6770              :             {
    6771      3144193 :               bool existed;
    6772      3144193 :               if (m_optimize_size)
    6773        24110 :                 existed = next_partition_i > partition_i;
    6774              :               else
    6775              :                 {
    6776      3120083 :                   struct loop *loop = containing_loop (vertex.node);
    6777      3120083 :                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
    6778      3120083 :                   if (!existed)
    6779      3029361 :                     entry = next_partition_i;
    6780      3120083 :                   partition_i = entry;
    6781              :                 }
    6782      3144193 :               if (!existed)
    6783              :                 {
    6784      3053393 :                   m_partitions.quick_push (slpg_partition_info ());
    6785      3053393 :                   next_partition_i += 1;
    6786              :                 }
    6787      3144193 :               vertex.partition = partition_i;
    6788      3144193 :               num_partitioned_nodes += 1;
    6789      3144193 :               m_partitions[partition_i].node_end += 1;
    6790              :             }
    6791              :         }
    6792      4459418 :       rpo_begin = rpo_end;
    6793              :     }
    6794              : 
    6795              :   /* Assign ranges of consecutive node indices to each partition,
    6796              :      in partition order.  Start with node_end being the same as
    6797              :      node_begin so that the next loop can use it as a counter.  */
    6798       624895 :   unsigned int node_begin = 0;
    6799      4928078 :   for (auto &partition : m_partitions)
    6800              :     {
    6801      3053393 :       partition.node_begin = node_begin;
    6802      3053393 :       node_begin += partition.node_end;
    6803      3053393 :       partition.node_end = partition.node_begin;
    6804              :     }
    6805       624895 :   gcc_assert (node_begin == num_partitioned_nodes);
    6806              : 
    6807              :   /* Finally build the list of nodes in partition order.  */
    6808       624895 :   m_partitioned_nodes.truncate (num_partitioned_nodes);
    6809      5176019 :   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
    6810              :     {
    6811      4551124 :       int partition_i = m_vertices[node_i].partition;
    6812      4551124 :       if (partition_i >= 0)
    6813              :         {
    6814      3144193 :           unsigned int order_i = m_partitions[partition_i].node_end++;
    6815      3144193 :           m_partitioned_nodes[order_i] = node_i;
    6816              :         }
    6817              :     }
    6818       624895 : }
    6819              : 
    6820              : /* Look for edges from earlier partitions into node NODE_I and edges from
    6821              :    node NODE_I into later partitions.  Call:
    6822              : 
    6823              :       FN (ud, other_node_i)
    6824              : 
    6825              :    for each such use-to-def edge ud, where other_node_i is the node at the
    6826              :    other end of the edge.  */
    6827              : 
    6828              : template<typename T>
    6829              : void
    6830      3536888 : vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
    6831              : {
    6832      3536888 :   int partition_i = m_vertices[node_i].partition;
    6833      3536888 :   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
    6834      6015467 :        pred; pred = pred->pred_next)
    6835              :     {
    6836      2478579 :       int src_partition_i = m_vertices[pred->src].partition;
    6837      2478579 :       if (src_partition_i >= 0 && src_partition_i != partition_i)
    6838      2252769 :         fn (pred, pred->src);
    6839              :     }
    6840      3536888 :   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
    6841      7570912 :        succ; succ = succ->succ_next)
    6842              :     {
    6843      4034024 :       int dest_partition_i = m_vertices[succ->dest].partition;
    6844      4034024 :       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
    6845      2274608 :         fn (succ, succ->dest);
    6846              :     }
    6847      3536888 : }
    6848              : 
    6849              : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
    6850              :    that NODE would operate on.  This test is independent of NODE's actual
    6851              :    operation.  */
    6852              : 
    6853              : bool
    6854      1584482 : vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
    6855              :                                               unsigned int layout_i)
    6856              : {
    6857      1584482 :   if (layout_i == 0)
    6858              :     return true;
    6859              : 
    6860       918810 :   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
    6861        11596 :     return false;
    6862              : 
    6863              :   return true;
    6864              : }
    6865              : 
    6866              : /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
    6867              :    that NODE would operate on for each NODE in PARTITION.
    6868              :    This test is independent of NODE's actual operations.  */
    6869              : 
    6870              : bool
    6871        17595 : vect_optimize_slp_pass::is_compatible_layout (const slpg_partition_info
    6872              :                                                 &partition,
    6873              :                                               unsigned int layout_i)
    6874              : {
    6875        35424 :   for (unsigned int order_i = partition.node_begin;
    6876        35424 :        order_i < partition.node_end; ++order_i)
    6877              :     {
    6878        17895 :       unsigned int node_i = m_partitioned_nodes[order_i];
    6879        17895 :       auto &vertex = m_vertices[node_i];
    6880              : 
    6881              :       /* The layout is incompatible if it is individually incompatible
    6882              :          with any node in the partition.  */
    6883        17895 :       if (!is_compatible_layout (vertex.node, layout_i))
    6884              :         return false;
    6885              :     }
    6886              :   return true;
    6887              : }
    6888              : 
    6889              : /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
    6890              :    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
    6891              :    layouts is incompatible with NODE or if the change is not possible for
    6892              :    some other reason.
    6893              : 
    6894              :    The properties taken from NODE include the number of lanes and the
    6895              :    vector type.  The actual operation doesn't matter.  */
    6896              : 
    6897              : int
    6898       678941 : vect_optimize_slp_pass::change_layout_cost (slp_tree node,
    6899              :                                             unsigned int from_layout_i,
    6900              :                                             unsigned int to_layout_i)
    6901              : {
    6902       678941 :   if (!is_compatible_layout (node, from_layout_i)
    6903       678941 :       || !is_compatible_layout (node, to_layout_i))
    6904          569 :     return -1;
    6905              : 
    6906       678372 :   if (from_layout_i == to_layout_i)
    6907              :     return 0;
    6908              : 
    6909       293386 :   auto_vec<slp_tree, 1> children (1);
    6910       293386 :   children.quick_push (node);
    6911       293386 :   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
    6912       293386 :   if (from_layout_i > 0)
    6913       830284 :     for (unsigned int i : m_perms[from_layout_i])
    6914       365491 :       perm.quick_push ({ 0, i });
    6915              :   else
    6916       448184 :     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
    6917       309729 :       perm.quick_push ({ 0, i });
    6918       293386 :   if (to_layout_i > 0)
    6919       138882 :     vect_slp_permute (m_perms[to_layout_i], perm, true);
    6920       293386 :   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
    6921              :                                                children, false);
    6922       293386 :   if (count >= 0)
    6923       288858 :     return MAX (count, 1);
    6924              : 
    6925              :   /* ??? In principle we could try changing via layout 0, giving two
    6926              :      layout changes rather than 1.  Doing that would require
    6927              :      corresponding support in get_result_with_layout.  */
    6928              :   return -1;
    6929       293386 : }
    6930              : 
    6931              : /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
    6932              : 
    6933              : inline slpg_partition_layout_costs &
    6934       981419 : vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
    6935              :                                                 unsigned int layout_i)
    6936              : {
    6937      1962838 :   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
    6938              : }
    6939              : 
    6940              : /* Change PERM in one of two ways:
    6941              : 
    6942              :    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
    6943              :      chosen for child I of NODE.
    6944              : 
    6945              :    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
    6946              : 
    6947              :    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
    6948              : 
    6949              : void
    6950        27867 : vect_optimize_slp_pass::
    6951              : change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
    6952              :                         int in_layout_i, unsigned int out_layout_i)
    6953              : {
    6954       163837 :   for (auto &entry : perm)
    6955              :     {
    6956        80236 :       int this_in_layout_i = in_layout_i;
    6957        80236 :       if (this_in_layout_i < 0)
    6958              :         {
    6959        57281 :           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
    6960        57281 :           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
    6961        57281 :           if (in_partition_i == -1u)
    6962          329 :             continue;
    6963        56952 :           this_in_layout_i = m_partitions[in_partition_i].layout;
    6964              :         }
    6965        79907 :       if (this_in_layout_i > 0)
    6966        17441 :         entry.second = m_perms[this_in_layout_i][entry.second];
    6967              :     }
    6968        27867 :   if (out_layout_i > 0)
    6969         6305 :     vect_slp_permute (m_perms[out_layout_i], perm, true);
    6970        27867 : }
    6971              : 
    6972              : /* Check whether the target allows NODE to be rearranged so that the node's
    6973              :    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
    6974              :    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
    6975              : 
    6976              :    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
    6977              :    NODE can adapt to the layout changes that have (perhaps provisionally)
    6978              :    been chosen for NODE's children, so that no extra permutations are
    6979              :    needed on either the input or the output of NODE.
    6980              : 
    6981              :    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
    6982              :    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
    6983              : 
    6984              :    IN_LAYOUT_I has no meaning for other types of node.
    6985              : 
    6986              :    Keeping the node as-is is always valid.  If the target doesn't appear
    6987              :    to support the node as-is, but might realistically support other layouts,
    6988              :    then layout 0 instead has the cost of a worst-case permutation.  On the
    6989              :    one hand, this ensures that every node has at least one valid layout,
    6990              :    avoiding what would otherwise be an awkward special case.  On the other,
    6991              :    it still encourages the pass to change an invalid pre-existing layout
    6992              :    choice into a valid one.  */
    6993              : 
    6994              : int
    6995       208670 : vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
    6996              :                                             unsigned int out_layout_i)
    6997              : {
    6998       208670 :   const int fallback_cost = 1;
    6999              : 
    7000       208670 :   if (SLP_TREE_PERMUTE_P (node))
    7001              :     {
    7002        23544 :       auto_lane_permutation_t tmp_perm;
    7003        23544 :       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
    7004              : 
    7005              :       /* Check that the child nodes support the chosen layout.  Checking
    7006              :          the first child is enough, since any second child would have the
    7007              :          same shape.  */
    7008        23544 :       auto first_child = SLP_TREE_CHILDREN (node)[0];
    7009        23544 :       if (in_layout_i > 0
    7010        23544 :           && !is_compatible_layout (first_child, in_layout_i))
    7011              :         return -1;
    7012              : 
    7013        22979 :       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
    7014        45958 :       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
    7015              :                                                   node, tmp_perm,
    7016        22979 :                                                   SLP_TREE_CHILDREN (node),
    7017              :                                                   false);
    7018        22979 :       if (count < 0)
    7019              :         {
    7020         1516 :           if (in_layout_i == 0 && out_layout_i == 0)
    7021              :             {
    7022              :               /* Use the fallback cost if the node could in principle support
    7023              :                  some nonzero layout for both the inputs and the outputs.
    7024              :                  Otherwise assume that the node will be rejected later
    7025              :                  and rebuilt from scalars.  */
    7026          369 :               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
    7027              :                 return fallback_cost;
    7028          299 :               return 0;
    7029              :             }
    7030              :           return -1;
    7031              :         }
    7032              : 
    7033              :       /* We currently have no way of telling whether the new layout is cheaper
    7034              :          or more expensive than the old one.  But at least in principle,
    7035              :          it should be worth making zero permutations (whole-vector shuffles)
    7036              :          cheaper than real permutations, in case the pass is able to remove
    7037              :          the latter.  */
    7038        21463 :       return count == 0 ? 0 : 1;
    7039        23544 :     }
    7040              : 
    7041       185126 :   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
    7042       185126 :   if (rep
    7043       184187 :       && STMT_VINFO_DATA_REF (rep)
    7044        58905 :       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
    7045       226771 :       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7046              :     {
    7047        35339 :       auto_load_permutation_t tmp_perm;
    7048        35339 :       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
    7049        35339 :       if (out_layout_i > 0)
    7050        12344 :         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
    7051              : 
    7052        35339 :       poly_uint64 vf = 1;
    7053        35339 :       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
    7054         7972 :         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    7055        35339 :       unsigned int n_perms;
    7056        35339 :       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
    7057              :                                            nullptr, vf, true, false, &n_perms))
    7058              :         {
    7059         1501 :           auto rep = SLP_TREE_REPRESENTATIVE (node);
    7060         1501 :           if (out_layout_i == 0)
    7061              :             {
    7062              :               /* Use the fallback cost if the load is an N-to-N permutation.
    7063              :                  Otherwise assume that the node will be rejected later
    7064              :                  and rebuilt from scalars.  */
    7065         1098 :               if (STMT_VINFO_GROUPED_ACCESS (rep)
    7066         2196 :                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
    7067         1098 :                       == SLP_TREE_LANES (node)))
    7068          602 :                 return fallback_cost;
    7069              :               return 0;
    7070              :             }
    7071              :           return -1;
    7072              :         }
    7073              : 
    7074              :       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
    7075        33838 :       return n_perms == 0 ? 0 : 1;
    7076        35339 :     }
    7077              : 
    7078              :   return 0;
    7079              : }
    7080              : 
    7081              : /* Decide which element layouts we should consider using.  Calculate the
    7082              :    weights associated with inserting layout changes on partition edges.
    7083              :    Also mark partitions that cannot change layout, by setting their
    7084              :    layout to zero.  */
    7085              : 
    7086              : void
    7087       624895 : vect_optimize_slp_pass::start_choosing_layouts ()
    7088              : {
    7089              :   /* Used to assign unique permutation indices.  */
    7090       624895 :   using perm_hash = unbounded_hashmap_traits<
    7091              :     vec_free_hash_base<int_hash_base<unsigned>>,
    7092              :     int_hash<int, -1, -2>
    7093              :   >;
    7094       624895 :   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
    7095              : 
    7096              :   /* Layout 0 is "no change".  */
    7097       624895 :   m_perms.safe_push (vNULL);
    7098              : 
    7099              :   /* Create layouts from existing permutations.  */
    7100       624895 :   auto_load_permutation_t tmp_perm;
    7101      5018878 :   for (unsigned int node_i : m_partitioned_nodes)
    7102              :     {
    7103              :       /* Leafs also double as entries to the reverse graph.  Allow the
    7104              :          layout of those to be changed.  */
    7105      3144193 :       auto &vertex = m_vertices[node_i];
    7106      3144193 :       auto &partition = m_partitions[vertex.partition];
    7107      3144193 :       if (!m_slpg->vertices[node_i].succ)
    7108       795747 :         partition.layout = 0;
    7109              : 
    7110              :       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
    7111      3144193 :       slp_tree node = vertex.node;
    7112      3144193 :       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
    7113      3144193 :       slp_tree child;
    7114      3144193 :       unsigned HOST_WIDE_INT imin, imax = 0;
    7115      3144193 :       bool any_permute = false;
    7116      3144193 :       tmp_perm.truncate (0);
    7117      3144193 :       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7118              :         {
    7119              :           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
    7120              :              unpermuted, record a layout that reverses this permutation.
    7121              : 
    7122              :              We would need more work to cope with loads that are internally
    7123              :              permuted and also have inputs (such as masks for
    7124              :              IFN_MASK_LOADs).  */
    7125       522218 :           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
    7126       522218 :           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
    7127              :             {
    7128       357776 :               partition.layout = -1;
    7129      3128082 :               continue;
    7130              :             }
    7131       164442 :           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
    7132       164442 :           imin = DR_GROUP_SIZE (dr_stmt) + 1;
    7133       164442 :           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
    7134              :         }
    7135      5128827 :       else if (SLP_TREE_PERMUTE_P (node)
    7136       130324 :                && SLP_TREE_CHILDREN (node).length () == 1
    7137       115123 :                && (child = SLP_TREE_CHILDREN (node)[0])
    7138      2737098 :                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
    7139       115123 :                    .is_constant (&imin)))
    7140              :         {
    7141              :           /* If the child has the same vector size as this node,
    7142              :              reversing the permutation can make the permutation a no-op.
    7143              :              In other cases it can change a true permutation into a
    7144              :              full-vector extract.  */
    7145       115123 :           tmp_perm.reserve (SLP_TREE_LANES (node));
    7146       307321 :           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7147       192198 :             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
    7148              :         }
    7149              :       else
    7150      2506852 :         continue;
    7151              : 
    7152       737559 :       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7153              :         {
    7154       457994 :           unsigned idx = tmp_perm[j];
    7155       457994 :           imin = MIN (imin, idx);
    7156       457994 :           imax = MAX (imax, idx);
    7157       457994 :           if (idx - tmp_perm[0] != j)
    7158       132443 :             any_permute = true;
    7159              :         }
    7160              :       /* If the span doesn't match we'd disrupt VF computation, avoid
    7161              :          that for now.  */
    7162       279565 :       if (imax - imin + 1 != SLP_TREE_LANES (node))
    7163        79785 :         continue;
    7164              :       /* If there's no permute no need to split one out.  In this case
    7165              :          we can consider turning a load into a permuted load, if that
    7166              :          turns out to be cheaper than alternatives.  */
    7167       199780 :       if (!any_permute)
    7168              :         {
    7169       183535 :           partition.layout = -1;
    7170       183535 :           continue;
    7171              :         }
    7172              : 
    7173              :       /* For now only handle true permutes, like
    7174              :          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
    7175              :          when permuting constants and invariants keeping the permute
    7176              :          bijective.  */
    7177        16245 :       auto_sbitmap load_index (SLP_TREE_LANES (node));
    7178        16245 :       bitmap_clear (load_index);
    7179        62835 :       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7180        46590 :         bitmap_set_bit (load_index, tmp_perm[j] - imin);
    7181              :       unsigned j;
    7182        62159 :       for (j = 0; j < SLP_TREE_LANES (node); ++j)
    7183        46048 :         if (!bitmap_bit_p (load_index, j))
    7184              :           break;
    7185        16245 :       if (j != SLP_TREE_LANES (node))
    7186          134 :         continue;
    7187              : 
    7188        16111 :       vec<unsigned> perm = vNULL;
    7189        16111 :       perm.safe_grow (SLP_TREE_LANES (node), true);
    7190        61924 :       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
    7191        45813 :         perm[j] = tmp_perm[j] - imin;
    7192              : 
    7193        32222 :       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
    7194              :         {
    7195              :           /* Continue to use existing layouts, but don't add any more.  */
    7196            0 :           int *entry = layout_ids.get (perm);
    7197            0 :           partition.layout = entry ? *entry : 0;
    7198            0 :           perm.release ();
    7199              :         }
    7200              :       else
    7201              :         {
    7202        16111 :           bool existed;
    7203        16111 :           int &layout_i = layout_ids.get_or_insert (perm, &existed);
    7204        16111 :           if (existed)
    7205         5511 :             perm.release ();
    7206              :           else
    7207              :             {
    7208        10600 :               layout_i = m_perms.length ();
    7209        10600 :               m_perms.safe_push (perm);
    7210              :             }
    7211        16111 :           partition.layout = layout_i;
    7212              :         }
    7213        16245 :     }
    7214              : 
    7215              :   /* Initially assume that every layout is possible and has zero cost
    7216              :      in every partition.  */
    7217       624895 :   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
    7218      1249790 :                                               * m_perms.length ());
    7219              : 
    7220              :   /* We have to mark outgoing permutations facing non-associating-reduction
    7221              :      graph entries that are not represented as to be materialized.
    7222              :      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
    7223      3255688 :   for (slp_instance instance : m_vinfo->slp_instances)
    7224      1381003 :     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
    7225              :       {
    7226         6248 :         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
    7227         6248 :         m_partitions[m_vertices[node_i].partition].layout = 0;
    7228              :       }
    7229      1374755 :     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
    7230              :       {
    7231         1399 :         stmt_vec_info stmt_info
    7232         1399 :           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
    7233         1399 :         vect_reduc_info reduc_info
    7234         1399 :           = info_for_reduction (as_a <loop_vec_info> (m_vinfo),
    7235              :                                 SLP_INSTANCE_TREE (instance));
    7236         1399 :         if (needs_fold_left_reduction_p (TREE_TYPE
    7237              :                                            (gimple_get_lhs (stmt_info->stmt)),
    7238              :                                          VECT_REDUC_INFO_CODE (reduc_info)))
    7239              :           {
    7240           64 :             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
    7241           64 :             m_partitions[m_vertices[node_i].partition].layout = 0;
    7242              :           }
    7243              :       }
    7244              : 
    7245              :   /* Check which layouts each node and partition can handle.  Calculate the
    7246              :      weights associated with inserting layout changes on edges.  */
    7247      5018878 :   for (unsigned int node_i : m_partitioned_nodes)
    7248              :     {
    7249      3144193 :       auto &vertex = m_vertices[node_i];
    7250      3144193 :       auto &partition = m_partitions[vertex.partition];
    7251      3144193 :       slp_tree node = vertex.node;
    7252              : 
    7253      3144193 :       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
    7254              :         {
    7255      3139585 :           vertex.weight = vect_slp_node_weight (node);
    7256              : 
    7257              :           /* We do not handle stores with a permutation, so all
    7258              :              incoming permutations must have been materialized.
    7259              : 
    7260              :              We also don't handle masked grouped loads, which lack a
    7261              :              permutation vector.  In this case the memory locations
    7262              :              form an implicit second input to the loads, on top of the
    7263              :              explicit mask input, and the memory input's layout cannot
    7264              :              be changed.
    7265              : 
    7266              :              On the other hand, we do support permuting gather loads and
    7267              :              masked gather loads, where each scalar load is independent
    7268              :              of the others.  This can be useful if the address/index input
    7269              :              benefits from permutation.  */
    7270      3139585 :           if (STMT_VINFO_DATA_REF (rep)
    7271      1627231 :               && STMT_VINFO_GROUPED_ACCESS (rep)
    7272      4208816 :               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7273       904789 :             partition.layout = 0;
    7274              : 
    7275              :           /* We cannot change the layout of an operation that is
    7276              :              not independent on lanes.  Note this is an explicit
    7277              :              negative list since that's much shorter than the respective
    7278              :              positive one but it's critical to keep maintaining it.  */
    7279      3139585 :           if (is_gimple_call (STMT_VINFO_STMT (rep)))
    7280        23350 :             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
    7281              :               {
    7282         1071 :               case CFN_COMPLEX_ADD_ROT90:
    7283         1071 :               case CFN_COMPLEX_ADD_ROT270:
    7284         1071 :               case CFN_COMPLEX_MUL:
    7285         1071 :               case CFN_COMPLEX_MUL_CONJ:
    7286         1071 :               case CFN_VEC_ADDSUB:
    7287         1071 :               case CFN_VEC_FMADDSUB:
    7288         1071 :               case CFN_VEC_FMSUBADD:
    7289         1071 :                 partition.layout = 0;
    7290              :               default:;
    7291              :               }
    7292              :         }
    7293              : 
    7294      6965685 :       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
    7295              :         {
    7296      3821492 :           auto &other_vertex = m_vertices[other_node_i];
    7297              : 
    7298              :           /* Count the number of edges from earlier partitions and the number
    7299              :              of edges to later partitions.  */
    7300      3821492 :           if (other_vertex.partition < vertex.partition)
    7301      1910746 :             partition.in_degree += 1;
    7302              :           else
    7303      1910746 :             partition.out_degree += 1;
    7304              : 
    7305              :           /* If the current node uses the result of OTHER_NODE_I, accumulate
    7306              :              the effects of that.  */
    7307      3821492 :           if (ud->src == int (node_i))
    7308              :             {
    7309      1910746 :               other_vertex.out_weight += vertex.weight;
    7310      1910746 :               other_vertex.out_degree += 1;
    7311              :             }
    7312      6965685 :         };
    7313      3144193 :       for_each_partition_edge (node_i, process_edge);
    7314              :     }
    7315       624895 : }
    7316              : 
    7317              : /* Return the incoming costs for node NODE_I, assuming that each input keeps
    7318              :    its current (provisional) choice of layout.  The inputs do not necessarily
    7319              :    have the same layout as each other.  */
    7320              : 
    7321              : slpg_layout_cost
    7322         3116 : vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
    7323              : {
    7324         3116 :   auto &vertex = m_vertices[node_i];
    7325         3116 :   slpg_layout_cost cost;
    7326        11365 :   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
    7327              :     {
    7328         8249 :       auto &other_vertex = m_vertices[other_node_i];
    7329         8249 :       if (other_vertex.partition < vertex.partition)
    7330              :         {
    7331         5228 :           auto &other_partition = m_partitions[other_vertex.partition];
    7332        10456 :           auto &other_costs = partition_layout_costs (other_vertex.partition,
    7333         5228 :                                                       other_partition.layout);
    7334         5228 :           slpg_layout_cost this_cost = other_costs.in_cost;
    7335         5228 :           this_cost.add_serial_cost (other_costs.internal_cost);
    7336         5228 :           this_cost.split (other_partition.out_degree);
    7337         5228 :           cost.add_parallel_cost (this_cost);
    7338              :         }
    7339        11365 :     };
    7340         3116 :   for_each_partition_edge (node_i, add_cost);
    7341         3116 :   return cost;
    7342              : }
    7343              : 
    7344              : /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
    7345              :    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
    7346              :    slpg_layout_cost::impossible () if the change isn't possible.  */
    7347              : 
    7348              : slpg_layout_cost
    7349       678941 : vect_optimize_slp_pass::
    7350              : edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
    7351              :                   unsigned int layout2_i)
    7352              : {
    7353       678941 :   auto &def_vertex = m_vertices[ud->dest];
    7354       678941 :   auto &use_vertex = m_vertices[ud->src];
    7355       678941 :   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
    7356       678941 :   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
    7357       678941 :   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
    7358              :                                     use_layout_i);
    7359       678941 :   if (factor < 0)
    7360         5097 :     return slpg_layout_cost::impossible ();
    7361              : 
    7362              :   /* We have a choice of putting the layout change at the site of the
    7363              :      definition or at the site of the use.  Prefer the former when
    7364              :      optimizing for size or when the execution frequency of the
    7365              :      definition is no greater than the combined execution frequencies of
    7366              :      the uses.  When putting the layout change at the site of the definition,
    7367              :      divvy up the cost among all consumers.  */
    7368       673844 :   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
    7369              :     {
    7370       656852 :       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
    7371       656852 :       cost.split (def_vertex.out_degree);
    7372       656852 :       return cost;
    7373              :     }
    7374        16992 :   return { use_vertex.weight * factor, m_optimize_size };
    7375              : }
    7376              : 
    7377              : /* UD represents a use-def link between FROM_NODE_I and a node in a later
    7378              :    partition; FROM_NODE_I could be the definition node or the use node.
    7379              :    The node at the other end of the link wants to use layout TO_LAYOUT_I.
    7380              :    Return the cost of any necessary fix-ups on edge UD, or return
    7381              :    slpg_layout_cost::impossible () if the change isn't possible.
    7382              : 
    7383              :    At this point, FROM_NODE_I's partition has chosen the cheapest
    7384              :    layout based on the information available so far, but this choice
    7385              :    is only provisional.  */
    7386              : 
    7387              : slpg_layout_cost
    7388       178205 : vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
    7389              :                                       unsigned int to_layout_i)
    7390              : {
    7391       178205 :   auto &from_vertex = m_vertices[from_node_i];
    7392       178205 :   unsigned int from_partition_i = from_vertex.partition;
    7393       178205 :   slpg_partition_info &from_partition = m_partitions[from_partition_i];
    7394       178205 :   gcc_assert (from_partition.layout >= 0);
    7395              : 
    7396              :   /* First calculate the cost on the assumption that FROM_PARTITION sticks
    7397              :      with its current layout preference.  */
    7398       178205 :   slpg_layout_cost cost = slpg_layout_cost::impossible ();
    7399       178205 :   auto edge_cost = edge_layout_cost (ud, from_node_i,
    7400       178205 :                                      from_partition.layout, to_layout_i);
    7401       178205 :   if (edge_cost.is_possible ())
    7402              :     {
    7403       351064 :       auto &from_costs = partition_layout_costs (from_partition_i,
    7404       175532 :                                                  from_partition.layout);
    7405       175532 :       cost = from_costs.in_cost;
    7406       175532 :       cost.add_serial_cost (from_costs.internal_cost);
    7407       175532 :       cost.split (from_partition.out_degree);
    7408       175532 :       cost.add_serial_cost (edge_cost);
    7409              :     }
    7410         2673 :   else if (from_partition.layout == 0)
    7411              :     /* We must allow the source partition to have layout 0 as a fallback,
    7412              :        in case all other options turn out to be impossible.  */
    7413         2673 :     return cost;
    7414              : 
    7415              :   /* Take the minimum of that cost and the cost that applies if
    7416              :      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
    7417       175532 :   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
    7418              :                                                       to_layout_i);
    7419       175532 :   if (direct_layout_costs.is_possible ())
    7420              :     {
    7421       158970 :       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
    7422       158970 :       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
    7423       158970 :       direct_cost.split (from_partition.out_degree);
    7424       158970 :       if (!cost.is_possible ()
    7425       158970 :           || direct_cost.is_better_than (cost, m_optimize_size))
    7426        42131 :         cost = direct_cost;
    7427              :     }
    7428              : 
    7429       175532 :   return cost;
    7430              : }
    7431              : 
    7432              : /* UD represents a use-def link between TO_NODE_I and a node in an earlier
    7433              :    partition; TO_NODE_I could be the definition node or the use node.
    7434              :    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
    7435              :    return the cost of any necessary fix-ups on edge UD, or
    7436              :    slpg_layout_cost::impossible () if the choice cannot be made.
    7437              : 
    7438              :    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
    7439              : 
    7440              : slpg_layout_cost
    7441       165372 : vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
    7442              :                                        unsigned int from_layout_i)
    7443              : {
    7444       165372 :   auto &to_vertex = m_vertices[to_node_i];
    7445       165372 :   unsigned int to_partition_i = to_vertex.partition;
    7446       165372 :   slpg_partition_info &to_partition = m_partitions[to_partition_i];
    7447       165372 :   gcc_assert (to_partition.layout >= 0);
    7448              : 
    7449              :   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
    7450              :      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
    7451              :      any other inputs keep their current choice of layout.  */
    7452       165372 :   auto &to_costs = partition_layout_costs (to_partition_i,
    7453              :                                            to_partition.layout);
    7454       165372 :   if (ud->src == int (to_node_i)
    7455       165210 :       && SLP_TREE_PERMUTE_P (to_vertex.node))
    7456              :     {
    7457         9275 :       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
    7458         9275 :       auto old_layout = from_partition.layout;
    7459         9275 :       from_partition.layout = from_layout_i;
    7460        18550 :       int factor = internal_node_cost (to_vertex.node, -1,
    7461         9275 :                                        to_partition.layout);
    7462         9275 :       from_partition.layout = old_layout;
    7463         9275 :       if (factor >= 0)
    7464              :         {
    7465         8643 :           slpg_layout_cost cost = to_costs.out_cost;
    7466        17286 :           cost.add_serial_cost ({ to_vertex.weight * factor,
    7467         8643 :                                   m_optimize_size });
    7468         8643 :           cost.split (to_partition.in_degree);
    7469         8643 :           return cost;
    7470              :         }
    7471              :     }
    7472              : 
    7473              :   /* Compute the cost if we insert any necessary layout change on edge UD.  */
    7474       156729 :   auto edge_cost = edge_layout_cost (ud, to_node_i,
    7475       156729 :                                      to_partition.layout, from_layout_i);
    7476       156729 :   if (edge_cost.is_possible ())
    7477              :     {
    7478       156729 :       slpg_layout_cost cost = to_costs.out_cost;
    7479       156729 :       cost.add_serial_cost (to_costs.internal_cost);
    7480       156729 :       cost.split (to_partition.in_degree);
    7481       156729 :       cost.add_serial_cost (edge_cost);
    7482       156729 :       return cost;
    7483              :     }
    7484              : 
    7485            0 :   return slpg_layout_cost::impossible ();
    7486              : }
    7487              : 
    7488              : /* Make a forward pass through the partitions, accumulating input costs.
    7489              :    Make a tentative (provisional) choice of layout for each partition,
    7490              :    ensuring that this choice still allows later partitions to keep
    7491              :    their original layout.  */
    7492              : 
    7493              : void
    7494         5313 : vect_optimize_slp_pass::forward_pass ()
    7495              : {
    7496       114876 :   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
    7497              :        ++partition_i)
    7498              :     {
    7499       109563 :       auto &partition = m_partitions[partition_i];
    7500              : 
    7501              :       /* If the partition consists of a single VEC_PERM_EXPR, precompute
    7502              :          the incoming cost that would apply if every predecessor partition
    7503              :          keeps its current layout.  This is used within the loop below.  */
    7504       109563 :       slpg_layout_cost in_cost;
    7505       109563 :       slp_tree single_node = nullptr;
    7506       109563 :       if (partition.node_end == partition.node_begin + 1)
    7507              :         {
    7508       105721 :           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
    7509       105721 :           single_node = m_vertices[node_i].node;
    7510       105721 :           if (SLP_TREE_PERMUTE_P (single_node))
    7511         3116 :             in_cost = total_in_cost (node_i);
    7512              :         }
    7513              : 
    7514              :       /* Go through the possible layouts.  Decide which ones are valid
    7515              :          for this partition and record which of the valid layouts has
    7516              :          the lowest cost.  */
    7517       109563 :       unsigned int min_layout_i = 0;
    7518       109563 :       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
    7519       334580 :       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
    7520              :         {
    7521       225017 :           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
    7522       225017 :           if (!layout_costs.is_possible ())
    7523        50930 :             continue;
    7524              : 
    7525              :           /* If the recorded layout is already 0 then the layout cannot
    7526              :              change.  */
    7527       225017 :           if (partition.layout == 0 && layout_i != 0)
    7528              :             {
    7529        37201 :               layout_costs.mark_impossible ();
    7530        37201 :               continue;
    7531              :             }
    7532              : 
    7533       187816 :           bool is_possible = true;
    7534       380755 :           for (unsigned int order_i = partition.node_begin;
    7535       380755 :                order_i < partition.node_end; ++order_i)
    7536              :             {
    7537       204373 :               unsigned int node_i = m_partitioned_nodes[order_i];
    7538       204373 :               auto &vertex = m_vertices[node_i];
    7539              : 
    7540              :               /* Reject the layout if it is individually incompatible
    7541              :                  with any node in the partition.  */
    7542       204373 :               if (!is_compatible_layout (vertex.node, layout_i))
    7543              :                 {
    7544        10396 :                   is_possible = false;
    7545        11434 :                   break;
    7546              :                 }
    7547              : 
    7548       540786 :               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
    7549              :                 {
    7550       346809 :                   auto &other_vertex = m_vertices[other_node_i];
    7551       346809 :                   if (other_vertex.partition < vertex.partition)
    7552              :                     {
    7553              :                       /* Accumulate the incoming costs from earlier
    7554              :                          partitions, plus the cost of any layout changes
    7555              :                          on UD itself.  */
    7556       178205 :                       auto cost = forward_cost (ud, other_node_i, layout_i);
    7557       178205 :                       if (!cost.is_possible ())
    7558         2673 :                         is_possible = false;
    7559              :                       else
    7560       175532 :                         layout_costs.in_cost.add_parallel_cost (cost);
    7561              :                     }
    7562              :                   else
    7563              :                     /* Reject the layout if it would make layout 0 impossible
    7564              :                        for later partitions.  This amounts to testing that the
    7565              :                        target supports reversing the layout change on edges
    7566              :                        to later partitions.
    7567              : 
    7568              :                        In principle, it might be possible to push a layout
    7569              :                        change all the way down a graph, so that it never
    7570              :                        needs to be reversed and so that the target doesn't
    7571              :                        need to support the reverse operation.  But it would
    7572              :                        be awkward to bail out if we hit a partition that
    7573              :                        does not support the new layout, especially since
    7574              :                        we are not dealing with a lattice.  */
    7575       168604 :                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
    7576       168604 :                                                      layout_i).is_possible ();
    7577       540786 :                 };
    7578       193977 :               for_each_partition_edge (node_i, add_cost);
    7579              : 
    7580              :               /* Accumulate the cost of using LAYOUT_I within NODE,
    7581              :                  both for the inputs and the outputs.  */
    7582       193977 :               int factor = internal_node_cost (vertex.node, layout_i,
    7583              :                                                layout_i);
    7584       193977 :               if (factor < 0)
    7585              :                 {
    7586         1038 :                   is_possible = false;
    7587         1038 :                   break;
    7588              :                 }
    7589       192939 :               else if (factor)
    7590        31482 :                 layout_costs.internal_cost.add_serial_cost
    7591        31482 :                   ({ vertex.weight * factor, m_optimize_size });
    7592              :             }
    7593       187816 :           if (!is_possible)
    7594              :             {
    7595        13729 :               layout_costs.mark_impossible ();
    7596        13729 :               continue;
    7597              :             }
    7598              : 
    7599              :           /* Combine the incoming and partition-internal costs.  */
    7600       174087 :           slpg_layout_cost combined_cost = layout_costs.in_cost;
    7601       174087 :           combined_cost.add_serial_cost (layout_costs.internal_cost);
    7602              : 
    7603              :           /* If this partition consists of a single VEC_PERM_EXPR, see
    7604              :              if the VEC_PERM_EXPR can be changed to support output layout
    7605              :              LAYOUT_I while keeping all the provisional choices of input
    7606              :              layout.  */
    7607       174087 :           if (single_node && SLP_TREE_PERMUTE_P (single_node))
    7608              :             {
    7609         5418 :               int factor = internal_node_cost (single_node, -1, layout_i);
    7610         5418 :               if (factor >= 0)
    7611              :                 {
    7612         4973 :                   auto weight = m_vertices[single_node->vertex].weight;
    7613         4973 :                   slpg_layout_cost internal_cost
    7614         4973 :                     = { weight * factor, m_optimize_size };
    7615              : 
    7616         4973 :                   slpg_layout_cost alt_cost = in_cost;
    7617         4973 :                   alt_cost.add_serial_cost (internal_cost);
    7618         4973 :                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
    7619              :                     {
    7620         1577 :                       combined_cost = alt_cost;
    7621         1577 :                       layout_costs.in_cost = in_cost;
    7622         1577 :                       layout_costs.internal_cost = internal_cost;
    7623              :                     }
    7624              :                 }
    7625              :             }
    7626              : 
    7627              :           /* Record the layout with the lowest cost.  Prefer layout 0 in
    7628              :              the event of a tie between it and another layout.  */
    7629       174087 :           if (!min_layout_cost.is_possible ()
    7630        64524 :               || combined_cost.is_better_than (min_layout_cost,
    7631        64524 :                                                m_optimize_size))
    7632              :             {
    7633       123198 :               min_layout_i = layout_i;
    7634       123198 :               min_layout_cost = combined_cost;
    7635              :             }
    7636              :         }
    7637              : 
    7638              :       /* This loop's handling of earlier partitions should ensure that
    7639              :          choosing the original layout for the current partition is no
    7640              :          less valid than it was in the original graph, even with the
    7641              :          provisional layout choices for those earlier partitions.  */
    7642       109563 :       gcc_assert (min_layout_cost.is_possible ());
    7643       109563 :       partition.layout = min_layout_i;
    7644              :     }
    7645         5313 : }
    7646              : 
    7647              : /* Make a backward pass through the partitions, accumulating output costs.
    7648              :    Make a final choice of layout for each partition.  */
    7649              : 
    7650              : void
    7651         5313 : vect_optimize_slp_pass::backward_pass ()
    7652              : {
    7653       120189 :   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
    7654              :     {
    7655       109563 :       auto &partition = m_partitions[partition_i];
    7656              : 
    7657       109563 :       unsigned int min_layout_i = 0;
    7658       109563 :       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
    7659       334580 :       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
    7660              :         {
    7661       225017 :           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
    7662       225017 :           if (!layout_costs.is_possible ())
    7663        50930 :             continue;
    7664              : 
    7665              :           /* Accumulate the costs from successor partitions.  */
    7666       174087 :           bool is_possible = true;
    7667       364700 :           for (unsigned int order_i = partition.node_begin;
    7668       364700 :                order_i < partition.node_end; ++order_i)
    7669              :             {
    7670       190613 :               unsigned int node_i = m_partitioned_nodes[order_i];
    7671       190613 :               auto &vertex = m_vertices[node_i];
    7672       531388 :               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
    7673              :                 {
    7674       340775 :                   auto &other_vertex = m_vertices[other_node_i];
    7675       340775 :                   auto &other_partition = m_partitions[other_vertex.partition];
    7676       340775 :                   if (other_vertex.partition > vertex.partition)
    7677              :                     {
    7678              :                       /* Accumulate the incoming costs from later
    7679              :                          partitions, plus the cost of any layout changes
    7680              :                          on UD itself.  */
    7681       165372 :                       auto cost = backward_cost (ud, other_node_i, layout_i);
    7682       165372 :                       if (!cost.is_possible ())
    7683            0 :                         is_possible = false;
    7684              :                       else
    7685       165372 :                         layout_costs.out_cost.add_parallel_cost (cost);
    7686              :                     }
    7687              :                   else
    7688              :                     /* Make sure that earlier partitions can (if necessary
    7689              :                        or beneficial) keep the layout that they chose in
    7690              :                        the forward pass.  This ensures that there is at
    7691              :                        least one valid choice of layout.  */
    7692       175403 :                     is_possible &= edge_layout_cost (ud, other_node_i,
    7693       175403 :                                                      other_partition.layout,
    7694       175403 :                                                      layout_i).is_possible ();
    7695       531388 :                 };
    7696       190613 :               for_each_partition_edge (node_i, add_cost);
    7697              :             }
    7698       174087 :           if (!is_possible)
    7699              :             {
    7700            0 :               layout_costs.mark_impossible ();
    7701            0 :               continue;
    7702              :             }
    7703              : 
    7704              :           /* Locally combine the costs from the forward and backward passes.
    7705              :              (This combined cost is not passed on, since that would lead
    7706              :              to double counting.)  */
    7707       174087 :           slpg_layout_cost combined_cost = layout_costs.in_cost;
    7708       174087 :           combined_cost.add_serial_cost (layout_costs.internal_cost);
    7709       174087 :           combined_cost.add_serial_cost (layout_costs.out_cost);
    7710              : 
    7711              :           /* Record the layout with the lowest cost.  Prefer layout 0 in
    7712              :              the event of a tie between it and another layout.  */
    7713       174087 :           if (!min_layout_cost.is_possible ()
    7714        64524 :               || combined_cost.is_better_than (min_layout_cost,
    7715        64524 :                                                m_optimize_size))
    7716              :             {
    7717       117726 :               min_layout_i = layout_i;
    7718       117726 :               min_layout_cost = combined_cost;
    7719              :             }
    7720              :         }
    7721              : 
    7722       109563 :       gcc_assert (min_layout_cost.is_possible ());
    7723       109563 :       partition.layout = min_layout_i;
    7724              :     }
    7725         5313 : }
    7726              : 
    7727              : /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
    7728              :    NODE already has the layout that was selected for its partition.  */
    7729              : 
    7730              : slp_tree
    7731       146340 : vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
    7732              :                                                 unsigned int to_layout_i)
    7733              : {
    7734       146340 :   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
    7735       146340 :   slp_tree result = m_node_layouts[result_i];
    7736       146340 :   if (result)
    7737              :     return result;
    7738              : 
    7739       145874 :   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
    7740       145874 :       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
    7741              :           /* We can't permute vector defs in place.  */
    7742        20405 :           && SLP_TREE_VEC_DEFS (node).is_empty ()))
    7743              :     {
    7744              :       /* If the vector is uniform or unchanged, there's nothing to do.  */
    7745        37920 :       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
    7746              :         result = node;
    7747              :       else
    7748              :         {
    7749         1996 :           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
    7750         1996 :           result = vect_create_new_slp_node (scalar_ops);
    7751         1996 :           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
    7752              :         }
    7753              :     }
    7754              :   else
    7755              :     {
    7756       107954 :       unsigned int partition_i = m_vertices[node->vertex].partition;
    7757       107954 :       unsigned int from_layout_i = m_partitions[partition_i].layout;
    7758       107954 :       if (from_layout_i == to_layout_i)
    7759       107418 :         return node;
    7760              : 
    7761              :       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
    7762              :          permutation instead of a serial one.  Leave the new permutation
    7763              :          in TMP_PERM on success.  */
    7764          536 :       auto_lane_permutation_t tmp_perm;
    7765          536 :       unsigned int num_inputs = 1;
    7766          536 :       if (SLP_TREE_PERMUTE_P (node))
    7767              :         {
    7768            7 :           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
    7769            7 :           if (from_layout_i != 0)
    7770            7 :             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
    7771            7 :           if (to_layout_i != 0)
    7772            4 :             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
    7773            7 :           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
    7774              :                                               tmp_perm,
    7775            7 :                                               SLP_TREE_CHILDREN (node),
    7776              :                                               false) >= 0)
    7777            7 :             num_inputs = SLP_TREE_CHILDREN (node).length ();
    7778              :           else
    7779            0 :             tmp_perm.truncate (0);
    7780              :         }
    7781              : 
    7782          536 :       if (dump_enabled_p ())
    7783              :         {
    7784           68 :           if (tmp_perm.length () > 0)
    7785            6 :             dump_printf_loc (MSG_NOTE, vect_location,
    7786              :                              "duplicating permutation node %p with"
    7787              :                              " layout %d\n",
    7788              :                              (void *) node, to_layout_i);
    7789              :           else
    7790           62 :             dump_printf_loc (MSG_NOTE, vect_location,
    7791              :                              "inserting permutation node in place of %p\n",
    7792              :                              (void *) node);
    7793              :         }
    7794              : 
    7795          536 :       unsigned int num_lanes = SLP_TREE_LANES (node);
    7796          536 :       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
    7797          536 :       if (SLP_TREE_SCALAR_STMTS (node).length ())
    7798              :         {
    7799          535 :           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
    7800          535 :           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
    7801          535 :           if (from_layout_i != 0)
    7802          269 :             vect_slp_permute (m_perms[from_layout_i], stmts, false);
    7803          535 :           if (to_layout_i != 0)
    7804          270 :             vect_slp_permute (m_perms[to_layout_i], stmts, true);
    7805              :         }
    7806          536 :       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
    7807          536 :       SLP_TREE_LANES (result) = num_lanes;
    7808          536 :       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
    7809          536 :       result->vertex = -1;
    7810              : 
    7811          536 :       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
    7812          536 :       if (tmp_perm.length ())
    7813              :         {
    7814            7 :           lane_perm.safe_splice (tmp_perm);
    7815            7 :           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
    7816              :         }
    7817              :       else
    7818              :         {
    7819          529 :           lane_perm.create (num_lanes);
    7820         1651 :           for (unsigned j = 0; j < num_lanes; ++j)
    7821         1122 :             lane_perm.quick_push ({ 0, j });
    7822          529 :           if (from_layout_i != 0)
    7823          262 :             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
    7824          529 :           if (to_layout_i != 0)
    7825          267 :             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
    7826          529 :           SLP_TREE_CHILDREN (result).safe_push (node);
    7827              :         }
    7828         2148 :       for (slp_tree child : SLP_TREE_CHILDREN (result))
    7829          540 :         child->refcnt++;
    7830          536 :     }
    7831        38456 :   m_node_layouts[result_i] = result;
    7832        38456 :   return result;
    7833              : }
    7834              : 
    7835              : /* Apply the chosen vector layouts to the SLP graph.  */
    7836              : 
    7837              : void
    7838        10181 : vect_optimize_slp_pass::materialize ()
    7839              : {
    7840              :   /* We no longer need the costs, so avoid having two O(N * P) arrays
    7841              :      live at the same time.  */
    7842        10181 :   m_partition_layout_costs.release ();
    7843        30543 :   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
    7844              : 
    7845        20362 :   auto_sbitmap fully_folded (m_vertices.length ());
    7846        10181 :   bitmap_clear (fully_folded);
    7847       157029 :   for (unsigned int node_i : m_partitioned_nodes)
    7848              :     {
    7849       126486 :       auto &vertex = m_vertices[node_i];
    7850       126486 :       slp_tree node = vertex.node;
    7851       126486 :       int layout_i = m_partitions[vertex.partition].layout;
    7852       126486 :       gcc_assert (layout_i >= 0);
    7853              : 
    7854              :       /* Rearrange the scalar statements to match the chosen layout.  */
    7855       126486 :       if (layout_i > 0)
    7856        15730 :         vect_slp_permute (m_perms[layout_i],
    7857        15730 :                           SLP_TREE_SCALAR_STMTS (node), true);
    7858              : 
    7859              :       /* Update load and lane permutations.  */
    7860       126486 :       if (SLP_TREE_PERMUTE_P (node))
    7861              :         {
    7862              :           /* First try to absorb the input vector layouts.  If that fails,
    7863              :              force the inputs to have layout LAYOUT_I too.  We checked that
    7864              :              that was possible before deciding to use nonzero output layouts.
    7865              :              (Note that at this stage we don't really have any guarantee that
    7866              :              the target supports the original VEC_PERM_EXPR.)  */
    7867         4519 :           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
    7868         4519 :           auto_lane_permutation_t tmp_perm;
    7869         4519 :           tmp_perm.safe_splice (perm);
    7870         4519 :           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
    7871         4519 :           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
    7872              :                                               tmp_perm,
    7873         4519 :                                               SLP_TREE_CHILDREN (node),
    7874              :                                               false) >= 0)
    7875              :             {
    7876         4150 :               if (dump_enabled_p ()
    7877         5042 :                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
    7878              :                                   perm.begin ()))
    7879           58 :                 dump_printf_loc (MSG_NOTE, vect_location,
    7880              :                                  "absorbing input layouts into %p\n",
    7881              :                                  (void *) node);
    7882        23827 :               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
    7883         4150 :               bitmap_set_bit (fully_folded, node_i);
    7884              :             }
    7885              :           else
    7886              :             {
    7887              :               /* Not MSG_MISSED because it would make no sense to users.  */
    7888          369 :               if (dump_enabled_p ())
    7889           46 :                 dump_printf_loc (MSG_NOTE, vect_location,
    7890              :                                  "failed to absorb input layouts into %p\n",
    7891              :                                  (void *) node);
    7892          369 :               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
    7893              :             }
    7894         4519 :         }
    7895              :       else
    7896              :         {
    7897       121967 :           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
    7898       121967 :           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
    7899       121967 :           if (layout_i > 0)
    7900              :             /* ???  When we handle non-bijective permutes the idea
    7901              :                is that we can force the load-permutation to be
    7902              :                { min, min + 1, min + 2, ... max }.  But then the
    7903              :                scalar defs might no longer match the lane content
    7904              :                which means wrong-code with live lane vectorization.
    7905              :                So we possibly have to have NULL entries for those.  */
    7906        15627 :             vect_slp_permute (m_perms[layout_i], load_perm, true);
    7907              :         }
    7908              :     }
    7909              : 
    7910              :   /* Do this before any nodes disappear, since it involves a walk
    7911              :      over the leaves.  */
    7912        10181 :   remove_redundant_permutations ();
    7913              : 
    7914              :   /* Replace each child with a correctly laid-out version.  */
    7915       157029 :   for (unsigned int node_i : m_partitioned_nodes)
    7916              :     {
    7917              :       /* Skip nodes that have already been handled above.  */
    7918       126486 :       if (bitmap_bit_p (fully_folded, node_i))
    7919         4150 :         continue;
    7920              : 
    7921       122336 :       auto &vertex = m_vertices[node_i];
    7922       122336 :       int in_layout_i = m_partitions[vertex.partition].layout;
    7923       122336 :       gcc_assert (in_layout_i >= 0);
    7924              : 
    7925              :       unsigned j;
    7926              :       slp_tree child;
    7927       363310 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
    7928              :         {
    7929       149997 :           if (!child)
    7930         3657 :             continue;
    7931              : 
    7932       146340 :           slp_tree new_child = get_result_with_layout (child, in_layout_i);
    7933       146340 :           if (new_child != child)
    7934              :             {
    7935         2741 :               vect_free_slp_tree (child);
    7936         2741 :               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
    7937         2741 :               new_child->refcnt += 1;
    7938              :             }
    7939              :         }
    7940              :     }
    7941        10181 : }
    7942              : 
    7943              : /* Elide load permutations that are not necessary.  Such permutations might
    7944              :    be pre-existing, rather than created by the layout optimizations.  */
    7945              : 
    7946              : void
    7947       624895 : vect_optimize_slp_pass::remove_redundant_permutations ()
    7948              : {
    7949      4141306 :   for (unsigned int node_i : m_leafs)
    7950              :     {
    7951      2266621 :       slp_tree node = m_vertices[node_i].node;
    7952      2266621 :       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
    7953      1744403 :         continue;
    7954              : 
    7955              :       /* In basic block vectorization we allow any subchain of an interleaving
    7956              :          chain.
    7957              :          FORNOW: not in loop SLP because of realignment complications.  */
    7958       522218 :       if (is_a <bb_vec_info> (m_vinfo))
    7959              :         {
    7960       157899 :           bool subchain_p = true;
    7961              :           stmt_vec_info next_load_info = NULL;
    7962              :           stmt_vec_info load_info;
    7963              :           unsigned j;
    7964       157899 :           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
    7965              :             {
    7966       128540 :               if (j != 0
    7967       128540 :                   && (next_load_info != load_info
    7968        61094 :                       || ! load_info
    7969        61094 :                       || DR_GROUP_GAP (load_info) != 1))
    7970              :                 {
    7971              :                   subchain_p = false;
    7972              :                   break;
    7973              :                 }
    7974       106027 :               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
    7975              :             }
    7976        51872 :           if (subchain_p)
    7977              :             {
    7978        29359 :               SLP_TREE_LOAD_PERMUTATION (node).release ();
    7979        29359 :               continue;
    7980              :             }
    7981              :         }
    7982              :       else
    7983              :         {
    7984       470346 :           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
    7985       470346 :           bool this_load_permuted = !vect_load_perm_consecutive_p (node, 0);
    7986              :           /* When this isn't a grouped access we know it's single element
    7987              :              and contiguous.  */
    7988       470346 :           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
    7989              :             {
    7990       357776 :               if (!this_load_permuted
    7991       357776 :                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
    7992       357162 :                       || SLP_TREE_LANES (node) == 1))
    7993       357151 :                 SLP_TREE_LOAD_PERMUTATION (node).release ();
    7994       357776 :               continue;
    7995              :             }
    7996       112570 :           stmt_vec_info first_stmt_info
    7997       112570 :             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
    7998       112971 :           if (!this_load_permuted
    7999              :               /* The load requires permutation when unrolling exposes
    8000              :                  a gap either because the group is larger than the SLP
    8001              :                  group-size or because there is a gap between the groups.  */
    8002       112570 :               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
    8003        95323 :                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
    8004          124 :                       && DR_GROUP_GAP (first_stmt_info) == 0)))
    8005              :             {
    8006          401 :               SLP_TREE_LOAD_PERMUTATION (node).release ();
    8007          401 :               continue;
    8008              :             }
    8009              :         }
    8010              :     }
    8011       624895 : }
    8012              : 
    8013              : /* Print the partition graph and layout information to the dump file.  */
    8014              : 
    8015              : void
    8016          659 : vect_optimize_slp_pass::dump ()
    8017              : {
    8018          659 :   dump_printf_loc (MSG_NOTE, vect_location,
    8019              :                    "SLP optimize permutations:\n");
    8020         1331 :   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
    8021              :     {
    8022          672 :       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
    8023          672 :       const char *sep = "";
    8024         5769 :       for (unsigned int idx : m_perms[layout_i])
    8025              :         {
    8026         3753 :           dump_printf (MSG_NOTE, "%s%d", sep, idx);
    8027         3753 :           sep = ", ";
    8028              :         }
    8029          672 :       dump_printf (MSG_NOTE, " }\n");
    8030              :     }
    8031          659 :   dump_printf_loc (MSG_NOTE, vect_location,
    8032              :                    "SLP optimize partitions:\n");
    8033         5420 :   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
    8034              :        ++partition_i)
    8035              :     {
    8036         4761 :       auto &partition = m_partitions[partition_i];
    8037         4761 :       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
    8038         4761 :       dump_printf_loc (MSG_NOTE, vect_location,
    8039              :                        "  partition %d (layout %d):\n",
    8040              :                        partition_i, partition.layout);
    8041         4761 :       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
    8042         9750 :       for (unsigned int order_i = partition.node_begin;
    8043         9750 :            order_i < partition.node_end; ++order_i)
    8044              :         {
    8045         4989 :           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
    8046         9978 :           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
    8047         4989 :                            (void *) vertex.node);
    8048         4989 :           dump_printf_loc (MSG_NOTE, vect_location,
    8049              :                            "          weight: %f\n",
    8050              :                            vertex.weight.to_double ());
    8051         4989 :           if (vertex.out_degree)
    8052         3888 :             dump_printf_loc (MSG_NOTE, vect_location,
    8053              :                              "          out weight: %f (degree %d)\n",
    8054              :                              vertex.out_weight.to_double (),
    8055              :                              vertex.out_degree);
    8056         4989 :           if (SLP_TREE_PERMUTE_P (vertex.node))
    8057          492 :             dump_printf_loc (MSG_NOTE, vect_location,
    8058              :                              "          op: VEC_PERM_EXPR\n");
    8059         4497 :           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
    8060         4479 :             dump_printf_loc (MSG_NOTE, vect_location,
    8061              :                              "          op template: %G", rep->stmt);
    8062              :         }
    8063         4761 :       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
    8064         9750 :       for (unsigned int order_i = partition.node_begin;
    8065         9750 :            order_i < partition.node_end; ++order_i)
    8066              :         {
    8067         4989 :           unsigned int node_i = m_partitioned_nodes[order_i];
    8068         4989 :           auto &vertex = m_vertices[node_i];
    8069        15041 :           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
    8070              :             {
    8071        10052 :               auto &other_vertex = m_vertices[other_node_i];
    8072        10052 :               if (other_vertex.partition < vertex.partition)
    8073         5026 :                 dump_printf_loc (MSG_NOTE, vect_location,
    8074              :                                  "      - %p [%d] --> %p\n",
    8075         5026 :                                  (void *) other_vertex.node,
    8076              :                                  other_vertex.partition,
    8077         5026 :                                  (void *) vertex.node);
    8078              :               else
    8079         5026 :                 dump_printf_loc (MSG_NOTE, vect_location,
    8080              :                                  "      - %p --> [%d] %p\n",
    8081         5026 :                                  (void *) vertex.node,
    8082              :                                  other_vertex.partition,
    8083         5026 :                                  (void *) other_vertex.node);
    8084        15041 :             };
    8085         4989 :           for_each_partition_edge (node_i, print_edge);
    8086              :         }
    8087              : 
    8088        14482 :       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
    8089              :         {
    8090         9721 :           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
    8091         9721 :           if (layout_costs.is_possible ())
    8092              :             {
    8093         7976 :               dump_printf_loc (MSG_NOTE, vect_location,
    8094              :                                "    layout %d:%s\n", layout_i,
    8095         7976 :                                partition.layout == int (layout_i)
    8096              :                                ? " (*)" : "");
    8097         7976 :               slpg_layout_cost combined_cost = layout_costs.in_cost;
    8098         7976 :               combined_cost.add_serial_cost (layout_costs.internal_cost);
    8099         7976 :               combined_cost.add_serial_cost (layout_costs.out_cost);
    8100              : #define TEMPLATE "{depth: %f, total: %f}"
    8101         7976 :               dump_printf_loc (MSG_NOTE, vect_location,
    8102              :                                "        " TEMPLATE "\n",
    8103              :                                layout_costs.in_cost.depth.to_double (),
    8104              :                                layout_costs.in_cost.total.to_double ());
    8105         7976 :               dump_printf_loc (MSG_NOTE, vect_location,
    8106              :                                "      + " TEMPLATE "\n",
    8107              :                                layout_costs.internal_cost.depth.to_double (),
    8108              :                                layout_costs.internal_cost.total.to_double ());
    8109         7976 :               dump_printf_loc (MSG_NOTE, vect_location,
    8110              :                                "      + " TEMPLATE "\n",
    8111              :                                layout_costs.out_cost.depth.to_double (),
    8112              :                                layout_costs.out_cost.total.to_double ());
    8113         7976 :               dump_printf_loc (MSG_NOTE, vect_location,
    8114              :                                "      = " TEMPLATE "\n",
    8115              :                                combined_cost.depth.to_double (),
    8116              :                                combined_cost.total.to_double ());
    8117              : #undef TEMPLATE
    8118              :             }
    8119              :           else
    8120         1745 :             dump_printf_loc (MSG_NOTE, vect_location,
    8121              :                              "    layout %d: rejected\n", layout_i);
    8122              :         }
    8123              :     }
    8124          659 : }
    8125              : 
    8126              : /* Masked load lanes discovery.  */
    8127              : 
    8128              : void
    8129       624895 : vect_optimize_slp_pass::decide_masked_load_lanes ()
    8130              : {
    8131      6426343 :   for (auto v : m_vertices)
    8132              :     {
    8133      4551658 :       slp_tree node = v.node;
    8134      4551658 :       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
    8135      3142474 :           || SLP_TREE_PERMUTE_P (node))
    8136      1540042 :         continue;
    8137      3011616 :       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
    8138      1511957 :       if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
    8139              :           /* The mask has to be uniform.  */
    8140       954308 :           || STMT_VINFO_SLP_VECT_ONLY (stmt_info)
    8141       954177 :           || ! is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
    8142      3011701 :           || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
    8143              :                                        IFN_MASK_LOAD))
    8144      3011583 :         continue;
    8145           33 :       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
    8146           66 :       if (STMT_VINFO_STRIDED_P (stmt_info)
    8147           33 :           || compare_step_with_zero (m_vinfo, stmt_info) <= 0
    8148           63 :           || vect_load_lanes_supported (SLP_TREE_VECTYPE (node),
    8149           30 :                                         DR_GROUP_SIZE (stmt_info),
    8150              :                                         true) == IFN_LAST)
    8151           33 :         continue;
    8152              : 
    8153              :       /* Uniform masks need to be suitably represented.  */
    8154            0 :       slp_tree mask = SLP_TREE_CHILDREN (node)[0];
    8155            0 :       if (!SLP_TREE_PERMUTE_P (mask)
    8156            0 :           || SLP_TREE_CHILDREN (mask).length () != 1)
    8157            0 :         continue;
    8158            0 :       bool match = true;
    8159            0 :       for (auto perm : SLP_TREE_LANE_PERMUTATION (mask))
    8160            0 :         if (perm.first != 0 || perm.second != 0)
    8161              :           {
    8162              :             match = false;
    8163              :             break;
    8164              :           }
    8165            0 :       if (!match)
    8166            0 :         continue;
    8167              : 
    8168              :       /* Now see if the consumer side matches.  */
    8169            0 :       for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
    8170            0 :            pred; pred = pred->pred_next)
    8171              :         {
    8172            0 :           slp_tree pred_node = m_vertices[pred->src].node;
    8173              :           /* All consumers should be a permute with a single outgoing lane.  */
    8174            0 :           if (!SLP_TREE_PERMUTE_P (pred_node)
    8175            0 :               || SLP_TREE_LANES (pred_node) != 1)
    8176              :             {
    8177              :               match = false;
    8178              :               break;
    8179              :             }
    8180            0 :           gcc_assert (SLP_TREE_CHILDREN (pred_node).length () == 1);
    8181              :         }
    8182            0 :       if (!match)
    8183            0 :         continue;
    8184              :       /* Now we can mark the nodes as to use load lanes.  */
    8185            0 :       node->ldst_lanes = true;
    8186            0 :       for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
    8187            0 :            pred; pred = pred->pred_next)
    8188            0 :         m_vertices[pred->src].node->ldst_lanes = true;
    8189              :       /* The catch is we have to massage the mask.  We have arranged
    8190              :          analyzed uniform masks to be represented by a splat VEC_PERM
    8191              :          which we can now simply elide as we cannot easily re-do SLP
    8192              :          discovery here.  */
    8193            0 :       slp_tree new_mask = SLP_TREE_CHILDREN (mask)[0];
    8194            0 :       SLP_TREE_REF_COUNT (new_mask)++;
    8195            0 :       SLP_TREE_CHILDREN (node)[0] = new_mask;
    8196            0 :       vect_free_slp_tree (mask);
    8197              :     }
    8198       624895 : }
    8199              : 
    8200              : /* Perform legitimizing attempts.  This is intended to improve the
    8201              :    situation when layout 0 is not valid which is a situation the cost
    8202              :    based propagation does not handle well.
    8203              :    Return true if further layout optimization is possible, false if
    8204              :    the layout configuration should be considered final.  */
    8205              : 
    8206              : bool
    8207        10181 : vect_optimize_slp_pass::legitimize ()
    8208              : {
    8209              :   /* Perform a very simple legitimizing attempt by attempting to choose
    8210              :      a single layout for all partitions that will make all permutations
    8211              :      a noop.  That should also be the optimal layout choice in case
    8212              :      layout zero is legitimate.
    8213              :      ???  Disconnected components of the SLP graph could have distinct
    8214              :      single layouts.  */
    8215        10181 :   int single_layout_i = -1;
    8216        10181 :   unsigned deferred_up_to = -1U;
    8217        30787 :   for (unsigned partition_i = 0; partition_i < m_partitions.length ();
    8218              :        ++partition_i)
    8219              :     {
    8220        25913 :       auto &partition = m_partitions[partition_i];
    8221        25913 :       if (single_layout_i == -1)
    8222              :         {
    8223        13374 :           single_layout_i = partition.layout;
    8224        13374 :           deferred_up_to = partition_i;
    8225              :         }
    8226        12539 :       else if (partition.layout == single_layout_i || partition.layout == -1)
    8227              :         ;
    8228              :       else
    8229              :         single_layout_i = 0;
    8230        22675 :       if (single_layout_i == 0)
    8231              :         return true;
    8232              : 
    8233        20666 :       if (single_layout_i != -1
    8234        20666 :           && !is_compatible_layout (partition, single_layout_i))
    8235              :         return true;
    8236              :     }
    8237              : 
    8238         4874 :   if (single_layout_i <= 0)
    8239              :     return true;
    8240              : 
    8241         4990 :   for (unsigned partition_i = 0; partition_i < deferred_up_to; ++partition_i)
    8242          122 :     if (!is_compatible_layout (m_partitions[partition_i],
    8243              :                                single_layout_i))
    8244              :       return true;
    8245              : 
    8246        12161 :   for (unsigned partition_i = 0; partition_i < m_partitions.length ();
    8247              :        ++partition_i)
    8248              :     {
    8249         7293 :       auto &partition = m_partitions[partition_i];
    8250         7293 :       partition.layout = single_layout_i;
    8251              :     }
    8252              : 
    8253              :   return false;
    8254              : }
    8255              : 
    8256              : /* Main entry point for the SLP graph optimization pass.  */
    8257              : 
    8258              : void
    8259       624895 : vect_optimize_slp_pass::run ()
    8260              : {
    8261       624895 :   build_graph ();
    8262       624895 :   create_partitions ();
    8263       624895 :   start_choosing_layouts ();
    8264       624895 :   if (m_perms.length () > 1)
    8265              :     {
    8266        10181 :       if (legitimize ())
    8267              :         {
    8268         5313 :           forward_pass ();
    8269         5313 :           backward_pass ();
    8270              :         }
    8271        10181 :       if (dump_enabled_p ())
    8272          659 :         dump ();
    8273        10181 :       materialize ();
    8274        41143 :       while (!m_perms.is_empty ())
    8275        20781 :         m_perms.pop ().release ();
    8276              :     }
    8277              :   else
    8278       614714 :     remove_redundant_permutations ();
    8279       624895 :   free_graph (m_slpg);
    8280       624895 :   build_graph ();
    8281       624895 :   decide_masked_load_lanes ();
    8282       624895 :   free_graph (m_slpg);
    8283       624895 : }
    8284              : 
    8285              : /* Apply CSE to NODE and its children using BST_MAP.  */
    8286              : 
    8287              : static void
    8288      4870820 : vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
    8289              : {
    8290      4870820 :   bool put_p = false;
    8291      4870820 :   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
    8292              :       /* Besides some VEC_PERM_EXPR, two-operator nodes also
    8293              :          lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
    8294              :          we'd have sth that works for all internal and external nodes.  */
    8295      4870820 :       && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
    8296              :     {
    8297      3439877 :       slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
    8298      3439877 :       if (leader)
    8299              :         {
    8300              :           /* We've visited this node already.  */
    8301       321026 :           if (!*leader || *leader == node)
    8302              :             return;
    8303              : 
    8304         2432 :           if (dump_enabled_p ())
    8305          887 :             dump_printf_loc (MSG_NOTE, vect_location,
    8306              :                              "re-using SLP tree %p for %p\n",
    8307              :                              (void *)*leader, (void *)node);
    8308         2432 :           vect_free_slp_tree (node);
    8309         2432 :           (*leader)->refcnt += 1;
    8310         2432 :           node = *leader;
    8311         2432 :           return;
    8312              :         }
    8313              : 
    8314              :       /* Avoid creating a cycle by populating the map only after recursion.  */
    8315      3118851 :       bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
    8316      3118851 :       node->refcnt += 1;
    8317      3118851 :       put_p = true;
    8318              :       /* And recurse.  */
    8319              :     }
    8320              : 
    8321     13422534 :   for (slp_tree &child : SLP_TREE_CHILDREN (node))
    8322      3867294 :     if (child)
    8323      3489817 :       vect_cse_slp_nodes (bst_map, child);
    8324              : 
    8325              :   /* Now record the node for CSE in other siblings.  */
    8326      4549794 :   if (put_p)
    8327      3118851 :     *bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
    8328              : }
    8329              : 
    8330              : /* Optimize the SLP graph of VINFO.  */
    8331              : 
    8332              : void
    8333       968348 : vect_optimize_slp (vec_info *vinfo)
    8334              : {
    8335       968348 :   if (vinfo->slp_instances.is_empty ())
    8336              :     return;
    8337       624895 :   vect_optimize_slp_pass (vinfo).run ();
    8338              : 
    8339              :   /* Apply CSE again to nodes after permute optimization.  */
    8340       624895 :   scalar_stmts_to_slp_tree_map_t *bst_map
    8341       624895 :     = new scalar_stmts_to_slp_tree_map_t ();
    8342              : 
    8343      3255688 :   for (auto inst : vinfo->slp_instances)
    8344      1381003 :     vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
    8345              : 
    8346       624895 :   release_scalar_stmts_to_slp_tree_map (bst_map);
    8347              : }
    8348              : 
    8349              : /* Gather loads reachable from the individual SLP graph entries.  */
    8350              : 
    8351              : void
    8352       968348 : vect_gather_slp_loads (vec_info *vinfo)
    8353              : {
    8354       968348 :   unsigned i;
    8355       968348 :   slp_instance instance;
    8356      2349351 :   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
    8357              :     {
    8358      1381003 :       hash_set<slp_tree> visited;
    8359      1381003 :       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
    8360              :                              SLP_INSTANCE_TREE (instance), visited);
    8361      1381003 :     }
    8362       968348 : }
    8363              : 
    8364              : /* For NODE update VF based on the number of lanes and the vector types
    8365              :    used.  */
    8366              : 
    8367              : static void
    8368      3583313 : vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
    8369              :                              hash_set<slp_tree> &visited)
    8370              : {
    8371      3583313 :   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
    8372      1287971 :     return;
    8373      2569238 :   if (visited.add (node))
    8374              :     return;
    8375              : 
    8376      8667821 :   for (slp_tree child : SLP_TREE_CHILDREN (node))
    8377      2919087 :     vect_update_slp_vf_for_node (child, vf, visited);
    8378              : 
    8379              :   /* We do not visit SLP nodes for constants or externals - those neither
    8380              :      have a vector type set yet (vectorizable_* does this) nor do they
    8381              :      have max_nunits set.  Instead we rely on internal nodes max_nunit
    8382              :      to cover constant/external operands.
    8383              :      Note that when we stop using fixed size vectors externs and constants
    8384              :      shouldn't influence the (minimum) vectorization factor, instead
    8385              :      vectorizable_* should honor the vectorization factor when trying to
    8386              :      assign vector types to constants and externals and cause iteration
    8387              :      to a higher vectorization factor when required.  */
    8388      2295342 :   poly_uint64 node_vf
    8389      2295342 :     = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
    8390      2295342 :   vf = force_common_multiple (vf, node_vf);
    8391              : 
    8392              :   /* For permute nodes that are fed from externs or constants we have to
    8393              :      consider their number of lanes as well.  Likewise for store-lanes.  */
    8394      2295342 :   if (SLP_TREE_PERMUTE_P (node) || node->ldst_lanes)
    8395       645924 :     for (slp_tree child : SLP_TREE_CHILDREN (node))
    8396       172008 :       if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
    8397              :         {
    8398         2858 :           poly_uint64 child_vf
    8399         2858 :             = calculate_unrolling_factor (node->max_nunits,
    8400              :                                           SLP_TREE_LANES (child));
    8401         2858 :           vf = force_common_multiple (vf, child_vf);
    8402              :         }
    8403              : }
    8404              : 
    8405              : /* For each possible SLP instance decide whether to SLP it and calculate overall
    8406              :    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
    8407              :    least one instance.  */
    8408              : 
    8409              : bool
    8410       405823 : vect_make_slp_decision (loop_vec_info loop_vinfo)
    8411              : {
    8412       405823 :   unsigned int i;
    8413       405823 :   poly_uint64 unrolling_factor = 1;
    8414       405823 :   const vec<slp_instance> &slp_instances
    8415              :     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
    8416       405823 :   slp_instance instance;
    8417       405823 :   int decided_to_slp = 0;
    8418              : 
    8419       405823 :   DUMP_VECT_SCOPE ("vect_make_slp_decision");
    8420              : 
    8421       405823 :   hash_set<slp_tree> visited;
    8422      1070049 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
    8423              :     {
    8424       664226 :       slp_tree root = SLP_INSTANCE_TREE (instance);
    8425              : 
    8426              :       /* All unroll factors have the form:
    8427              : 
    8428              :            GET_MODE_SIZE (vinfo->vector_mode) * X
    8429              : 
    8430              :          for some rational X, so they must have a common multiple.  */
    8431       664226 :       vect_update_slp_vf_for_node (root, unrolling_factor, visited);
    8432              : 
    8433              :       /* If all instances ended up with vector(1) T roots make sure to
    8434              :          not vectorize.  RVV for example relies on loop vectorization
    8435              :          when some instances are essentially kept scalar.  See PR121048.  */
    8436       664226 :       if (SLP_TREE_VECTYPE (root)
    8437       664226 :           && known_gt (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (root)), 1U))
    8438       546073 :         decided_to_slp++;
    8439              :     }
    8440              : 
    8441       405823 :   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = unrolling_factor;
    8442              : 
    8443       405823 :   if (decided_to_slp && dump_enabled_p ())
    8444              :     {
    8445        18419 :       dump_printf_loc (MSG_NOTE, vect_location,
    8446              :                        "Decided to SLP %d instances. Unrolling factor ",
    8447              :                        decided_to_slp);
    8448        18419 :       dump_dec (MSG_NOTE, unrolling_factor);
    8449        18419 :       dump_printf (MSG_NOTE, "\n");
    8450              :     }
    8451              : 
    8452       405823 :   return (decided_to_slp > 0);
    8453       405823 : }
    8454              : 
    8455              : /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
    8456              : 
    8457      2183824 : _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
    8458              :   : vec_info (vec_info::bb, shared),
    8459      2183824 :     roots (vNULL)
    8460              : {
    8461              :   /* The region we are operating on.  bbs[0] is the entry, excluding
    8462              :      its PHI nodes.  In the future we might want to track an explicit
    8463              :      entry edge to cover bbs[0] PHI nodes and have a region entry
    8464              :      insert location.  */
    8465      2183824 :   bbs = _bbs.address ();
    8466      2183824 :   nbbs = _bbs.length ();
    8467              : 
    8468     17723259 :   for (unsigned i = 0; i < nbbs; ++i)
    8469              :     {
    8470     15539435 :       if (i != 0)
    8471     20291617 :         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
    8472      6936006 :              gsi_next (&si))
    8473              :           {
    8474      6936006 :             gphi *phi = si.phi ();
    8475      6936006 :             gimple_set_uid (phi, 0);
    8476      6936006 :             add_stmt (phi);
    8477              :           }
    8478     31078870 :       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
    8479    134980207 :            !gsi_end_p (gsi); gsi_next (&gsi))
    8480              :         {
    8481    119440772 :           gimple *stmt = gsi_stmt (gsi);
    8482    119440772 :           gimple_set_uid (stmt, 0);
    8483    119440772 :           if (is_gimple_debug (stmt))
    8484     74171204 :             continue;
    8485     45269568 :           add_stmt (stmt);
    8486              :         }
    8487              :     }
    8488      2183824 : }
    8489              : 
    8490              : 
    8491              : /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
    8492              :    stmts in the basic block.  */
    8493              : 
    8494      2183824 : _bb_vec_info::~_bb_vec_info ()
    8495              : {
    8496              :   /* Reset region marker.  */
    8497     17723259 :   for (unsigned i = 0; i < nbbs; ++i)
    8498              :     {
    8499     15539435 :       if (i != 0)
    8500     20307345 :         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
    8501      6951734 :              gsi_next (&si))
    8502              :           {
    8503      6951734 :             gphi *phi = si.phi ();
    8504      6951734 :             gimple_set_uid (phi, -1);
    8505              :           }
    8506     31078870 :       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
    8507    134931004 :            !gsi_end_p (gsi); gsi_next (&gsi))
    8508              :         {
    8509    119391569 :           gimple *stmt = gsi_stmt (gsi);
    8510    119391569 :           gimple_set_uid (stmt, -1);
    8511              :         }
    8512              :     }
    8513              : 
    8514      3388288 :   for (unsigned i = 0; i < roots.length (); ++i)
    8515              :     {
    8516      1204464 :       roots[i].stmts.release ();
    8517      1204464 :       roots[i].roots.release ();
    8518      1204464 :       roots[i].remain.release ();
    8519              :     }
    8520      2183824 :   roots.release ();
    8521      2183824 : }
    8522              : 
    8523              : /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
    8524              :    given then that child nodes have already been processed, and that
    8525              :    their def types currently match their SLP node's def type.  */
    8526              : 
    8527              : static bool
    8528      2429037 : vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
    8529              :                                     slp_instance node_instance,
    8530              :                                     stmt_vector_for_cost *cost_vec)
    8531              : {
    8532              :   /* Handle purely internal nodes.  */
    8533      2429037 :   if (SLP_TREE_PERMUTE_P (node))
    8534              :     {
    8535        99113 :       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
    8536              :         return false;
    8537              : 
    8538              :       stmt_vec_info slp_stmt_info;
    8539              :       unsigned int i;
    8540       256648 :       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
    8541              :         {
    8542       158864 :           if (slp_stmt_info
    8543       153919 :               && STMT_VINFO_LIVE_P (slp_stmt_info)
    8544       158864 :               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
    8545              :                                                node_instance, i,
    8546              :                                                false, cost_vec))
    8547              :             return false;
    8548              :         }
    8549        97784 :       SLP_TREE_TYPE (node) = permute_info_type;
    8550        97784 :       return true;
    8551              :     }
    8552              : 
    8553      2329924 :   return vect_analyze_stmt (vinfo, node, node_instance, cost_vec);
    8554              : }
    8555              : 
    8556              : static int
    8557      1845264 : sort_ints (const void *a_, const void *b_)
    8558              : {
    8559      1845264 :   int a = *(const int *)a_;
    8560      1845264 :   int b = *(const int *)b_;
    8561      1845264 :   return a - b;
    8562              : }
    8563              : 
    8564              : /* Verify if we can externalize a set of internal defs.  */
    8565              : 
    8566              : static bool
    8567       379584 : vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
    8568              : {
    8569              :   /* Constant generation uses get_later_stmt which can only handle
    8570              :      defs from the same BB or a set of defs that can be ordered
    8571              :      with a dominance query.  */
    8572       379584 :   basic_block bb = NULL;
    8573       379584 :   bool all_same = true;
    8574       379584 :   auto_vec<int> bbs;
    8575       759168 :   bbs.reserve_exact (stmts.length ());
    8576      2052826 :   for (stmt_vec_info stmt : stmts)
    8577              :     {
    8578       914074 :       if (!stmt)
    8579              :         return false;
    8580       914074 :       else if (!bb)
    8581       379584 :         bb = gimple_bb (stmt->stmt);
    8582       534490 :       else if (gimple_bb (stmt->stmt) != bb)
    8583       172108 :         all_same = false;
    8584       914074 :       bbs.quick_push (gimple_bb (stmt->stmt)->index);
    8585              :     }
    8586       379584 :   if (all_same)
    8587              :     return true;
    8588              : 
    8589              :   /* Produce a vector of unique BB indexes for the defs.  */
    8590       129040 :   bbs.qsort (sort_ints);
    8591              :   unsigned i, j;
    8592       314220 :   for (i = 1, j = 1; i < bbs.length (); ++i)
    8593       185180 :     if (bbs[i] != bbs[j-1])
    8594       137844 :       bbs[j++] = bbs[i];
    8595       129040 :   gcc_assert (j >= 2);
    8596       129040 :   bbs.truncate (j);
    8597              : 
    8598       258080 :   if (bbs.length () == 2)
    8599       125527 :     return (dominated_by_p (CDI_DOMINATORS,
    8600       125527 :                             BASIC_BLOCK_FOR_FN (cfun, bbs[0]),
    8601       125527 :                             BASIC_BLOCK_FOR_FN (cfun, bbs[1]))
    8602       244320 :             || dominated_by_p (CDI_DOMINATORS,
    8603       118793 :                                BASIC_BLOCK_FOR_FN (cfun, bbs[1]),
    8604       118793 :                                BASIC_BLOCK_FOR_FN (cfun, bbs[0])));
    8605              : 
    8606              :   /* ???  For more than two BBs we can sort the vector and verify the
    8607              :      result is a total order.  But we can't use vec::qsort with a
    8608              :      compare function using a dominance query since there's no way to
    8609              :      signal failure and any fallback for an unordered pair would
    8610              :      fail qsort_chk later.
    8611              :      For now simply hope that ordering after BB index provides the
    8612              :      best candidate total order.  If required we can implement our
    8613              :      own mergesort or export an entry without checking.  */
    8614       395361 :   for (unsigned i = 1; i < bbs.length (); ++i)
    8615        12293 :     if (!dominated_by_p (CDI_DOMINATORS,
    8616        12293 :                          BASIC_BLOCK_FOR_FN (cfun, bbs[i]),
    8617        12293 :                          BASIC_BLOCK_FOR_FN (cfun, bbs[i-1])))
    8618              :       return false;
    8619              : 
    8620              :   return true;
    8621       379584 : }
    8622              : 
    8623              : /* Try to build NODE from scalars, returning true on success.
    8624              :    NODE_INSTANCE is the SLP instance that contains NODE.  */
    8625              : 
    8626              : static bool
    8627       543873 : vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
    8628              :                               slp_instance node_instance)
    8629              : {
    8630       543873 :   stmt_vec_info stmt_info;
    8631       543873 :   unsigned int i;
    8632              : 
    8633       543873 :   if (!is_a <bb_vec_info> (vinfo)
    8634        70703 :       || node == SLP_INSTANCE_TREE (node_instance)
    8635        22344 :       || !SLP_TREE_SCALAR_STMTS (node).exists ()
    8636        22303 :       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
    8637              :       /* Force the mask use to be built from scalars instead.  */
    8638        19998 :       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
    8639       563656 :       || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
    8640       524090 :     return false;
    8641              : 
    8642        19783 :   if (dump_enabled_p ())
    8643           70 :     dump_printf_loc (MSG_NOTE, vect_location,
    8644              :                      "Building vector operands of %p from scalars instead\n",
    8645              :                      (void *) node);
    8646              : 
    8647              :   /* Don't remove and free the child nodes here, since they could be
    8648              :      referenced by other structures.  The analysis and scheduling phases
    8649              :      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
    8650        19783 :   unsigned int group_size = SLP_TREE_LANES (node);
    8651        19783 :   SLP_TREE_DEF_TYPE (node) = vect_external_def;
    8652              :   /* Invariants get their vector type from the uses.  */
    8653        19783 :   SLP_TREE_VECTYPE (node) = NULL_TREE;
    8654        19783 :   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
    8655        19783 :   SLP_TREE_LOAD_PERMUTATION (node).release ();
    8656        68867 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    8657              :     {
    8658        49084 :       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
    8659        49084 :       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
    8660              :     }
    8661              :   return true;
    8662              : }
    8663              : 
    8664              : /* Return true if all elements of the slice are the same.  */
    8665              : bool
    8666       452321 : vect_scalar_ops_slice::all_same_p () const
    8667              : {
    8668       499556 :   for (unsigned int i = 1; i < length; ++i)
    8669       421818 :     if (!operand_equal_p (op (0), op (i)))
    8670              :       return false;
    8671              :   return true;
    8672              : }
    8673              : 
    8674              : hashval_t
    8675       392222 : vect_scalar_ops_slice_hash::hash (const value_type &s)
    8676              : {
    8677       392222 :   hashval_t hash = 0;
    8678      1516199 :   for (unsigned i = 0; i < s.length; ++i)
    8679      1123977 :     hash = iterative_hash_expr (s.op (i), hash);
    8680       392222 :   return hash;
    8681              : }
    8682              : 
    8683              : bool
    8684       213458 : vect_scalar_ops_slice_hash::equal (const value_type &s1,
    8685              :                                    const compare_type &s2)
    8686              : {
    8687       213458 :   if (s1.length != s2.length)
    8688              :     return false;
    8689       370451 :   for (unsigned i = 0; i < s1.length; ++i)
    8690       323861 :     if (!operand_equal_p (s1.op (i), s2.op (i)))
    8691              :       return false;
    8692              :   return true;
    8693              : }
    8694              : 
    8695              : /* Compute the prologue cost for invariant or constant operands represented
    8696              :    by NODE.  */
    8697              : 
    8698              : static void
    8699      1035475 : vect_prologue_cost_for_slp (vec_info *vinfo, slp_tree node,
    8700              :                             stmt_vector_for_cost *cost_vec)
    8701              : {
    8702              :   /* There's a special case of an existing vector, that costs nothing.  */
    8703      1035475 :   if (SLP_TREE_SCALAR_OPS (node).length () == 0
    8704      1035475 :       && !SLP_TREE_VEC_DEFS (node).is_empty ())
    8705         1576 :     return;
    8706              :   /* Without looking at the actual initializer a vector of
    8707              :      constants can be implemented as load from the constant pool.
    8708              :      When all elements are the same we can use a splat.  */
    8709      1033899 :   tree vectype = SLP_TREE_VECTYPE (node);
    8710      1033899 :   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
    8711      1033899 :   unsigned HOST_WIDE_INT const_nunits;
    8712      1033899 :   unsigned nelt_limit;
    8713      1033899 :   unsigned nvectors = vect_get_num_copies (vinfo, node);
    8714      1033899 :   auto ops = &SLP_TREE_SCALAR_OPS (node);
    8715      1033899 :   auto_vec<unsigned int> starts (nvectors);
    8716      1033899 :   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
    8717      1033899 :       && ! multiple_p (const_nunits, group_size))
    8718              :     {
    8719        62585 :       nelt_limit = const_nunits;
    8720        62585 :       hash_set<vect_scalar_ops_slice_hash> vector_ops;
    8721       258561 :       for (unsigned int i = 0; i < nvectors; ++i)
    8722       195976 :         if (!vector_ops.add ({ ops, i * nelt_limit, nelt_limit }))
    8723       149386 :           starts.quick_push (i * nelt_limit);
    8724        62585 :     }
    8725              :   else
    8726              :     {
    8727              :       /* If either the vector has variable length or the vectors
    8728              :          are composed of repeated whole groups we only need to
    8729              :          cost construction once.  All vectors will be the same.  */
    8730       971314 :       nelt_limit = group_size;
    8731       971314 :       starts.quick_push (0);
    8732              :     }
    8733              :   /* ???  We're just tracking whether vectors in a single node are the same.
    8734              :      Ideally we'd do something more global.  */
    8735      1033899 :   bool passed = false;
    8736      4222397 :   for (unsigned int start : starts)
    8737              :     {
    8738      1120700 :       vect_cost_for_stmt kind;
    8739      1120700 :       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
    8740              :         kind = vector_load;
    8741       452321 :       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
    8742              :         kind = scalar_to_vec;
    8743              :       else
    8744       374583 :         kind = vec_construct;
    8745              :       /* The target cost hook has no idea which part of the SLP node
    8746              :          we are costing so avoid passing it down more than once.  Pass
    8747              :          it to the first vec_construct or scalar_to_vec part since for those
    8748              :          the x86 backend tries to account for GPR to XMM register moves.  */
    8749      1120700 :       record_stmt_cost (cost_vec, 1, kind, nullptr,
    8750      1120700 :                         (kind != vector_load && !passed) ? node : nullptr,
    8751              :                         vectype, 0, vect_prologue);
    8752      1120700 :       if (kind != vector_load)
    8753       452321 :         passed = true;
    8754              :     }
    8755      1033899 : }
    8756              : 
    8757              : /* Analyze statements contained in SLP tree NODE after recursively analyzing
    8758              :    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
    8759              : 
    8760              :    Return true if the operations are supported.  */
    8761              : 
    8762              : static bool
    8763      4541890 : vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
    8764              :                                   slp_instance node_instance,
    8765              :                                   hash_set<slp_tree> &visited_set,
    8766              :                                   vec<slp_tree> &visited_vec,
    8767              :                                   stmt_vector_for_cost *cost_vec)
    8768              : {
    8769      4541890 :   int i, j;
    8770      4541890 :   slp_tree child;
    8771              : 
    8772              :   /* Assume we can code-generate all invariants.  */
    8773      4541890 :   if (!node
    8774      4220139 :       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
    8775      3504742 :       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
    8776              :     return true;
    8777              : 
    8778      3000487 :   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
    8779              :     {
    8780            9 :       if (dump_enabled_p ())
    8781            0 :         dump_printf_loc (MSG_NOTE, vect_location,
    8782              :                          "Failed cyclic SLP reference in %p\n", (void *) node);
    8783            9 :       return false;
    8784              :     }
    8785      3000478 :   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
    8786              : 
    8787              :   /* If we already analyzed the exact same set of scalar stmts we're done.
    8788              :      We share the generated vector stmts for those.  */
    8789      3000478 :   if (visited_set.add (node))
    8790              :     return true;
    8791      2731270 :   visited_vec.safe_push (node);
    8792              : 
    8793      2731270 :   bool res = true;
    8794      2731270 :   unsigned visited_rec_start = visited_vec.length ();
    8795      2731270 :   unsigned cost_vec_rec_start = cost_vec->length ();
    8796      2731270 :   bool seen_non_constant_child = false;
    8797      5746041 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    8798              :     {
    8799      3316779 :       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
    8800              :                                               visited_set, visited_vec,
    8801              :                                               cost_vec);
    8802      3316779 :       if (!res)
    8803              :         break;
    8804      3014771 :       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
    8805      3014771 :         seen_non_constant_child = true;
    8806              :     }
    8807              :   /* We're having difficulties scheduling nodes with just constant
    8808              :      operands and no scalar stmts since we then cannot compute a stmt
    8809              :      insertion place.  */
    8810      2731270 :   if (res
    8811      2731270 :       && !seen_non_constant_child
    8812      2731270 :       && SLP_TREE_SCALAR_STMTS (node).is_empty ())
    8813              :     {
    8814          225 :       if (dump_enabled_p ())
    8815            6 :         dump_printf_loc (MSG_NOTE, vect_location,
    8816              :                          "Cannot vectorize all-constant op node %p\n",
    8817              :                          (void *) node);
    8818              :       res = false;
    8819              :     }
    8820              : 
    8821      2731045 :   if (res)
    8822      2429037 :     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
    8823              :                                               cost_vec);
    8824              :   /* If analysis failed we have to pop all recursive visited nodes
    8825              :      plus ourselves.  */
    8826      2731270 :   if (!res)
    8827              :     {
    8828      2687000 :       while (visited_vec.length () >= visited_rec_start)
    8829       799627 :         visited_set.remove (visited_vec.pop ());
    8830       543873 :       cost_vec->truncate (cost_vec_rec_start);
    8831              :     }
    8832              : 
    8833              :   /* When the node can be vectorized cost invariant nodes it references.
    8834              :      This is not done in DFS order to allow the referring node
    8835              :      vectorizable_* calls to nail down the invariant nodes vector type
    8836              :      and possibly unshare it if it needs a different vector type than
    8837              :      other referrers.  */
    8838      2731270 :   if (res)
    8839      4910795 :     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
    8840      2723398 :       if (child
    8841      2466297 :           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
    8842      2466297 :               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
    8843              :           /* Perform usual caching, note code-generation still
    8844              :              code-gens these nodes multiple times but we expect
    8845              :              to CSE them later.  */
    8846      3824571 :           && !visited_set.add (child))
    8847              :         {
    8848      1076524 :           visited_vec.safe_push (child);
    8849              :           /* ???  After auditing more code paths make a "default"
    8850              :              and push the vector type from NODE to all children
    8851              :              if it is not already set.  */
    8852              :           /* Compute the number of vectors to be generated.  */
    8853      1076524 :           tree vector_type = SLP_TREE_VECTYPE (child);
    8854      1076524 :           if (!vector_type)
    8855              :             {
    8856              :               /* Masked loads can have an undefined (default SSA definition)
    8857              :                  else operand.  We do not need to cost it.  */
    8858        41049 :               vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
    8859        42099 :               if (SLP_TREE_TYPE (node) == load_vec_info_type
    8860        42099 :                   && ((ops.length ()
    8861         1050 :                        && TREE_CODE (ops[0]) == SSA_NAME
    8862            0 :                        && SSA_NAME_IS_DEFAULT_DEF (ops[0])
    8863            0 :                        && VAR_P (SSA_NAME_VAR (ops[0])))
    8864         1050 :                       || SLP_TREE_DEF_TYPE (child) == vect_constant_def))
    8865         1050 :                 continue;
    8866              : 
    8867              :               /* For shifts with a scalar argument we don't need
    8868              :                  to cost or code-generate anything.
    8869              :                  ???  Represent this more explicitly.  */
    8870        39999 :               gcc_assert (SLP_TREE_TYPE (node) == shift_vec_info_type
    8871              :                           && j == 1);
    8872        39999 :               continue;
    8873        39999 :             }
    8874              : 
    8875              :           /* And cost them.  */
    8876      1035475 :           vect_prologue_cost_for_slp (vinfo, child, cost_vec);
    8877              :         }
    8878              : 
    8879              :   /* If this node or any of its children can't be vectorized, try pruning
    8880              :      the tree here rather than felling the whole thing.  */
    8881       543873 :   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
    8882              :     {
    8883              :       /* We'll need to revisit this for invariant costing and number
    8884              :          of vectorized stmt setting.   */
    8885              :       res = true;
    8886              :     }
    8887              : 
    8888              :   return res;
    8889              : }
    8890              : 
    8891              : /* Given a definition DEF, analyze if it will have any live scalar use after
    8892              :    performing SLP vectorization whose information is represented by BB_VINFO,
    8893              :    and record result into hash map SCALAR_USE_MAP as cache for later fast
    8894              :    check.  If recursion DEPTH exceeds a limit, stop analysis and make a
    8895              :    conservative assumption.  Return 0 if no scalar use, 1 if there is, -1
    8896              :    means recursion is limited.  */
    8897              : 
    8898              : static int
    8899       582084 : vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
    8900              :                         hash_map<tree, int> &scalar_use_map,
    8901              :                         int depth = 0)
    8902              : {
    8903       582084 :   const int depth_limit = 3;
    8904       582084 :   imm_use_iterator use_iter;
    8905       582084 :   gimple *use_stmt;
    8906              : 
    8907       582084 :   if (int *res = scalar_use_map.get (def))
    8908        25386 :     return *res;
    8909              : 
    8910       556698 :   int scalar_use = 1;
    8911              : 
    8912      1822214 :   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
    8913              :     {
    8914       838250 :       if (is_gimple_debug (use_stmt))
    8915       183602 :         continue;
    8916              : 
    8917       654648 :       stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
    8918              : 
    8919       654648 :       if (!use_stmt_info)
    8920              :         break;
    8921              : 
    8922       657915 :       if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
    8923       521398 :         continue;
    8924              : 
    8925              :       /* Do not step forward when encounter PHI statement, since it may
    8926              :          involve cyclic reference and cause infinite recursive invocation.  */
    8927       127194 :       if (gimple_code (use_stmt) == GIMPLE_PHI)
    8928              :         break;
    8929              : 
    8930              :       /* When pattern recognition is involved, a statement whose definition is
    8931              :          consumed in some pattern, may not be included in the final replacement
    8932              :          pattern statements, so would be skipped when building SLP graph.
    8933              : 
    8934              :          * Original
    8935              :           char a_c = *(char *) a;
    8936              :           char b_c = *(char *) b;
    8937              :           unsigned short a_s = (unsigned short) a_c;
    8938              :           int a_i = (int) a_s;
    8939              :           int b_i = (int) b_c;
    8940              :           int r_i = a_i - b_i;
    8941              : 
    8942              :          * After pattern replacement
    8943              :           a_s = (unsigned short) a_c;
    8944              :           a_i = (int) a_s;
    8945              : 
    8946              :           patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
    8947              :           patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
    8948              : 
    8949              :           patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
    8950              :           patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
    8951              : 
    8952              :          The definitions of a_i(original statement) and b_i(pattern statement)
    8953              :          are related to, but actually not part of widen_minus pattern.
    8954              :          Vectorizing the pattern does not cause these definition statements to
    8955              :          be marked as PURE_SLP.  For this case, we need to recursively check
    8956              :          whether their uses are all absorbed into vectorized code.  But there
    8957              :          is an exception that some use may participate in an vectorized
    8958              :          operation via an external SLP node containing that use as an element.
    8959              :          The parameter "scalar_use_map" tags such kind of SSA as having scalar
    8960              :          use in advance.  */
    8961       107944 :       tree lhs = gimple_get_lhs (use_stmt);
    8962              : 
    8963       107944 :       if (!lhs || TREE_CODE (lhs) != SSA_NAME)
    8964              :         break;
    8965              : 
    8966        73217 :       if (depth_limit && depth >= depth_limit)
    8967         8937 :         return -1;
    8968              : 
    8969        64280 :       if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
    8970              :                                                 depth + 1)))
    8971              :         break;
    8972         8937 :     }
    8973              : 
    8974       547761 :   if (end_imm_use_stmt_p (&use_iter))
    8975       427266 :     scalar_use = 0;
    8976              : 
    8977              :   /* If recursion is limited, do not cache result for non-root defs.  */
    8978       547761 :   if (!depth || scalar_use >= 0)
    8979              :     {
    8980       529887 :       bool added = scalar_use_map.put (def, scalar_use);
    8981       529887 :       gcc_assert (!added);
    8982              :     }
    8983              : 
    8984       547761 :   return scalar_use;
    8985              : }
    8986              : 
    8987              : /* Mark lanes of NODE that are live outside of the basic-block vectorized
    8988              :    region and that can be vectorized using vectorizable_live_operation
    8989              :    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
    8990              :    scalar code computing it to be retained.  */
    8991              : 
    8992              : static void
    8993       909370 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
    8994              :                              slp_instance instance,
    8995              :                              stmt_vector_for_cost *cost_vec,
    8996              :                              hash_map<tree, int> &scalar_use_map,
    8997              :                              hash_set<stmt_vec_info> &svisited,
    8998              :                              hash_set<slp_tree> &visited)
    8999              : {
    9000       909370 :   if (visited.add (node))
    9001        41636 :     return;
    9002              : 
    9003       867734 :   unsigned i;
    9004       867734 :   stmt_vec_info stmt_info;
    9005       867734 :   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
    9006      3142192 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    9007              :     {
    9008      2274458 :       if (!stmt_info || svisited.contains (stmt_info))
    9009        30788 :         continue;
    9010      2252531 :       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
    9011      2252531 :       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
    9012        11959 :           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
    9013              :         /* Only the pattern root stmt computes the original scalar value.  */
    9014         8861 :         continue;
    9015      2243670 :       bool mark_visited = true;
    9016      2243670 :       gimple *orig_stmt = orig_stmt_info->stmt;
    9017      2243670 :       ssa_op_iter op_iter;
    9018      2243670 :       def_operand_p def_p;
    9019      5005144 :       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
    9020              :         {
    9021       517804 :           if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
    9022              :                                       scalar_use_map))
    9023              :             {
    9024        93938 :               STMT_VINFO_LIVE_P (stmt_info) = true;
    9025        93938 :               if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
    9026              :                                                instance, i, false, cost_vec))
    9027              :                 /* ???  So we know we can vectorize the live stmt from one SLP
    9028              :                    node.  If we cannot do so from all or none consistently
    9029              :                    we'd have to record which SLP node (and lane) we want to
    9030              :                    use for the live operation.  So make sure we can
    9031              :                    code-generate from all nodes.  */
    9032              :                 mark_visited = false;
    9033              :               else
    9034            0 :                 STMT_VINFO_LIVE_P (stmt_info) = false;
    9035              :             }
    9036              : 
    9037              :           /* We have to verify whether we can insert the lane extract
    9038              :              before all uses.  The following is a conservative approximation.
    9039              :              We cannot put this into vectorizable_live_operation because
    9040              :              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
    9041              :              doesn't work.
    9042              :              Note that while the fact that we emit code for loads at the
    9043              :              first load should make this a non-problem leafs we construct
    9044              :              from scalars are vectorized after the last scalar def.
    9045              :              ???  If we'd actually compute the insert location during
    9046              :              analysis we could use sth less conservative than the last
    9047              :              scalar stmt in the node for the dominance check.  */
    9048              :           /* ???  What remains is "live" uses in vector CTORs in the same
    9049              :              SLP graph which is where those uses can end up code-generated
    9050              :              right after their definition instead of close to their original
    9051              :              use.  But that would restrict us to code-generate lane-extracts
    9052              :              from the latest stmt in a node.  So we compensate for this
    9053              :              during code-generation, simply not replacing uses for those
    9054              :              hopefully rare cases.  */
    9055       517804 :           imm_use_iterator use_iter;
    9056       517804 :           gimple *use_stmt;
    9057       517804 :           stmt_vec_info use_stmt_info;
    9058              : 
    9059       517804 :           if (STMT_VINFO_LIVE_P (stmt_info))
    9060       626756 :             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
    9061       438880 :               if (!is_gimple_debug (use_stmt)
    9062       330276 :                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
    9063       320787 :                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
    9064       621082 :                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
    9065              :                 {
    9066        17552 :                   if (dump_enabled_p ())
    9067           57 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9068              :                                      "Cannot determine insertion place for "
    9069              :                                      "lane extract\n");
    9070        17552 :                   STMT_VINFO_LIVE_P (stmt_info) = false;
    9071        17552 :                   mark_visited = true;
    9072        93938 :                 }
    9073              :         }
    9074      2243670 :       if (mark_visited)
    9075      2164312 :         svisited.add (stmt_info);
    9076              :     }
    9077              : 
    9078              :   slp_tree child;
    9079      2506636 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9080       877332 :     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    9081       232880 :       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
    9082              :                                    scalar_use_map, svisited, visited);
    9083              : }
    9084              : 
    9085              : /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
    9086              :    are live outside of the basic-block vectorized region and that can be
    9087              :    vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
    9088              : 
    9089              : static void
    9090       263794 : vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
    9091              : {
    9092       263794 :   if (bb_vinfo->slp_instances.is_empty ())
    9093        29661 :     return;
    9094              : 
    9095       234133 :   hash_set<stmt_vec_info> svisited;
    9096       234133 :   hash_set<slp_tree> visited;
    9097       234133 :   hash_map<tree, int> scalar_use_map;
    9098       234133 :   auto_vec<slp_tree> worklist;
    9099              : 
    9100      1378889 :   for (slp_instance instance : bb_vinfo->slp_instances)
    9101              :     {
    9102       676490 :       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
    9103        58673 :         for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
    9104        16736 :           if (TREE_CODE (op) == SSA_NAME)
    9105        14100 :             scalar_use_map.put (op, 1);
    9106       676490 :       if (!visited.add (SLP_INSTANCE_TREE (instance)))
    9107       674406 :         worklist.safe_push (SLP_INSTANCE_TREE (instance));
    9108              :     }
    9109              : 
    9110      1510824 :   do
    9111              :     {
    9112      1510824 :       slp_tree node = worklist.pop ();
    9113              : 
    9114      1510824 :       if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
    9115              :         {
    9116      1542444 :           for (tree op : SLP_TREE_SCALAR_OPS (node))
    9117       681340 :             if (TREE_CODE (op) == SSA_NAME)
    9118       460542 :               scalar_use_map.put (op, 1);
    9119              :         }
    9120              :       else
    9121              :         {
    9122      3623700 :           for (slp_tree child : SLP_TREE_CHILDREN (node))
    9123       877308 :             if (child && !visited.add (child))
    9124       836418 :               worklist.safe_push (child);
    9125              :         }
    9126              :     }
    9127      3021648 :   while (!worklist.is_empty ());
    9128              : 
    9129       234133 :   visited.empty ();
    9130              : 
    9131      1378889 :   for (slp_instance instance : bb_vinfo->slp_instances)
    9132              :     {
    9133       676490 :       vect_location = instance->location ();
    9134       676490 :       vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
    9135              :                                    instance, &instance->cost_vec,
    9136              :                                    scalar_use_map, svisited, visited);
    9137              :     }
    9138       234133 : }
    9139              : 
    9140              : /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
    9141              : 
    9142              : static bool
    9143        73907 : vectorizable_bb_reduc_epilogue (slp_instance instance,
    9144              :                                 stmt_vector_for_cost *cost_vec)
    9145              : {
    9146        73907 :   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
    9147        73907 :   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
    9148        73907 :   if (reduc_code == MINUS_EXPR)
    9149            0 :     reduc_code = PLUS_EXPR;
    9150        73907 :   internal_fn reduc_fn;
    9151        73907 :   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
    9152        73907 :   if (!vectype
    9153        73895 :       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
    9154        73895 :       || reduc_fn == IFN_LAST
    9155        73895 :       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
    9156       108781 :       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
    9157        34874 :                                      TREE_TYPE (vectype)))
    9158              :     {
    9159        49311 :       if (dump_enabled_p ())
    9160          271 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
    9161              :                          "not vectorized: basic block reduction epilogue "
    9162              :                          "operation unsupported.\n");
    9163        49311 :       return false;
    9164              :     }
    9165              : 
    9166              :   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
    9167              :      cost log2 vector operations plus shuffles and one extraction.  */
    9168        24596 :   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
    9169        24596 :   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
    9170              :                     vectype, 0, vect_body);
    9171        24596 :   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
    9172              :                     vectype, 0, vect_body);
    9173        24596 :   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
    9174              :                     vectype, 0, vect_body);
    9175              : 
    9176              :   /* Since we replace all stmts of a possibly longer scalar reduction
    9177              :      chain account for the extra scalar stmts for that.  */
    9178        24596 :   if (!instance->remain_defs.is_empty ())
    9179        19766 :     record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
    9180         9883 :                       instance->root_stmts[0], 0, vect_body);
    9181              :   return true;
    9182              : }
    9183              : 
    9184              : /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
    9185              :    and recurse to children.  */
    9186              : 
    9187              : static void
    9188       183720 : vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
    9189              :                               hash_set<slp_tree> &visited)
    9190              : {
    9191       183720 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
    9192       183720 :       || visited.add (node))
    9193        81426 :     return;
    9194              : 
    9195              :   stmt_vec_info stmt;
    9196              :   unsigned i;
    9197       350674 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
    9198       248380 :     if (stmt)
    9199       252600 :       roots.remove (vect_orig_stmt (stmt));
    9200              : 
    9201              :   slp_tree child;
    9202       226731 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9203       124437 :     if (child)
    9204       123109 :       vect_slp_prune_covered_roots (child, roots, visited);
    9205              : }
    9206              : 
    9207              : /* Analyze statements in SLP instances of VINFO.  Return true if the
    9208              :    operations are supported. */
    9209              : 
    9210              : bool
    9211       605944 : vect_slp_analyze_operations (vec_info *vinfo)
    9212              : {
    9213       605944 :   slp_instance instance;
    9214       605944 :   int i;
    9215              : 
    9216       605944 :   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
    9217              : 
    9218       605944 :   hash_set<slp_tree> visited;
    9219      1601898 :   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
    9220              :     {
    9221      1225111 :       auto_vec<slp_tree> visited_vec;
    9222      1225111 :       stmt_vector_for_cost cost_vec;
    9223      1225111 :       cost_vec.create (2);
    9224      1225111 :       if (is_a <bb_vec_info> (vinfo))
    9225       775678 :         vect_location = instance->location ();
    9226      1225111 :       if (!vect_slp_analyze_node_operations (vinfo,
    9227              :                                              SLP_INSTANCE_TREE (instance),
    9228              :                                              instance, visited, visited_vec,
    9229              :                                              &cost_vec)
    9230              :           /* CTOR instances require vectorized defs for the SLP tree root.  */
    9231      1003020 :           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
    9232         5521 :               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
    9233              :                   != vect_internal_def
    9234              :                   /* Make sure we vectorized with the expected type.  */
    9235         5521 :                   || !useless_type_conversion_p
    9236         5521 :                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
    9237              :                                               (instance->root_stmts[0]->stmt))),
    9238         5521 :                          TREE_TYPE (SLP_TREE_VECTYPE
    9239              :                                             (SLP_INSTANCE_TREE (instance))))))
    9240              :           /* Check we can vectorize the reduction.  */
    9241      1003005 :           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
    9242        73907 :               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
    9243              :           /* Check we can vectorize the gcond.  */
    9244      2178805 :           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
    9245        57416 :               && !vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
    9246        57416 :                                            SLP_INSTANCE_ROOT_STMTS (instance)[0],
    9247              :                                            NULL,
    9248              :                                            SLP_INSTANCE_TREE (instance),
    9249              :                                            &cost_vec)))
    9250              :         {
    9251       326842 :           cost_vec.release ();
    9252       326842 :           slp_tree node = SLP_INSTANCE_TREE (instance);
    9253       326842 :           stmt_vec_info stmt_info;
    9254       326842 :           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
    9255       252124 :             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
    9256        74718 :           else if (!SLP_TREE_SCALAR_STMTS (node).is_empty ()
    9257        74718 :                    && SLP_TREE_SCALAR_STMTS (node)[0])
    9258              :             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
    9259              :           else
    9260            0 :             stmt_info = SLP_TREE_REPRESENTATIVE (node);
    9261       326842 :           if (is_a <loop_vec_info> (vinfo))
    9262              :             {
    9263       229157 :               if (dump_enabled_p ())
    9264         6343 :                 dump_printf_loc (MSG_NOTE, vect_location,
    9265              :                                  "unsupported SLP instance starting from: %G",
    9266              :                                  stmt_info->stmt);
    9267       229157 :               return false;
    9268              :             }
    9269        97685 :           if (dump_enabled_p ())
    9270          325 :             dump_printf_loc (MSG_NOTE, vect_location,
    9271              :                              "removing SLP instance operations starting from: %G",
    9272              :                              stmt_info->stmt);
    9273       435459 :           while (!visited_vec.is_empty ())
    9274              :             {
    9275       337774 :               slp_tree node = visited_vec.pop ();
    9276       337774 :               SLP_TREE_TYPE (node) = undef_vec_info_type;
    9277       337774 :               if (node->data)
    9278              :                 {
    9279        12269 :                   delete node->data;
    9280        12269 :                   node->data = nullptr;
    9281              :                 }
    9282       337774 :               visited.remove (node);
    9283              :             }
    9284        97685 :           vect_free_slp_instance (instance);
    9285        97685 :           vinfo->slp_instances.ordered_remove (i);
    9286              :         }
    9287              :       else
    9288              :         {
    9289       898269 :           i++;
    9290       898269 :           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
    9291              :             {
    9292       220276 :               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
    9293       220276 :               cost_vec.release ();
    9294              :             }
    9295              :           else
    9296              :             /* For BB vectorization remember the SLP graph entry
    9297              :                cost for later.  */
    9298       677993 :             instance->cost_vec = cost_vec;
    9299              :         }
    9300      1225111 :     }
    9301              : 
    9302              :   /* Now look for SLP instances with a root that are covered by other
    9303              :      instances and remove them.  */
    9304       376787 :   hash_set<stmt_vec_info> roots;
    9305      1585610 :   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
    9306       864117 :     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
    9307        32081 :       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
    9308       376787 :   if (!roots.is_empty ())
    9309              :     {
    9310        12431 :       visited.empty ();
    9311        73042 :       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
    9312        60611 :         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
    9313              :                                       visited);
    9314        73042 :       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
    9315        60611 :         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
    9316        32081 :             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
    9317              :           {
    9318         1503 :             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
    9319         1503 :             if (dump_enabled_p ())
    9320           20 :               dump_printf_loc (MSG_NOTE, vect_location,
    9321              :                                "removing SLP instance operations starting "
    9322              :                                "from: %G", root->stmt);
    9323         1503 :             vect_free_slp_instance (instance);
    9324         1503 :             vinfo->slp_instances.ordered_remove (i);
    9325              :           }
    9326              :         else
    9327        59108 :           ++i;
    9328              :     }
    9329              : 
    9330              :   /* Compute vectorizable live stmts.  */
    9331       376787 :   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
    9332       263794 :     vect_bb_slp_mark_live_stmts (bb_vinfo);
    9333              : 
    9334       753574 :   return !vinfo->slp_instances.is_empty ();
    9335       982731 : }
    9336              : 
    9337              : /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
    9338              :    closing the eventual chain.  */
    9339              : 
    9340              : static slp_instance
    9341       742064 : get_ultimate_leader (slp_instance instance,
    9342              :                      hash_map<slp_instance, slp_instance> &instance_leader)
    9343              : {
    9344       742064 :   auto_vec<slp_instance *, 8> chain;
    9345       742064 :   slp_instance *tem;
    9346       819669 :   while (*(tem = instance_leader.get (instance)) != instance)
    9347              :     {
    9348        77605 :       chain.safe_push (tem);
    9349        77605 :       instance = *tem;
    9350              :     }
    9351       819669 :   while (!chain.is_empty ())
    9352        77605 :     *chain.pop () = instance;
    9353       742064 :   return instance;
    9354       742064 : }
    9355              : 
    9356              : namespace {
    9357              : /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
    9358              :    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
    9359              :    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
    9360              : 
    9361              :    INSTANCE_LEADER is as for get_ultimate_leader.  */
    9362              : 
    9363              : template<typename T>
    9364              : bool
    9365      3285934 : vect_map_to_instance (slp_instance instance, T key,
    9366              :                       hash_map<T, slp_instance> &key_to_instance,
    9367              :                       hash_map<slp_instance, slp_instance> &instance_leader)
    9368              : {
    9369              :   bool existed_p;
    9370      3285934 :   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
    9371      3285934 :   if (!existed_p)
    9372              :     ;
    9373       174715 :   else if (key_instance != instance)
    9374              :     {
    9375              :       /* If we're running into a previously marked key make us the
    9376              :          leader of the current ultimate leader.  This keeps the
    9377              :          leader chain acyclic and works even when the current instance
    9378              :          connects two previously independent graph parts.  */
    9379        65574 :       slp_instance key_leader
    9380        65574 :         = get_ultimate_leader (key_instance, instance_leader);
    9381        65574 :       if (key_leader != instance)
    9382        19507 :         instance_leader.put (key_leader, instance);
    9383              :     }
    9384      3285934 :   key_instance = instance;
    9385      3285934 :   return existed_p;
    9386              : }
    9387              : }
    9388              : 
    9389              : /* Worker of vect_bb_partition_graph, recurse on NODE.  */
    9390              : 
    9391              : static void
    9392       909370 : vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
    9393              :                            slp_instance instance, slp_tree node,
    9394              :                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
    9395              :                            hash_map<slp_tree, slp_instance> &node_to_instance,
    9396              :                            hash_map<slp_instance, slp_instance> &instance_leader)
    9397              : {
    9398       909370 :   stmt_vec_info stmt_info;
    9399       909370 :   unsigned i;
    9400              : 
    9401      3285934 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    9402      2376564 :     if (stmt_info)
    9403      2376564 :       vect_map_to_instance (instance, stmt_info, stmt_to_instance,
    9404              :                             instance_leader);
    9405              : 
    9406       909370 :   if (vect_map_to_instance (instance, node, node_to_instance,
    9407              :                             instance_leader))
    9408       909370 :     return;
    9409              : 
    9410              :   slp_tree child;
    9411      1745066 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9412       877332 :     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    9413       232880 :       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
    9414              :                                  node_to_instance, instance_leader);
    9415              : }
    9416              : 
    9417              : /* Partition the SLP graph into pieces that can be costed independently.  */
    9418              : 
    9419              : static void
    9420       234133 : vect_bb_partition_graph (bb_vec_info bb_vinfo)
    9421              : {
    9422       234133 :   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
    9423              : 
    9424              :   /* First walk the SLP graph assigning each involved scalar stmt a
    9425              :      corresponding SLP graph entry and upon visiting a previously
    9426              :      marked stmt, make the stmts leader the current SLP graph entry.  */
    9427       234133 :   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
    9428       234133 :   hash_map<slp_tree, slp_instance> node_to_instance;
    9429       234133 :   hash_map<slp_instance, slp_instance> instance_leader;
    9430       234133 :   slp_instance instance;
    9431       910623 :   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
    9432              :     {
    9433       676490 :       instance_leader.put (instance, instance);
    9434       676490 :       vect_bb_partition_graph_r (bb_vinfo,
    9435              :                                  instance, SLP_INSTANCE_TREE (instance),
    9436              :                                  stmt_to_instance, node_to_instance,
    9437              :                                  instance_leader);
    9438              :     }
    9439              : 
    9440              :   /* Then collect entries to each independent subgraph.  */
    9441      1144756 :   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
    9442              :     {
    9443       676490 :       slp_instance leader = get_ultimate_leader (instance, instance_leader);
    9444       676490 :       leader->subgraph_entries.safe_push (instance);
    9445       676490 :       if (dump_enabled_p ()
    9446       676490 :           && leader != instance)
    9447           69 :         dump_printf_loc (MSG_NOTE, vect_location,
    9448              :                          "instance %p is leader of %p\n",
    9449              :                          (void *) leader, (void *) instance);
    9450              :     }
    9451       234133 : }
    9452              : 
    9453              : /* Compute the set of scalar stmts participating in internal and external
    9454              :    nodes.  */
    9455              : 
    9456              : static void
    9457      1540098 : vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
    9458              :                                          hash_set<slp_tree> &visited,
    9459              :                                          hash_set<stmt_vec_info> &vstmts,
    9460              :                                          hash_set<stmt_vec_info> &estmts)
    9461              : {
    9462      1540098 :   int i;
    9463      1540098 :   stmt_vec_info stmt_info;
    9464      1540098 :   slp_tree child;
    9465              : 
    9466      1540098 :   if (visited.add (node))
    9467        41573 :     return;
    9468              : 
    9469      1498525 :   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
    9470              :     {
    9471      3081903 :       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    9472      2223238 :         if (stmt_info)
    9473      2223238 :           vstmts.add (stmt_info);
    9474              : 
    9475      3119980 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9476       867080 :         if (child)
    9477       867080 :           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
    9478              :                                                    vstmts, estmts);
    9479              :     }
    9480              :   else
    9481      3588950 :     for (tree def : SLP_TREE_SCALAR_OPS (node))
    9482              :       {
    9483      1670418 :         stmt_vec_info def_stmt = vinfo->lookup_def (def);
    9484      1670418 :         if (def_stmt)
    9485       332354 :           estmts.add (def_stmt);
    9486              :       }
    9487              : }
    9488              : 
    9489              : 
    9490              : /* Compute the scalar cost of the SLP node NODE and its children
    9491              :    and return it.  Do not account defs that are marked in LIFE and
    9492              :    update LIFE according to uses of NODE.  */
    9493              : 
    9494              : static void
    9495       899457 : vect_bb_slp_scalar_cost (vec_info *vinfo,
    9496              :                          slp_tree node, vec<bool, va_heap> *life,
    9497              :                          stmt_vector_for_cost *cost_vec,
    9498              :                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
    9499              :                          hash_set<stmt_vec_info> &scalar_stmts_in_externs,
    9500              :                          hash_set<slp_tree> &visited)
    9501              : {
    9502       899457 :   unsigned i;
    9503       899457 :   stmt_vec_info stmt_info;
    9504       899457 :   slp_tree child;
    9505              : 
    9506       899457 :   if (visited.add (node))
    9507        40775 :     return;
    9508              : 
    9509      3081954 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
    9510              :     {
    9511      2223272 :       ssa_op_iter op_iter;
    9512      2223272 :       def_operand_p def_p;
    9513              : 
    9514      2255446 :       if (!stmt_info
    9515      2223272 :           || (*life)[i]
    9516              :           /* Defs also used in external nodes are not in the
    9517              :              vectorized_scalar_stmts set as they need to be preserved.
    9518              :              Honor that.  */
    9519      4417371 :           || scalar_stmts_in_externs.contains (stmt_info))
    9520       106016 :         continue;
    9521              : 
    9522      2191098 :       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
    9523      2191098 :       gimple *orig_stmt = orig_stmt_info->stmt;
    9524              : 
    9525              :       /* If there is a non-vectorized use of the defs then the scalar
    9526              :          stmt is kept live in which case we do not account it or any
    9527              :          required defs in the SLP children in the scalar cost.  This
    9528              :          way we make the vectorization more costly when compared to
    9529              :          the scalar cost.  */
    9530      2191098 :       if (!STMT_VINFO_LIVE_P (stmt_info))
    9531              :         {
    9532      2121600 :           auto_vec<gimple *, 8> worklist;
    9533      2121600 :           hash_set<gimple *> *worklist_visited = NULL;
    9534      2121600 :           worklist.quick_push (orig_stmt);
    9535      2127088 :           do
    9536              :             {
    9537      2127088 :               gimple *work_stmt = worklist.pop ();
    9538      4657081 :               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
    9539              :                 {
    9540       423722 :                   imm_use_iterator use_iter;
    9541       423722 :                   gimple *use_stmt;
    9542      1054986 :                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
    9543              :                                          DEF_FROM_PTR (def_p))
    9544       652081 :                     if (!is_gimple_debug (use_stmt))
    9545              :                       {
    9546       499020 :                         stmt_vec_info use_stmt_info
    9547       499020 :                           = vinfo->lookup_stmt (use_stmt);
    9548       499020 :                         if (!use_stmt_info
    9549       499020 :                             || !vectorized_scalar_stmts.contains (use_stmt_info))
    9550              :                           {
    9551        26411 :                             if (use_stmt_info
    9552        23331 :                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
    9553              :                               {
    9554              :                                 /* For stmts participating in patterns we have
    9555              :                                    to check its uses recursively.  */
    9556         5594 :                                 if (!worklist_visited)
    9557         4243 :                                   worklist_visited = new hash_set<gimple *> ();
    9558         5594 :                                 if (!worklist_visited->add (use_stmt))
    9559         5594 :                                   worklist.safe_push (use_stmt);
    9560         5594 :                                 continue;
    9561              :                               }
    9562        20817 :                             (*life)[i] = true;
    9563        20817 :                             goto next_lane;
    9564              :                           }
    9565       423722 :                       }
    9566              :                 }
    9567              :             }
    9568      4212542 :           while (!worklist.is_empty ());
    9569      2100783 : next_lane:
    9570      2121600 :           if (worklist_visited)
    9571         4243 :             delete worklist_visited;
    9572      2121600 :           if ((*life)[i])
    9573        20817 :             continue;
    9574      2121600 :         }
    9575              : 
    9576              :       /* Count scalar stmts only once.  */
    9577      2170281 :       if (gimple_visited_p (orig_stmt))
    9578        24979 :         continue;
    9579      2145302 :       gimple_set_visited (orig_stmt, true);
    9580              : 
    9581      2145302 :       vect_cost_for_stmt kind;
    9582      2145302 :       if (STMT_VINFO_DATA_REF (orig_stmt_info))
    9583              :         {
    9584      1945652 :           data_reference_p dr = STMT_VINFO_DATA_REF (orig_stmt_info);
    9585      1945652 :           tree base = get_base_address (DR_REF (dr));
    9586              :           /* When the scalar access is to a non-global not address-taken
    9587              :              decl that is not BLKmode assume we can access it with a single
    9588              :              non-load/store instruction.  */
    9589      1945652 :           if (DECL_P (base)
    9590      1498243 :               && !is_global_var (base)
    9591      1421967 :               && !TREE_ADDRESSABLE (base)
    9592      2495046 :               && DECL_MODE (base) != BLKmode)
    9593              :             kind = scalar_stmt;
    9594      1802085 :           else if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
    9595              :             kind = scalar_load;
    9596              :           else
    9597      1576212 :             kind = scalar_store;
    9598              :         }
    9599       199650 :       else if (vect_nop_conversion_p (orig_stmt_info))
    9600        20035 :         continue;
    9601              :       /* For single-argument PHIs assume coalescing which means zero cost
    9602              :          for the scalar and the vector PHIs.  This avoids artificially
    9603              :          favoring the vector path (but may pessimize it in some cases).  */
    9604       179615 :       else if (is_a <gphi *> (orig_stmt_info->stmt)
    9605       179615 :                && gimple_phi_num_args
    9606        83457 :                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
    9607         8011 :         continue;
    9608              :       else
    9609              :         kind = scalar_stmt;
    9610      2117256 :       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
    9611              :                         SLP_TREE_VECTYPE (node), 0, vect_body);
    9612              :     }
    9613              : 
    9614      1717364 :   auto_vec<bool, 20> subtree_life;
    9615      2480170 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
    9616              :     {
    9617       867104 :       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
    9618              :         {
    9619              :           /* Do not directly pass LIFE to the recursive call, copy it to
    9620              :              confine changes in the callee to the current child/subtree.  */
    9621       226439 :           if (SLP_TREE_PERMUTE_P (node))
    9622              :             {
    9623         3496 :               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
    9624        12240 :               for (unsigned j = 0;
    9625        12240 :                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
    9626              :                 {
    9627         8744 :                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
    9628         8744 :                   if (perm.first == i)
    9629         4600 :                     subtree_life[perm.second] = (*life)[j];
    9630              :                 }
    9631              :             }
    9632              :           else
    9633              :             {
    9634       222943 :               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
    9635       222943 :               subtree_life.safe_splice (*life);
    9636              :             }
    9637       226439 :           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
    9638              :                                    vectorized_scalar_stmts,
    9639              :                                    scalar_stmts_in_externs, visited);
    9640       226439 :           subtree_life.truncate (0);
    9641              :         }
    9642              :     }
    9643              : }
    9644              : 
    9645              : /* Comparator for the loop-index sorted cost vectors.  */
    9646              : 
    9647              : static int
    9648     17645886 : li_cost_vec_cmp (const void *a_, const void *b_)
    9649              : {
    9650     17645886 :   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
    9651     17645886 :   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
    9652     17645886 :   if (a->first < b->first)
    9653              :     return -1;
    9654     16884865 :   else if (a->first == b->first)
    9655     16212843 :     return 0;
    9656              :   return 1;
    9657              : }
    9658              : 
    9659              : /* Check if vectorization of the basic block is profitable for the
    9660              :    subgraph denoted by SLP_INSTANCES.  */
    9661              : 
    9662              : static bool
    9663       653648 : vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
    9664              :                                     vec<slp_instance> slp_instances,
    9665              :                                     loop_p orig_loop)
    9666              : {
    9667       653648 :   slp_instance instance;
    9668       653648 :   int i;
    9669       653648 :   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
    9670       653648 :   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
    9671              : 
    9672       653648 :   if (dump_enabled_p ())
    9673              :     {
    9674           96 :       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
    9675           96 :       hash_set<slp_tree> visited;
    9676          387 :       FOR_EACH_VEC_ELT (slp_instances, i, instance)
    9677           99 :         vect_print_slp_graph (MSG_NOTE, vect_location,
    9678              :                               SLP_INSTANCE_TREE (instance), visited);
    9679           96 :     }
    9680              : 
    9681              :   /* Compute the set of scalar stmts we know will go away 'locally' when
    9682              :      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
    9683              :      not accurate for nodes promoted extern late or for scalar stmts that
    9684              :      are used both in extern defs and in vectorized defs.  */
    9685       653648 :   hash_set<stmt_vec_info> vectorized_scalar_stmts;
    9686       653648 :   hash_set<stmt_vec_info> scalar_stmts_in_externs;
    9687       653648 :   hash_set<slp_tree> visited;
    9688      1326666 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
    9689              :     {
    9690       673018 :       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
    9691              :                                                SLP_INSTANCE_TREE (instance),
    9692              :                                                visited,
    9693              :                                                vectorized_scalar_stmts,
    9694              :                                                scalar_stmts_in_externs);
    9695       781332 :       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
    9696        51642 :         vectorized_scalar_stmts.add (rstmt);
    9697              :     }
    9698              :   /* Scalar stmts used as defs in external nodes need to be preseved, so
    9699              :      remove them from vectorized_scalar_stmts.  */
    9700       950439 :   for (stmt_vec_info stmt : scalar_stmts_in_externs)
    9701       296791 :     vectorized_scalar_stmts.remove (stmt);
    9702              : 
    9703              :   /* Calculate scalar cost and sum the cost for the vector stmts
    9704              :      previously collected.  */
    9705       653648 :   stmt_vector_for_cost scalar_costs = vNULL;
    9706       653648 :   stmt_vector_for_cost vector_costs = vNULL;
    9707       653648 :   visited.empty ();
    9708      1326666 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
    9709              :     {
    9710       673018 :       auto_vec<bool, 20> life;
    9711       673018 :       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
    9712              :                               true);
    9713       673018 :       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
    9714        56672 :         record_stmt_cost (&scalar_costs,
    9715        28336 :                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
    9716              :                           scalar_stmt,
    9717        28336 :                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
    9718       673018 :       vect_bb_slp_scalar_cost (bb_vinfo,
    9719              :                                SLP_INSTANCE_TREE (instance),
    9720              :                                &life, &scalar_costs, vectorized_scalar_stmts,
    9721              :                                scalar_stmts_in_externs, visited);
    9722       673018 :       vector_costs.safe_splice (instance->cost_vec);
    9723       673018 :       instance->cost_vec.release ();
    9724       673018 :     }
    9725              : 
    9726       653648 :   if (dump_enabled_p ())
    9727           96 :     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
    9728              : 
    9729              :   /* When costing non-loop vectorization we need to consider each covered
    9730              :      loop independently and make sure vectorization is profitable.  For
    9731              :      now we assume a loop may be not entered or executed an arbitrary
    9732              :      number of iterations (???  static information can provide more
    9733              :      precise info here) which means we can simply cost each containing
    9734              :      loops stmts separately.  */
    9735              : 
    9736              :   /* First produce cost vectors sorted by loop index.  */
    9737       653648 :   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
    9738       653648 :     li_scalar_costs (scalar_costs.length ());
    9739       653648 :   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
    9740       653648 :     li_vector_costs (vector_costs.length ());
    9741       653648 :   stmt_info_for_cost *cost;
    9742      2799240 :   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
    9743              :     {
    9744      2145592 :       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
    9745      2145592 :       li_scalar_costs.quick_push (std::make_pair (l, cost));
    9746              :     }
    9747              :   /* Use a random used loop as fallback in case the first vector_costs
    9748              :      entry does not have a stmt_info associated with it.  */
    9749       653648 :   unsigned l = li_scalar_costs[0].first;
    9750      2412521 :   FOR_EACH_VEC_ELT (vector_costs, i, cost)
    9751              :     {
    9752              :       /* We inherit from the previous COST, invariants, externals and
    9753              :          extracts immediately follow the cost for the related stmt.  */
    9754      1758873 :       if (cost->stmt_info)
    9755      1044776 :         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
    9756      1758873 :       li_vector_costs.quick_push (std::make_pair (l, cost));
    9757              :     }
    9758       653648 :   li_scalar_costs.qsort (li_cost_vec_cmp);
    9759       653648 :   li_vector_costs.qsort (li_cost_vec_cmp);
    9760              : 
    9761              :   /* Now cost the portions individually.  */
    9762              :   unsigned vi = 0;
    9763              :   unsigned si = 0;
    9764      1135742 :   bool profitable = true;
    9765      1135742 :   while (si < li_scalar_costs.length ()
    9766      1794210 :          && vi < li_vector_costs.length ())
    9767              :     {
    9768       658468 :       unsigned sl = li_scalar_costs[si].first;
    9769       658468 :       unsigned vl = li_vector_costs[vi].first;
    9770       658468 :       if (sl != vl)
    9771              :         {
    9772         1253 :           if (dump_enabled_p ())
    9773            0 :             dump_printf_loc (MSG_NOTE, vect_location,
    9774              :                              "Scalar %d and vector %d loop part do not "
    9775              :                              "match up, skipping scalar part\n", sl, vl);
    9776              :           /* Skip the scalar part, assuming zero cost on the vector side.  */
    9777         2744 :           do
    9778              :             {
    9779         2744 :               si++;
    9780              :             }
    9781         2744 :           while (si < li_scalar_costs.length ()
    9782         4876 :                  && li_scalar_costs[si].first == sl);
    9783         1253 :           continue;
    9784              :         }
    9785              : 
    9786       657215 :       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
    9787      2125731 :       do
    9788              :         {
    9789      2125731 :           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
    9790      2125731 :           si++;
    9791              :         }
    9792      2125731 :       while (si < li_scalar_costs.length ()
    9793      4259004 :              && li_scalar_costs[si].first == sl);
    9794       657215 :       scalar_target_cost_data->finish_cost (nullptr);
    9795       657215 :       scalar_cost = (scalar_target_cost_data->body_cost ()
    9796       657215 :                      * param_vect_scalar_cost_multiplier) / 100;
    9797              : 
    9798              :       /* Complete the target-specific vector cost calculation.  */
    9799       657215 :       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
    9800      1724812 :       do
    9801              :         {
    9802      1724812 :           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
    9803      1724812 :           vi++;
    9804              :         }
    9805      1724812 :       while (vi < li_vector_costs.length ()
    9806      3458299 :              && li_vector_costs[vi].first == vl);
    9807       657215 :       vect_target_cost_data->finish_cost (scalar_target_cost_data);
    9808       657215 :       vec_prologue_cost = vect_target_cost_data->prologue_cost ();
    9809       657215 :       vec_inside_cost = vect_target_cost_data->body_cost ();
    9810       657215 :       vec_epilogue_cost = vect_target_cost_data->epilogue_cost ();
    9811       657215 :       delete scalar_target_cost_data;
    9812       657215 :       delete vect_target_cost_data;
    9813              : 
    9814       657215 :       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
    9815              : 
    9816       657215 :       if (dump_enabled_p ())
    9817              :         {
    9818           96 :           dump_printf_loc (MSG_NOTE, vect_location,
    9819              :                            "Cost model analysis for part in loop %d:\n", sl);
    9820           96 :           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
    9821              :                        vec_inside_cost + vec_outside_cost);
    9822           96 :           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
    9823              :         }
    9824              : 
    9825              :       /* Vectorization is profitable if its cost is more than the cost of scalar
    9826              :          version.  Note that we err on the vector side for equal cost because
    9827              :          the cost estimate is otherwise quite pessimistic (constant uses are
    9828              :          free on the scalar side but cost a load on the vector side for
    9829              :          example).  */
    9830       657215 :       if (vec_outside_cost + vec_inside_cost > scalar_cost)
    9831              :         {
    9832              :           profitable = false;
    9833              :           break;
    9834              :         }
    9835              :     }
    9836      1130909 :   if (profitable && vi < li_vector_costs.length ())
    9837              :     {
    9838         1177 :       if (dump_enabled_p ())
    9839           12 :         dump_printf_loc (MSG_NOTE, vect_location,
    9840              :                          "Excess vector cost for part in loop %d:\n",
    9841            6 :                          li_vector_costs[vi].first);
    9842              :       profitable = false;
    9843              :     }
    9844              : 
    9845              :   /* Unset visited flag.  This is delayed when the subgraph is profitable
    9846              :      and we process the loop for remaining unvectorized if-converted code.  */
    9847       653648 :   if (!orig_loop || !profitable)
    9848      2797953 :     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
    9849      2144394 :       gimple_set_visited  (cost->stmt_info->stmt, false);
    9850              : 
    9851       653648 :   scalar_costs.release ();
    9852       653648 :   vector_costs.release ();
    9853              : 
    9854       653648 :   return profitable;
    9855       653648 : }
    9856              : 
    9857              : /* qsort comparator for lane defs.  */
    9858              : 
    9859              : static int
    9860           40 : vld_cmp (const void *a_, const void *b_)
    9861              : {
    9862           40 :   auto *a = (const std::pair<unsigned, tree> *)a_;
    9863           40 :   auto *b = (const std::pair<unsigned, tree> *)b_;
    9864           40 :   return a->first - b->first;
    9865              : }
    9866              : 
    9867              : /* Return true if USE_STMT is a vector lane insert into VEC and set
    9868              :    *THIS_LANE to the lane number that is set.  */
    9869              : 
    9870              : static bool
    9871          248 : vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
    9872              : {
    9873          248 :   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
    9874           91 :   if (!use_ass
    9875           91 :       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
    9876           22 :       || (vec
    9877           22 :           ? gimple_assign_rhs1 (use_ass) != vec
    9878           24 :           : ((vec = gimple_assign_rhs1 (use_ass)), false))
    9879           46 :       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
    9880           46 :                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
    9881           46 :       || !constant_multiple_p
    9882           46 :             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
    9883           92 :              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
    9884              :              this_lane))
    9885          202 :     return false;
    9886              :   return true;
    9887              : }
    9888              : 
    9889              : /* Find any vectorizable constructors and add them to the grouped_store
    9890              :    array.  */
    9891              : 
    9892              : static void
    9893      2183824 : vect_slp_check_for_roots (bb_vec_info bb_vinfo)
    9894              : {
    9895     17723259 :   for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
    9896     31078870 :     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
    9897    134980207 :          !gsi_end_p (gsi); gsi_next (&gsi))
    9898              :     {
    9899    119440772 :       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
    9900              :       /* This can be used to start SLP discovery for early breaks for BB early breaks
    9901              :          when we get that far.  */
    9902    119440772 :       if (!assign)
    9903    178835507 :         continue;
    9904              : 
    9905     30937072 :       tree rhs = gimple_assign_rhs1 (assign);
    9906     30937072 :       enum tree_code code = gimple_assign_rhs_code (assign);
    9907     30937072 :       use_operand_p use_p;
    9908     30937072 :       gimple *use_stmt;
    9909     30937072 :       if (code == CONSTRUCTOR)
    9910              :         {
    9911      1563735 :           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
    9912        63705 :               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
    9913        92969 :                            CONSTRUCTOR_NELTS (rhs))
    9914        42794 :               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
    9915      1606525 :               || uniform_vector_p (rhs))
    9916      1550838 :             continue;
    9917              : 
    9918              :           unsigned j;
    9919              :           tree val;
    9920        64357 :           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
    9921        51460 :             if (TREE_CODE (val) != SSA_NAME
    9922        51460 :                 || !bb_vinfo->lookup_def (val))
    9923              :               break;
    9924        31628 :           if (j != CONSTRUCTOR_NELTS (rhs))
    9925         2917 :             continue;
    9926              : 
    9927        12897 :           vec<stmt_vec_info> roots = vNULL;
    9928        12897 :           roots.safe_push (bb_vinfo->lookup_stmt (assign));
    9929        12897 :           vec<stmt_vec_info> stmts;
    9930        12897 :           stmts.create (CONSTRUCTOR_NELTS (rhs));
    9931        72760 :           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
    9932        46966 :             stmts.quick_push
    9933        46966 :               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
    9934        12897 :           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
    9935        12897 :                                                stmts, roots));
    9936              :         }
    9937     29373337 :       else if (code == BIT_INSERT_EXPR
    9938          926 :                && VECTOR_TYPE_P (TREE_TYPE (rhs))
    9939          608 :                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
    9940          608 :                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
    9941          605 :                && integer_zerop (gimple_assign_rhs3 (assign))
    9942          341 :                && useless_type_conversion_p
    9943          341 :                     (TREE_TYPE (TREE_TYPE (rhs)),
    9944          341 :                      TREE_TYPE (gimple_assign_rhs2 (assign)))
    9945     29373959 :                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
    9946              :         {
    9947              :           /* We start to match on insert to lane zero but since the
    9948              :              inserts need not be ordered we'd have to search both
    9949              :              the def and the use chains.  */
    9950          215 :           tree vectype = TREE_TYPE (rhs);
    9951          215 :           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
    9952          215 :           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
    9953          215 :           auto_sbitmap lanes (nlanes);
    9954          215 :           bitmap_clear (lanes);
    9955          215 :           bitmap_set_bit (lanes, 0);
    9956          215 :           tree def = gimple_assign_lhs (assign);
    9957          215 :           lane_defs.quick_push
    9958          215 :                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
    9959          215 :           unsigned lanes_found = 1;
    9960              :           /* Start with the use chains, the last stmt will be the root.  */
    9961          215 :           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
    9962          215 :           vec<stmt_vec_info> roots = vNULL;
    9963          215 :           roots.safe_push (last);
    9964          217 :           do
    9965              :             {
    9966          217 :               use_operand_p use_p;
    9967          217 :               gimple *use_stmt;
    9968          217 :               if (!single_imm_use (def, &use_p, &use_stmt))
    9969              :                 break;
    9970          211 :               unsigned this_lane;
    9971          211 :               if (!bb_vinfo->lookup_stmt (use_stmt)
    9972          211 :                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
    9973          233 :                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
    9974              :                 break;
    9975           22 :               if (bitmap_bit_p (lanes, this_lane))
    9976              :                 break;
    9977            2 :               lanes_found++;
    9978            2 :               bitmap_set_bit (lanes, this_lane);
    9979            2 :               gassign *use_ass = as_a <gassign *> (use_stmt);
    9980            2 :               lane_defs.quick_push (std::make_pair
    9981            2 :                                      (this_lane, gimple_assign_rhs2 (use_ass)));
    9982            2 :               last = bb_vinfo->lookup_stmt (use_ass);
    9983            2 :               roots.safe_push (last);
    9984            2 :               def = gimple_assign_lhs (use_ass);
    9985              :             }
    9986            2 :           while (lanes_found < nlanes);
    9987          215 :           if (roots.length () > 1)
    9988            2 :             std::swap(roots[0], roots[roots.length () - 1]);
    9989          215 :           if (lanes_found < nlanes)
    9990              :             {
    9991              :               /* Now search the def chain.  */
    9992          215 :               def = gimple_assign_rhs1 (assign);
    9993          217 :               do
    9994              :                 {
    9995          217 :                   if (TREE_CODE (def) != SSA_NAME
    9996          217 :                       || !has_single_use (def))
    9997              :                     break;
    9998           56 :                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
    9999           56 :                   unsigned this_lane;
   10000           56 :                   if (!bb_vinfo->lookup_stmt (def_stmt)
   10001           37 :                       || !vect_slp_is_lane_insert (def_stmt,
   10002              :                                                    NULL_TREE, &this_lane)
   10003           80 :                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
   10004              :                     break;
   10005           24 :                   if (bitmap_bit_p (lanes, this_lane))
   10006              :                     break;
   10007            4 :                   lanes_found++;
   10008            4 :                   bitmap_set_bit (lanes, this_lane);
   10009            8 :                   lane_defs.quick_push (std::make_pair
   10010            4 :                                           (this_lane,
   10011            4 :                                            gimple_assign_rhs2 (def_stmt)));
   10012            4 :                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
   10013            4 :                   def = gimple_assign_rhs1 (def_stmt);
   10014              :                 }
   10015            4 :               while (lanes_found < nlanes);
   10016              :             }
   10017          215 :           if (lanes_found == nlanes)
   10018              :             {
   10019              :               /* Sort lane_defs after the lane index and register the root.  */
   10020            2 :               lane_defs.qsort (vld_cmp);
   10021            2 :               vec<stmt_vec_info> stmts;
   10022            2 :               stmts.create (nlanes);
   10023           10 :               for (unsigned i = 0; i < nlanes; ++i)
   10024            8 :                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
   10025            2 :               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
   10026            2 :                                                    stmts, roots));
   10027              :             }
   10028              :           else
   10029          213 :             roots.release ();
   10030          215 :         }
   10031     29373122 :       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
   10032     28387558 :                && (associative_tree_code (code) || code == MINUS_EXPR)
   10033              :                /* ???  This pessimizes a two-element reduction.  PR54400.
   10034              :                   ???  In-order reduction could be handled if we only
   10035              :                   traverse one operand chain in vect_slp_linearize_chain.  */
   10036     33320528 :                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
   10037              :                /* Ops with constants at the tail can be stripped here.  */
   10038      5811026 :                && TREE_CODE (rhs) == SSA_NAME
   10039      5748632 :                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
   10040              :                /* Should be the chain end.  */
   10041     31643800 :                && (!single_imm_use (gimple_assign_lhs (assign),
   10042              :                                     &use_p, &use_stmt)
   10043      1752004 :                    || !is_gimple_assign (use_stmt)
   10044      1191498 :                    || (gimple_assign_rhs_code (use_stmt) != code
   10045       883057 :                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
   10046       500242 :                            || (gimple_assign_rhs_code (use_stmt)
   10047       500242 :                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
   10048              :         {
   10049              :           /* We start the match at the end of a possible association
   10050              :              chain.  */
   10051      1863620 :           auto_vec<chain_op_t> chain;
   10052      1863620 :           auto_vec<std::pair<tree_code, gimple *> > worklist;
   10053      1863620 :           auto_vec<gimple *> chain_stmts;
   10054      1863620 :           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
   10055      1863620 :           if (code == MINUS_EXPR)
   10056       304307 :             code = PLUS_EXPR;
   10057      1863620 :           internal_fn reduc_fn;
   10058      2140889 :           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
   10059      1863620 :               || reduc_fn == IFN_LAST)
   10060       277269 :             continue;
   10061      1586351 :           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
   10062              :                                     /* ??? */
   10063              :                                     code_stmt, alt_code_stmt, &chain_stmts);
   10064      3172702 :           if (chain.length () > 1)
   10065              :             {
   10066              :               /* Sort the chain according to def_type and operation.  */
   10067      1586351 :               chain.sort (dt_sort_cmp, bb_vinfo);
   10068              :               /* ???  Now we'd want to strip externals and constants
   10069              :                  but record those to be handled in the epilogue.  */
   10070              :               /* ???  For now do not allow mixing ops or externs/constants.  */
   10071      1586351 :               bool invalid = false;
   10072      1586351 :               unsigned remain_cnt = 0;
   10073      1586351 :               unsigned last_idx = 0;
   10074      4786594 :               for (unsigned i = 0; i < chain.length (); ++i)
   10075              :                 {
   10076      3527511 :                   if (chain[i].code != code)
   10077              :                     {
   10078              :                       invalid = true;
   10079              :                       break;
   10080              :                     }
   10081      3200243 :                   if (chain[i].dt != vect_internal_def
   10082              :                       /* Avoid stmts where the def is not the LHS, like
   10083              :                          ASMs.  */
   10084      6172153 :                       || (gimple_get_lhs (bb_vinfo->lookup_def
   10085      2971910 :                                                       (chain[i].op)->stmt)
   10086      2971910 :                           != chain[i].op))
   10087       231277 :                     remain_cnt++;
   10088              :                   else
   10089              :                     last_idx = i;
   10090              :                 }
   10091              :               /* Make sure to have an even number of lanes as we later do
   10092              :                  all-or-nothing discovery, not trying to split further.  */
   10093      1586351 :               if ((chain.length () - remain_cnt) & 1)
   10094       184779 :                 remain_cnt++;
   10095      1586351 :               if (!invalid && chain.length () - remain_cnt > 1)
   10096              :                 {
   10097      1191565 :                   vec<stmt_vec_info> stmts;
   10098      1191565 :                   vec<tree> remain = vNULL;
   10099      1191565 :                   stmts.create (chain.length ());
   10100      1191565 :                   if (remain_cnt > 0)
   10101       110112 :                     remain.create (remain_cnt);
   10102      3828602 :                   for (unsigned i = 0; i < chain.length (); ++i)
   10103              :                     {
   10104      2637037 :                       stmt_vec_info stmt_info;
   10105      2637037 :                       if (chain[i].dt == vect_internal_def
   10106      2600203 :                           && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
   10107      2600203 :                               gimple_get_lhs (stmt_info->stmt) == chain[i].op)
   10108      5237156 :                           && (i != last_idx
   10109      1191565 :                               || (stmts.length () & 1)))
   10110      2515588 :                         stmts.quick_push (stmt_info);
   10111              :                       else
   10112       121449 :                         remain.quick_push (chain[i].op);
   10113              :                     }
   10114      1191565 :                   vec<stmt_vec_info> roots;
   10115      1191565 :                   roots.create (chain_stmts.length ());
   10116      2637037 :                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
   10117      1445472 :                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
   10118      1191565 :                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
   10119      1191565 :                                                        stmts, roots, remain));
   10120              :                 }
   10121              :             }
   10122      1863620 :         }
   10123              :     }
   10124      2183824 : }
   10125              : 
   10126              : /* Walk the grouped store chains and replace entries with their
   10127              :    pattern variant if any.  */
   10128              : 
   10129              : static void
   10130       609089 : vect_fixup_store_groups_with_patterns (vec_info *vinfo)
   10131              : {
   10132       609089 :   stmt_vec_info first_element;
   10133       609089 :   unsigned i;
   10134              : 
   10135      1493147 :   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
   10136              :     {
   10137              :       /* We also have CTORs in this array.  */
   10138       884058 :       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
   10139            0 :         continue;
   10140       884058 :       if (STMT_VINFO_IN_PATTERN_P (first_element))
   10141              :         {
   10142          254 :           stmt_vec_info orig = first_element;
   10143          254 :           first_element = STMT_VINFO_RELATED_STMT (first_element);
   10144          254 :           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
   10145          254 :           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
   10146          254 :           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
   10147          254 :           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
   10148          254 :           vinfo->grouped_stores[i] = first_element;
   10149              :         }
   10150       884058 :       stmt_vec_info prev = first_element;
   10151      2482831 :       while (DR_GROUP_NEXT_ELEMENT (prev))
   10152              :         {
   10153      1598773 :           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
   10154      1598773 :           if (STMT_VINFO_IN_PATTERN_P (elt))
   10155              :             {
   10156          893 :               stmt_vec_info orig = elt;
   10157          893 :               elt = STMT_VINFO_RELATED_STMT (elt);
   10158          893 :               DR_GROUP_NEXT_ELEMENT (prev) = elt;
   10159          893 :               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
   10160          893 :               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
   10161              :             }
   10162      1598773 :           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
   10163      1598773 :           prev = elt;
   10164              :         }
   10165              :     }
   10166       609089 : }
   10167              : 
   10168              : /* Check if the region described by BB_VINFO can be vectorized, returning
   10169              :    true if so.  When returning false, set FATAL to true if the same failure
   10170              :    would prevent vectorization at other vector sizes, false if it is still
   10171              :    worth trying other sizes.  N_STMTS is the number of statements in the
   10172              :    region.  */
   10173              : 
   10174              : static bool
   10175      2183824 : vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
   10176              :                        vec<int> *dataref_groups)
   10177              : {
   10178      2183824 :   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
   10179              : 
   10180      2183824 :   slp_instance instance;
   10181      2183824 :   int i;
   10182              : 
   10183              :   /* The first group of checks is independent of the vector size.  */
   10184      2183824 :   fatal = true;
   10185              : 
   10186              :   /* Analyze the data references.  */
   10187              : 
   10188      2183824 :   if (!vect_analyze_data_refs (bb_vinfo, NULL))
   10189              :     {
   10190            0 :       if (dump_enabled_p ())
   10191            0 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10192              :                          "not vectorized: unhandled data-ref in basic "
   10193              :                          "block.\n");
   10194            0 :       return false;
   10195              :     }
   10196              : 
   10197      2183824 :   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
   10198              :     {
   10199            0 :      if (dump_enabled_p ())
   10200            0 :        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10201              :                         "not vectorized: unhandled data access in "
   10202              :                         "basic block.\n");
   10203            0 :       return false;
   10204              :     }
   10205              : 
   10206      2183824 :   vect_slp_check_for_roots (bb_vinfo);
   10207              : 
   10208              :   /* If there are no grouped stores and no constructors in the region
   10209              :      there is no need to continue with pattern recog as vect_analyze_slp
   10210              :      will fail anyway.  */
   10211      2183824 :   if (bb_vinfo->grouped_stores.is_empty ()
   10212      1843395 :       && bb_vinfo->roots.is_empty ())
   10213              :     {
   10214      1574735 :       if (dump_enabled_p ())
   10215         1022 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10216              :                          "not vectorized: no grouped stores in "
   10217              :                          "basic block.\n");
   10218      1574735 :       return false;
   10219              :     }
   10220              : 
   10221              :   /* While the rest of the analysis below depends on it in some way.  */
   10222       609089 :   fatal = false;
   10223              : 
   10224       609089 :   vect_pattern_recog (bb_vinfo);
   10225              : 
   10226              :   /* Update store groups from pattern processing.  */
   10227       609089 :   vect_fixup_store_groups_with_patterns (bb_vinfo);
   10228              : 
   10229              :   /* Check the SLP opportunities in the basic block, analyze and build SLP
   10230              :      trees.  */
   10231       609089 :   if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
   10232              :     {
   10233            0 :       if (dump_enabled_p ())
   10234              :         {
   10235            0 :           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10236              :                            "Failed to SLP the basic block.\n");
   10237            0 :           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10238              :                            "not vectorized: failed to find SLP opportunities "
   10239              :                            "in basic block.\n");
   10240              :         }
   10241            0 :       return false;
   10242              :     }
   10243              : 
   10244              :   /* Optimize permutations.  */
   10245       609089 :   vect_optimize_slp (bb_vinfo);
   10246              : 
   10247              :   /* Gather the loads reachable from the SLP graph entries.  */
   10248       609089 :   vect_gather_slp_loads (bb_vinfo);
   10249              : 
   10250       609089 :   vect_record_base_alignments (bb_vinfo);
   10251              : 
   10252              :   /* Analyze and verify the alignment of data references and the
   10253              :      dependence in the SLP instances.  */
   10254      1393086 :   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
   10255              :     {
   10256       783997 :       vect_location = instance->location ();
   10257       783997 :       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
   10258       783997 :           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
   10259              :         {
   10260         8319 :           slp_tree node = SLP_INSTANCE_TREE (instance);
   10261         8319 :           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   10262         8319 :           if (dump_enabled_p ())
   10263            4 :             dump_printf_loc (MSG_NOTE, vect_location,
   10264              :                              "removing SLP instance operations starting from: %G",
   10265              :                              stmt_info->stmt);
   10266         8319 :           vect_free_slp_instance (instance);
   10267         8319 :           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
   10268         8319 :           continue;
   10269         8319 :         }
   10270              : 
   10271              :       /* Mark all the statements that we want to vectorize as pure SLP and
   10272              :          relevant.  */
   10273       775678 :       vect_mark_slp_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance));
   10274       775678 :       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
   10275       775678 :       unsigned j;
   10276       775678 :       stmt_vec_info root;
   10277              :       /* Likewise consider instance root stmts as vectorized.  */
   10278      1712751 :       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
   10279       161395 :         STMT_SLP_TYPE (root) = pure_slp;
   10280              : 
   10281       775678 :       i++;
   10282              :     }
   10283      2213485 :   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
   10284              :     return false;
   10285              : 
   10286       263794 :   if (!vect_slp_analyze_operations (bb_vinfo))
   10287              :     {
   10288        29661 :       if (dump_enabled_p ())
   10289           81 :         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10290              :                          "not vectorized: bad operation in basic block.\n");
   10291        29661 :       return false;
   10292              :     }
   10293              : 
   10294       234133 :   vect_bb_partition_graph (bb_vinfo);
   10295              : 
   10296       234133 :   return true;
   10297              : }
   10298              : 
   10299              : /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
   10300              :    basic blocks in BBS, returning true on success.
   10301              :    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
   10302              : 
   10303              : static bool
   10304      1864745 : vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
   10305              :                  vec<int> *dataref_groups, unsigned int n_stmts,
   10306              :                  loop_p orig_loop)
   10307              : {
   10308      1864745 :   bb_vec_info bb_vinfo;
   10309      1864745 :   auto_vector_modes vector_modes;
   10310              : 
   10311              :   /* Autodetect first vector size we try.  */
   10312      1864745 :   machine_mode next_vector_mode = VOIDmode;
   10313      1864745 :   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
   10314      1864745 :   unsigned int mode_i = 0;
   10315              : 
   10316      1864745 :   vec_info_shared shared;
   10317              : 
   10318      1864745 :   machine_mode autodetected_vector_mode = VOIDmode;
   10319      2502903 :   while (1)
   10320              :     {
   10321      2183824 :       bool vectorized = false;
   10322      2183824 :       bool fatal = false;
   10323      2183824 :       bb_vinfo = new _bb_vec_info (bbs, &shared);
   10324              : 
   10325      2183824 :       bool first_time_p = shared.datarefs.is_empty ();
   10326      2183824 :       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
   10327      2183824 :       if (first_time_p)
   10328      1887021 :         bb_vinfo->shared->save_datarefs ();
   10329              :       else
   10330       296803 :         bb_vinfo->shared->check_datarefs ();
   10331      2183824 :       bb_vinfo->vector_mode = next_vector_mode;
   10332              : 
   10333      2183824 :       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
   10334              :         {
   10335       234133 :           if (dump_enabled_p ())
   10336              :             {
   10337         1498 :               dump_printf_loc (MSG_NOTE, vect_location,
   10338              :                                "***** Analysis succeeded with vector mode"
   10339          749 :                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
   10340          749 :               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
   10341              :             }
   10342              : 
   10343       234133 :           bb_vinfo->shared->check_datarefs ();
   10344              : 
   10345       234133 :           bool force_clear = false;
   10346       234133 :           auto_vec<slp_instance> profitable_subgraphs;
   10347      1378889 :           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
   10348              :             {
   10349       676490 :               if (instance->subgraph_entries.is_empty ())
   10350       216565 :                 continue;
   10351              : 
   10352       656983 :               dump_user_location_t saved_vect_location = vect_location;
   10353       656983 :               vect_location = instance->location ();
   10354       656983 :               if (!unlimited_cost_model (NULL)
   10355      1310631 :                   && !vect_bb_vectorization_profitable_p
   10356       653648 :                         (bb_vinfo, instance->subgraph_entries, orig_loop))
   10357              :                 {
   10358       177551 :                   if (dump_enabled_p ())
   10359           28 :                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10360              :                                      "not vectorized: vectorization is not "
   10361              :                                      "profitable.\n");
   10362       177551 :                   vect_location = saved_vect_location;
   10363       177551 :                   continue;
   10364              :                 }
   10365              : 
   10366       479432 :               vect_location = saved_vect_location;
   10367       479432 :               if (!dbg_cnt (vect_slp))
   10368              :                 {
   10369            0 :                   force_clear = true;
   10370            0 :                   continue;
   10371              :                 }
   10372              : 
   10373       479432 :               profitable_subgraphs.safe_push (instance);
   10374              :             }
   10375              : 
   10376              :           /* When we're vectorizing an if-converted loop body make sure
   10377              :              we vectorized all if-converted code.  */
   10378       392538 :           if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
   10379              :             {
   10380           97 :               gcc_assert (bb_vinfo->nbbs == 1);
   10381          194 :               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
   10382         4084 :                    !gsi_end_p (gsi); gsi_next (&gsi))
   10383              :                 {
   10384              :                   /* The costing above left us with DCEable vectorized scalar
   10385              :                      stmts having the visited flag set on profitable
   10386              :                      subgraphs.  Do the delayed clearing of the flag here.  */
   10387         3987 :                   if (gimple_visited_p (gsi_stmt (gsi)))
   10388              :                     {
   10389         1172 :                       gimple_set_visited (gsi_stmt (gsi), false);
   10390         1172 :                       continue;
   10391              :                     }
   10392         2815 :                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
   10393          813 :                     continue;
   10394              : 
   10395         5859 :                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
   10396         2450 :                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
   10397              :                       {
   10398           51 :                         if (!profitable_subgraphs.is_empty ()
   10399           22 :                             && dump_enabled_p ())
   10400            0 :                           dump_printf_loc (MSG_NOTE, vect_location,
   10401              :                                            "not profitable because of "
   10402              :                                            "unprofitable if-converted scalar "
   10403              :                                            "code\n");
   10404           29 :                         profitable_subgraphs.truncate (0);
   10405              :                       }
   10406              :                 }
   10407              :             }
   10408              : 
   10409              :           /* Finally schedule the profitable subgraphs.  */
   10410      1030343 :           for (slp_instance instance : profitable_subgraphs)
   10411              :             {
   10412       479400 :               if (!vectorized && dump_enabled_p ())
   10413          724 :                 dump_printf_loc (MSG_NOTE, vect_location,
   10414              :                                  "Basic block will be vectorized "
   10415              :                                  "using SLP\n");
   10416       479400 :               vectorized = true;
   10417              : 
   10418              :               /* Dump before scheduling as store vectorization will remove
   10419              :                  the original stores and mess with the instance tree
   10420              :                  so querying its location will eventually ICE.  */
   10421       479400 :               if (flag_checking)
   10422      1928229 :                 for (slp_instance sub : instance->subgraph_entries)
   10423       490029 :                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
   10424       479400 :               unsigned HOST_WIDE_INT bytes;
   10425       479400 :               if (dump_enabled_p ())
   10426         3449 :                 for (slp_instance sub : instance->subgraph_entries)
   10427              :                   {
   10428          914 :                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
   10429         1828 :                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
   10430          914 :                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
   10431          914 :                                        sub->location (),
   10432              :                                        "basic block part vectorized using %wu "
   10433              :                                        "byte vectors\n", bytes);
   10434              :                     else
   10435              :                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
   10436              :                                        sub->location (),
   10437              :                                        "basic block part vectorized using "
   10438              :                                        "variable length vectors\n");
   10439              :                   }
   10440              : 
   10441       479400 :               dump_user_location_t saved_vect_location = vect_location;
   10442       479400 :               vect_location = instance->location ();
   10443              : 
   10444       479400 :               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
   10445              : 
   10446       479400 :               vect_location = saved_vect_location;
   10447              :             }
   10448              : 
   10449              : 
   10450              :           /* Generate the invariant statements.  */
   10451       234133 :           if (!gimple_seq_empty_p (bb_vinfo->inv_pattern_def_seq))
   10452              :             {
   10453           23 :               if (dump_enabled_p ())
   10454            0 :                 dump_printf_loc (MSG_NOTE, vect_location,
   10455              :                          "------>generating invariant statements\n");
   10456              : 
   10457           23 :               bb_vinfo->insert_seq_on_entry (NULL,
   10458              :                                              bb_vinfo->inv_pattern_def_seq);
   10459              :             }
   10460       234133 :         }
   10461              :       else
   10462              :         {
   10463      1949691 :           if (dump_enabled_p ())
   10464         1314 :             dump_printf_loc (MSG_NOTE, vect_location,
   10465              :                              "***** Analysis failed with vector mode %s\n",
   10466         1314 :                              GET_MODE_NAME (bb_vinfo->vector_mode));
   10467              :         }
   10468              : 
   10469      2183824 :       if (mode_i == 0)
   10470      1864745 :         autodetected_vector_mode = bb_vinfo->vector_mode;
   10471              : 
   10472      2183824 :       if (!fatal)
   10473      3125653 :         while (mode_i < vector_modes.length ()
   10474      1751984 :                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
   10475              :           {
   10476       332740 :             if (dump_enabled_p ())
   10477         1650 :               dump_printf_loc (MSG_NOTE, vect_location,
   10478              :                                "***** The result for vector mode %s would"
   10479              :                                " be the same\n",
   10480          825 :                                GET_MODE_NAME (vector_modes[mode_i]));
   10481       332740 :             mode_i += 1;
   10482              :           }
   10483              : 
   10484      2183824 :       delete bb_vinfo;
   10485              : 
   10486      2183824 :       if (mode_i < vector_modes.length ()
   10487      2007392 :           && VECTOR_MODE_P (autodetected_vector_mode)
   10488      1988466 :           && (related_vector_mode (vector_modes[mode_i],
   10489              :                                    GET_MODE_INNER (autodetected_vector_mode))
   10490       994233 :               == autodetected_vector_mode)
   10491      4191216 :           && (related_vector_mode (autodetected_vector_mode,
   10492       516353 :                                    GET_MODE_INNER (vector_modes[mode_i]))
   10493      1032706 :               == vector_modes[mode_i]))
   10494              :         {
   10495       516353 :           if (dump_enabled_p ())
   10496          205 :             dump_printf_loc (MSG_NOTE, vect_location,
   10497              :                              "***** Skipping vector mode %s, which would"
   10498              :                              " repeat the analysis for %s\n",
   10499          205 :                              GET_MODE_NAME (vector_modes[mode_i]),
   10500          205 :                              GET_MODE_NAME (autodetected_vector_mode));
   10501       516353 :           mode_i += 1;
   10502              :         }
   10503              : 
   10504      2183824 :       if (vectorized
   10505      2025441 :           || mode_i == vector_modes.length ()
   10506      1849056 :           || autodetected_vector_mode == VOIDmode
   10507              :           /* If vect_slp_analyze_bb_1 signaled that analysis for all
   10508              :              vector sizes will fail do not bother iterating.  */
   10509      3019721 :           || fatal)
   10510      3729490 :         return vectorized;
   10511              : 
   10512              :       /* Try the next biggest vector size.  */
   10513       319079 :       next_vector_mode = vector_modes[mode_i++];
   10514       319079 :       if (dump_enabled_p ())
   10515          218 :         dump_printf_loc (MSG_NOTE, vect_location,
   10516              :                          "***** Re-trying analysis with vector mode %s\n",
   10517          218 :                          GET_MODE_NAME (next_vector_mode));
   10518       319079 :     }
   10519      1864745 : }
   10520              : 
   10521              : 
   10522              : /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
   10523              :    true if anything in the basic-block was vectorized.  */
   10524              : 
   10525              : static bool
   10526      1864745 : vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
   10527              : {
   10528      1864745 :   vec<data_reference_p> datarefs = vNULL;
   10529      1864745 :   auto_vec<int> dataref_groups;
   10530      1864745 :   int insns = 0;
   10531      1864745 :   int current_group = 0;
   10532              : 
   10533     12493816 :   for (unsigned i = 0; i < bbs.length (); i++)
   10534              :     {
   10535     10629071 :       basic_block bb = bbs[i];
   10536     88546594 :       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
   10537     77917523 :            gsi_next (&gsi))
   10538              :         {
   10539     77917523 :           gimple *stmt = gsi_stmt (gsi);
   10540     77917523 :           if (is_gimple_debug (stmt))
   10541     48220995 :             continue;
   10542              : 
   10543     29696528 :           insns++;
   10544              : 
   10545     29696528 :           if (gimple_location (stmt) != UNKNOWN_LOCATION)
   10546     26682990 :             vect_location = stmt;
   10547              : 
   10548     29696528 :           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
   10549              :                                               &dataref_groups, current_group))
   10550      5086097 :             ++current_group;
   10551              :         }
   10552              :       /* New BBs always start a new DR group.  */
   10553     10629071 :       ++current_group;
   10554              :     }
   10555              : 
   10556      1864745 :   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
   10557      1864745 : }
   10558              : 
   10559              : /* Special entry for the BB vectorizer.  Analyze and transform a single
   10560              :    if-converted BB with ORIG_LOOPs body being the not if-converted
   10561              :    representation.  Returns true if anything in the basic-block was
   10562              :    vectorized.  */
   10563              : 
   10564              : bool
   10565        19420 : vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
   10566              : {
   10567        19420 :   auto_vec<basic_block> bbs;
   10568        19420 :   bbs.safe_push (bb);
   10569        19420 :   return vect_slp_bbs (bbs, orig_loop);
   10570        19420 : }
   10571              : 
   10572              : /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
   10573              :    true if anything in the basic-block was vectorized.  */
   10574              : 
   10575              : bool
   10576       905454 : vect_slp_function (function *fun)
   10577              : {
   10578       905454 :   bool r = false;
   10579       905454 :   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
   10580       905454 :   auto_bitmap exit_bbs;
   10581       905454 :   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
   10582       905454 :   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
   10583       905454 :   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
   10584       905454 :                                                       true, rpo, NULL);
   10585              : 
   10586              :   /* For the moment split the function into pieces to avoid making
   10587              :      the iteration on the vector mode moot.  Split at points we know
   10588              :      to not handle well which is CFG merges (SLP discovery doesn't
   10589              :      handle non-loop-header PHIs) and loop exits.  Since pattern
   10590              :      recog requires reverse iteration to visit uses before defs
   10591              :      simply chop RPO into pieces.  */
   10592       905454 :   auto_vec<basic_block> bbs;
   10593     11546033 :   for (unsigned i = 0; i < n; i++)
   10594              :     {
   10595     10640579 :       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
   10596     10640579 :       bool split = false;
   10597              : 
   10598              :       /* Split when a BB is not dominated by the first block.  */
   10599     20072008 :       if (!bbs.is_empty ()
   10600      9431429 :           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
   10601              :         {
   10602       656087 :           if (dump_enabled_p ())
   10603          146 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10604              :                              "splitting region at dominance boundary bb%d\n",
   10605              :                              bb->index);
   10606              :           split = true;
   10607              :         }
   10608              :       /* Split when the loop determined by the first block
   10609              :          is exited.  This is because we eventually insert
   10610              :          invariants at region begin.  */
   10611     18759834 :       else if (!bbs.is_empty ()
   10612      8775342 :                && bbs[0]->loop_father != bb->loop_father
   10613      2280719 :                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
   10614              :         {
   10615         3747 :           if (dump_enabled_p ())
   10616            6 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10617              :                              "splitting region at loop %d exit at bb%d\n",
   10618            3 :                              bbs[0]->loop_father->num, bb->index);
   10619              :           split = true;
   10620              :         }
   10621      9980745 :       else if (!bbs.is_empty ()
   10622      8771595 :                && bb->loop_father->header == bb
   10623       472342 :                && bb->loop_father->dont_vectorize)
   10624              :         {
   10625         7269 :           if (dump_enabled_p ())
   10626           72 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10627              :                              "splitting region at dont-vectorize loop %d "
   10628              :                              "entry at bb%d\n",
   10629              :                              bb->loop_father->num, bb->index);
   10630              :           split = true;
   10631              :         }
   10632              : 
   10633     11307682 :       if (split && !bbs.is_empty ())
   10634              :         {
   10635       667103 :           r |= vect_slp_bbs (bbs, NULL);
   10636       667103 :           bbs.truncate (0);
   10637              :         }
   10638              : 
   10639     10640579 :       if (bbs.is_empty ())
   10640              :         {
   10641              :           /* We need to be able to insert at the head of the region which
   10642              :              we cannot for region starting with a returns-twice call.  */
   10643      1876253 :           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
   10644       396454 :             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
   10645              :               {
   10646          301 :                 if (dump_enabled_p ())
   10647            2 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10648              :                                    "skipping bb%d as start of region as it "
   10649              :                                    "starts with returns-twice call\n",
   10650              :                                    bb->index);
   10651        30928 :                 continue;
   10652              :               }
   10653              :           /* If the loop this BB belongs to is marked as not to be vectorized
   10654              :              honor that also for BB vectorization.  */
   10655      1875952 :           if (bb->loop_father->dont_vectorize)
   10656        30627 :             continue;
   10657              :         }
   10658              : 
   10659     10609651 :       bbs.safe_push (bb);
   10660              : 
   10661              :       /* When we have a stmt ending this block and defining a
   10662              :          value we have to insert on edges when inserting after it for
   10663              :          a vector containing its definition.  Avoid this for now.  */
   10664     21219302 :       if (gimple *last = *gsi_last_bb (bb))
   10665      8583263 :         if (gimple_get_lhs (last)
   10666      8583263 :             && is_ctrl_altering_stmt (last))
   10667              :           {
   10668       272775 :             if (dump_enabled_p ())
   10669            2 :               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   10670              :                                "splitting region at control altering "
   10671              :                                "definition %G", last);
   10672       272775 :             r |= vect_slp_bbs (bbs, NULL);
   10673       272775 :             bbs.truncate (0);
   10674              :           }
   10675              :     }
   10676              : 
   10677       905454 :   if (!bbs.is_empty ())
   10678       905447 :     r |= vect_slp_bbs (bbs, NULL);
   10679              : 
   10680       905454 :   free (rpo);
   10681              : 
   10682       905454 :   return r;
   10683       905454 : }
   10684              : 
   10685              : /* Build a variable-length vector in which the elements in ELTS are repeated
   10686              :    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
   10687              :    RESULTS and add any new instructions to SEQ.
   10688              : 
   10689              :    The approach we use is:
   10690              : 
   10691              :    (1) Find a vector mode VM with integer elements of mode IM.
   10692              : 
   10693              :    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
   10694              :        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
   10695              :        from small vectors to IM.
   10696              : 
   10697              :    (3) Duplicate each ELTS'[I] into a vector of mode VM.
   10698              : 
   10699              :    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
   10700              :        correct byte contents.
   10701              : 
   10702              :    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
   10703              : 
   10704              :    We try to find the largest IM for which this sequence works, in order
   10705              :    to cut down on the number of interleaves.  */
   10706              : 
   10707              : void
   10708            0 : duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
   10709              :                           const vec<tree> &elts, unsigned int nresults,
   10710              :                           vec<tree> &results)
   10711              : {
   10712            0 :   unsigned int nelts = elts.length ();
   10713            0 :   tree element_type = TREE_TYPE (vector_type);
   10714              : 
   10715              :   /* (1) Find a vector mode VM with integer elements of mode IM.  */
   10716            0 :   unsigned int nvectors = 1;
   10717            0 :   tree new_vector_type;
   10718            0 :   tree permutes[2];
   10719            0 :   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
   10720              :                                        &nvectors, &new_vector_type,
   10721              :                                        permutes))
   10722            0 :     gcc_unreachable ();
   10723              : 
   10724              :   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
   10725            0 :   unsigned int partial_nelts = nelts / nvectors;
   10726            0 :   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
   10727              : 
   10728            0 :   tree_vector_builder partial_elts;
   10729            0 :   auto_vec<tree, 32> pieces (nvectors * 2);
   10730            0 :   pieces.quick_grow_cleared (nvectors * 2);
   10731            0 :   for (unsigned int i = 0; i < nvectors; ++i)
   10732              :     {
   10733              :       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
   10734              :              ELTS' has mode IM.  */
   10735            0 :       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
   10736            0 :       for (unsigned int j = 0; j < partial_nelts; ++j)
   10737            0 :         partial_elts.quick_push (elts[i * partial_nelts + j]);
   10738            0 :       tree t = gimple_build_vector (seq, &partial_elts);
   10739            0 :       t = gimple_build (seq, VIEW_CONVERT_EXPR,
   10740            0 :                         TREE_TYPE (new_vector_type), t);
   10741              : 
   10742              :       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
   10743            0 :       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
   10744              :     }
   10745              : 
   10746              :   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
   10747              :          correct byte contents.
   10748              : 
   10749              :      Conceptually, we need to repeat the following operation log2(nvectors)
   10750              :      times, where hi_start = nvectors / 2:
   10751              : 
   10752              :         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
   10753              :         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
   10754              : 
   10755              :      However, if each input repeats every N elements and the VF is
   10756              :      a multiple of N * 2, the HI result is the same as the LO result.
   10757              :      This will be true for the first N1 iterations of the outer loop,
   10758              :      followed by N2 iterations for which both the LO and HI results
   10759              :      are needed.  I.e.:
   10760              : 
   10761              :         N1 + N2 = log2(nvectors)
   10762              : 
   10763              :      Each "N1 iteration" doubles the number of redundant vectors and the
   10764              :      effect of the process as a whole is to have a sequence of nvectors/2**N1
   10765              :      vectors that repeats 2**N1 times.  Rather than generate these redundant
   10766              :      vectors, we halve the number of vectors for each N1 iteration.  */
   10767              :   unsigned int in_start = 0;
   10768              :   unsigned int out_start = nvectors;
   10769              :   unsigned int new_nvectors = nvectors;
   10770            0 :   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
   10771              :     {
   10772            0 :       unsigned int hi_start = new_nvectors / 2;
   10773            0 :       unsigned int out_i = 0;
   10774            0 :       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
   10775              :         {
   10776            0 :           if ((in_i & 1) != 0
   10777            0 :               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
   10778              :                              2 * in_repeat))
   10779            0 :             continue;
   10780              : 
   10781            0 :           tree output = make_ssa_name (new_vector_type);
   10782            0 :           tree input1 = pieces[in_start + (in_i / 2)];
   10783            0 :           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
   10784            0 :           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
   10785              :                                                input1, input2,
   10786              :                                                permutes[in_i & 1]);
   10787            0 :           gimple_seq_add_stmt (seq, stmt);
   10788            0 :           pieces[out_start + out_i] = output;
   10789            0 :           out_i += 1;
   10790              :         }
   10791            0 :       std::swap (in_start, out_start);
   10792            0 :       new_nvectors = out_i;
   10793              :     }
   10794              : 
   10795              :   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
   10796            0 :   results.reserve (nresults);
   10797            0 :   for (unsigned int i = 0; i < nresults; ++i)
   10798            0 :     if (i < new_nvectors)
   10799            0 :       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
   10800            0 :                                         pieces[in_start + i]));
   10801              :     else
   10802            0 :       results.quick_push (results[i - new_nvectors]);
   10803            0 : }
   10804              : 
   10805              : 
   10806              : /* For constant and loop invariant defs in OP_NODE this function creates
   10807              :    vector defs that will be used in the vectorized stmts and stores them
   10808              :    to SLP_TREE_VEC_DEFS of OP_NODE.  */
   10809              : 
   10810              : static void
   10811       489703 : vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
   10812              : {
   10813       489703 :   unsigned HOST_WIDE_INT nunits;
   10814       489703 :   tree vec_cst;
   10815       489703 :   unsigned j, number_of_places_left_in_vector;
   10816       489703 :   tree vector_type;
   10817       489703 :   tree vop;
   10818       489703 :   int group_size = op_node->ops.length ();
   10819       489703 :   unsigned int vec_num, i;
   10820       489703 :   unsigned number_of_copies = 1;
   10821       489703 :   bool constant_p;
   10822       489703 :   gimple_seq ctor_seq = NULL;
   10823       489703 :   auto_vec<tree, 16> permute_results;
   10824              : 
   10825              :   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
   10826       489703 :   vector_type = SLP_TREE_VECTYPE (op_node);
   10827              : 
   10828       489703 :   unsigned int number_of_vectors = vect_get_num_copies (vinfo, op_node);
   10829       489703 :   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
   10830       489703 :   auto_vec<tree> voprnds (number_of_vectors);
   10831              : 
   10832              :   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
   10833              :      created vectors. It is greater than 1 if unrolling is performed.
   10834              : 
   10835              :      For example, we have two scalar operands, s1 and s2 (e.g., group of
   10836              :      strided accesses of size two), while NUNITS is four (i.e., four scalars
   10837              :      of this type can be packed in a vector).  The output vector will contain
   10838              :      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
   10839              :      will be 2).
   10840              : 
   10841              :      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
   10842              :      containing the operands.
   10843              : 
   10844              :      For example, NUNITS is four as before, and the group size is 8
   10845              :      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
   10846              :      {s5, s6, s7, s8}.  */
   10847              : 
   10848              :   /* When using duplicate_and_interleave, we just need one element for
   10849              :      each scalar statement.  */
   10850       489703 :   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
   10851              :     nunits = group_size;
   10852              : 
   10853       489703 :   number_of_copies = nunits * number_of_vectors / group_size;
   10854              : 
   10855       489703 :   number_of_places_left_in_vector = nunits;
   10856       489703 :   constant_p = true;
   10857       489703 :   tree uniform_elt = NULL_TREE;
   10858       489703 :   tree_vector_builder elts (vector_type, nunits, 1);
   10859       489703 :   elts.quick_grow (nunits);
   10860       489703 :   stmt_vec_info insert_after = NULL;
   10861      1466112 :   for (j = 0; j < number_of_copies; j++)
   10862              :     {
   10863       976409 :       tree op;
   10864      3738338 :       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
   10865              :         {
   10866              :           /* Create 'vect_ = {op0,op1,...,opn}'.  */
   10867      1785520 :           tree orig_op = op;
   10868      1785520 :           if (number_of_places_left_in_vector == nunits)
   10869              :             uniform_elt = op;
   10870      1167382 :           else if (uniform_elt && operand_equal_p (uniform_elt, op))
   10871       745054 :             op = elts[number_of_places_left_in_vector];
   10872              :           else
   10873              :             uniform_elt = NULL_TREE;
   10874      1785520 :           number_of_places_left_in_vector--;
   10875      1785520 :           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
   10876              :             {
   10877       273196 :               if (CONSTANT_CLASS_P (op))
   10878              :                 {
   10879        99383 :                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
   10880              :                     {
   10881              :                       /* Can't use VIEW_CONVERT_EXPR for booleans because
   10882              :                          of possibly different sizes of scalar value and
   10883              :                          vector element.  */
   10884           51 :                       if (integer_zerop (op))
   10885           51 :                         op = build_int_cst (TREE_TYPE (vector_type), 0);
   10886            0 :                       else if (integer_onep (op))
   10887            0 :                         op = build_all_ones_cst (TREE_TYPE (vector_type));
   10888              :                       else
   10889            0 :                         gcc_unreachable ();
   10890              :                     }
   10891              :                   else
   10892        99332 :                     op = fold_unary (VIEW_CONVERT_EXPR,
   10893              :                                      TREE_TYPE (vector_type), op);
   10894        99383 :                   gcc_assert (op && CONSTANT_CLASS_P (op));
   10895              :                 }
   10896              :               else
   10897              :                 {
   10898       173813 :                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
   10899       173813 :                   gimple *init_stmt;
   10900       173813 :                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
   10901              :                     {
   10902          403 :                       tree true_val
   10903          403 :                         = build_all_ones_cst (TREE_TYPE (vector_type));
   10904          403 :                       tree false_val
   10905          403 :                         = build_zero_cst (TREE_TYPE (vector_type));
   10906          403 :                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
   10907          403 :                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
   10908              :                                                        op, true_val,
   10909              :                                                        false_val);
   10910              :                     }
   10911              :                   else
   10912              :                     {
   10913       173410 :                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
   10914              :                                    op);
   10915       173410 :                       init_stmt
   10916       173410 :                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
   10917              :                                                op);
   10918              :                     }
   10919       173813 :                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
   10920       173813 :                   op = new_temp;
   10921              :                 }
   10922              :             }
   10923      1785520 :           elts[number_of_places_left_in_vector] = op;
   10924      1785520 :           if (!CONSTANT_CLASS_P (op))
   10925       316216 :             constant_p = false;
   10926              :           /* For BB vectorization we have to compute an insert location
   10927              :              when a def is inside the analyzed region since we cannot
   10928              :              simply insert at the BB start in this case.  */
   10929      1785520 :           stmt_vec_info opdef;
   10930      1785520 :           if (TREE_CODE (orig_op) == SSA_NAME
   10931       182025 :               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
   10932       162221 :               && is_a <bb_vec_info> (vinfo)
   10933      1889918 :               && (opdef = vinfo->lookup_def (orig_op)))
   10934              :             {
   10935        85555 :               if (!insert_after)
   10936              :                 insert_after = opdef;
   10937              :               else
   10938        47216 :                 insert_after = get_later_stmt (insert_after, opdef);
   10939              :             }
   10940              : 
   10941      1785520 :           if (number_of_places_left_in_vector == 0)
   10942              :             {
   10943       618138 :               auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
   10944       618138 :               if (uniform_elt)
   10945       646342 :                 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
   10946       323171 :                                                         elts[0]);
   10947       589934 :               else if (constant_p
   10948       589934 :                        ? multiple_p (type_nunits, nunits)
   10949       108833 :                        : known_eq (type_nunits, nunits))
   10950       294967 :                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
   10951              :               else
   10952              :                 {
   10953            0 :                   if (permute_results.is_empty ())
   10954            0 :                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
   10955              :                                               elts, number_of_vectors,
   10956              :                                               permute_results);
   10957            0 :                   vec_cst = permute_results[number_of_vectors - j - 1];
   10958              :                 }
   10959       618138 :               if (!gimple_seq_empty_p (ctor_seq))
   10960              :                 {
   10961       136067 :                   if (insert_after)
   10962              :                     {
   10963        38339 :                       gimple_stmt_iterator gsi;
   10964        38339 :                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
   10965              :                         {
   10966          620 :                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
   10967          620 :                           gsi_insert_seq_before (&gsi, ctor_seq,
   10968              :                                                  GSI_CONTINUE_LINKING);
   10969              :                         }
   10970        37719 :                       else if (!stmt_ends_bb_p (insert_after->stmt))
   10971              :                         {
   10972        37719 :                           gsi = gsi_for_stmt (insert_after->stmt);
   10973        37719 :                           gsi_insert_seq_after (&gsi, ctor_seq,
   10974              :                                                 GSI_CONTINUE_LINKING);
   10975              :                         }
   10976              :                       else
   10977              :                         {
   10978              :                           /* When we want to insert after a def where the
   10979              :                              defining stmt throws then insert on the fallthru
   10980              :                              edge.  */
   10981            0 :                           edge e = find_fallthru_edge
   10982            0 :                                      (gimple_bb (insert_after->stmt)->succs);
   10983            0 :                           basic_block new_bb
   10984            0 :                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
   10985            0 :                           gcc_assert (!new_bb);
   10986              :                         }
   10987              :                     }
   10988              :                   else
   10989        97728 :                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
   10990       136067 :                   ctor_seq = NULL;
   10991              :                 }
   10992       618138 :               voprnds.quick_push (vec_cst);
   10993       618138 :               insert_after = NULL;
   10994       618138 :               number_of_places_left_in_vector = nunits;
   10995       618138 :               constant_p = true;
   10996       618138 :               elts.new_vector (vector_type, nunits, 1);
   10997       618138 :               elts.quick_grow (nunits);
   10998              :             }
   10999              :         }
   11000              :     }
   11001              : 
   11002              :   /* Since the vectors are created in the reverse order, we should invert
   11003              :      them.  */
   11004       489703 :   vec_num = voprnds.length ();
   11005      1107841 :   for (j = vec_num; j != 0; j--)
   11006              :     {
   11007       618138 :       vop = voprnds[j - 1];
   11008       618138 :       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
   11009              :     }
   11010              : 
   11011              :   /* In case that VF is greater than the unrolling factor needed for the SLP
   11012              :      group of stmts, NUMBER_OF_VECTORS to be created is greater than
   11013              :      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
   11014              :      to replicate the vectors.  */
   11015       489703 :   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
   11016       489703 :     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
   11017              :          i++)
   11018            0 :       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
   11019       489703 : }
   11020              : 
   11021              : /* Get the scalar definition of the Nth lane from SLP_NODE or NULL_TREE
   11022              :    if there is no definition for it in the scalar IL or it is not known.  */
   11023              : 
   11024              : tree
   11025         1909 : vect_get_slp_scalar_def (slp_tree slp_node, unsigned n)
   11026              : {
   11027         1909 :   if (SLP_TREE_DEF_TYPE (slp_node) == vect_internal_def)
   11028              :     {
   11029         1899 :       if (!SLP_TREE_SCALAR_STMTS (slp_node).exists ())
   11030              :         return NULL_TREE;
   11031         1899 :       stmt_vec_info def = SLP_TREE_SCALAR_STMTS (slp_node)[n];
   11032         1899 :       if (!def)
   11033              :         return NULL_TREE;
   11034         1899 :       return gimple_get_lhs (STMT_VINFO_STMT (def));
   11035              :     }
   11036              :   else
   11037           10 :     return SLP_TREE_SCALAR_OPS (slp_node)[n];
   11038              : }
   11039              : 
   11040              : /* Get the Ith vectorized definition from SLP_NODE.  */
   11041              : 
   11042              : tree
   11043       145870 : vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
   11044              : {
   11045       145870 :   return SLP_TREE_VEC_DEFS (slp_node)[i];
   11046              : }
   11047              : 
   11048              : /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
   11049              : 
   11050              : void
   11051       928640 : vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
   11052              : {
   11053      1857280 :   vec_defs->create (SLP_TREE_VEC_DEFS (slp_node).length ());
   11054       928640 :   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
   11055       928640 : }
   11056              : 
   11057              : /* Get N vectorized definitions for SLP_NODE.  */
   11058              : 
   11059              : void
   11060         2955 : vect_get_slp_defs (vec_info *,
   11061              :                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
   11062              : {
   11063         2955 :   if (n == -1U)
   11064         2955 :     n = SLP_TREE_CHILDREN (slp_node).length ();
   11065              : 
   11066        10648 :   for (unsigned i = 0; i < n; ++i)
   11067              :     {
   11068         7693 :       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
   11069         7693 :       vec<tree> vec_defs = vNULL;
   11070         7693 :       vect_get_slp_defs (child, &vec_defs);
   11071         7693 :       vec_oprnds->quick_push (vec_defs);
   11072              :     }
   11073         2955 : }
   11074              : 
   11075              : /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
   11076              :    - PERM gives the permutation that the caller wants to use for NODE,
   11077              :      which might be different from SLP_LOAD_PERMUTATION.
   11078              :    - DUMP_P controls whether the function dumps information.  */
   11079              : 
   11080              : static bool
   11081       119662 : vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
   11082              :                                 load_permutation_t &perm,
   11083              :                                 const vec<tree> &dr_chain,
   11084              :                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
   11085              :                                 bool analyze_only, bool dump_p,
   11086              :                                 unsigned *n_perms, unsigned int *n_loads,
   11087              :                                 bool dce_chain)
   11088              : {
   11089       119662 :   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   11090       119662 :   int vec_index = 0;
   11091       119662 :   tree vectype = SLP_TREE_VECTYPE (node);
   11092       119662 :   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
   11093       119662 :   unsigned int mask_element;
   11094       119662 :   unsigned dr_group_size;
   11095       119662 :   machine_mode mode;
   11096              : 
   11097       119662 :   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
   11098              :     {
   11099              :       /* We have both splats of the same non-grouped load and groups
   11100              :          of distinct invariant loads entering here.  */
   11101         1205 :       unsigned max_idx = 0;
   11102         6793 :       for (auto idx : perm)
   11103         3178 :         max_idx = idx > max_idx ? idx : max_idx;
   11104         1205 :       dr_group_size = max_idx + 1;
   11105              :     }
   11106              :   else
   11107              :     {
   11108       118457 :       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
   11109       118457 :       dr_group_size = DR_GROUP_SIZE (stmt_info);
   11110              :     }
   11111              : 
   11112       119662 :   mode = TYPE_MODE (vectype);
   11113       119662 :   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   11114       119662 :   unsigned int nstmts = vect_get_num_copies (vinfo, node);
   11115              : 
   11116              :   /* Initialize the vect stmts of NODE to properly insert the generated
   11117              :      stmts later.  */
   11118       119662 :   if (! analyze_only)
   11119        57826 :     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
   11120        22274 :       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
   11121              : 
   11122              :   /* Generate permutation masks for every NODE. Number of masks for each NODE
   11123              :      is equal to GROUP_SIZE.
   11124              :      E.g., we have a group of three nodes with three loads from the same
   11125              :      location in each node, and the vector size is 4. I.e., we have a
   11126              :      a0b0c0a1b1c1... sequence and we need to create the following vectors:
   11127              :      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
   11128              :      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
   11129              :      ...
   11130              : 
   11131              :      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
   11132              :      The last mask is illegal since we assume two operands for permute
   11133              :      operation, and the mask element values can't be outside that range.
   11134              :      Hence, the last mask must be converted into {2,5,5,5}.
   11135              :      For the first two permutations we need the first and the second input
   11136              :      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
   11137              :      we need the second and the third vectors: {b1,c1,a2,b2} and
   11138              :      {c2,a3,b3,c3}.  */
   11139              : 
   11140       119662 :   int vect_stmts_counter = 0;
   11141       119662 :   unsigned int index = 0;
   11142       119662 :   int first_vec_index = -1;
   11143       119662 :   int second_vec_index = -1;
   11144       119662 :   bool noop_p = true;
   11145       119662 :   *n_perms = 0;
   11146              : 
   11147       119662 :   vec_perm_builder mask;
   11148       119662 :   unsigned int nelts_to_build;
   11149       119662 :   unsigned int nvectors_per_build;
   11150       119662 :   unsigned int in_nlanes;
   11151       119662 :   bool repeating_p = (group_size == dr_group_size
   11152       151413 :                       && multiple_p (nunits, group_size));
   11153       119662 :   if (repeating_p)
   11154              :     {
   11155              :       /* A single vector contains a whole number of copies of the node, so:
   11156              :          (a) all permutes can use the same mask; and
   11157              :          (b) the permutes only need a single vector input.  */
   11158        29584 :       mask.new_vector (nunits, group_size, 3);
   11159        29584 :       nelts_to_build = mask.encoded_nelts ();
   11160              :       /* It's possible to obtain zero nstmts during analyze_only, so make
   11161              :          it at least one to ensure the later computation for n_perms
   11162              :          proceed.  */
   11163        29584 :       nvectors_per_build = nstmts > 0 ? nstmts : 1;
   11164        29584 :       in_nlanes = dr_group_size * 3;
   11165              :     }
   11166              :   else
   11167              :     {
   11168              :       /* We need to construct a separate mask for each vector statement.  */
   11169        90078 :       unsigned HOST_WIDE_INT const_nunits, const_vf;
   11170        90078 :       if (!nunits.is_constant (&const_nunits)
   11171        90078 :           || !vf.is_constant (&const_vf))
   11172              :         return false;
   11173        90078 :       mask.new_vector (const_nunits, const_nunits, 1);
   11174        90078 :       nelts_to_build = const_vf * group_size;
   11175        90078 :       nvectors_per_build = 1;
   11176        90078 :       in_nlanes = const_vf * dr_group_size;
   11177              :     }
   11178       119662 :   auto_sbitmap used_in_lanes (in_nlanes);
   11179       119662 :   bitmap_clear (used_in_lanes);
   11180       119662 :   auto_bitmap used_defs;
   11181              : 
   11182       119662 :   unsigned int count = mask.encoded_nelts ();
   11183       119662 :   mask.quick_grow (count);
   11184       119662 :   vec_perm_indices indices;
   11185              : 
   11186       636473 :   for (unsigned int j = 0; j < nelts_to_build; j++)
   11187              :     {
   11188       526386 :       unsigned int iter_num = j / group_size;
   11189       526386 :       unsigned int stmt_num = j % group_size;
   11190       526386 :       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
   11191       526386 :       bitmap_set_bit (used_in_lanes, i);
   11192       526386 :       if (repeating_p)
   11193              :         {
   11194              :           first_vec_index = 0;
   11195              :           mask_element = i;
   11196              :         }
   11197              :       else
   11198              :         {
   11199              :           /* Enforced before the loop when !repeating_p.  */
   11200       335784 :           unsigned int const_nunits = nunits.to_constant ();
   11201       335784 :           vec_index = i / const_nunits;
   11202       335784 :           mask_element = i % const_nunits;
   11203       335784 :           if (vec_index == first_vec_index
   11204       335784 :               || first_vec_index == -1)
   11205              :             {
   11206              :               first_vec_index = vec_index;
   11207              :             }
   11208       133480 :           else if (vec_index == second_vec_index
   11209       133480 :                    || second_vec_index == -1)
   11210              :             {
   11211       127428 :               second_vec_index = vec_index;
   11212       127428 :               mask_element += const_nunits;
   11213              :             }
   11214              :           else
   11215              :             {
   11216         6052 :               if (dump_p)
   11217          280 :                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11218              :                                  "permutation requires at "
   11219              :                                  "least three vectors %G",
   11220              :                                  stmt_info->stmt);
   11221         6052 :               gcc_assert (analyze_only);
   11222              :               return false;
   11223              :             }
   11224              : 
   11225       329732 :           gcc_assert (mask_element < 2 * const_nunits);
   11226              :         }
   11227              : 
   11228       520334 :       if (mask_element != index)
   11229       333318 :         noop_p = false;
   11230       520334 :       mask[index++] = mask_element;
   11231              : 
   11232       520334 :       if (index == count)
   11233              :         {
   11234       142308 :           if (!noop_p)
   11235              :             {
   11236       195070 :               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
   11237       115458 :               if (!can_vec_perm_const_p (mode, mode, indices))
   11238              :                 {
   11239         3523 :                   if (dump_p)
   11240              :                     {
   11241           79 :                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11242              :                                        "unsupported vect permute { ");
   11243          669 :                       for (i = 0; i < count; ++i)
   11244              :                         {
   11245          590 :                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
   11246          590 :                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
   11247              :                         }
   11248           79 :                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
   11249              :                     }
   11250         3523 :                   gcc_assert (analyze_only);
   11251              :                   return false;
   11252              :                 }
   11253              : 
   11254       111935 :               tree mask_vec = NULL_TREE;
   11255       111935 :               if (!analyze_only)
   11256        20634 :                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
   11257              : 
   11258       111935 :               if (second_vec_index == -1)
   11259        33890 :                 second_vec_index = first_vec_index;
   11260              : 
   11261       225861 :               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
   11262              :                 {
   11263       113926 :                   ++*n_perms;
   11264       113926 :                   if (analyze_only)
   11265        93009 :                     continue;
   11266              :                   /* Generate the permute statement if necessary.  */
   11267        20917 :                   tree first_vec = dr_chain[first_vec_index + ri];
   11268        20917 :                   tree second_vec = dr_chain[second_vec_index + ri];
   11269        20917 :                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
   11270        20917 :                   tree perm_dest
   11271        20917 :                     = vect_create_destination_var (gimple_assign_lhs (stmt),
   11272              :                                                    vectype);
   11273        20917 :                   perm_dest = make_ssa_name (perm_dest);
   11274        20917 :                   gimple *perm_stmt
   11275        20917 :                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
   11276              :                                            second_vec, mask_vec);
   11277        20917 :                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
   11278              :                                                gsi);
   11279        20917 :                   if (dce_chain)
   11280              :                     {
   11281        20148 :                       bitmap_set_bit (used_defs, first_vec_index + ri);
   11282        20148 :                       bitmap_set_bit (used_defs, second_vec_index + ri);
   11283              :                     }
   11284              : 
   11285              :                   /* Store the vector statement in NODE.  */
   11286        20917 :                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
   11287              :                 }
   11288              :             }
   11289        26850 :           else if (!analyze_only)
   11290              :             {
   11291         2714 :               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
   11292              :                 {
   11293         1357 :                   tree first_vec = dr_chain[first_vec_index + ri];
   11294              :                   /* If mask was NULL_TREE generate the requested
   11295              :                      identity transform.  */
   11296         1357 :                   if (dce_chain)
   11297         1356 :                     bitmap_set_bit (used_defs, first_vec_index + ri);
   11298              : 
   11299              :                   /* Store the vector statement in NODE.  */
   11300         1357 :                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
   11301              :                 }
   11302              :             }
   11303              : 
   11304              :           index = 0;
   11305              :           first_vec_index = -1;
   11306              :           second_vec_index = -1;
   11307              :           noop_p = true;
   11308              :         }
   11309              :     }
   11310              : 
   11311       110087 :   if (n_loads)
   11312              :     {
   11313        75628 :       if (repeating_p)
   11314        10354 :         *n_loads = nstmts;
   11315              :       else
   11316              :         {
   11317              :           /* Enforced above when !repeating_p.  */
   11318        65274 :           unsigned int const_nunits = nunits.to_constant ();
   11319        65274 :           *n_loads = 0;
   11320        65274 :           bool load_seen = false;
   11321       929794 :           for (unsigned i = 0; i < in_nlanes; ++i)
   11322              :             {
   11323       864520 :               if (i % const_nunits == 0)
   11324              :                 {
   11325       368131 :                   if (load_seen)
   11326       104465 :                     *n_loads += 1;
   11327              :                   load_seen = false;
   11328              :                 }
   11329       864520 :               if (bitmap_bit_p (used_in_lanes, i))
   11330       234285 :                 load_seen = true;
   11331              :             }
   11332        65274 :           if (load_seen)
   11333        42965 :             *n_loads += 1;
   11334              :         }
   11335              :     }
   11336              : 
   11337       110087 :   if (dce_chain)
   11338       209030 :     for (unsigned i = 0; i < dr_chain.length (); ++i)
   11339        72213 :       if (!bitmap_bit_p (used_defs, i))
   11340              :         {
   11341        39350 :           tree def = dr_chain[i];
   11342        39685 :           do
   11343              :             {
   11344        39685 :               gimple *stmt = SSA_NAME_DEF_STMT (def);
   11345        39685 :               if (is_gimple_assign (stmt)
   11346        39685 :                   && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
   11347        39685 :                       || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
   11348         4916 :                 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
   11349              :               else
   11350              :                 def = NULL;
   11351        39685 :               gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
   11352        39685 :               gsi_remove (&rgsi, true);
   11353        39685 :               release_defs (stmt);
   11354              :             }
   11355        39685 :           while (def);
   11356              :         }
   11357              : 
   11358              :   return true;
   11359       119662 : }
   11360              : 
   11361              : /* Generate vector permute statements from a list of loads in DR_CHAIN.
   11362              :    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
   11363              :    permute statements for the SLP node NODE.  Store the number of vector
   11364              :    permute instructions in *N_PERMS and the number of vector load
   11365              :    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
   11366              :    that were not needed.  */
   11367              : 
   11368              : bool
   11369        84323 : vect_transform_slp_perm_load (vec_info *vinfo,
   11370              :                               slp_tree node, const vec<tree> &dr_chain,
   11371              :                               gimple_stmt_iterator *gsi, poly_uint64 vf,
   11372              :                               bool analyze_only, unsigned *n_perms,
   11373              :                               unsigned int *n_loads, bool dce_chain)
   11374              : {
   11375        84323 :   return vect_transform_slp_perm_load_1 (vinfo, node,
   11376        84323 :                                          SLP_TREE_LOAD_PERMUTATION (node),
   11377              :                                          dr_chain, gsi, vf, analyze_only,
   11378              :                                          dump_enabled_p (), n_perms, n_loads,
   11379        84323 :                                          dce_chain);
   11380              : }
   11381              : 
   11382              : /* Produce the next vector result for SLP permutation NODE by adding a vector
   11383              :    statement at GSI.  If MASK_VEC is nonnull, add:
   11384              : 
   11385              :       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
   11386              : 
   11387              :    otherwise add:
   11388              : 
   11389              :       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF,
   11390              :                                       { N, N+1, N+2, ... }>
   11391              : 
   11392              :    where N == IDENTITY_OFFSET which is either zero or equal to the
   11393              :    number of elements of the result.  */
   11394              : 
   11395              : static void
   11396        31379 : vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
   11397              :                           slp_tree node, tree first_def, tree second_def,
   11398              :                           tree mask_vec, poly_uint64 identity_offset)
   11399              : {
   11400        31379 :   tree vectype = SLP_TREE_VECTYPE (node);
   11401              : 
   11402              :   /* ???  We SLP match existing vector element extracts but
   11403              :      allow punning which we need to re-instantiate at uses
   11404              :      but have no good way of explicitly representing.  */
   11405        31379 :   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
   11406        31379 :       && !types_compatible_p (TREE_TYPE (first_def), vectype))
   11407              :     {
   11408           14 :       gassign *conv_stmt
   11409           14 :         = gimple_build_assign (make_ssa_name (vectype),
   11410              :                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
   11411           14 :       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
   11412           14 :       first_def = gimple_assign_lhs (conv_stmt);
   11413              :     }
   11414        31379 :   gassign *perm_stmt;
   11415        31379 :   tree perm_dest = make_ssa_name (vectype);
   11416        31379 :   if (mask_vec)
   11417              :     {
   11418        28100 :       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
   11419        28100 :                            TYPE_SIZE (vectype))
   11420        28100 :           && !types_compatible_p (TREE_TYPE (second_def), vectype))
   11421              :         {
   11422            8 :           gassign *conv_stmt
   11423            8 :             = gimple_build_assign (make_ssa_name (vectype),
   11424              :                                    build1 (VIEW_CONVERT_EXPR,
   11425              :                                            vectype, second_def));
   11426            8 :           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
   11427            8 :           second_def = gimple_assign_lhs (conv_stmt);
   11428              :         }
   11429        28100 :       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
   11430              :                                        first_def, second_def,
   11431              :                                        mask_vec);
   11432              :     }
   11433              :   else
   11434              :     {
   11435         3279 :       auto def_nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
   11436         3279 :       unsigned HOST_WIDE_INT vecno;
   11437         3279 :       poly_uint64 eltno;
   11438         3279 :       if (!can_div_trunc_p (poly_uint64 (identity_offset), def_nunits,
   11439              :                             &vecno, &eltno))
   11440              :         gcc_unreachable ();
   11441         3279 :       tree def = vecno & 1 ? second_def : first_def;
   11442         3279 :       if (!types_compatible_p (TREE_TYPE (def), vectype))
   11443              :         {
   11444              :           /* For identity permutes we still need to handle the case
   11445              :              of offsetted extracts or concats.  */
   11446          261 :           unsigned HOST_WIDE_INT c;
   11447          261 :           if (known_le (TYPE_VECTOR_SUBPARTS (vectype), def_nunits))
   11448              :             {
   11449          257 :               unsigned HOST_WIDE_INT elsz
   11450          257 :                 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (def))));
   11451          514 :               tree lowpart = build3 (BIT_FIELD_REF, vectype, def,
   11452          257 :                                      TYPE_SIZE (vectype),
   11453          257 :                                      bitsize_int (eltno * elsz));
   11454          257 :               perm_stmt = gimple_build_assign (perm_dest, lowpart);
   11455              :             }
   11456            4 :           else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
   11457            4 :                                         def_nunits, &c) && c == 2)
   11458              :             {
   11459            4 :               gcc_assert (known_eq (identity_offset, 0U));
   11460            4 :               tree ctor = build_constructor_va (vectype, 2,
   11461              :                                                 NULL_TREE, first_def,
   11462              :                                                 NULL_TREE, second_def);
   11463            4 :               perm_stmt = gimple_build_assign (perm_dest, ctor);
   11464              :             }
   11465              :           else
   11466            0 :             gcc_unreachable ();
   11467              :         }
   11468              :       else
   11469              :         {
   11470              :           /* We need a copy here in case the def was external.  */
   11471         3018 :           gcc_assert (known_eq (eltno, 0U));
   11472         3018 :           perm_stmt = gimple_build_assign (perm_dest, def);
   11473              :         }
   11474              :     }
   11475        31379 :   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
   11476              :   /* Store the vector statement in NODE.  */
   11477        31379 :   node->push_vec_def (perm_stmt);
   11478        31379 : }
   11479              : 
   11480              : /* Subroutine of vectorizable_slp_permutation.  Check whether the target
   11481              :    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
   11482              :    If GSI is nonnull, emit the permutation there.
   11483              : 
   11484              :    When GSI is null, the only purpose of NODE is to give properties
   11485              :    of the result, such as the vector type and number of SLP lanes.
   11486              :    The node does not need to be a VEC_PERM_EXPR.
   11487              : 
   11488              :    If the target supports the operation, return the number of individual
   11489              :    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
   11490              :    dump file if DUMP_P is true.  */
   11491              : 
   11492              : static int
   11493       436481 : vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
   11494              :                                 slp_tree node, lane_permutation_t &perm,
   11495              :                                 vec<slp_tree> &children, bool dump_p)
   11496              : {
   11497       436481 :   tree vectype = SLP_TREE_VECTYPE (node);
   11498              : 
   11499              :   /* ???  We currently only support all same vector input types
   11500              :      while the SLP IL should really do a concat + select and thus accept
   11501              :      arbitrary mismatches.  */
   11502       436481 :   slp_tree child;
   11503       436481 :   unsigned i;
   11504       436481 :   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   11505       436481 :   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
   11506              :   /* True if we're permuting a single input of 2N vectors down
   11507              :      to N vectors.  This case doesn't generalize beyond 2 since
   11508              :      VEC_PERM_EXPR only takes 2 inputs.  */
   11509       436481 :   bool pack_p = false;
   11510              :   /* If we're permuting inputs of N vectors each into X*N outputs,
   11511              :      this is the value of X, otherwise it is 1.  */
   11512       436481 :   unsigned int unpack_factor = 1;
   11513       436481 :   tree op_vectype = NULL_TREE;
   11514       437666 :   FOR_EACH_VEC_ELT (children, i, child)
   11515       437587 :     if (SLP_TREE_VECTYPE (child))
   11516              :       {
   11517              :         op_vectype = SLP_TREE_VECTYPE (child);
   11518              :         break;
   11519              :       }
   11520       436481 :   if (!op_vectype)
   11521           79 :     op_vectype = vectype;
   11522       932244 :   FOR_EACH_VEC_ELT (children, i, child)
   11523              :     {
   11524       495763 :       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
   11525        10086 :            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
   11526       495763 :           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
   11527       991526 :           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
   11528              :         {
   11529            0 :           if (dump_p)
   11530            0 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11531              :                              "Unsupported vector types in lane permutation\n");
   11532            0 :           return -1;
   11533              :         }
   11534       495763 :       auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
   11535       495763 :       unsigned int this_unpack_factor;
   11536              :       /* Detect permutations of external, pre-existing vectors.  The external
   11537              :          node's SLP_TREE_LANES stores the total number of units in the vector,
   11538              :          or zero if the vector has variable length.
   11539              : 
   11540              :          We are expected to keep the original VEC_PERM_EXPR for such cases.
   11541              :          There is no repetition to model.  */
   11542       495763 :       if (SLP_TREE_DEF_TYPE (child) == vect_external_def
   11543       495763 :           && SLP_TREE_SCALAR_OPS (child).is_empty ())
   11544              :         repeating_p = false;
   11545              :       /* Check whether the input has twice as many lanes per vector.  */
   11546       487791 :       else if (children.length () == 1
   11547       487791 :                && known_eq (SLP_TREE_LANES (child) * nunits,
   11548              :                             SLP_TREE_LANES (node) * op_nunits * 2))
   11549              :         pack_p = true;
   11550              :       /* Check whether the output has N times as many lanes per vector.  */
   11551       495763 :       else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
   11552       445274 :                                     SLP_TREE_LANES (child) * nunits,
   11553              :                                     &this_unpack_factor)
   11554       410691 :                && (i == 0 || unpack_factor == this_unpack_factor))
   11555              :         unpack_factor = this_unpack_factor;
   11556              :       else
   11557              :         repeating_p = false;
   11558              :     }
   11559              : 
   11560       872962 :   gcc_assert (perm.length () == SLP_TREE_LANES (node));
   11561              : 
   11562              :   /* Load-lanes permute.  This permute only acts as a forwarder to
   11563              :      select the correct vector def of the load-lanes load which
   11564              :      has the permuted vectors in its vector defs like
   11565              :      { v0, w0, r0, v1, w1, r1 ... } for a ld3.  All costs are
   11566              :      accounted for in the costing for the actual load so we
   11567              :      return zero here.  */
   11568       436481 :   if (node->ldst_lanes)
   11569              :     {
   11570            0 :       gcc_assert (children.length () == 1);
   11571            0 :       if (!gsi)
   11572              :         /* This is a trivial op always supported.  */
   11573              :         return 0;
   11574            0 :       slp_tree child = children[0];
   11575            0 :       unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
   11576            0 :                           / SLP_TREE_LANES (node));
   11577            0 :       unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
   11578            0 :       unsigned nvectors = vect_get_num_copies (vinfo, node);
   11579            0 :       for (unsigned i = 0; i < nvectors; ++i)
   11580              :         {
   11581            0 :           tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num  + vec_idx];
   11582            0 :           node->push_vec_def (def);
   11583              :         }
   11584              :       return 0;
   11585              :     }
   11586              : 
   11587              :   /* Set REPEATING_P to true if the permutations are cyclical wrt UNPACK_FACTOR
   11588              :      and if we can generate the vectors in a vector-length agnostic way.
   11589              :      This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
   11590              :      compile time.
   11591              : 
   11592              :      The significance of UNPACK_STEP is that, when PACK_P is false,
   11593              :      output vector I operates on a window of UNPACK_STEP elements from each
   11594              :      input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR).  For example,
   11595              :      when UNPACK_FACTOR is 2, the first output vector operates on lanes
   11596              :      [0, NUNITS / 2 - 1] of each input vector and the second output vector
   11597              :      operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
   11598              : 
   11599              :      When REPEATING_P is true, NOUTPUTS holds the total number of outputs
   11600              :      that we actually need to generate.  */
   11601       436481 :   uint64_t noutputs = 0;
   11602       436481 :   poly_uint64 unpack_step = 0;
   11603       436481 :   loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
   11604       148604 :   if (!linfo
   11605       474632 :       || !multiple_p (nunits, unpack_factor, &unpack_step)
   11606       147704 :       || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
   11607       147704 :                                * SLP_TREE_LANES (node), nunits, &noutputs))
   11608              :     repeating_p = false;
   11609              : 
   11610              :   /* We can handle the conditions described for REPEATING_P above for
   11611              :      both variable- and constant-length vectors.  The fallback requires
   11612              :      us to generate every element of every permute vector explicitly,
   11613              :      which is only possible for constant-length permute vectors.
   11614              : 
   11615              :      Set:
   11616              : 
   11617              :      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
   11618              :        mask vectors that we want to build.
   11619              : 
   11620              :      - NCOPIES to the number of copies of PERM that we need in order
   11621              :        to build the necessary permute mask vectors.  */
   11622       147704 :   uint64_t npatterns;
   11623       147704 :   unsigned nelts_per_pattern;
   11624       147704 :   uint64_t ncopies;
   11625       147704 :   if (repeating_p)
   11626              :     {
   11627              :       /* We need permute mask vectors that have the form:
   11628              : 
   11629              :            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
   11630              : 
   11631              :          In other words, the original n-element permute in PERM is
   11632              :          "unrolled" to fill a full vector.  The stepped vector encoding
   11633              :          that we use for permutes requires 3n elements.  */
   11634       109553 :       npatterns = SLP_TREE_LANES (node);
   11635       109553 :       nelts_per_pattern = ncopies = 3;
   11636              :     }
   11637              :   else
   11638              :     {
   11639              :       /* Calculate every element of every permute mask vector explicitly,
   11640              :          instead of relying on the pattern described above.  */
   11641       326928 :       if (!nunits.is_constant (&npatterns)
   11642       326928 :           || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
   11643              :         {
   11644              :           if (dump_p)
   11645              :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11646              :                              "unsupported permutation %p on variable-length"
   11647              :                              " vectors\n", (void *) node);
   11648              :           return -1;
   11649              :         }
   11650       326928 :       nelts_per_pattern = ncopies = 1;
   11651       326928 :       if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
   11652              :         {
   11653              :           if (dump_p)
   11654              :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11655              :                              "unsupported permutation %p for variable VF\n",
   11656              :                              (void *) node);
   11657              :           return -1;
   11658              :         }
   11659              :       pack_p = false;
   11660              :       unpack_factor = 1;
   11661              :     }
   11662       436481 :   unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
   11663       436481 :   gcc_assert (repeating_p || multiple_p (olanes, nunits));
   11664              : 
   11665              :   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
   11666              :      from the { SLP operand, scalar lane } permutation as recorded in the
   11667              :      SLP node as intermediate step.  This part should already work
   11668              :      with SLP children with arbitrary number of lanes.  */
   11669       436481 :   auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
   11670       436481 :   auto_vec<poly_uint64> active_lane;
   11671       436481 :   vperm.create (olanes);
   11672       436481 :   active_lane.safe_grow_cleared (children.length (), true);
   11673       879932 :   for (unsigned int ui = 0; ui < unpack_factor; ++ui)
   11674              :     {
   11675      1906260 :       for (unsigned j = 0; j < children.length (); ++j)
   11676       509679 :         active_lane[j] = ui * unpack_step;
   11677      1218750 :       for (unsigned i = 0; i < ncopies; ++i)
   11678              :         {
   11679      4845936 :           for (unsigned pi = 0; pi < perm.length (); ++pi)
   11680              :             {
   11681      1647669 :               std::pair<unsigned, unsigned> p = perm[pi];
   11682      1647669 :               tree vtype = SLP_TREE_VECTYPE (children[p.first]);
   11683      1647669 :               if (repeating_p)
   11684       626427 :                 vperm.quick_push ({{p.first, 0},
   11685       626427 :                                    p.second + active_lane[p.first]});
   11686              :               else
   11687              :                 {
   11688              :                   /* We checked above that the vectors are constant-length.  */
   11689      1021242 :                   unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
   11690      1021242 :                     .to_constant ();
   11691      1021242 :                   unsigned lane = active_lane[p.first].to_constant ();
   11692      1021242 :                   unsigned vi = (lane + p.second) / vnunits;
   11693      1021242 :                   unsigned vl = (lane + p.second) % vnunits;
   11694      1021242 :                   vperm.quick_push ({{p.first, vi}, vl});
   11695              :                 }
   11696              :             }
   11697              :           /* Advance to the next group.  */
   11698      1669799 :           for (unsigned j = 0; j < children.length (); ++j)
   11699       894500 :             active_lane[j] += SLP_TREE_LANES (children[j]);
   11700              :         }
   11701              :     }
   11702              : 
   11703       436481 :   if (dump_p)
   11704              :     {
   11705         8827 :       dump_printf_loc (MSG_NOTE, vect_location,
   11706              :                        "vectorizing permutation %p", (void *)node);
   11707        31996 :       for (unsigned i = 0; i < perm.length (); ++i)
   11708        23169 :         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
   11709         8827 :       if (repeating_p)
   11710         7427 :         dump_printf (MSG_NOTE, " (repeat %d)", SLP_TREE_LANES (node));
   11711         8827 :       dump_printf (MSG_NOTE, "\n");
   11712         8827 :       dump_printf_loc (MSG_NOTE, vect_location, "as");
   11713        88790 :       for (unsigned i = 0; i < vperm.length (); ++i)
   11714              :         {
   11715        79963 :           if (i != 0
   11716        79963 :               && (repeating_p
   11717        53986 :                   ? multiple_p (i, npatterns)
   11718        59505 :                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
   11719        23952 :             dump_printf (MSG_NOTE, ",");
   11720        79963 :           dump_printf (MSG_NOTE, " vops%u[%u][",
   11721        79963 :                        vperm[i].first.first, vperm[i].first.second);
   11722        79963 :           dump_dec (MSG_NOTE, vperm[i].second);
   11723        79963 :           dump_printf (MSG_NOTE, "]");
   11724              :         }
   11725         8827 :       dump_printf (MSG_NOTE, "\n");
   11726              :     }
   11727              : 
   11728              :   /* We can only handle two-vector permutes, everything else should
   11729              :      be lowered on the SLP level.  The following is closely inspired
   11730              :      by vect_transform_slp_perm_load and is supposed to eventually
   11731              :      replace it.
   11732              :      ???   As intermediate step do code-gen in the SLP tree representation
   11733              :      somehow?  */
   11734       436481 :   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
   11735       436481 :   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
   11736       436481 :   unsigned int index = 0;
   11737       436481 :   poly_uint64 mask_element;
   11738       436481 :   vec_perm_builder mask;
   11739       436481 :   mask.new_vector (nunits, npatterns, nelts_per_pattern);
   11740       436481 :   unsigned int count = mask.encoded_nelts ();
   11741       436481 :   mask.quick_grow (count);
   11742       436481 :   vec_perm_indices indices;
   11743       436481 :   unsigned nperms = 0;
   11744              :   /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
   11745              :      vectors to check during analysis, but we need to generate NOUTPUTS
   11746              :      vectors during transformation.  */
   11747       436481 :   unsigned total_nelts = olanes;
   11748       436481 :   unsigned process_nelts = olanes;
   11749       436481 :   if (repeating_p)
   11750              :     {
   11751       109553 :       total_nelts = (total_nelts / unpack_factor) * noutputs;
   11752       109553 :       if (gsi)
   11753         9879 :         process_nelts = total_nelts;
   11754              :     }
   11755       436481 :   unsigned last_ei = (total_nelts - 1) % process_nelts;
   11756      2093376 :   for (unsigned i = 0; i < process_nelts; ++i)
   11757              :     {
   11758              :       /* VI is the input vector index when generating code for REPEATING_P.  */
   11759      1664637 :       unsigned vi = i / olanes * (pack_p ? 2 : 1);
   11760      1664637 :       unsigned ei = i % olanes;
   11761      1664637 :       mask_element = vperm[ei].second;
   11762      1664637 :       if (pack_p)
   11763              :         {
   11764              :           /* In this case, we have N outputs and the single child provides 2N
   11765              :              inputs.  Output X permutes inputs 2X and 2X+1.
   11766              : 
   11767              :              The mask indices are taken directly from the SLP permutation node.
   11768              :              Index X selects from the first vector if (X / NUNITS) % 2 == 0;
   11769              :              X selects from the second vector otherwise.  These conditions
   11770              :              are only known at compile time for constant-length vectors.  */
   11771              :           first_vec = std::make_pair (0, 0);
   11772              :           second_vec = std::make_pair (0, 1);
   11773              :         }
   11774      1501311 :       else if (first_vec.first == -1U
   11775      1501311 :                || first_vec == vperm[ei].first)
   11776      1306360 :         first_vec = vperm[ei].first;
   11777       194951 :       else if (second_vec.first == -1U
   11778       194951 :                || second_vec == vperm[ei].first)
   11779              :         {
   11780       194563 :           second_vec = vperm[ei].first;
   11781       194563 :           mask_element += nunits;
   11782              :         }
   11783              :       else
   11784              :         {
   11785          388 :           if (dump_p)
   11786            7 :             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
   11787              :                              "permutation requires at "
   11788              :                              "least three vectors\n");
   11789          388 :           gcc_assert (!gsi);
   11790              :           return -1;
   11791              :         }
   11792              : 
   11793      1664249 :       mask[index++] = mask_element;
   11794              : 
   11795      1664249 :       if (index == count)
   11796              :         {
   11797       720210 :           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
   11798              :                               TYPE_VECTOR_SUBPARTS (op_vectype));
   11799       573853 :           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
   11800       889765 :                              && constant_multiple_p (mask[0], nunits));
   11801       573853 :           machine_mode vmode = TYPE_MODE (vectype);
   11802       573853 :           machine_mode op_vmode = TYPE_MODE (op_vectype);
   11803       573853 :           unsigned HOST_WIDE_INT c;
   11804       573853 :           if ((!identity_p
   11805       533772 :                && !can_vec_perm_const_p (vmode, op_vmode, indices))
   11806       573853 :               || (identity_p
   11807        40081 :                   && !known_le (nunits,
   11808              :                                 TYPE_VECTOR_SUBPARTS (op_vectype))
   11809         7362 :                   && (!constant_multiple_p (nunits,
   11810            8 :                                             TYPE_VECTOR_SUBPARTS (op_vectype),
   11811            8 :                                             &c) || c != 2)))
   11812              :             {
   11813         7354 :               if (dump_p)
   11814              :                 {
   11815          152 :                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
   11816              :                                    vect_location,
   11817              :                                    "unsupported vect permute { ");
   11818         1586 :                   for (i = 0; i < count; ++i)
   11819              :                     {
   11820         1434 :                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
   11821         1434 :                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
   11822              :                     }
   11823          152 :                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
   11824              :                 }
   11825         7354 :               gcc_assert (!gsi);
   11826         7742 :               return -1;
   11827              :             }
   11828              : 
   11829       566499 :           if (!identity_p)
   11830       526418 :             nperms += CEIL (total_nelts, process_nelts) - (ei > last_ei);
   11831       566499 :           if (gsi)
   11832              :             {
   11833        31379 :               if (second_vec.first == -1U)
   11834         7004 :                 second_vec = first_vec;
   11835              : 
   11836        31379 :               slp_tree
   11837        31379 :                 first_node = children[first_vec.first],
   11838        31379 :                 second_node = children[second_vec.first];
   11839              : 
   11840        31379 :               tree mask_vec = NULL_TREE;
   11841        31379 :               if (!identity_p)
   11842        28100 :                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
   11843              : 
   11844        31379 :               tree first_def
   11845        31379 :                 = vect_get_slp_vect_def (first_node, first_vec.second + vi);
   11846        31379 :               tree second_def
   11847        31379 :                 = vect_get_slp_vect_def (second_node, second_vec.second + vi);
   11848        31379 :               vect_add_slp_permutation (vinfo, gsi, node, first_def,
   11849        31379 :                                         second_def, mask_vec, mask[0]);
   11850              :             }
   11851              : 
   11852              :           index = 0;
   11853              :           first_vec = std::make_pair (-1U, -1U);
   11854              :           second_vec = std::make_pair (-1U, -1U);
   11855              :         }
   11856              :     }
   11857              : 
   11858       428739 :   return nperms;
   11859       436481 : }
   11860              : 
   11861              : /* Vectorize the SLP permutations in NODE as specified
   11862              :    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
   11863              :    child number and lane number.
   11864              :    Interleaving of two two-lane two-child SLP subtrees (not supported):
   11865              :      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
   11866              :    A blend of two four-lane two-child SLP subtrees:
   11867              :      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
   11868              :    Highpart of a four-lane one-child SLP subtree (not supported):
   11869              :      [ { 0, 2 }, { 0, 3 } ]
   11870              :    Where currently only a subset is supported by code generating below.  */
   11871              : 
   11872              : bool
   11873       115590 : vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
   11874              :                               slp_tree node, stmt_vector_for_cost *cost_vec)
   11875              : {
   11876       115590 :   tree vectype = SLP_TREE_VECTYPE (node);
   11877       115590 :   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
   11878       115590 :   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
   11879       115590 :                                                SLP_TREE_CHILDREN (node),
   11880              :                                                dump_enabled_p ());
   11881       115590 :   if (nperms < 0)
   11882              :     return false;
   11883              : 
   11884       114261 :   if (!gsi && nperms != 0)
   11885        92757 :     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
   11886              : 
   11887              :   return true;
   11888              : }
   11889              : 
   11890              : /* Vectorize SLP NODE.  */
   11891              : 
   11892              : static void
   11893      1466671 : vect_schedule_slp_node (vec_info *vinfo,
   11894              :                         slp_tree node, slp_instance instance)
   11895              : {
   11896      1466671 :   gimple_stmt_iterator si;
   11897      1466671 :   int i;
   11898      1466671 :   slp_tree child;
   11899              : 
   11900              :   /* Vectorize externals and constants.  */
   11901      1466671 :   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
   11902      1466671 :       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
   11903              :     {
   11904              :       /* ???  vectorizable_shift can end up using a scalar operand which is
   11905              :          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
   11906              :          node in this case.  */
   11907       496810 :       if (!SLP_TREE_VECTYPE (node))
   11908       496810 :         return;
   11909              : 
   11910              :       /* There are two reasons vector defs might already exist.  The first
   11911              :          is that we are vectorizing an existing vector def.  The second is
   11912              :          when performing BB vectorization shared constant/external nodes
   11913              :          are not split apart during partitioning so during the code-gen
   11914              :          DFS walk we can end up visiting them twice.  */
   11915       490512 :       if (! SLP_TREE_VEC_DEFS (node).exists ())
   11916       489703 :         vect_create_constant_vectors (vinfo, node);
   11917       490512 :       return;
   11918              :     }
   11919              : 
   11920       969861 :   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
   11921              : 
   11922       969861 :   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
   11923       969861 :   if (SLP_TREE_VECTYPE (node))
   11924       969855 :     SLP_TREE_VEC_DEFS (node).create (vect_get_num_copies (vinfo, node));
   11925              : 
   11926       969861 :   if (!SLP_TREE_PERMUTE_P (node) && STMT_VINFO_DATA_REF (stmt_info))
   11927              :     {
   11928              :       /* Vectorized loads go before the first scalar load to make it
   11929              :          ready early, vectorized stores go before the last scalar
   11930              :          stmt which is where all uses are ready.  */
   11931       709588 :       stmt_vec_info last_stmt_info = NULL;
   11932       709588 :       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
   11933       165901 :         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
   11934              :       else /* DR_IS_WRITE */
   11935       543687 :         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
   11936       709588 :       si = gsi_for_stmt (last_stmt_info->stmt);
   11937       709588 :     }
   11938       260273 :   else if (!SLP_TREE_PERMUTE_P (node)
   11939       243796 :            && (SLP_TREE_TYPE (node) == cycle_phi_info_type
   11940              :                || SLP_TREE_TYPE (node) == induc_vec_info_type
   11941              :                || SLP_TREE_TYPE (node) == phi_info_type))
   11942              :     {
   11943              :       /* For PHI node vectorization we do not use the insertion iterator.  */
   11944        53997 :       si = gsi_none ();
   11945              :     }
   11946              :   else
   11947              :     {
   11948              :       /* Emit other stmts after the children vectorized defs which is
   11949              :          earliest possible.  */
   11950              :       gimple *last_stmt = NULL;
   11951              :       bool seen_vector_def = false;
   11952       574388 :       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
   11953       368112 :         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
   11954              :           {
   11955              :             /* For fold-left reductions we are retaining the scalar
   11956              :                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
   11957              :                set so the representation isn't perfect.  Resort to the
   11958              :                last scalar def here.  */
   11959       295360 :             if (SLP_TREE_VEC_DEFS (child).is_empty ())
   11960              :               {
   11961          866 :                 gcc_assert (SLP_TREE_TYPE (child) == cycle_phi_info_type);
   11962          866 :                 gphi *phi = as_a <gphi *>
   11963          866 :                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
   11964          866 :                 if (!last_stmt)
   11965              :                   last_stmt = phi;
   11966          648 :                 else if (vect_stmt_dominates_stmt_p (last_stmt, phi))
   11967              :                   last_stmt = phi;
   11968          637 :                 else if (vect_stmt_dominates_stmt_p (phi, last_stmt))
   11969              :                   ;
   11970              :                 else
   11971            0 :                   gcc_unreachable ();
   11972              :               }
   11973              :             /* We are emitting all vectorized stmts in the same place and
   11974              :                the last one is the last.
   11975              :                ???  Unless we have a load permutation applied and that
   11976              :                figures to re-use an earlier generated load.  */
   11977              :             unsigned j;
   11978              :             tree vdef;
   11979       697961 :             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
   11980              :               {
   11981       402601 :                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
   11982       402601 :                 if (!last_stmt)
   11983              :                   last_stmt = vstmt;
   11984       206964 :                 else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
   11985              :                   last_stmt = vstmt;
   11986        45474 :                 else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
   11987              :                   ;
   11988              :                 else
   11989            0 :                   gcc_unreachable ();
   11990              :               }
   11991              :           }
   11992        72752 :         else if (!SLP_TREE_VECTYPE (child))
   11993              :           {
   11994              :             /* For externals we use unvectorized at all scalar defs.  */
   11995              :             unsigned j;
   11996              :             tree def;
   11997        13434 :             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
   11998         7770 :               if (TREE_CODE (def) == SSA_NAME
   11999         7770 :                   && !SSA_NAME_IS_DEFAULT_DEF (def))
   12000              :                 {
   12001          279 :                   gimple *stmt = SSA_NAME_DEF_STMT (def);
   12002          279 :                   if (gimple_uid (stmt) == -1u)
   12003              :                     /* If the stmt is not inside the region do not
   12004              :                        use it as possible insertion point.  */
   12005              :                     ;
   12006          271 :                   else if (!last_stmt)
   12007              :                     last_stmt = stmt;
   12008          255 :                   else if (vect_stmt_dominates_stmt_p (last_stmt, stmt))
   12009              :                     last_stmt = stmt;
   12010          153 :                   else if (vect_stmt_dominates_stmt_p (stmt, last_stmt))
   12011              :                     ;
   12012              :                   else
   12013            0 :                     gcc_unreachable ();
   12014              :                 }
   12015              :           }
   12016              :         else
   12017              :           {
   12018              :             /* For externals we have to look at all defs since their
   12019              :                insertion place is decided per vector.  But beware
   12020              :                of pre-existing vectors where we need to make sure
   12021              :                we do not insert before the region boundary.  */
   12022        67088 :             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
   12023          657 :                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
   12024              :               seen_vector_def = true;
   12025              :             else
   12026              :               {
   12027              :                 unsigned j;
   12028              :                 tree vdef;
   12029       529966 :                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
   12030        94883 :                   if (TREE_CODE (vdef) == SSA_NAME
   12031        94883 :                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
   12032              :                     {
   12033        19659 :                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
   12034        19659 :                       if (!last_stmt)
   12035              :                         last_stmt = vstmt;
   12036        10978 :                       else if (vect_stmt_dominates_stmt_p (last_stmt, vstmt))
   12037              :                         last_stmt = vstmt;
   12038         8725 :                       else if (vect_stmt_dominates_stmt_p (vstmt, last_stmt))
   12039              :                         ;
   12040              :                       else
   12041            0 :                         gcc_unreachable ();
   12042              :                     }
   12043              :               }
   12044              :           }
   12045              :       /* This can happen when all children are pre-existing vectors or
   12046              :          constants.  */
   12047       206276 :       if (!last_stmt)
   12048         1724 :         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
   12049         1724 :       if (!last_stmt)
   12050              :         {
   12051            0 :           gcc_assert (seen_vector_def);
   12052            0 :           si = gsi_after_labels (vinfo->bbs[0]);
   12053              :         }
   12054       206276 :       else if (is_ctrl_altering_stmt (last_stmt))
   12055              :         {
   12056              :           /* We split regions to vectorize at control altering stmts
   12057              :              with a definition so this must be an external which
   12058              :              we can insert at the start of the region.  */
   12059            0 :           si = gsi_after_labels (vinfo->bbs[0]);
   12060              :         }
   12061       206276 :       else if (is_a <bb_vec_info> (vinfo)
   12062        18017 :                && !SLP_TREE_PERMUTE_P (node)
   12063        16591 :                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
   12064       207717 :                && gimple_could_trap_p (stmt_info->stmt))
   12065              :         {
   12066              :           /* We've constrained possibly trapping operations to all come
   12067              :              from the same basic-block, if vectorized defs would allow earlier
   12068              :              scheduling still force vectorized stmts to the original block.
   12069              :              This is only necessary for BB vectorization since for loop vect
   12070              :              all operations are in a single BB and scalar stmt based
   12071              :              placement doesn't play well with epilogue vectorization.  */
   12072           53 :           gcc_assert (dominated_by_p (CDI_DOMINATORS,
   12073              :                                       gimple_bb (stmt_info->stmt),
   12074              :                                       gimple_bb (last_stmt)));
   12075           53 :           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
   12076              :         }
   12077       206223 :       else if (is_a <gphi *> (last_stmt))
   12078        14380 :         si = gsi_after_labels (gimple_bb (last_stmt));
   12079              :       else
   12080              :         {
   12081       191843 :           si = gsi_for_stmt (last_stmt);
   12082       191843 :           gsi_next (&si);
   12083              : 
   12084              :           /* Avoid scheduling internal defs outside of the loop when
   12085              :              we might have only implicitly tracked loop mask/len defs.  */
   12086       191843 :           if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
   12087           74 :             if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
   12088       174121 :                 || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
   12089              :               {
   12090           74 :                 gimple_stmt_iterator si2
   12091           74 :                   = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
   12092           74 :                 if ((gsi_end_p (si2)
   12093            0 :                      && (LOOP_VINFO_LOOP (loop_vinfo)->header
   12094            0 :                          != gimple_bb (last_stmt))
   12095            0 :                      && dominated_by_p (CDI_DOMINATORS,
   12096              :                                         LOOP_VINFO_LOOP (loop_vinfo)->header,
   12097            0 :                                         gimple_bb (last_stmt)))
   12098           74 :                     || (!gsi_end_p (si2)
   12099           74 :                         && last_stmt != *si2
   12100           72 :                         && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
   12101            3 :                   si = si2;
   12102              :               }
   12103              :         }
   12104              :     }
   12105              : 
   12106       969861 :   if (dump_enabled_p ())
   12107              :     {
   12108        71780 :       if (stmt_info)
   12109        71727 :         dump_printf_loc (MSG_NOTE, vect_location,
   12110              :                          "------>vectorizing SLP node starting from: %G",
   12111              :                          stmt_info->stmt);
   12112              :       else
   12113              :         {
   12114           53 :           dump_printf_loc (MSG_NOTE, vect_location,
   12115              :                            "------>vectorizing SLP node:\n");
   12116           53 :           vect_print_slp_tree (MSG_NOTE, vect_location, node);
   12117              :         }
   12118              :     }
   12119       969861 :   vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
   12120              : }
   12121              : 
   12122              : /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
   12123              :    For loop vectorization this is done in vectorizable_call, but for SLP
   12124              :    it needs to be deferred until end of vect_schedule_slp, because multiple
   12125              :    SLP instances may refer to the same scalar stmt.  */
   12126              : 
   12127              : static void
   12128       601522 : vect_remove_slp_scalar_calls (vec_info *vinfo,
   12129              :                               slp_tree node, hash_set<slp_tree> &visited)
   12130              : {
   12131       601522 :   gimple *new_stmt;
   12132       601522 :   gimple_stmt_iterator gsi;
   12133       601522 :   int i;
   12134       601522 :   slp_tree child;
   12135       601522 :   tree lhs;
   12136       601522 :   stmt_vec_info stmt_info;
   12137              : 
   12138       601522 :   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
   12139       188435 :     return;
   12140              : 
   12141       457090 :   if (visited.add (node))
   12142              :     return;
   12143              : 
   12144       924560 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
   12145       511473 :     vect_remove_slp_scalar_calls (vinfo, child, visited);
   12146              : 
   12147      1308157 :   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
   12148              :     {
   12149       486198 :       if (!stmt_info)
   12150         3974 :         continue;
   12151       482224 :       stmt_info = vect_orig_stmt (stmt_info);
   12152       482224 :       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
   12153         5231 :       if (!stmt || gimple_bb (stmt) == NULL)
   12154       477031 :         continue;
   12155         5193 :       lhs = gimple_call_lhs (stmt);
   12156         5193 :       if (lhs)
   12157         4579 :         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
   12158              :       else
   12159          614 :         new_stmt = gimple_build_nop ();
   12160         5193 :       unlink_stmt_vdef (stmt_info->stmt);
   12161         5193 :       gsi = gsi_for_stmt (stmt);
   12162         5193 :       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
   12163         5193 :       if (lhs)
   12164         4579 :         SSA_NAME_DEF_STMT (lhs) = new_stmt;
   12165              :     }
   12166              : }
   12167              : 
   12168              : static void
   12169        90049 : vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
   12170              : {
   12171        90049 :   hash_set<slp_tree> visited;
   12172        90049 :   vect_remove_slp_scalar_calls (vinfo, node, visited);
   12173        90049 : }
   12174              : 
   12175              : /* Vectorize the instance root.  */
   12176              : 
   12177              : void
   12178        10984 : vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance instance)
   12179              : {
   12180        10984 :   gassign *rstmt = NULL;
   12181              : 
   12182        10984 :   if (instance->kind == slp_inst_kind_ctor)
   12183              :     {
   12184         5068 :       if (SLP_TREE_VEC_DEFS (node).length () == 1)
   12185              :         {
   12186         5031 :           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
   12187         5031 :           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
   12188         5031 :           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
   12189         5031 :                                           TREE_TYPE (vect_lhs)))
   12190            0 :             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
   12191              :                                vect_lhs);
   12192         5031 :           rstmt = gimple_build_assign (root_lhs, vect_lhs);
   12193              :         }
   12194              :       else
   12195              :         {
   12196           37 :           gcc_assert (SLP_TREE_VEC_DEFS (node).length () > 1);
   12197           37 :           tree child_def;
   12198           37 :           int j;
   12199           37 :           vec<constructor_elt, va_gc> *v;
   12200           37 :           vec_alloc (v, SLP_TREE_VEC_DEFS (node).length ());
   12201              : 
   12202              :           /* A CTOR can handle V16HI composition from VNx8HI so we
   12203              :              do not need to convert vector elements if the types
   12204              :              do not match.  */
   12205          111 :           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
   12206           74 :             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
   12207           37 :           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
   12208           37 :           tree rtype
   12209           37 :             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
   12210           37 :           tree r_constructor = build_constructor (rtype, v);
   12211           37 :           rstmt = gimple_build_assign (lhs, r_constructor);
   12212              :         }
   12213              :     }
   12214         5916 :   else if (instance->kind == slp_inst_kind_bb_reduc)
   12215              :     {
   12216              :       /* Largely inspired by reduction chain epilogue handling in
   12217              :          vect_create_epilog_for_reduction.  */
   12218         4352 :       vec<tree> vec_defs = vNULL;
   12219         4352 :       vect_get_slp_defs (node, &vec_defs);
   12220         4352 :       enum tree_code reduc_code
   12221         4352 :         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
   12222              :       /* ???  We actually have to reflect signs somewhere.  */
   12223         4352 :       if (reduc_code == MINUS_EXPR)
   12224            0 :         reduc_code = PLUS_EXPR;
   12225         4352 :       gimple_seq epilogue = NULL;
   12226              :       /* We may end up with more than one vector result, reduce them
   12227              :          to one vector.  */
   12228         4352 :       tree vec_def = vec_defs[0];
   12229         4352 :       tree vectype = TREE_TYPE (vec_def);
   12230         4352 :       tree compute_vectype = vectype;
   12231         4352 :       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
   12232         4152 :                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
   12233         7332 :                                  && operation_can_overflow (reduc_code));
   12234         2840 :       if (pun_for_overflow_p)
   12235              :         {
   12236         2840 :           compute_vectype = unsigned_type_for (vectype);
   12237         2840 :           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
   12238              :                                   compute_vectype, vec_def);
   12239              :         }
   12240         6730 :       for (unsigned i = 1; i < vec_defs.length (); ++i)
   12241              :         {
   12242         2378 :           tree def = vec_defs[i];
   12243         2378 :           if (pun_for_overflow_p)
   12244         2273 :             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
   12245              :                                 compute_vectype, def);
   12246         2378 :           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
   12247              :                                   vec_def, def);
   12248              :         }
   12249         4352 :       vec_defs.release ();
   12250              :       /* ???  Support other schemes than direct internal fn.  */
   12251         4352 :       internal_fn reduc_fn;
   12252         4352 :       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
   12253         4352 :           || reduc_fn == IFN_LAST)
   12254            0 :         gcc_unreachable ();
   12255         4352 :       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
   12256         4352 :                                       TREE_TYPE (compute_vectype), vec_def);
   12257         4352 :       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
   12258              :         {
   12259         2813 :           tree rem_def = NULL_TREE;
   12260        12403 :           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
   12261              :             {
   12262         9590 :               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
   12263         9590 :               if (!rem_def)
   12264              :                 rem_def = def;
   12265              :               else
   12266         6777 :                 rem_def = gimple_build (&epilogue, reduc_code,
   12267         6777 :                                         TREE_TYPE (scalar_def),
   12268              :                                         rem_def, def);
   12269              :             }
   12270         2813 :           scalar_def = gimple_build (&epilogue, reduc_code,
   12271         2813 :                                      TREE_TYPE (scalar_def),
   12272              :                                      scalar_def, rem_def);
   12273              :         }
   12274         4352 :       scalar_def = gimple_convert (&epilogue,
   12275         4352 :                                    TREE_TYPE (vectype), scalar_def);
   12276         4352 :       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
   12277         4352 :       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
   12278         4352 :       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
   12279         4352 :       update_stmt (gsi_stmt (rgsi));
   12280         4352 :       return;
   12281              :     }
   12282         1564 :   else if (instance->kind == slp_inst_kind_gcond)
   12283              :     {
   12284              :       /* Only support a single root for now as we can't codegen CFG yet and so we
   12285              :          can't support lane > 1 at this time.  */
   12286         1564 :       gcc_assert (instance->root_stmts.length () == 1);
   12287         1564 :       auto root_stmt_info = instance->root_stmts[0];
   12288         1564 :       auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
   12289         1564 :       gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
   12290         1564 :       gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
   12291         1564 :       bool res = vectorizable_early_exit (as_a <loop_vec_info> (vinfo),
   12292              :                                           root_stmt_info, &rgsi, node, NULL);
   12293         1564 :       gcc_assert (res);
   12294         1564 :       return;
   12295              :     }
   12296              :   else
   12297            0 :     gcc_unreachable ();
   12298              : 
   12299         5068 :   gcc_assert (rstmt);
   12300              : 
   12301         5068 :   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
   12302         5068 :   gsi_replace (&rgsi, rstmt, true);
   12303              : }
   12304              : 
   12305              : struct slp_scc_info
   12306              : {
   12307              :   bool on_stack;
   12308              :   int dfs;
   12309              :   int lowlink;
   12310              : };
   12311              : 
   12312              : /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
   12313              : 
   12314              : static void
   12315      1466671 : vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
   12316              :                    hash_map<slp_tree, slp_scc_info> &scc_info,
   12317              :                    int &maxdfs, vec<slp_tree> &stack)
   12318              : {
   12319      1466671 :   bool existed_p;
   12320      1466671 :   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
   12321      1466671 :   gcc_assert (!existed_p);
   12322      1466671 :   info->dfs = maxdfs;
   12323      1466671 :   info->lowlink = maxdfs;
   12324      1466671 :   maxdfs++;
   12325              : 
   12326              :   /* Leaf.  */
   12327      1466671 :   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
   12328              :     {
   12329       496810 :       info->on_stack = false;
   12330       496810 :       vect_schedule_slp_node (vinfo, node, instance);
   12331      1025280 :       return;
   12332              :     }
   12333              : 
   12334       969861 :   info->on_stack = true;
   12335       969861 :   stack.safe_push (node);
   12336              : 
   12337       969861 :   unsigned i;
   12338       969861 :   slp_tree child;
   12339              :   /* DFS recurse.  */
   12340      2001496 :   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
   12341              :     {
   12342      1031635 :       if (!child)
   12343        55111 :         continue;
   12344       976524 :       slp_scc_info *child_info = scc_info.get (child);
   12345       976524 :       if (!child_info)
   12346              :         {
   12347       886698 :           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
   12348              :           /* Recursion might have re-allocated the node.  */
   12349       886698 :           info = scc_info.get (node);
   12350       886698 :           child_info = scc_info.get (child);
   12351       886698 :           info->lowlink = MIN (info->lowlink, child_info->lowlink);
   12352              :         }
   12353        89826 :       else if (child_info->on_stack)
   12354        25492 :         info->lowlink = MIN (info->lowlink, child_info->dfs);
   12355              :     }
   12356       969861 :   if (info->lowlink != info->dfs)
   12357              :     return;
   12358              : 
   12359       938201 :   auto_vec<slp_tree, 4> phis_to_fixup;
   12360              : 
   12361              :   /* Singleton.  */
   12362       938201 :   if (stack.last () == node)
   12363              :     {
   12364       914364 :       stack.pop ();
   12365       914364 :       info->on_stack = false;
   12366       914364 :       vect_schedule_slp_node (vinfo, node, instance);
   12367       914364 :       if (!SLP_TREE_PERMUTE_P (node)
   12368       914364 :           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
   12369        30268 :         phis_to_fixup.quick_push (node);
   12370              :     }
   12371              :   else
   12372              :     {
   12373              :       /* SCC.  */
   12374        23837 :       int last_idx = stack.length () - 1;
   12375        55497 :       while (stack[last_idx] != node)
   12376        31660 :         last_idx--;
   12377              :       /* We can break the cycle at PHIs who have at least one child
   12378              :          code generated.  Then we could re-start the DFS walk until
   12379              :          all nodes in the SCC are covered (we might have new entries
   12380              :          for only back-reachable nodes).  But it's simpler to just
   12381              :          iterate and schedule those that are ready.  */
   12382        23837 :       unsigned todo = stack.length () - last_idx;
   12383        24164 :       do
   12384              :         {
   12385       105555 :           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
   12386              :             {
   12387        57227 :               slp_tree entry = stack[idx];
   12388        57227 :               if (!entry)
   12389          934 :                 continue;
   12390        56293 :               bool phi = (!SLP_TREE_PERMUTE_P (entry)
   12391        56293 :                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
   12392        56293 :               bool ready = !phi;
   12393       142467 :               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
   12394       111213 :                   if (!child)
   12395              :                     {
   12396        22983 :                       gcc_assert (phi);
   12397              :                       ready = true;
   12398              :                       break;
   12399              :                     }
   12400        88230 :                   else if (scc_info.get (child)->on_stack)
   12401              :                     {
   12402        24027 :                       if (!phi)
   12403              :                         {
   12404              :                           ready = false;
   12405              :                           break;
   12406              :                         }
   12407              :                     }
   12408              :                   else
   12409              :                     {
   12410        64203 :                       if (phi)
   12411              :                         {
   12412              :                           ready = true;
   12413              :                           break;
   12414              :                         }
   12415              :                     }
   12416        33310 :               if (ready)
   12417              :                 {
   12418        55497 :                   vect_schedule_slp_node (vinfo, entry, instance);
   12419        55497 :                   scc_info.get (entry)->on_stack = false;
   12420        55497 :                   stack[idx] = NULL;
   12421        55497 :                   todo--;
   12422        55497 :                   if (phi)
   12423        24273 :                     phis_to_fixup.safe_push (entry);
   12424              :                 }
   12425              :             }
   12426              :         }
   12427        24164 :       while (todo != 0);
   12428              : 
   12429              :       /* Pop the SCC.  */
   12430        23837 :       stack.truncate (last_idx);
   12431              :     }
   12432              : 
   12433              :   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
   12434              :   slp_tree phi_node;
   12435      1930943 :   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
   12436              :     {
   12437        54541 :       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
   12438        54541 :       edge_iterator ei;
   12439        54541 :       edge e;
   12440       171923 :       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
   12441              :         {
   12442       117382 :           unsigned dest_idx = e->dest_idx;
   12443       117382 :           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
   12444       117382 :           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
   12445        66027 :             continue;
   12446        51355 :           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
   12447              :           /* Simply fill all args.  */
   12448        51355 :           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
   12449              :               != vect_first_order_recurrence)
   12450       110347 :             for (unsigned i = 0; i < n; ++i)
   12451              :               {
   12452        59032 :                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
   12453        59032 :                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
   12454        59032 :                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
   12455              :                              e, gimple_phi_arg_location (phi, dest_idx));
   12456              :               }
   12457              :           else
   12458              :             {
   12459              :               /* Unless it is a first order recurrence which needs
   12460              :                  args filled in for both the PHI node and the permutes.  */
   12461           40 :               gimple *perm
   12462           40 :                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
   12463           40 :               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
   12464           40 :               add_phi_arg (as_a <gphi *> (rphi),
   12465              :                            vect_get_slp_vect_def (child, n - 1),
   12466              :                            e, gimple_phi_arg_location (phi, dest_idx));
   12467          117 :               for (unsigned i = 0; i < n; ++i)
   12468              :                 {
   12469           77 :                   gimple *perm
   12470           77 :                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
   12471           77 :                   if (i > 0)
   12472           37 :                     gimple_assign_set_rhs1 (perm,
   12473              :                                             vect_get_slp_vect_def (child, i - 1));
   12474           77 :                   gimple_assign_set_rhs2 (perm,
   12475              :                                           vect_get_slp_vect_def (child, i));
   12476           77 :                   update_stmt (perm);
   12477              :                 }
   12478              :             }
   12479              :         }
   12480              :     }
   12481       938201 : }
   12482              : 
   12483              : /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
   12484              : 
   12485              : void
   12486       541109 : vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
   12487              : {
   12488       541109 :   slp_instance instance;
   12489       541109 :   unsigned int i;
   12490              : 
   12491       541109 :   hash_map<slp_tree, slp_scc_info> scc_info;
   12492       541109 :   int maxdfs = 0;
   12493      1121187 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
   12494              :     {
   12495       580078 :       slp_tree node = SLP_INSTANCE_TREE (instance);
   12496       580078 :       if (dump_enabled_p ())
   12497              :         {
   12498        15987 :           dump_printf_loc (MSG_NOTE, vect_location,
   12499              :                            "Vectorizing SLP tree:\n");
   12500              :           /* ???  Dump all?  */
   12501        15987 :           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
   12502          447 :             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
   12503          447 :                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
   12504        15987 :           vect_print_slp_graph (MSG_NOTE, vect_location,
   12505              :                                 SLP_INSTANCE_TREE (instance));
   12506              :         }
   12507              :       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
   12508              :          have a PHI be the node breaking the cycle.  */
   12509       580078 :       auto_vec<slp_tree> stack;
   12510       580078 :       if (!scc_info.get (node))
   12511       579973 :         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
   12512              : 
   12513       580078 :       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
   12514        10984 :         vectorize_slp_instance_root_stmt (vinfo, node, instance);
   12515              : 
   12516       580078 :       if (dump_enabled_p ())
   12517        15987 :         dump_printf_loc (MSG_NOTE, vect_location,
   12518              :                          "vectorizing stmts using SLP.\n");
   12519       580078 :     }
   12520              : 
   12521      1662296 :   FOR_EACH_VEC_ELT (slp_instances, i, instance)
   12522              :     {
   12523       580078 :       slp_tree root = SLP_INSTANCE_TREE (instance);
   12524       580078 :       stmt_vec_info store_info;
   12525       580078 :       unsigned int j;
   12526              : 
   12527              :       /* Remove scalar call stmts.  Do not do this for basic-block
   12528              :          vectorization as not all uses may be vectorized.
   12529              :          ???  Why should this be necessary?  DCE should be able to
   12530              :          remove the stmts itself.
   12531              :          ???  For BB vectorization we can as well remove scalar
   12532              :          stmts starting from the SLP tree root if they have no
   12533              :          uses.  */
   12534       580078 :       if (is_a <loop_vec_info> (vinfo))
   12535        90049 :         vect_remove_slp_scalar_calls (vinfo, root);
   12536              : 
   12537              :       /* Remove vectorized stores original scalar stmts.  */
   12538      2586663 :       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
   12539              :         {
   12540      1462898 :           if (!store_info
   12541      1462884 :               || !STMT_VINFO_DATA_REF (store_info)
   12542      1435214 :               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
   12543              :             break;
   12544              : 
   12545      1426507 :           store_info = vect_orig_stmt (store_info);
   12546              :           /* Free the attached stmt_vec_info and remove the stmt.  */
   12547      1426507 :           vinfo->remove_stmt (store_info);
   12548              : 
   12549              :           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
   12550              :              to not crash in vect_free_slp_tree later.  */
   12551      1426507 :           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
   12552       543388 :             SLP_TREE_REPRESENTATIVE (root) = NULL;
   12553              :         }
   12554              :     }
   12555       541109 : }
        

Generated by: LCOV version 2.4-beta

LCOV profile is generated on x86_64 machine using following configure options: configure --disable-bootstrap --enable-coverage=opt --enable-languages=c,c++,fortran,go,jit,lto,rust,m2 --enable-host-shared. GCC test suite is run with the built compiler.